diff options
Diffstat (limited to 'storage')
387 files changed, 53735 insertions, 18162 deletions
diff --git a/storage/archive/ha_archive.cc b/storage/archive/ha_archive.cc index 342f8be956e..5a6251a5de6 100644 --- a/storage/archive/ha_archive.cc +++ b/storage/archive/ha_archive.cc @@ -699,7 +699,7 @@ int ha_archive::create(const char *name, TABLE *table_arg, { KEY *pos= table_arg->key_info+key; KEY_PART_INFO *key_part= pos->key_part; - KEY_PART_INFO *key_part_end= key_part + pos->key_parts; + KEY_PART_INFO *key_part_end= key_part + pos->user_defined_key_parts; for (; key_part != key_part_end; key_part++) { diff --git a/storage/csv/ha_tina.cc b/storage/csv/ha_tina.cc index 916c7b151de..c25bc4f2713 100644 --- a/storage/csv/ha_tina.cc +++ b/storage/csv/ha_tina.cc @@ -1308,7 +1308,7 @@ bool ha_tina::get_write_pos(my_off_t *end_pos, tina_set *closest_hole) if (closest_hole == chain_ptr) /* no more chains */ *end_pos= file_buff->end(); else - *end_pos= min(file_buff->end(), closest_hole->begin); + *end_pos= MY_MIN(file_buff->end(), closest_hole->begin); return (closest_hole != chain_ptr) && (*end_pos == closest_hole->begin); } @@ -1545,7 +1545,7 @@ int ha_tina::repair(THD* thd, HA_CHECK_OPT* check_opt) /* write repaired file */ while (1) { - write_end= min(file_buff->end(), current_position); + write_end= MY_MIN(file_buff->end(), current_position); if ((write_end - write_begin) && (mysql_file_write(repair_file, (uchar*)file_buff->ptr(), (size_t) (write_end - write_begin), MYF_RW))) diff --git a/storage/federated/ha_federated.cc b/storage/federated/ha_federated.cc index 0c07af0a554..333e3b0b672 100644 --- a/storage/federated/ha_federated.cc +++ b/storage/federated/ha_federated.cc @@ -584,7 +584,7 @@ static int parse_url_error(FEDERATED_SHARE *share, TABLE *table, int error_num) size_t buf_len; DBUG_ENTER("ha_federated parse_url_error"); - buf_len= min(table->s->connect_string.length, + buf_len= MY_MIN(table->s->connect_string.length, FEDERATED_QUERY_BUFFER_SIZE-1); strmake(buf, table->s->connect_string.str, buf_len); my_error(error_num, MYF(0), buf); @@ -1317,7 +1317,7 @@ bool ha_federated::create_where_from_key(String *to, } for (key_part= key_info->key_part, - remainder= key_info->key_parts, + remainder= key_info->user_defined_key_parts, length= ranges[i]->length, ptr= ranges[i]->key; ; remainder--, @@ -1325,7 +1325,7 @@ bool ha_federated::create_where_from_key(String *to, { Field *field= key_part->field; uint store_length= key_part->store_length; - uint part_length= min(store_length, length); + uint part_length= MY_MIN(store_length, length); needs_quotes= field->str_needs_quotes(); DBUG_DUMP("key, start of loop", ptr, length); diff --git a/storage/federatedx/ha_federatedx.cc b/storage/federatedx/ha_federatedx.cc index e1c2a38964a..f5cb284c7c4 100644 --- a/storage/federatedx/ha_federatedx.cc +++ b/storage/federatedx/ha_federatedx.cc @@ -522,7 +522,7 @@ static int parse_url_error(FEDERATEDX_SHARE *share, TABLE *table, int error_num) int buf_len; DBUG_ENTER("ha_federatedx parse_url_error"); - buf_len= min(table->s->connect_string.length, + buf_len= MY_MIN(table->s->connect_string.length, FEDERATEDX_QUERY_BUFFER_SIZE-1); strmake(buf, table->s->connect_string.str, buf_len); my_error(error_num, MYF(0), buf); @@ -1246,7 +1246,7 @@ bool ha_federatedx::create_where_from_key(String *to, { Field *field= key_part->field; uint store_length= key_part->store_length; - uint part_length= min(store_length, length); + uint part_length= MY_MIN(store_length, length); needs_quotes= field->str_needs_quotes(); DBUG_DUMP("key, start of loop", ptr, length); diff --git a/storage/heap/ha_heap.cc b/storage/heap/ha_heap.cc index 8e63799680b..66d64c54b89 100644 --- a/storage/heap/ha_heap.cc +++ b/storage/heap/ha_heap.cc @@ -221,14 +221,14 @@ void ha_heap::update_key_stats() if (key->algorithm != HA_KEY_ALG_BTREE) { if (key->flags & HA_NOSAME) - key->rec_per_key[key->key_parts-1]= 1; + key->rec_per_key[key->user_defined_key_parts-1]= 1; else { ha_rows hash_buckets= file->s->keydef[i].hash_buckets; uint no_records= hash_buckets ? (uint) (file->s->records/hash_buckets) : 2; if (no_records < 2) no_records= 2; - key->rec_per_key[key->key_parts-1]= no_records; + key->rec_per_key[key->user_defined_key_parts-1]= no_records; } } } @@ -611,7 +611,7 @@ ha_rows ha_heap::records_in_range(uint inx, key_range *min_key, /* Assert that info() did run. We need current statistics here. */ DBUG_ASSERT(key_stat_version == file->s->key_stat_version); - return key->rec_per_key[key->key_parts-1]; + return key->rec_per_key[key->user_defined_key_parts-1]; } @@ -630,7 +630,7 @@ heap_prepare_hp_create_info(TABLE *table_arg, bool internal_table, bzero(hp_create_info, sizeof(*hp_create_info)); for (key= parts= 0; key < keys; key++) - parts+= table_arg->key_info[key].key_parts; + parts+= table_arg->key_info[key].user_defined_key_parts; if (!(keydef= (HP_KEYDEF*) my_malloc(keys * sizeof(HP_KEYDEF) + parts * sizeof(HA_KEYSEG), @@ -641,9 +641,9 @@ heap_prepare_hp_create_info(TABLE *table_arg, bool internal_table, { KEY *pos= table_arg->key_info+key; KEY_PART_INFO *key_part= pos->key_part; - KEY_PART_INFO *key_part_end= key_part + pos->key_parts; + KEY_PART_INFO *key_part_end= key_part + pos->user_defined_key_parts; - keydef[key].keysegs= (uint) pos->key_parts; + keydef[key].keysegs= (uint) pos->user_defined_key_parts; keydef[key].flag= (pos->flags & (HA_NOSAME | HA_NULL_ARE_EQUAL)); keydef[key].seg= seg; diff --git a/storage/heap/hp_create.c b/storage/heap/hp_create.c index a8bc8e63810..e286ff69e61 100644 --- a/storage/heap/hp_create.c +++ b/storage/heap/hp_create.c @@ -254,18 +254,18 @@ static void init_block(HP_BLOCK *block, uint reclength, ulong min_records, If not min_records and max_records are given, optimize for 1000 rows */ if (!min_records) - min_records= min(1000, max_records); + min_records= MY_MIN(1000, max_records); if (!max_records) - max_records= max(min_records, 1000); + max_records= MY_MAX(min_records, 1000); /* We don't want too few records_in_block as otherwise the overhead of of the HP_PTRS block will be too notable */ - records_in_block= max(1000, min_records); - records_in_block= min(records_in_block, max_records); + records_in_block= MY_MAX(1000, min_records); + records_in_block= MY_MIN(records_in_block, max_records); /* If big max_records is given, allocate bigger blocks */ - records_in_block= max(records_in_block, max_records / 10); + records_in_block= MY_MAX(records_in_block, max_records / 10); /* We don't want too few blocks per row either */ if (records_in_block < 10) records_in_block= 10; diff --git a/storage/heap/hp_test2.c b/storage/heap/hp_test2.c index 058a2904697..13b49fbb7ec 100644 --- a/storage/heap/hp_test2.c +++ b/storage/heap/hp_test2.c @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) for (i=0 ; i < recant ; i++) { - n1=rnd(1000); n2=rnd(100); n3=rnd(min(recant*5,MAX_RECORDS)); + n1=rnd(1000); n2=rnd(100); n3=rnd(MY_MIN(recant*5,MAX_RECORDS)); make_record(record,n1,n2,n3,"Pos",write_count); if (heap_write(file,record)) @@ -208,7 +208,7 @@ int main(int argc, char *argv[]) printf("- Update\n"); for (i=0 ; i < write_count/10 ; i++) { - n1=rnd(1000); n2=rnd(100); n3=rnd(min(recant*2,MAX_RECORDS)); + n1=rnd(1000); n2=rnd(100); n3=rnd(MY_MIN(recant*2,MAX_RECORDS)); make_record(record2, n1, n2, n3, "XXX", update); if (rnd(2) == 1) { diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 318b45e43ae..ee8758a08d2 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -54,6 +54,8 @@ SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG -DUNIV_SYNC_DEB #SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion") #ENDIF() +CHECK_FUNCTION_EXISTS(sched_getcpu HAVE_SCHED_GETCPU) + IF(NOT MSVC) # either define HAVE_IB_GCC_ATOMIC_BUILTINS or not IF(NOT CMAKE_CROSSCOMPILING) @@ -95,12 +97,36 @@ IF(NOT CMAKE_CROSSCOMPILING) }" HAVE_IB_GCC_ATOMIC_BUILTINS ) + CHECK_C_SOURCE_RUNS( + "#include<stdint.h> + int main() + { + int64_t x,y,res; + + x = 10; + y = 123; + res = __sync_sub_and_fetch(&y, x); + if (res != y || y != 113) { + return(1); + } + res = __sync_add_and_fetch(&y, x); + if (res != y || y != 123) { + return(1); + } + return(0); + }" + HAVE_IB_GCC_ATOMIC_BUILTINS_64 + ) ENDIF() IF(HAVE_IB_GCC_ATOMIC_BUILTINS) ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1) ENDIF() +IF(HAVE_IB_GCC_ATOMIC_BUILTINS_64) + ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_64=1) +ENDIF() + # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not IF(NOT CMAKE_CROSSCOMPILING) CHECK_C_SOURCE_RUNS( @@ -129,7 +155,8 @@ ENDIF() ENDIF(NOT MSVC) -SET(LINKER_SCRIPT) +CHECK_FUNCTION_EXISTS(asprintf HAVE_ASPRINTF) +CHECK_FUNCTION_EXISTS(vasprintf HAVE_VASPRINTF) # Solaris atomics IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") @@ -150,10 +177,6 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1) ENDIF() - IF(CMAKE_COMPILER_IS_GNUCC AND NOT HAVE_VISIBILITY_HIDDEN) - SET(LINKER_SCRIPT "-Wl,-M${CMAKE_CURRENT_SOURCE_DIR}/plugin_exports") - ENDIF() - IF(NOT CMAKE_CROSSCOMPILING) # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not CHECK_C_SOURCE_COMPILES( @@ -233,13 +256,16 @@ ENDIF() IF(MSVC) # Avoid "unreferenced label" warning in generated file GET_FILENAME_COMPONENT(_SRC_DIR ${CMAKE_CURRENT_LIST_FILE} PATH) - SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.cc + SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.c PROPERTIES COMPILE_FLAGS "/wd4102") - SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.cc + SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.c PROPERTIES COMPILE_FLAGS "/wd4003") ENDIF() - + + SET(INNOBASE_SOURCES + api/api0api.cc + api/api0misc.cc btr/btr0btr.cc btr/btr0cur.cc btr/btr0pcur.cc @@ -260,6 +286,7 @@ SET(INNOBASE_SOURCES dict/dict0load.cc dict/dict0mem.cc dict/dict0stats.cc + dict/dict0stats_bg.cc dyn/dyn0dyn.cc eval/eval0eval.cc eval/eval0proc.cc @@ -311,9 +338,11 @@ SET(INNOBASE_SOURCES rem/rem0rec.cc row/row0ext.cc row/row0ftsort.cc + row/row0import.cc row/row0ins.cc row/row0merge.cc row/row0mysql.cc + row/row0log.cc row/row0purge.cc row/row0row.cc row/row0sel.cc @@ -321,6 +350,7 @@ SET(INNOBASE_SOURCES row/row0umod.cc row/row0undo.cc row/row0upd.cc + row/row0quiesce.cc row/row0vers.cc srv/srv0conc.cc srv/srv0mon.cc @@ -355,7 +385,18 @@ IF(WITH_INNODB) SET(WITH_INNOBASE_STORAGE_ENGINE TRUE) ENDIF() + +# On solaris, reduce symbol visibility, so loader does not mix +# the same symbols from builtin innodb and from shared one. +# Only required for old GCC (3.4.3) that does not support hidden visibility +IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_COMPILER_IS_GNUCC + AND NOT HAVE_VISIBILITY_HIDDEN) + SET(LINKER_SCRIPT "-Wl,-M${CMAKE_CURRENT_SOURCE_DIR}/plugin_exports") +ELSE() + SET(LINKER_SCRIPT) +ENDIF() + MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE DEFAULT MODULE_OUTPUT_NAME ha_innodb - LINK_LIBRARIES ${ZLIB_LIBRARY}) + LINK_LIBRARIES ${ZLIB_LIBRARY} ${LINKER_SCRIPT}) diff --git a/storage/innobase/api/api0api.cc b/storage/innobase/api/api0api.cc new file mode 100644 index 00000000000..5f9762a1846 --- /dev/null +++ b/storage/innobase/api/api0api.cc @@ -0,0 +1,3859 @@ +/***************************************************************************** + +Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file api/api0api.cc +InnoDB Native API + +2008-08-01 Created Sunny Bains +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +*******************************************************/ + +#include "univ.i" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif + +#include "api0api.h" +#include "api0misc.h" +#include "srv0start.h" +#include "dict0dict.h" +#include "btr0pcur.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0vers.h" +#include "trx0roll.h" +#include "dict0crea.h" +#include "row0merge.h" +#include "pars0pars.h" +#include "lock0types.h" +#include "row0sel.h" +#include "lock0lock.h" +#include "rem0cmp.h" +#include "ut0dbg.h" +#include "dict0priv.h" +#include "ut0ut.h" +#include "ha_prototypes.h" +#include "trx0roll.h" + +/** configure variable for binlog option with InnoDB APIs */ +my_bool ib_binlog_enabled = FALSE; + +/** configure variable for MDL option with InnoDB APIs */ +my_bool ib_mdl_enabled = FALSE; + +/** configure variable for disable rowlock with InnoDB APIs */ +my_bool ib_disable_row_lock = FALSE; + +/** configure variable for Transaction isolation levels */ +ulong ib_trx_level_setting = IB_TRX_READ_UNCOMMITTED; + +/** configure variable for background commit interval in seconds */ +ulong ib_bk_commit_interval = 0; + +/** InnoDB tuple types. */ +enum ib_tuple_type_t{ + TPL_TYPE_ROW, /*!< Data row tuple */ + TPL_TYPE_KEY /*!< Index key tuple */ +}; + +/** Query types supported. */ +enum ib_qry_type_t{ + QRY_NON, /*!< None/Sentinel */ + QRY_INS, /*!< Insert operation */ + QRY_UPD, /*!< Update operation */ + QRY_SEL /*!< Select operation */ +}; + +/** Query graph types. */ +struct ib_qry_grph_t { + que_fork_t* ins; /*!< Innobase SQL query graph used + in inserts */ + que_fork_t* upd; /*!< Innobase SQL query graph used + in updates or deletes */ + que_fork_t* sel; /*!< dummy query graph used in + selects */ +}; + +/** Query node types. */ +struct ib_qry_node_t { + ins_node_t* ins; /*!< Innobase SQL insert node + used to perform inserts to the table */ + upd_node_t* upd; /*!< Innobase SQL update node + used to perform updates and deletes */ + sel_node_t* sel; /*!< Innobase SQL select node + used to perform selects on the table */ +}; + +/** Query processing fields. */ +struct ib_qry_proc_t { + + ib_qry_node_t node; /*!< Query node*/ + + ib_qry_grph_t grph; /*!< Query graph */ +}; + +/** Cursor instance for traversing tables/indexes. This will eventually +become row_prebuilt_t. */ +struct ib_cursor_t { + mem_heap_t* heap; /*!< Instance heap */ + + mem_heap_t* query_heap; /*!< Heap to use for query graphs */ + + ib_qry_proc_t q_proc; /*!< Query processing info */ + + ib_match_mode_t match_mode; /*!< ib_cursor_moveto match mode */ + + row_prebuilt_t* prebuilt; /*!< For reading rows */ + + bool valid_trx; /*!< Valid transaction attached */ +}; + +/** InnoDB table columns used during table and index schema creation. */ +struct ib_col_t { + const char* name; /*!< Name of column */ + + ib_col_type_t ib_col_type; /*!< Main type of the column */ + + ulint len; /*!< Length of the column */ + + ib_col_attr_t ib_col_attr; /*!< Column attributes */ + +}; + +/** InnoDB index columns used during index and index schema creation. */ +struct ib_key_col_t { + const char* name; /*!< Name of column */ + + ulint prefix_len; /*!< Column index prefix len or 0 */ +}; + +struct ib_table_def_t; + +/** InnoDB index schema used during index creation */ +struct ib_index_def_t { + mem_heap_t* heap; /*!< Heap used to build this and all + its columns in the list */ + + const char* name; /*!< Index name */ + + dict_table_t* table; /*!< Parent InnoDB table */ + + ib_table_def_t* schema; /*!< Parent table schema that owns + this instance */ + + ibool clustered; /*!< True if clustered index */ + + ibool unique; /*!< True if unique index */ + + ib_vector_t* cols; /*!< Vector of columns */ + + trx_t* usr_trx; /*!< User transacton covering the + DDL operations */ +}; + +/** InnoDB table schema used during table creation */ +struct ib_table_def_t { + mem_heap_t* heap; /*!< Heap used to build this and all + its columns in the list */ + const char* name; /*!< Table name */ + + ib_tbl_fmt_t ib_tbl_fmt; /*!< Row format */ + + ulint page_size; /*!< Page size */ + + ib_vector_t* cols; /*!< Vector of columns */ + + ib_vector_t* indexes; /*!< Vector of indexes */ + + dict_table_t* table; /* Table read from or NULL */ +}; + +/** InnoDB tuple used for key operations. */ +struct ib_tuple_t { + mem_heap_t* heap; /*!< Heap used to build + this and for copying + the column values. */ + + ib_tuple_type_t type; /*!< Tuple discriminitor. */ + + const dict_index_t* index; /*!< Index for tuple can be either + secondary or cluster index. */ + + dtuple_t* ptr; /*!< The internal tuple + instance */ +}; + +/** The following counter is used to convey information to InnoDB +about server activity: in selects it is not sensible to call +srv_active_wake_master_thread after each fetch or search, we only do +it every INNOBASE_WAKE_INTERVAL'th step. */ + +#define INNOBASE_WAKE_INTERVAL 32 + +/*****************************************************************//** +Check whether the Innodb persistent cursor is positioned. +@return IB_TRUE if positioned */ +UNIV_INLINE +ib_bool_t +ib_btr_cursor_is_positioned( +/*========================*/ + btr_pcur_t* pcur) /*!< in: InnoDB persistent cursor */ +{ + return(pcur->old_stored == BTR_PCUR_OLD_STORED + && (pcur->pos_state == BTR_PCUR_IS_POSITIONED + || pcur->pos_state == BTR_PCUR_WAS_POSITIONED)); +} + + +/********************************************************************//** +Open a table using the table id, if found then increment table ref count. +@return table instance if found */ +static +dict_table_t* +ib_open_table_by_id( +/*================*/ + ib_id_u64_t tid, /*!< in: table id to lookup */ + ib_bool_t locked) /*!< in: TRUE if own dict mutex */ +{ + dict_table_t* table; + table_id_t table_id; + + table_id = tid; + + if (!locked) { + dict_mutex_enter_for_mysql(); + } + + table = dict_table_open_on_id(table_id, FALSE, FALSE); + + if (table != NULL && table->ibd_file_missing) { + table = NULL; + } + + if (!locked) { + dict_mutex_exit_for_mysql(); + } + + return(table); +} + +/********************************************************************//** +Open a table using the table name, if found then increment table ref count. +@return table instance if found */ +UNIV_INTERN +void* +ib_open_table_by_name( +/*==================*/ + const char* name) /*!< in: table name to lookup */ +{ + dict_table_t* table; + + table = dict_table_open_on_name(name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); + + if (table != NULL && table->ibd_file_missing) { + table = NULL; + } + + return(table); +} + +/********************************************************************//** +Find table using table name. +@return table instance if found */ +static +dict_table_t* +ib_lookup_table_by_name( +/*====================*/ + const char* name) /*!< in: table name to lookup */ +{ + dict_table_t* table; + + table = dict_table_get_low(name); + + if (table != NULL && table->ibd_file_missing) { + table = NULL; + } + + return(table); +} + +/********************************************************************//** +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth +time calls srv_active_wake_master_thread. This function should be used +when a single database operation may introduce a small need for +server utility activity, like checkpointing. */ +UNIV_INLINE +void +ib_wake_master_thread(void) +/*=======================*/ +{ + static ulint ib_signal_counter = 0; + + ++ib_signal_counter; + + if ((ib_signal_counter % INNOBASE_WAKE_INTERVAL) == 0) { + srv_active_wake_master_thread(); + } +} + +/*********************************************************************//** +Calculate the max row size of the columns in a cluster index. +@return max row length */ +UNIV_INLINE +ulint +ib_get_max_row_len( +/*===============*/ + dict_index_t* cluster) /*!< in: cluster index */ +{ + ulint i; + ulint max_len = 0; + ulint n_fields = cluster->n_fields; + + /* Add the size of the ordering columns in the + clustered index. */ + for (i = 0; i < n_fields; ++i) { + const dict_col_t* col; + + col = dict_index_get_nth_col(cluster, i); + + /* Use the maximum output size of + mach_write_compressed(), although the encoded + length should always fit in 2 bytes. */ + max_len += dict_col_get_max_size(col); + } + + return(max_len); +} + +/*****************************************************************//** +Read the columns from a rec into a tuple. */ +static +void +ib_read_tuple( +/*==========*/ + const rec_t* rec, /*!< in: Record to read */ + ib_bool_t page_format, /*!< in: IB_TRUE if compressed format */ + ib_tuple_t* tuple) /*!< in: tuple to read into */ +{ + ulint i; + void* ptr; + rec_t* copy; + ulint rec_meta_data; + ulint n_index_fields; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + dtuple_t* dtuple = tuple->ptr; + const dict_index_t* index = tuple->index; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, &tuple->heap); + + rec_meta_data = rec_get_info_bits(rec, page_format); + dtuple_set_info_bits(dtuple, rec_meta_data); + + /* Make a copy of the rec. */ + ptr = mem_heap_alloc(tuple->heap, rec_offs_size(offsets)); + copy = rec_copy(ptr, rec, offsets); + + n_index_fields = ut_min( + rec_offs_n_fields(offsets), dtuple_get_n_fields(dtuple)); + + for (i = 0; i < n_index_fields; ++i) { + ulint len; + const byte* data; + dfield_t* dfield; + + if (tuple->type == TPL_TYPE_ROW) { + const dict_col_t* col; + ulint col_no; + const dict_field_t* index_field; + + index_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(index_field); + col_no = dict_col_get_no(col); + + dfield = dtuple_get_nth_field(dtuple, col_no); + } else { + dfield = dtuple_get_nth_field(dtuple, i); + } + + data = rec_get_nth_field(copy, offsets, i, &len); + + /* Fetch and copy any externally stored column. */ + if (rec_offs_nth_extern(offsets, i)) { + + ulint zip_size; + + zip_size = dict_table_zip_size(index->table); + + data = btr_rec_copy_externally_stored_field( + copy, offsets, zip_size, i, &len, + tuple->heap); + + ut_a(len != UNIV_SQL_NULL); + } + + dfield_set_data(dfield, data, len); + } +} + +/*****************************************************************//** +Create an InnoDB key tuple. +@return tuple instance created, or NULL */ +static +ib_tpl_t +ib_key_tuple_new_low( +/*=================*/ + const dict_index_t* index, /*!< in: index for which tuple + required */ + ulint n_cols, /*!< in: no. of user defined cols */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ib_tuple_t* tuple; + ulint i; + ulint n_cmp_cols; + + tuple = static_cast<ib_tuple_t*>( + mem_heap_alloc(heap, sizeof(*tuple))); + + if (tuple == NULL) { + mem_heap_free(heap); + return(NULL); + } + + tuple->heap = heap; + tuple->index = index; + tuple->type = TPL_TYPE_KEY; + + /* Is it a generated clustered index ? */ + if (n_cols == 0) { + ++n_cols; + } + + tuple->ptr = dtuple_create(heap, n_cols); + + /* Copy types and set to SQL_NULL. */ + dict_index_copy_types(tuple->ptr, index, n_cols); + + for (i = 0; i < n_cols; i++) { + + dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple->ptr, i); + dfield_set_null(dfield); + } + + n_cmp_cols = dict_index_get_n_ordering_defined_by_user(index); + + dtuple_set_n_fields_cmp(tuple->ptr, n_cmp_cols); + + return((ib_tpl_t) tuple); +} + +/*****************************************************************//** +Create an InnoDB key tuple. +@return tuple instance created, or NULL */ +static +ib_tpl_t +ib_key_tuple_new( +/*=============*/ + const dict_index_t* index, /*!< in: index of tuple */ + ulint n_cols) /*!< in: no. of user defined cols */ +{ + mem_heap_t* heap; + + heap = mem_heap_create(64); + + if (heap == NULL) { + return(NULL); + } + + return(ib_key_tuple_new_low(index, n_cols, heap)); +} + +/*****************************************************************//** +Create an InnoDB row tuple. +@return tuple instance, or NULL */ +static +ib_tpl_t +ib_row_tuple_new_low( +/*=================*/ + const dict_index_t* index, /*!< in: index of tuple */ + ulint n_cols, /*!< in: no. of cols in tuple */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ib_tuple_t* tuple; + + tuple = static_cast<ib_tuple_t*>(mem_heap_alloc(heap, sizeof(*tuple))); + + if (tuple == NULL) { + mem_heap_free(heap); + return(NULL); + } + + tuple->heap = heap; + tuple->index = index; + tuple->type = TPL_TYPE_ROW; + + tuple->ptr = dtuple_create(heap, n_cols); + + /* Copy types and set to SQL_NULL. */ + dict_table_copy_types(tuple->ptr, index->table); + + return((ib_tpl_t) tuple); +} + +/*****************************************************************//** +Create an InnoDB row tuple. +@return tuple instance, or NULL */ +static +ib_tpl_t +ib_row_tuple_new( +/*=============*/ + const dict_index_t* index, /*!< in: index of tuple */ + ulint n_cols) /*!< in: no. of cols in tuple */ +{ + mem_heap_t* heap; + + heap = mem_heap_create(64); + + if (heap == NULL) { + return(NULL); + } + + return(ib_row_tuple_new_low(index, n_cols, heap)); +} + +/*****************************************************************//** +Begin a transaction. +@return innobase txn handle */ +UNIV_INTERN +ib_err_t +ib_trx_start( +/*=========*/ + ib_trx_t ib_trx, /*!< in: transaction to restart */ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + void* thd) /*!< in: THD */ +{ + ib_err_t err = DB_SUCCESS; + trx_t* trx = (trx_t*) ib_trx; + + ut_a(ib_trx_level <= IB_TRX_SERIALIZABLE); + + trx_start_if_not_started(trx); + + trx->isolation_level = ib_trx_level; + + /* FIXME: This is a place holder, we should add an arg that comes + from the client. */ + trx->mysql_thd = static_cast<THD*>(thd); + + return(err); +} + +/*****************************************************************//** +Begin a transaction. This will allocate a new transaction handle. +put the transaction in the active state. +@return innobase txn handle */ +UNIV_INTERN +ib_trx_t +ib_trx_begin( +/*=========*/ + ib_trx_level_t ib_trx_level) /*!< in: trx isolation level */ +{ + trx_t* trx; + ib_bool_t started; + + trx = trx_allocate_for_mysql(); + started = ib_trx_start((ib_trx_t) trx, ib_trx_level, NULL); + ut_a(started); + + return((ib_trx_t) trx); +} + +/*****************************************************************//** +Get the transaction's state. +@return transaction state */ +UNIV_INTERN +ib_trx_state_t +ib_trx_state( +/*=========*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + trx_t* trx = (trx_t*) ib_trx; + + return((ib_trx_state_t) trx->state); +} + +/*****************************************************************//** +Get a trx start time. +@return trx start_time */ +UNIV_INTERN +ib_u64_t +ib_trx_get_start_time( +/*==================*/ + ib_trx_t ib_trx) /*!< in: transaction */ +{ + trx_t* trx = (trx_t*) ib_trx; + return(static_cast<ib_u64_t>(trx->start_time)); +} +/*****************************************************************//** +Release the resources of the transaction. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_trx_release( +/*===========*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + trx_t* trx = (trx_t*) ib_trx; + + ut_ad(trx != NULL); + trx_free_for_mysql(trx); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Commit a transaction. This function will also release the schema +latches too. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_commit( +/*==========*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + ib_err_t err = DB_SUCCESS; + trx_t* trx = (trx_t*) ib_trx; + + if (trx->state == TRX_STATE_NOT_STARTED) { + err = ib_trx_release(ib_trx); + return(err); + } + + trx_commit(trx); + + err = ib_trx_release(ib_trx); + ut_a(err == DB_SUCCESS); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Rollback a transaction. This function will also release the schema +latches too. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_trx_rollback( +/*============*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + ib_err_t err; + trx_t* trx = (trx_t*) ib_trx; + + err = static_cast<ib_err_t>(trx_rollback_for_mysql(trx)); + + /* It should always succeed */ + ut_a(err == DB_SUCCESS); + + err = ib_trx_release(ib_trx); + ut_a(err == DB_SUCCESS); + + ib_wake_master_thread(); + + return(err); +} + +/*****************************************************************//** +Find an index definition from the index vector using index name. +@return index def. if found else NULL */ +UNIV_INLINE +const ib_index_def_t* +ib_table_find_index( +/*================*/ + ib_vector_t* indexes, /*!< in: vector of indexes */ + const char* name) /*!< in: index name */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(indexes); ++i) { + const ib_index_def_t* index_def; + + index_def = (ib_index_def_t*) ib_vector_get(indexes, i); + + if (innobase_strcasecmp(name, index_def->name) == 0) { + return(index_def); + } + } + + return(NULL); +} + +/*****************************************************************//** +Get the InnoDB internal precise type from the schema column definition. +@return precise type in api format */ +UNIV_INLINE +ulint +ib_col_get_prtype( +/*==============*/ + const ib_col_t* ib_col) /*!< in: column definition */ +{ + ulint prtype = 0; + + if (ib_col->ib_col_attr & IB_COL_UNSIGNED) { + prtype |= DATA_UNSIGNED; + + ut_a(ib_col->ib_col_type == IB_INT); + } + + if (ib_col->ib_col_attr & IB_COL_NOT_NULL) { + prtype |= DATA_NOT_NULL; + } + + return(prtype); +} + +/*****************************************************************//** +Get the InnoDB internal main type from the schema column definition. +@return column main type */ +UNIV_INLINE +ulint +ib_col_get_mtype( +/*==============*/ + const ib_col_t* ib_col) /*!< in: column definition */ +{ + /* Note: The api0api.h types should map directly to + the internal numeric codes. */ + return(ib_col->ib_col_type); +} + +/*****************************************************************//** +Find a column in the the column vector with the same name. +@return col. def. if found else NULL */ +UNIV_INLINE +const ib_col_t* +ib_table_find_col( +/*==============*/ + const ib_vector_t* cols, /*!< in: column list head */ + const char* name) /*!< in: column name to find */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(cols); ++i) { + const ib_col_t* ib_col; + + ib_col = static_cast<const ib_col_t*>( + ib_vector_get((ib_vector_t*) cols, i)); + + if (innobase_strcasecmp(ib_col->name, name) == 0) { + return(ib_col); + } + } + + return(NULL); +} + +/*****************************************************************//** +Find a column in the the column list with the same name. +@return col. def. if found else NULL */ +UNIV_INLINE +const ib_key_col_t* +ib_index_find_col( +/*==============*/ + ib_vector_t* cols, /*!< in: column list head */ + const char* name) /*!< in: column name to find */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(cols); ++i) { + const ib_key_col_t* ib_col; + + ib_col = static_cast<ib_key_col_t*>(ib_vector_get(cols, i)); + + if (innobase_strcasecmp(ib_col->name, name) == 0) { + return(ib_col); + } + } + + return(NULL); +} + +#ifdef __WIN__ +/*****************************************************************//** +Convert a string to lower case. */ +static +void +ib_to_lower_case( +/*=============*/ + char* ptr) /*!< string to convert to lower case */ +{ + while (*ptr) { + *ptr = tolower(*ptr); + ++ptr; + } +} +#endif /* __WIN__ */ + +/*****************************************************************//** +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case. This function can be called for system +tables and they don't have a database component. For tables that don't have +a database component, we don't normalize them to lower case on Windows. +The assumption is that they are system tables that reside in the system +table space. */ +static +void +ib_normalize_table_name( +/*====================*/ + char* norm_name, /*!< out: normalized name as a + null-terminated string */ + const char* name) /*!< in: table name string */ +{ + const char* ptr = name; + + /* Scan name from the end */ + + ptr += ut_strlen(name) - 1; + + /* Find the start of the table name. */ + while (ptr >= name && *ptr != '\\' && *ptr != '/' && ptr > name) { + --ptr; + } + + + /* For system tables there is no '/' or dbname. */ + ut_a(ptr >= name); + + if (ptr > name) { + const char* db_name; + const char* table_name; + + table_name = ptr + 1; + + --ptr; + + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + } + + db_name = ptr + 1; + + memcpy(norm_name, db_name, + ut_strlen(name) + 1 - (db_name - name)); + + norm_name[table_name - db_name - 1] = '/'; +#ifdef __WIN__ + ib_to_lower_case(norm_name); +#endif + } else { + ut_strcpy(norm_name, name); + } +} + +/*****************************************************************//** +Check whether the table name conforms to our requirements. Currently +we only do a simple check for the presence of a '/'. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_table_name_check( +/*================*/ + const char* name) /*!< in: table name to check */ +{ + const char* slash = NULL; + ulint len = ut_strlen(name); + + if (len < 2 + || *name == '/' + || name[len - 1] == '/' + || (name[0] == '.' && name[1] == '/') + || (name[0] == '.' && name[1] == '.' && name[2] == '/')) { + + return(DB_DATA_MISMATCH); + } + + for ( ; *name; ++name) { +#ifdef __WIN__ + /* Check for reserved characters in DOS filenames. */ + switch (*name) { + case ':': + case '|': + case '"': + case '*': + case '<': + case '>': + return(DB_DATA_MISMATCH); + } +#endif /* __WIN__ */ + if (*name == '/') { + if (slash) { + return(DB_DATA_MISMATCH); + } + slash = name; + } + } + + return(slash ? DB_SUCCESS : DB_DATA_MISMATCH); +} + + + +/*****************************************************************//** +Get an index definition that is tagged as a clustered index. +@return cluster index schema */ +UNIV_INLINE +ib_index_def_t* +ib_find_clustered_index( +/*====================*/ + ib_vector_t* indexes) /*!< in: index defs. to search */ +{ + ulint i; + ulint n_indexes; + + n_indexes = ib_vector_size(indexes); + + for (i = 0; i < n_indexes; ++i) { + ib_index_def_t* ib_index_def; + + ib_index_def = static_cast<ib_index_def_t*>( + ib_vector_get(indexes, i)); + + if (ib_index_def->clustered) { + return(ib_index_def); + } + } + + return(NULL); +} + +/*****************************************************************//** +Get a table id. The caller must have acquired the dictionary mutex. +@return DB_SUCCESS if found */ +static +ib_err_t +ib_table_get_id_low( +/*================*/ + const char* table_name, /*!< in: table to find */ + ib_id_u64_t* table_id) /*!< out: table id if found */ +{ + dict_table_t* table; + ib_err_t err = DB_TABLE_NOT_FOUND; + + *table_id = 0; + + table = ib_lookup_table_by_name(table_name); + + if (table != NULL) { + *table_id = (table->id); + + err = DB_SUCCESS; + } + + return(err); +} + +/*****************************************************************//** +Create an internal cursor instance. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_create_cursor( +/*=============*/ + ib_crsr_t* ib_crsr, /*!< out: InnoDB cursor */ + dict_table_t* table, /*!< in: table instance */ + dict_index_t* index, /*!< in: index to use */ + trx_t* trx) /*!< in: transaction */ +{ + mem_heap_t* heap; + ib_cursor_t* cursor; + ib_err_t err = DB_SUCCESS; + + heap = mem_heap_create(sizeof(*cursor) * 2); + + if (heap != NULL) { + row_prebuilt_t* prebuilt; + + cursor = static_cast<ib_cursor_t*>( + mem_heap_zalloc(heap, sizeof(*cursor))); + + cursor->heap = heap; + + cursor->query_heap = mem_heap_create(64); + + if (cursor->query_heap == NULL) { + mem_heap_free(heap); + + return(DB_OUT_OF_MEMORY); + } + + cursor->prebuilt = row_create_prebuilt(table, 0); + + prebuilt = cursor->prebuilt; + + prebuilt->trx = trx; + + cursor->valid_trx = TRUE; + + prebuilt->table = table; + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->innodb_api = TRUE; + + prebuilt->index = index; + + ut_a(prebuilt->index != NULL); + + if (prebuilt->trx != NULL) { + ++prebuilt->trx->n_mysql_tables_in_use; + + prebuilt->index_usable = + row_merge_is_index_usable( + prebuilt->trx, prebuilt->index); + + /* Assign a read view if the transaction does + not have it yet */ + + trx_assign_read_view(prebuilt->trx); + } + + *ib_crsr = (ib_crsr_t) cursor; + } else { + err = DB_OUT_OF_MEMORY; + } + + return(err); +} + +/*****************************************************************//** +Create an internal cursor instance, and set prebuilt->index to index +with supplied index_id. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_create_cursor_with_index_id( +/*===========================*/ + ib_crsr_t* ib_crsr, /*!< out: InnoDB cursor */ + dict_table_t* table, /*!< in: table instance */ + ib_id_u64_t index_id, /*!< in: index id or 0 */ + trx_t* trx) /*!< in: transaction */ +{ + dict_index_t* index; + + if (index_id != 0) { + mutex_enter(&dict_sys->mutex); + index = dict_index_find_on_id_low(index_id); + mutex_exit(&dict_sys->mutex); + } else { + index = dict_table_get_first_index(table); + } + + return(ib_create_cursor(ib_crsr, table, index, trx)); +} + +/*****************************************************************//** +Open an InnoDB table and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_table_using_id( +/*==========================*/ + ib_id_u64_t table_id, /*!< in: table id of table to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr) /*!< out,own: InnoDB cursor */ +{ + ib_err_t err; + dict_table_t* table; + + if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) { + table = ib_open_table_by_id(table_id, FALSE); + } else { + table = ib_open_table_by_id(table_id, TRUE); + } + + if (table == NULL) { + + return(DB_TABLE_NOT_FOUND); + } + + err = ib_create_cursor_with_index_id(ib_crsr, table, 0, + (trx_t*) ib_trx); + + return(err); +} + +/*****************************************************************//** +Open an InnoDB index and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_index_using_id( +/*==========================*/ + ib_id_u64_t index_id, /*!< in: index id of index to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr) /*!< out: InnoDB cursor */ +{ + ib_err_t err; + dict_table_t* table; + ulint table_id = (ulint)( index_id >> 32); + + if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) { + table = ib_open_table_by_id(table_id, FALSE); + } else { + table = ib_open_table_by_id(table_id, TRUE); + } + + if (table == NULL) { + + return(DB_TABLE_NOT_FOUND); + } + + /* We only return the lower 32 bits of the dulint. */ + err = ib_create_cursor_with_index_id( + ib_crsr, table, index_id, (trx_t*) ib_trx); + + if (ib_crsr != NULL) { + const ib_cursor_t* cursor; + + cursor = *(ib_cursor_t**) ib_crsr; + + if (cursor->prebuilt->index == NULL) { + ib_err_t crsr_err; + + crsr_err = ib_cursor_close(*ib_crsr); + ut_a(crsr_err == DB_SUCCESS); + + *ib_crsr = NULL; + } + } + + return(err); +} + +/*****************************************************************//** +Open an InnoDB secondary index cursor and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_index_using_name( +/*============================*/ + ib_crsr_t ib_open_crsr, /*!< in: open/active cursor */ + const char* index_name, /*!< in: secondary index name */ + ib_crsr_t* ib_crsr, /*!< out,own: InnoDB index cursor */ + int* idx_type, /*!< out: index is cluster index */ + ib_id_u64_t* idx_id) /*!< out: index id */ +{ + dict_table_t* table; + dict_index_t* index; + index_id_t index_id = 0; + ib_err_t err = DB_TABLE_NOT_FOUND; + ib_cursor_t* cursor = (ib_cursor_t*) ib_open_crsr; + + *idx_type = 0; + *idx_id = 0; + *ib_crsr = NULL; + + /* We want to increment the ref count, so we do a redundant search. */ + table = dict_table_open_on_id(cursor->prebuilt->table->id, + FALSE, FALSE); + ut_a(table != NULL); + + /* The first index is always the cluster index. */ + index = dict_table_get_first_index(table); + + /* Traverse the user defined indexes. */ + while (index != NULL) { + if (innobase_strcasecmp(index->name, index_name) == 0) { + index_id = index->id; + *idx_type = index->type; + *idx_id = index_id; + break; + } + index = UT_LIST_GET_NEXT(indexes, index); + } + + if (!index_id) { + dict_table_close(table, FALSE, FALSE); + return(DB_ERROR); + } + + if (index_id > 0) { + ut_ad(index->id == index_id); + err = ib_create_cursor( + ib_crsr, table, index, cursor->prebuilt->trx); + } + + if (*ib_crsr != NULL) { + const ib_cursor_t* cursor; + + cursor = *(ib_cursor_t**) ib_crsr; + + if (cursor->prebuilt->index == NULL) { + err = ib_cursor_close(*ib_crsr); + ut_a(err == DB_SUCCESS); + *ib_crsr = NULL; + } + } + + return(err); +} + +/*****************************************************************//** +Open an InnoDB table and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_table( +/*=================*/ + const char* name, /*!< in: table name */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr) /*!< out,own: InnoDB cursor */ +{ + ib_err_t err; + dict_table_t* table; + char* normalized_name; + + normalized_name = static_cast<char*>(mem_alloc(ut_strlen(name) + 1)); + ib_normalize_table_name(normalized_name, name); + + if (ib_trx != NULL) { + if (!ib_schema_lock_is_exclusive(ib_trx)) { + table = (dict_table_t*)ib_open_table_by_name( + normalized_name); + } else { + /* NOTE: We do not acquire MySQL metadata lock */ + table = ib_lookup_table_by_name(normalized_name); + } + } else { + table = (dict_table_t*)ib_open_table_by_name(normalized_name); + } + + mem_free(normalized_name); + normalized_name = NULL; + + /* It can happen that another thread has created the table but + not the cluster index or it's a broken table definition. Refuse to + open if that's the case. */ + if (table != NULL && dict_table_get_first_index(table) == NULL) { + table = NULL; + } + + if (table != NULL) { + err = ib_create_cursor_with_index_id(ib_crsr, table, 0, + (trx_t*) ib_trx); + } else { + err = DB_TABLE_NOT_FOUND; + } + + return(err); +} + +/********************************************************************//** +Free a context struct for a table handle. */ +static +void +ib_qry_proc_free( +/*=============*/ + ib_qry_proc_t* q_proc) /*!< in, own: qproc struct */ +{ + que_graph_free_recursive(q_proc->grph.ins); + que_graph_free_recursive(q_proc->grph.upd); + que_graph_free_recursive(q_proc->grph.sel); + + memset(q_proc, 0x0, sizeof(*q_proc)); +} + +/*****************************************************************//** +set a cursor trx to NULL */ +UNIV_INTERN +void +ib_cursor_clear_trx( +/*================*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + cursor->prebuilt->trx = NULL; +} + +/*****************************************************************//** +Reset the cursor. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_reset( +/*============*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (cursor->valid_trx && prebuilt->trx != NULL + && prebuilt->trx->n_mysql_tables_in_use > 0) { + + --prebuilt->trx->n_mysql_tables_in_use; + } + + /* The fields in this data structure are allocated from + the query heap and so need to be reset too. */ + ib_qry_proc_free(&cursor->q_proc); + + mem_heap_empty(cursor->query_heap); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +update the cursor with new transactions and also reset the cursor +@return DB_SUCCESS or err code */ +ib_err_t +ib_cursor_new_trx( +/*==============*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx) /*!< in: transaction */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + trx_t* trx = (trx_t*) ib_trx; + + row_prebuilt_t* prebuilt = cursor->prebuilt; + + row_update_prebuilt_trx(prebuilt, trx); + + cursor->valid_trx = TRUE; + + trx_assign_read_view(prebuilt->trx); + + ib_qry_proc_free(&cursor->q_proc); + + mem_heap_empty(cursor->query_heap); + + return(err); +} + +/*****************************************************************//** +Commit the transaction in a cursor +@return DB_SUCCESS or err code */ +ib_err_t +ib_cursor_commit_trx( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx) /*!< in: transaction */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + ut_ad(prebuilt->trx == (trx_t*) ib_trx); + err = ib_trx_commit(ib_trx); + prebuilt->trx = NULL; + cursor->valid_trx = FALSE; + return(err); +} + +/*****************************************************************//** +Close an InnoDB table and free the cursor. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_close( +/*============*/ + ib_crsr_t ib_crsr) /*!< in,own: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt; + trx_t* trx; + + if (!cursor) { + return(DB_SUCCESS); + } + + prebuilt = cursor->prebuilt; + trx = prebuilt->trx; + + ib_qry_proc_free(&cursor->q_proc); + + /* The transaction could have been detached from the cursor. */ + if (cursor->valid_trx && trx != NULL + && trx->n_mysql_tables_in_use > 0) { + --trx->n_mysql_tables_in_use; + } + + row_prebuilt_free(prebuilt, FALSE); + cursor->prebuilt = NULL; + + mem_heap_free(cursor->query_heap); + mem_heap_free(cursor->heap); + cursor = NULL; + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Close the table, decrement n_ref_count count. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_close_table( +/*==================*/ + ib_crsr_t ib_crsr) /*!< in,own: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (prebuilt && prebuilt->table) { + dict_table_close(prebuilt->table, FALSE, FALSE); + } + + return(DB_SUCCESS); +} +/**********************************************************************//** +Run the insert query and do error handling. +@return DB_SUCCESS or error code */ +UNIV_INLINE +ib_err_t +ib_insert_row_with_lock_retry( +/*==========================*/ + que_thr_t* thr, /*!< in: insert query graph */ + ins_node_t* node, /*!< in: insert node for the query */ + trx_savept_t* savept) /*!< in: savepoint to rollback to + in case of an error */ +{ + trx_t* trx; + ib_err_t err; + ib_bool_t lock_wait; + + trx = thr_get_trx(thr); + + do { + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + thr->lock_state = QUE_THR_LOCK_ROW; + lock_wait = ib_handle_errors(&err, trx, thr, savept); + thr->lock_state = QUE_THR_LOCK_NOLOCK; + } else { + lock_wait = FALSE; + } + } while (lock_wait); + + return(err); +} + +/*****************************************************************//** +Write a row. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_execute_insert_query_graph( +/*==========================*/ + dict_table_t* table, /*!< in: table where to insert */ + que_fork_t* ins_graph, /*!< in: query graph */ + ins_node_t* node) /*!< in: insert node */ +{ + trx_t* trx; + que_thr_t* thr; + trx_savept_t savept; + ib_err_t err = DB_SUCCESS; + + trx = ins_graph->trx; + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(ins_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + err = ib_insert_row_with_lock_retry(thr, node, &savept); + + if (err == DB_SUCCESS) { + que_thr_stop_for_mysql_no_error(thr, trx); + + dict_table_n_rows_inc(table); + + srv_stats.n_rows_inserted.inc(); + } + + trx->op_info = ""; + + return(err); +} + +/*****************************************************************//** +Create an insert query graph node. */ +static +void +ib_insert_query_graph_create( +/*==========================*/ + ib_cursor_t* cursor) /*!< in: Cursor instance */ +{ + ib_qry_proc_t* q_proc = &cursor->q_proc; + ib_qry_node_t* node = &q_proc->node; + trx_t* trx = cursor->prebuilt->trx; + + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + if (node->ins == NULL) { + dtuple_t* row; + ib_qry_grph_t* grph = &q_proc->grph; + mem_heap_t* heap = cursor->query_heap; + dict_table_t* table = cursor->prebuilt->table; + + node->ins = ins_node_create(INS_DIRECT, table, heap); + + node->ins->select = NULL; + node->ins->values_list = NULL; + + row = dtuple_create(heap, dict_table_get_n_cols(table)); + dict_table_copy_types(row, table); + + ins_node_set_new_row(node->ins, row); + + grph->ins = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec(node->ins, trx, + heap))); + + grph->ins->state = QUE_FORK_ACTIVE; + } +} + +/*****************************************************************//** +Insert a row to a table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_insert_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor instance */ + const ib_tpl_t ib_tpl) /*!< in: tuple to insert */ +{ + ib_ulint_t i; + ib_qry_node_t* node; + ib_qry_proc_t* q_proc; + ulint n_fields; + dtuple_t* dst_dtuple; + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + const ib_tuple_t* src_tuple = (const ib_tuple_t*) ib_tpl; + + ib_insert_query_graph_create(cursor); + + ut_ad(src_tuple->type == TPL_TYPE_ROW); + + q_proc = &cursor->q_proc; + node = &q_proc->node; + + node->ins->state = INS_NODE_ALLOC_ROW_ID; + dst_dtuple = node->ins->row; + + n_fields = dtuple_get_n_fields(src_tuple->ptr); + ut_ad(n_fields == dtuple_get_n_fields(dst_dtuple)); + + /* Do a shallow copy of the data fields and check for NULL + constraints on columns. */ + for (i = 0; i < n_fields; i++) { + ulint mtype; + dfield_t* src_field; + dfield_t* dst_field; + + src_field = dtuple_get_nth_field(src_tuple->ptr, i); + + mtype = dtype_get_mtype(dfield_get_type(src_field)); + + /* Don't touch the system columns. */ + if (mtype != DATA_SYS) { + ulint prtype; + + prtype = dtype_get_prtype(dfield_get_type(src_field)); + + if ((prtype & DATA_NOT_NULL) + && dfield_is_null(src_field)) { + + err = DB_DATA_MISMATCH; + break; + } + + dst_field = dtuple_get_nth_field(dst_dtuple, i); + ut_ad(mtype + == dtype_get_mtype(dfield_get_type(dst_field))); + + /* Do a shallow copy. */ + dfield_set_data( + dst_field, src_field->data, src_field->len); + + if (dst_field->len != IB_SQL_NULL) { + UNIV_MEM_ASSERT_RW(dst_field->data, + dst_field->len); + } + } + } + + if (err == DB_SUCCESS) { + err = ib_execute_insert_query_graph( + src_tuple->index->table, q_proc->grph.ins, node->ins); + } + + return(err); +} + +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. +@return update vector */ +UNIV_INLINE +upd_t* +ib_update_vector_create( +/*====================*/ + ib_cursor_t* cursor) /*!< in: current cursor */ +{ + trx_t* trx = cursor->prebuilt->trx; + mem_heap_t* heap = cursor->query_heap; + dict_table_t* table = cursor->prebuilt->table; + ib_qry_proc_t* q_proc = &cursor->q_proc; + ib_qry_grph_t* grph = &q_proc->grph; + ib_qry_node_t* node = &q_proc->node; + + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + if (node->upd == NULL) { + node->upd = static_cast<upd_node_t*>( + row_create_update_node_for_mysql(table, heap)); + } + + grph->upd = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec(node->upd, trx, heap))); + + grph->upd->state = QUE_FORK_ACTIVE; + + return(node->upd->update); +} + +/**********************************************************************//** +Note that a column has changed. */ +static +void +ib_update_col( +/*==========*/ + + ib_cursor_t* cursor, /*!< in: current cursor */ + upd_field_t* upd_field, /*!< in/out: update field */ + ulint col_no, /*!< in: column number */ + dfield_t* dfield) /*!< in: updated dfield */ +{ + ulint data_len; + dict_table_t* table = cursor->prebuilt->table; + dict_index_t* index = dict_table_get_first_index(table); + + data_len = dfield_get_len(dfield); + + if (data_len == UNIV_SQL_NULL) { + dfield_set_null(&upd_field->new_val); + } else { + dfield_copy_data(&upd_field->new_val, dfield); + } + + upd_field->exp = NULL; + + upd_field->orig_len = 0; + + upd_field->field_no = dict_col_get_clust_pos( + &table->cols[col_no], index); +} + +/**********************************************************************//** +Checks which fields have changed in a row and stores the new data +to an update vector. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_calc_diff( +/*=========*/ + ib_cursor_t* cursor, /*!< in: current cursor */ + upd_t* upd, /*!< in/out: update vector */ + const ib_tuple_t*old_tuple, /*!< in: Old tuple in table */ + const ib_tuple_t*new_tuple) /*!< in: New tuple to update */ +{ + ulint i; + ulint n_changed = 0; + ib_err_t err = DB_SUCCESS; + ulint n_fields = dtuple_get_n_fields(new_tuple->ptr); + + ut_a(old_tuple->type == TPL_TYPE_ROW); + ut_a(new_tuple->type == TPL_TYPE_ROW); + ut_a(old_tuple->index->table == new_tuple->index->table); + + for (i = 0; i < n_fields; ++i) { + ulint mtype; + ulint prtype; + upd_field_t* upd_field; + dfield_t* new_dfield; + dfield_t* old_dfield; + + new_dfield = dtuple_get_nth_field(new_tuple->ptr, i); + old_dfield = dtuple_get_nth_field(old_tuple->ptr, i); + + mtype = dtype_get_mtype(dfield_get_type(old_dfield)); + prtype = dtype_get_prtype(dfield_get_type(old_dfield)); + + /* Skip the system columns */ + if (mtype == DATA_SYS) { + continue; + + } else if ((prtype & DATA_NOT_NULL) + && dfield_is_null(new_dfield)) { + + err = DB_DATA_MISMATCH; + break; + } + + if (dfield_get_len(new_dfield) != dfield_get_len(old_dfield) + || (!dfield_is_null(old_dfield) + && memcmp(dfield_get_data(new_dfield), + dfield_get_data(old_dfield), + dfield_get_len(old_dfield)) != 0)) { + + upd_field = &upd->fields[n_changed]; + + ib_update_col(cursor, upd_field, i, new_dfield); + + ++n_changed; + } + } + + if (err == DB_SUCCESS) { + upd->info_bits = 0; + upd->n_fields = n_changed; + } + + return(err); +} + +/**********************************************************************//** +Run the update query and do error handling. +@return DB_SUCCESS or error code */ +UNIV_INLINE +ib_err_t +ib_update_row_with_lock_retry( +/*==========================*/ + que_thr_t* thr, /*!< in: Update query graph */ + upd_node_t* node, /*!< in: Update node for the query */ + trx_savept_t* savept) /*!< in: savepoint to rollback to + in case of an error */ + +{ + trx_t* trx; + ib_err_t err; + ib_bool_t lock_wait; + + trx = thr_get_trx(thr); + + do { + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + if (err != DB_RECORD_NOT_FOUND) { + thr->lock_state = QUE_THR_LOCK_ROW; + + lock_wait = ib_handle_errors( + &err, trx, thr, savept); + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + } else { + lock_wait = FALSE; + } + } else { + lock_wait = FALSE; + } + } while (lock_wait); + + return(err); +} + +/*********************************************************************//** +Does an update or delete of a row. +@return DB_SUCCESS or err code */ +UNIV_INLINE +ib_err_t +ib_execute_update_query_graph( +/*==========================*/ + ib_cursor_t* cursor, /*!< in: Cursor instance */ + btr_pcur_t* pcur) /*!< in: Btree persistent cursor */ +{ + ib_err_t err; + que_thr_t* thr; + upd_node_t* node; + trx_savept_t savept; + trx_t* trx = cursor->prebuilt->trx; + dict_table_t* table = cursor->prebuilt->table; + ib_qry_proc_t* q_proc = &cursor->q_proc; + + /* The transaction must be running. */ + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + node = q_proc->node.upd; + + ut_a(dict_index_is_clust(pcur->btr_cur.index)); + btr_pcur_copy_stored_position(node->pcur, pcur); + + ut_a(node->pcur->rel_pos == BTR_PCUR_ON); + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(q_proc->grph.upd); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + que_thr_move_to_run_state_for_mysql(thr, trx); + + err = ib_update_row_with_lock_retry(thr, node, &savept); + + if (err == DB_SUCCESS) { + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (node->is_delete) { + + dict_table_n_rows_dec(table); + + srv_stats.n_rows_deleted.inc(); + } else { + srv_stats.n_rows_updated.inc(); + } + + } else if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + return(err); +} + +/*****************************************************************//** +Update a row in a table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_update_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + const ib_tpl_t ib_old_tpl, /*!< in: Old tuple in table */ + const ib_tpl_t ib_new_tpl) /*!< in: New tuple to update */ +{ + upd_t* upd; + ib_err_t err; + btr_pcur_t* pcur; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + const ib_tuple_t*old_tuple = (const ib_tuple_t*) ib_old_tpl; + const ib_tuple_t*new_tuple = (const ib_tuple_t*) ib_new_tpl; + + if (dict_index_is_clust(prebuilt->index)) { + pcur = &cursor->prebuilt->pcur; + } else if (prebuilt->need_to_access_clustered) { + pcur = &cursor->prebuilt->clust_pcur; + } else { + return(DB_ERROR); + } + + ut_a(old_tuple->type == TPL_TYPE_ROW); + ut_a(new_tuple->type == TPL_TYPE_ROW); + + upd = ib_update_vector_create(cursor); + + err = ib_calc_diff(cursor, upd, old_tuple, new_tuple); + + if (err == DB_SUCCESS) { + /* Note that this is not a delete. */ + cursor->q_proc.node.upd->is_delete = FALSE; + + err = ib_execute_update_query_graph(cursor, pcur); + } + + return(err); +} + +/**********************************************************************//** +Build the update query graph to delete a row from an index. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_delete_row( +/*==========*/ + ib_cursor_t* cursor, /*!< in: current cursor */ + btr_pcur_t* pcur, /*!< in: Btree persistent cursor */ + const rec_t* rec) /*!< in: record to delete */ +{ + ulint i; + upd_t* upd; + ib_err_t err; + ib_tuple_t* tuple; + ib_tpl_t ib_tpl; + ulint n_cols; + upd_field_t* upd_field; + ib_bool_t page_format; + dict_table_t* table = cursor->prebuilt->table; + dict_index_t* index = dict_table_get_first_index(table); + + n_cols = dict_index_get_n_ordering_defined_by_user(index); + ib_tpl = ib_key_tuple_new(index, n_cols); + + if (!ib_tpl) { + return(DB_OUT_OF_MEMORY); + } + + tuple = (ib_tuple_t*) ib_tpl; + + upd = ib_update_vector_create(cursor); + + page_format = dict_table_is_comp(index->table); + ib_read_tuple(rec, page_format, tuple); + + upd->n_fields = ib_tuple_get_n_cols(ib_tpl); + + for (i = 0; i < upd->n_fields; ++i) { + dfield_t* dfield; + + upd_field = &upd->fields[i]; + dfield = dtuple_get_nth_field(tuple->ptr, i); + + dfield_copy_data(&upd_field->new_val, dfield); + + upd_field->exp = NULL; + + upd_field->orig_len = 0; + + upd->info_bits = 0; + + upd_field->field_no = dict_col_get_clust_pos( + &table->cols[i], index); + } + + /* Note that this is a delete. */ + cursor->q_proc.node.upd->is_delete = TRUE; + + err = ib_execute_update_query_graph(cursor, pcur); + + ib_tuple_delete(ib_tpl); + + return(err); +} + +/*****************************************************************//** +Delete a row in a table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_delete_row( +/*=================*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_err_t err; + btr_pcur_t* pcur; + dict_index_t* index; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + index = dict_table_get_first_index(prebuilt->index->table); + + /* Check whether this is a secondary index cursor */ + if (index != prebuilt->index) { + if (prebuilt->need_to_access_clustered) { + pcur = &prebuilt->clust_pcur; + } else { + return(DB_ERROR); + } + } else { + pcur = &prebuilt->pcur; + } + + if (ib_btr_cursor_is_positioned(pcur)) { + const rec_t* rec; + ib_bool_t page_format; + mtr_t mtr; + + page_format = dict_table_is_comp(index->table); + + mtr_start(&mtr); + + if (btr_pcur_restore_position( + BTR_SEARCH_LEAF, pcur, &mtr)) { + + rec = btr_pcur_get_rec(pcur); + } else { + rec = NULL; + } + + mtr_commit(&mtr); + + if (rec && !rec_get_deleted_flag(rec, page_format)) { + err = ib_delete_row(cursor, pcur, rec); + } else { + err = DB_RECORD_NOT_FOUND; + } + } else { + err = DB_RECORD_NOT_FOUND; + } + + return(err); +} + +/*****************************************************************//** +Read current row. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_read_row( +/*===============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl) /*!< out: read cols into this tuple */ +{ + ib_err_t err; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + ut_a(cursor->prebuilt->trx->state != TRX_STATE_NOT_STARTED); + + /* When searching with IB_EXACT_MATCH set, row_search_for_mysql() + will not position the persistent cursor but will copy the record + found into the row cache. It should be the only entry. */ + if (!ib_cursor_is_positioned(ib_crsr) ) { + err = DB_RECORD_NOT_FOUND; + } else { + mtr_t mtr; + btr_pcur_t* pcur; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (prebuilt->need_to_access_clustered + && tuple->type == TPL_TYPE_ROW) { + pcur = &prebuilt->clust_pcur; + } else { + pcur = &prebuilt->pcur; + } + + if (pcur == NULL) { + return(DB_ERROR); + } + + mtr_start(&mtr); + + if (btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr)) { + const rec_t* rec; + ib_bool_t page_format; + + page_format = dict_table_is_comp(tuple->index->table); + rec = btr_pcur_get_rec(pcur); + + if (prebuilt->innodb_api_rec && + prebuilt->innodb_api_rec != rec) { + rec = prebuilt->innodb_api_rec; + } + + if (!rec_get_deleted_flag(rec, page_format)) { + ib_read_tuple(rec, page_format, tuple); + err = DB_SUCCESS; + } else{ + err = DB_RECORD_NOT_FOUND; + } + + } else { + err = DB_RECORD_NOT_FOUND; + } + + mtr_commit(&mtr); + } + + return(err); +} + +/*****************************************************************//** +Move cursor to the first record in the table. +@return DB_SUCCESS or err code */ +UNIV_INLINE +ib_err_t +ib_cursor_position( +/*===============*/ + ib_cursor_t* cursor, /*!< in: InnoDB cursor instance */ + ib_srch_mode_t mode) /*!< in: Search mode */ +{ + ib_err_t err; + row_prebuilt_t* prebuilt = cursor->prebuilt; + unsigned char* buf; + + buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE)); + + /* We want to position at one of the ends, row_search_for_mysql() + uses the search_tuple fields to work out what to do. */ + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + err = static_cast<ib_err_t>(row_search_for_mysql( + buf, mode, prebuilt, 0, 0)); + + mem_free(buf); + + return(err); +} + +/*****************************************************************//** +Move cursor to the first record in the table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_first( +/*============*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + return(ib_cursor_position(cursor, IB_CUR_G)); +} + +/*****************************************************************//** +Move cursor to the last record in the table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_last( +/*===========*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + return(ib_cursor_position(cursor, IB_CUR_L)); +} + +/*****************************************************************//** +Move cursor to the next user record in the table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_next( +/*===========*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_err_t err; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + byte buf[UNIV_PAGE_SIZE_MAX]; + + /* We want to move to the next record */ + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + err = static_cast<ib_err_t>(row_search_for_mysql( + buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT)); + + return(err); +} + +/*****************************************************************//** +Search for key. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_moveto( +/*=============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< in: Key to search for */ + ib_srch_mode_t ib_srch_mode) /*!< in: search mode */ +{ + ulint i; + ulint n_fields; + ib_err_t err = DB_SUCCESS; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + dtuple_t* search_tuple = prebuilt->search_tuple; + unsigned char* buf; + + ut_a(tuple->type == TPL_TYPE_KEY); + + n_fields = dict_index_get_n_ordering_defined_by_user(prebuilt->index); + + dtuple_set_n_fields(search_tuple, n_fields); + dtuple_set_n_fields_cmp(search_tuple, n_fields); + + /* Do a shallow copy */ + for (i = 0; i < n_fields; ++i) { + dfield_copy(dtuple_get_nth_field(search_tuple, i), + dtuple_get_nth_field(tuple->ptr, i)); + } + + ut_a(prebuilt->select_lock_type <= LOCK_NUM); + + prebuilt->innodb_api_rec = NULL; + + buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE)); + + err = static_cast<ib_err_t>(row_search_for_mysql( + buf, ib_srch_mode, prebuilt, cursor->match_mode, 0)); + + mem_free(buf); + + return(err); +} + +/*****************************************************************//** +Set the cursor search mode. */ +UNIV_INTERN +void +ib_cursor_set_match_mode( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: Cursor instance */ + ib_match_mode_t match_mode) /*!< in: ib_cursor_moveto match mode */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + cursor->match_mode = match_mode; +} + +/*****************************************************************//** +Get the dfield instance for the column in the tuple. +@return dfield instance in tuple */ +UNIV_INLINE +dfield_t* +ib_col_get_dfield( +/*==============*/ + ib_tuple_t* tuple, /*!< in: tuple instance */ + ulint col_no) /*!< in: col no. in tuple */ +{ + dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple->ptr, col_no); + + return(dfield); +} + +/*****************************************************************//** +Predicate to check whether a column type contains variable length data. +@return DB_SUCCESS or error code */ +UNIV_INLINE +ib_err_t +ib_col_is_capped( +/*==============*/ + const dtype_t* dtype) /*!< in: column type */ +{ + return(static_cast<ib_err_t>( + (dtype_get_mtype(dtype) == DATA_VARCHAR + || dtype_get_mtype(dtype) == DATA_CHAR + || dtype_get_mtype(dtype) == DATA_MYSQL + || dtype_get_mtype(dtype) == DATA_VARMYSQL + || dtype_get_mtype(dtype) == DATA_FIXBINARY + || dtype_get_mtype(dtype) == DATA_BINARY) + && dtype_get_len(dtype) > 0)); +} + +/*****************************************************************//** +Set a column of the tuple. Make a copy using the tuple's heap. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_col_set_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t col_no, /*!< in: column index in tuple */ + const void* src, /*!< in: data value */ + ib_ulint_t len) /*!< in: data value len */ +{ + const dtype_t* dtype; + dfield_t* dfield; + void* dst = NULL; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + /* User wants to set the column to NULL. */ + if (len == IB_SQL_NULL) { + dfield_set_null(dfield); + return(DB_SUCCESS); + } + + dtype = dfield_get_type(dfield); + + /* Not allowed to update system columns. */ + if (dtype_get_mtype(dtype) == DATA_SYS) { + return(DB_DATA_MISMATCH); + } + + dst = dfield_get_data(dfield); + + /* Since TEXT/CLOB also map to DATA_VARCHAR we need to make an + exception. Perhaps we need to set the precise type and check + for that. */ + if (ib_col_is_capped(dtype)) { + + len = ut_min(len, dtype_get_len(dtype)); + + if (dst == NULL || len > dfield_get_len(dfield)) { + dst = mem_heap_alloc(tuple->heap, dtype_get_len(dtype)); + ut_a(dst != NULL); + } + } else if (dst == NULL || len > dfield_get_len(dfield)) { + dst = mem_heap_alloc(tuple->heap, len); + } + + if (dst == NULL) { + return(DB_OUT_OF_MEMORY); + } + + switch (dtype_get_mtype(dtype)) { + case DATA_INT: { + + if (dtype_get_len(dtype) == len) { + ibool usign; + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + mach_write_int_type(static_cast<byte*>(dst), + static_cast<const byte*>(src), + len, usign); + + } else { + return(DB_DATA_MISMATCH); + } + break; + } + + case DATA_FLOAT: + if (len == sizeof(float)) { + mach_float_write(static_cast<byte*>(dst), *(float*)src); + } else { + return(DB_DATA_MISMATCH); + } + break; + + case DATA_DOUBLE: + if (len == sizeof(double)) { + mach_double_write(static_cast<byte*>(dst), + *(double*)src); + } else { + return(DB_DATA_MISMATCH); + } + break; + + case DATA_SYS: + ut_error; + break; + + case DATA_CHAR: { + ulint pad_char = ULINT_UNDEFINED; + + pad_char = dtype_get_pad_char( + dtype_get_mtype(dtype), dtype_get_prtype(dtype)); + + ut_a(pad_char != ULINT_UNDEFINED); + + memset((byte*) dst + len, + pad_char, + dtype_get_len(dtype) - len); + + memcpy(dst, src, len); + + len = dtype_get_len(dtype); + break; + } + case DATA_BLOB: + case DATA_BINARY: + case DATA_MYSQL: + case DATA_DECIMAL: + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_FIXBINARY: + memcpy(dst, src, len); + break; + + default: + ut_error; + } + + if (dst != dfield_get_data(dfield)) { + dfield_set_data(dfield, dst, len); + } else { + dfield_set_len(dfield, len); + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Get the size of the data available in a column of the tuple. +@return bytes avail or IB_SQL_NULL */ +UNIV_INTERN +ib_ulint_t +ib_col_get_len( +/*===========*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data_len = dfield_get_len(dfield); + + return(data_len == UNIV_SQL_NULL ? IB_SQL_NULL : data_len); +} + +/*****************************************************************//** +Copy a column value from the tuple. +@return bytes copied or IB_SQL_NULL */ +UNIV_INLINE +ib_ulint_t +ib_col_copy_value_low( +/*==================*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + void* dst, /*!< out: copied data value */ + ib_ulint_t len) /*!< in: max data value len to copy */ +{ + const void* data; + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data = dfield_get_data(dfield); + data_len = dfield_get_len(dfield); + + if (data_len != UNIV_SQL_NULL) { + + const dtype_t* dtype = dfield_get_type(dfield); + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + case DATA_INT: { + ibool usign; + ullint ret; + + ut_a(data_len == len); + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + ret = mach_read_int_type(static_cast<const byte*>(data), + data_len, usign); + + if (usign) { + if (len == 2) { + *(ib_i16_t*)dst = (ib_i16_t)ret; + } else if (len == 4) { + *(ib_i32_t*)dst = (ib_i32_t)ret; + } else { + *(ib_i64_t*)dst = (ib_i64_t)ret; + } + } else { + if (len == 2) { + *(ib_u16_t*)dst = (ib_i16_t)ret; + } else if (len == 4) { + *(ib_u32_t*)dst = (ib_i32_t)ret; + } else { + *(ib_u64_t*)dst = (ib_i64_t)ret; + } + } + + break; + } + case DATA_FLOAT: + if (len == data_len) { + float f; + + ut_a(data_len == sizeof(f)); + f = mach_float_read(static_cast<const byte*>( + data)); + memcpy(dst, &f, sizeof(f)); + } else { + data_len = 0; + } + break; + case DATA_DOUBLE: + if (len == data_len) { + double d; + + ut_a(data_len == sizeof(d)); + d = mach_double_read(static_cast<const byte*>( + data)); + memcpy(dst, &d, sizeof(d)); + } else { + data_len = 0; + } + break; + default: + data_len = ut_min(data_len, len); + memcpy(dst, data, data_len); + } + } else { + data_len = IB_SQL_NULL; + } + + return(data_len); +} + +/*****************************************************************//** +Copy a column value from the tuple. +@return bytes copied or IB_SQL_NULL */ +UNIV_INTERN +ib_ulint_t +ib_col_copy_value( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + void* dst, /*!< out: copied data value */ + ib_ulint_t len) /*!< in: max data value len to copy */ +{ + return(ib_col_copy_value_low(ib_tpl, i, dst, len)); +} + +/*****************************************************************//** +Get the InnoDB column attribute from the internal column precise type. +@return precise type in api format */ +UNIV_INLINE +ib_col_attr_t +ib_col_get_attr( +/*============*/ + ulint prtype) /*!< in: column definition */ +{ + ib_col_attr_t attr = IB_COL_NONE; + + if (prtype & DATA_UNSIGNED) { + attr = static_cast<ib_col_attr_t>(attr | IB_COL_UNSIGNED); + } + + if (prtype & DATA_NOT_NULL) { + attr = static_cast<ib_col_attr_t>(attr | IB_COL_NOT_NULL); + } + + return(attr); +} + +/*****************************************************************//** +Get a column name from the tuple. +@return name of the column */ +UNIV_INTERN +const char* +ib_col_get_name( +/*============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + const char* name; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_table_t* table = cursor->prebuilt->table; + dict_col_t* col = dict_table_get_nth_col(table, i); + ulint col_no = dict_col_get_no(col); + + name = dict_table_get_col_name(table, col_no); + + return(name); +} + +/*****************************************************************//** +Get an index field name from the cursor. +@return name of the field */ +UNIV_INTERN +const char* +ib_get_idx_field_name( +/*==================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index = cursor->prebuilt->index; + dict_field_t* field; + + if (index) { + field = dict_index_get_nth_field(cursor->prebuilt->index, i); + + if (field) { + return(field->name); + } + } + + return(NULL); +} + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ +UNIV_INLINE +ib_ulint_t +ib_col_get_meta_low( +/*================*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + ib_col_meta_t* ib_col_meta) /*!< out: column meta data */ +{ + ib_u16_t prtype; + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data_len = dfield_get_len(dfield); + + /* We assume 1-1 mapping between the ENUM and internal type codes. */ + ib_col_meta->type = static_cast<ib_col_type_t>( + dtype_get_mtype(dfield_get_type(dfield))); + + ib_col_meta->type_len = dtype_get_len(dfield_get_type(dfield)); + + prtype = (ib_u16_t) dtype_get_prtype(dfield_get_type(dfield)); + + ib_col_meta->attr = ib_col_get_attr(prtype); + ib_col_meta->client_type = prtype & DATA_MYSQL_TYPE_MASK; + + return(data_len); +} + +/*************************************************************//** +Read a signed int 8 bit column from an InnoDB tuple. */ +UNIV_INLINE +ib_err_t +ib_tuple_check_int( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_bool_t usign, /*!< in: true if unsigned */ + ulint size) /*!< in: size of integer */ +{ + ib_col_meta_t ib_col_meta; + + ib_col_get_meta_low(ib_tpl, i, &ib_col_meta); + + if (ib_col_meta.type != IB_INT) { + return(DB_DATA_MISMATCH); + } else if (ib_col_meta.type_len == IB_SQL_NULL) { + return(DB_UNDERFLOW); + } else if (ib_col_meta.type_len != size) { + return(DB_DATA_MISMATCH); + } else if ((ib_col_meta.attr & IB_COL_UNSIGNED) && !usign) { + return(DB_DATA_MISMATCH); + } + + return(DB_SUCCESS); +} + +/*************************************************************//** +Read a signed int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i8_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u8_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read a signed int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i16_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u16_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read a signed int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i32_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u32_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read a signed int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i64_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u64_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*****************************************************************//** +Get a column value pointer from the tuple. +@return NULL or pointer to buffer */ +UNIV_INTERN +const void* +ib_col_get_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + const void* data; + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data = dfield_get_data(dfield); + data_len = dfield_get_len(dfield); + + return(data_len != UNIV_SQL_NULL ? data : NULL); +} + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ +UNIV_INTERN +ib_ulint_t +ib_col_get_meta( +/*============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + ib_col_meta_t* ib_col_meta) /*!< out: column meta data */ +{ + return(ib_col_get_meta_low(ib_tpl, i, ib_col_meta)); +} + +/*****************************************************************//** +"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple. +@return new tuple, or NULL */ +UNIV_INTERN +ib_tpl_t +ib_tuple_clear( +/*============*/ + ib_tpl_t ib_tpl) /*!< in,own: tuple (will be freed) */ +{ + const dict_index_t* index; + ulint n_cols; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + ib_tuple_type_t type = tuple->type; + mem_heap_t* heap = tuple->heap; + + index = tuple->index; + n_cols = dtuple_get_n_fields(tuple->ptr); + + mem_heap_empty(heap); + + if (type == TPL_TYPE_ROW) { + return(ib_row_tuple_new_low(index, n_cols, heap)); + } else { + return(ib_key_tuple_new_low(index, n_cols, heap)); + } +} + +/*****************************************************************//** +Create a new cluster key search tuple and copy the contents of the +secondary index key tuple columns that refer to the cluster index record +to the cluster key. It does a deep copy of the column data. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_tuple_get_cluster_key( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: secondary index cursor */ + ib_tpl_t* ib_dst_tpl, /*!< out,own: destination tuple */ + const ib_tpl_t ib_src_tpl) /*!< in: source tuple */ +{ + ulint i; + ulint n_fields; + ib_err_t err = DB_SUCCESS; + ib_tuple_t* dst_tuple = NULL; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + ib_tuple_t* src_tuple = (ib_tuple_t*) ib_src_tpl; + dict_index_t* clust_index; + + clust_index = dict_table_get_first_index(cursor->prebuilt->table); + + /* We need to ensure that the src tuple belongs to the same table + as the open cursor and that it's not a tuple for a cluster index. */ + if (src_tuple->type != TPL_TYPE_KEY) { + return(DB_ERROR); + } else if (src_tuple->index->table != cursor->prebuilt->table) { + return(DB_DATA_MISMATCH); + } else if (src_tuple->index == clust_index) { + return(DB_ERROR); + } + + /* Create the cluster index key search tuple. */ + *ib_dst_tpl = ib_clust_search_tuple_create(ib_crsr); + + if (!*ib_dst_tpl) { + return(DB_OUT_OF_MEMORY); + } + + dst_tuple = (ib_tuple_t*) *ib_dst_tpl; + ut_a(dst_tuple->index == clust_index); + + n_fields = dict_index_get_n_unique(dst_tuple->index); + + /* Do a deep copy of the data fields. */ + for (i = 0; i < n_fields; i++) { + ulint pos; + dfield_t* src_field; + dfield_t* dst_field; + + pos = dict_index_get_nth_field_pos( + src_tuple->index, dst_tuple->index, i); + + ut_a(pos != ULINT_UNDEFINED); + + src_field = dtuple_get_nth_field(src_tuple->ptr, pos); + dst_field = dtuple_get_nth_field(dst_tuple->ptr, i); + + if (!dfield_is_null(src_field)) { + UNIV_MEM_ASSERT_RW(src_field->data, src_field->len); + + dst_field->data = mem_heap_dup( + dst_tuple->heap, + src_field->data, + src_field->len); + + dst_field->len = src_field->len; + } else { + dfield_set_null(dst_field); + } + } + + return(err); +} + +/*****************************************************************//** +Copy the contents of source tuple to destination tuple. The tuples +must be of the same type and belong to the same table/index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_tuple_copy( +/*==========*/ + ib_tpl_t ib_dst_tpl, /*!< in: destination tuple */ + const ib_tpl_t ib_src_tpl) /*!< in: source tuple */ +{ + ulint i; + ulint n_fields; + ib_err_t err = DB_SUCCESS; + const ib_tuple_t*src_tuple = (const ib_tuple_t*) ib_src_tpl; + ib_tuple_t* dst_tuple = (ib_tuple_t*) ib_dst_tpl; + + /* Make sure src and dst are not the same. */ + ut_a(src_tuple != dst_tuple); + + /* Make sure they are the same type and refer to the same index. */ + if (src_tuple->type != dst_tuple->type + || src_tuple->index != dst_tuple->index) { + + return(DB_DATA_MISMATCH); + } + + n_fields = dtuple_get_n_fields(src_tuple->ptr); + ut_ad(n_fields == dtuple_get_n_fields(dst_tuple->ptr)); + + /* Do a deep copy of the data fields. */ + for (i = 0; i < n_fields; ++i) { + dfield_t* src_field; + dfield_t* dst_field; + + src_field = dtuple_get_nth_field(src_tuple->ptr, i); + dst_field = dtuple_get_nth_field(dst_tuple->ptr, i); + + if (!dfield_is_null(src_field)) { + UNIV_MEM_ASSERT_RW(src_field->data, src_field->len); + + dst_field->data = mem_heap_dup( + dst_tuple->heap, + src_field->data, + src_field->len); + + dst_field->len = src_field->len; + } else { + dfield_set_null(dst_field); + } + } + + return(err); +} + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return own: Tuple for current index */ +UNIV_INTERN +ib_tpl_t +ib_sec_search_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index = cursor->prebuilt->index; + + n_cols = dict_index_get_n_unique_in_tree(index); + return(ib_key_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return own: Tuple for current index */ +UNIV_INTERN +ib_tpl_t +ib_sec_read_tuple_create( +/*=====================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index = cursor->prebuilt->index; + + n_cols = dict_index_get_n_fields(index); + return(ib_row_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Create an InnoDB tuple used for table key operations. +@return own: Tuple for current table */ +UNIV_INTERN +ib_tpl_t +ib_clust_search_tuple_create( +/*=========================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index; + + index = dict_table_get_first_index(cursor->prebuilt->table); + + n_cols = dict_index_get_n_ordering_defined_by_user(index); + return(ib_key_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Create an InnoDB tuple for table row operations. +@return own: Tuple for current table */ +UNIV_INTERN +ib_tpl_t +ib_clust_read_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index; + + index = dict_table_get_first_index(cursor->prebuilt->table); + + n_cols = dict_table_get_n_cols(cursor->prebuilt->table); + return(ib_row_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Return the number of user columns in the tuple definition. +@return number of user columns */ +UNIV_INTERN +ib_ulint_t +ib_tuple_get_n_user_cols( +/*=====================*/ + const ib_tpl_t ib_tpl) /*!< in: Tuple for current table */ +{ + const ib_tuple_t* tuple = (const ib_tuple_t*) ib_tpl; + + if (tuple->type == TPL_TYPE_ROW) { + return(dict_table_get_n_user_cols(tuple->index->table)); + } + + return(dict_index_get_n_ordering_defined_by_user(tuple->index)); +} + +/*****************************************************************//** +Return the number of columns in the tuple definition. +@return number of columns */ +UNIV_INTERN +ib_ulint_t +ib_tuple_get_n_cols( +/*================*/ + const ib_tpl_t ib_tpl) /*!< in: Tuple for table/index */ +{ + const ib_tuple_t* tuple = (const ib_tuple_t*) ib_tpl; + + return(dtuple_get_n_fields(tuple->ptr)); +} + +/*****************************************************************//** +Destroy an InnoDB tuple. */ +UNIV_INTERN +void +ib_tuple_delete( +/*============*/ + ib_tpl_t ib_tpl) /*!< in,own: Tuple instance to delete */ +{ + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + if (!ib_tpl) { + return; + } + + mem_heap_free(tuple->heap); +} + +/*****************************************************************//** +Get a table id. This function will acquire the dictionary mutex. +@return DB_SUCCESS if found */ +UNIV_INTERN +ib_err_t +ib_table_get_id( +/*============*/ + const char* table_name, /*!< in: table to find */ + ib_id_u64_t* table_id) /*!< out: table id if found */ +{ + ib_err_t err; + + dict_mutex_enter_for_mysql(); + + err = ib_table_get_id_low(table_name, table_id); + + dict_mutex_exit_for_mysql(); + + return(err); +} + +/*****************************************************************//** +Get an index id. +@return DB_SUCCESS if found */ +UNIV_INTERN +ib_err_t +ib_index_get_id( +/*============*/ + const char* table_name, /*!< in: find index for this table */ + const char* index_name, /*!< in: index to find */ + ib_id_u64_t* index_id) /*!< out: index id if found */ +{ + dict_table_t* table; + char* normalized_name; + ib_err_t err = DB_TABLE_NOT_FOUND; + + *index_id = 0; + + normalized_name = static_cast<char*>( + mem_alloc(ut_strlen(table_name) + 1)); + ib_normalize_table_name(normalized_name, table_name); + + table = ib_lookup_table_by_name(normalized_name); + + mem_free(normalized_name); + normalized_name = NULL; + + if (table != NULL) { + dict_index_t* index; + + index = dict_table_get_index_on_name(table, index_name); + + if (index != NULL) { + /* We only support 32 bit table and index ids. Because + we need to pack the table id into the index id. */ + + *index_id = (table->id); + *index_id <<= 32; + *index_id |= (index->id); + + err = DB_SUCCESS; + } + } + + return(err); +} + +#ifdef __WIN__ +#define SRV_PATH_SEPARATOR '\\' +#else +#define SRV_PATH_SEPARATOR '/' +#endif + + +/*****************************************************************//** +Check if cursor is positioned. +@return IB_TRUE if positioned */ +UNIV_INTERN +ib_bool_t +ib_cursor_is_positioned( +/*====================*/ + const ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + const ib_cursor_t* cursor = (const ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + return(ib_btr_cursor_is_positioned(&prebuilt->pcur)); +} + + +/*****************************************************************//** +Checks if the data dictionary is latched in exclusive mode. +@return TRUE if exclusive latch */ +UNIV_INTERN +ib_bool_t +ib_schema_lock_is_exclusive( +/*========================*/ + const ib_trx_t ib_trx) /*!< in: transaction */ +{ + const trx_t* trx = (const trx_t*) ib_trx; + + return(trx->dict_operation_lock_mode == RW_X_LATCH); +} + +/*****************************************************************//** +Checks if the data dictionary is latched in shared mode. +@return TRUE if shared latch */ +UNIV_INTERN +ib_bool_t +ib_schema_lock_is_shared( +/*=====================*/ + const ib_trx_t ib_trx) /*!< in: transaction */ +{ + const trx_t* trx = (const trx_t*) ib_trx; + + return(trx->dict_operation_lock_mode == RW_S_LATCH); +} + +/*****************************************************************//** +Set the Lock an InnoDB cursor/table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_lock( +/*===========*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode) /*!< in: InnoDB lock mode */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + trx_t* trx = prebuilt->trx; + dict_table_t* table = prebuilt->table; + + return(ib_trx_lock_table_with_retry( + trx, table, (enum lock_mode) ib_lck_mode)); +} + +/*****************************************************************//** +Set the Lock an InnoDB table using the table id. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_table_lock( +/*==========*/ + ib_trx_t ib_trx, /*!< in/out: transaction */ + ib_id_u64_t table_id, /*!< in: table id */ + ib_lck_mode_t ib_lck_mode) /*!< in: InnoDB lock mode */ +{ + ib_err_t err; + que_thr_t* thr; + mem_heap_t* heap; + dict_table_t* table; + ib_qry_proc_t q_proc; + trx_t* trx = (trx_t*) ib_trx; + + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + table = ib_open_table_by_id(table_id, FALSE); + + if (table == NULL) { + return(DB_TABLE_NOT_FOUND); + } + + ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM)); + + heap = mem_heap_create(128); + + q_proc.node.sel = sel_node_create(heap); + + thr = pars_complete_graph_for_exec(q_proc.node.sel, trx, heap); + + q_proc.grph.sel = static_cast<que_fork_t*>(que_node_get_parent(thr)); + q_proc.grph.sel->state = QUE_FORK_ACTIVE; + + trx->op_info = "setting table lock"; + + ut_a(ib_lck_mode == IB_LOCK_IS || ib_lck_mode == IB_LOCK_IX); + err = static_cast<ib_err_t>( + lock_table(0, table, (enum lock_mode) ib_lck_mode, thr)); + + trx->error_state = err; + + mem_heap_free(heap); + + return(err); +} + +/*****************************************************************//** +Unlock an InnoDB table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_unlock( +/*=============*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (prebuilt->trx->mysql_n_tables_locked > 0) { + --prebuilt->trx->mysql_n_tables_locked; + } else { + err = DB_ERROR; + } + + return(err); +} + +/*****************************************************************//** +Set the Lock mode of the cursor. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_set_lock_mode( +/*====================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode) /*!< in: InnoDB lock mode */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM)); + + if (ib_lck_mode == IB_LOCK_X) { + err = ib_cursor_lock(ib_crsr, IB_LOCK_IX); + } else if (ib_lck_mode == IB_LOCK_S) { + err = ib_cursor_lock(ib_crsr, IB_LOCK_IS); + } + + if (err == DB_SUCCESS) { + prebuilt->select_lock_type = (enum lock_mode) ib_lck_mode; + ut_a(prebuilt->trx->state != TRX_STATE_NOT_STARTED); + } + + return(err); +} + +/*****************************************************************//** +Set need to access clustered index record. */ +UNIV_INTERN +void +ib_cursor_set_cluster_access( +/*=========================*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + prebuilt->need_to_access_clustered = TRUE; +} + +/*************************************************************//** +Convert and write an INT column value to an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INLINE +ib_err_t +ib_tuple_write_int( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + ulint col_no, /*!< in: column number */ + const void* value, /*!< in: integer value */ + ulint value_len) /*!< in: sizeof value type */ +{ + const dfield_t* dfield; + ulint data_len; + ulint type_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + ut_a(col_no < ib_tuple_get_n_cols(ib_tpl)); + + dfield = ib_col_get_dfield(tuple, col_no); + + data_len = dfield_get_len(dfield); + type_len = dtype_get_len(dfield_get_type(dfield)); + + if (dtype_get_mtype(dfield_get_type(dfield)) != DATA_INT + || value_len != data_len) { + + return(DB_DATA_MISMATCH); + } + + return(ib_col_set_value(ib_tpl, col_no, value, type_len)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i8_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i16( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i16_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i32( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i32_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i64_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u8_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u16( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tupe to write to */ + int col_no, /*!< in: column number */ + ib_u16_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u32( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u32_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u64_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); +} + +/*****************************************************************//** +Inform the cursor that it's the start of an SQL statement. */ +UNIV_INTERN +void +ib_cursor_stmt_begin( +/*=================*/ + ib_crsr_t ib_crsr) /*!< in: cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + cursor->prebuilt->sql_stat_start = TRUE; +} + +/*****************************************************************//** +Write a double value to a column. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_double( +/*==================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + double val) /*!< in: value to write */ +{ + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) { + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); + } else { + return(DB_DATA_MISMATCH); + } +} + +/*************************************************************//** +Read a double column value from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_double( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + double* dval) /*!< out: double value */ +{ + ib_err_t err; + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) { + ib_col_copy_value_low(ib_tpl, col_no, dval, sizeof(*dval)); + err = DB_SUCCESS; + } else { + err = DB_DATA_MISMATCH; + } + + return(err); +} + +/*****************************************************************//** +Write a float value to a column. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_float( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + float val) /*!< in: value to write */ +{ + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) { + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val))); + } else { + return(DB_DATA_MISMATCH); + } +} + +/*************************************************************//** +Read a float value from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_float( +/*================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + float* fval) /*!< out: float value */ +{ + ib_err_t err; + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) { + ib_col_copy_value_low(ib_tpl, col_no, fval, sizeof(*fval)); + err = DB_SUCCESS; + } else { + err = DB_DATA_MISMATCH; + } + + return(err); +} + +/*****************************************************************//** +Truncate a table. The cursor handle will be closed and set to NULL +on success. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_truncate( +/*===============*/ + ib_crsr_t* ib_crsr, /*!< in/out: cursor for table + to truncate */ + ib_id_u64_t* table_id) /*!< out: new table id */ +{ + ib_err_t err; + ib_cursor_t* cursor = *(ib_cursor_t**) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + *table_id = 0; + + err = ib_cursor_lock(*ib_crsr, IB_LOCK_X); + + if (err == DB_SUCCESS) { + trx_t* trx; + dict_table_t* table = prebuilt->table; + + /* We are going to free the cursor and the prebuilt. Store + the transaction handle locally. */ + trx = prebuilt->trx; + err = ib_cursor_close(*ib_crsr); + ut_a(err == DB_SUCCESS); + + *ib_crsr = NULL; + + /* A temp go around for assertion in trx_start_for_ddl_low + we already start the trx */ + if (trx->state == TRX_STATE_ACTIVE) { +#ifdef UNIV_DEBUG + trx->start_file = 0; +#endif /* UNIV_DEBUG */ + trx->dict_operation = TRX_DICT_OP_TABLE; + } + + /* This function currently commits the transaction + on success. */ + err = static_cast<ib_err_t>( + row_truncate_table_for_mysql(table, trx)); + + if (err == DB_SUCCESS) { + *table_id = (table->id); + } + } + + return(err); +} + +/*****************************************************************//** +Truncate a table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_table_truncate( +/*==============*/ + const char* table_name, /*!< in: table name */ + ib_id_u64_t* table_id) /*!< out: new table id */ +{ + ib_err_t err; + dict_table_t* table; + ib_err_t trunc_err; + ib_trx_t ib_trx = NULL; + ib_crsr_t ib_crsr = NULL; + + ib_trx = ib_trx_begin(IB_TRX_SERIALIZABLE); + + dict_mutex_enter_for_mysql(); + + table = dict_table_open_on_name(table_name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE); + + if (table != NULL && dict_table_get_first_index(table)) { + err = ib_create_cursor_with_index_id(&ib_crsr, table, 0, + (trx_t*) ib_trx); + } else { + err = DB_TABLE_NOT_FOUND; + } + + dict_mutex_exit_for_mysql(); + + if (err == DB_SUCCESS) { + trunc_err = ib_cursor_truncate(&ib_crsr, table_id); + ut_a(err == DB_SUCCESS); + } else { + trunc_err = err; + } + + if (ib_crsr != NULL) { + err = ib_cursor_close(ib_crsr); + ut_a(err == DB_SUCCESS); + } + + if (trunc_err == DB_SUCCESS) { + ut_a(ib_trx_state(ib_trx) == static_cast<ib_trx_state_t>( + TRX_STATE_NOT_STARTED)); + + err = ib_trx_release(ib_trx); + ut_a(err == DB_SUCCESS); + } else { + err = ib_trx_rollback(ib_trx); + ut_a(err == DB_SUCCESS); + } + + return(trunc_err); +} + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +ib_err_t +ib_close_thd( +/*=========*/ + void* thd) /*!< in: handle to the MySQL thread of the user + whose resources should be free'd */ +{ + innobase_close_thd(static_cast<THD*>(thd)); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Return isolation configuration set by "innodb_api_trx_level" +@return trx isolation level*/ +UNIV_INTERN +ib_trx_state_t +ib_cfg_trx_level() +/*==============*/ +{ + return(static_cast<ib_trx_state_t>(ib_trx_level_setting)); +} + +/*****************************************************************//** +Return configure value for background commit interval (in seconds) +@return background commit interval (in seconds) */ +UNIV_INTERN +ib_ulint_t +ib_cfg_bk_commit_interval() +/*=======================*/ +{ + return(static_cast<ib_ulint_t>(ib_bk_commit_interval)); +} + +/*****************************************************************//** +Get generic configure status +@return configure status*/ +UNIV_INTERN +int +ib_cfg_get_cfg() +/*============*/ +{ + int cfg_status; + + cfg_status = (ib_binlog_enabled) ? IB_CFG_BINLOG_ENABLED : 0; + + if (ib_mdl_enabled) { + cfg_status |= IB_CFG_MDL_ENABLED; + } + + if (ib_disable_row_lock) { + cfg_status |= IB_CFG_DISABLE_ROWLOCK; + } + + return(cfg_status); +} diff --git a/storage/innobase/api/api0misc.cc b/storage/innobase/api/api0misc.cc new file mode 100644 index 00000000000..b2370105938 --- /dev/null +++ b/storage/innobase/api/api0misc.cc @@ -0,0 +1,206 @@ +/***************************************************************************** + +Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file api/api0misc.cc +InnoDB Native API + +2008-08-01 Created by Sunny Bains +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +*******************************************************/ + +#include <errno.h> + +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif /* HAVE_UNISTD_H */ + +#include "api0misc.h" +#include "trx0roll.h" +#include "srv0srv.h" +#include "dict0mem.h" +#include "dict0dict.h" +#include "pars0pars.h" +#include "row0sel.h" +#include "lock0lock.h" +#include "ha_prototypes.h" +#include <m_ctype.h> +#include <mysys_err.h> +#include <mysql/plugin.h> + +/*********************************************************************//** +Sets a lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +ib_trx_lock_table_with_retry( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ +{ + que_thr_t* thr; + dberr_t err; + mem_heap_t* heap; + sel_node_t* node; + + heap = mem_heap_create(512); + + trx->op_info = "setting table lock"; + + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(static_cast<que_fork_t*>( + que_node_get_parent(thr))); + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + err = lock_table(0, table, mode, thr); + + trx->error_state = err; + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + que_thr_stop_for_mysql_no_error(thr, trx); + } else { + que_thr_stop_for_mysql(thr); + + if (err != DB_QUE_THR_SUSPENDED) { + ibool was_lock_wait; + + was_lock_wait = ib_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; + + parent = que_node_get_parent(thr); + run_thr = que_fork_start_command( + static_cast<que_fork_t*>(parent)); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + + goto run_again; + } + } + + que_graph_free(thr->graph); + trx->op_info = ""; + + return(err); +} +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return TRUE if it was a lock wait and we should continue running +the query thread */ +UNIV_INTERN +ibool +ib_handle_errors( +/*=============*/ + dberr_t* new_err,/*!< out: possible new error encountered in + lock wait, or if no new error, the value + of trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread */ + trx_savept_t* savept) /*!< in: savepoint or NULL */ +{ + dberr_t err; +handle_new_error: + err = trx->error_state; + + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + switch (err) { + case DB_LOCK_WAIT_TIMEOUT: + trx_rollback_for_mysql(trx); + break; + /* fall through */ + case DB_DUPLICATE_KEY: + case DB_FOREIGN_DUPLICATE_KEY: + case DB_TOO_BIG_RECORD: + case DB_ROW_IS_REFERENCED: + case DB_NO_REFERENCED_ROW: + case DB_CANNOT_ADD_CONSTRAINT: + case DB_TOO_MANY_CONCURRENT_TRXS: + case DB_OUT_OF_FILE_SPACE: + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_rollback_to_savepoint(trx, savept); + } + break; + case DB_LOCK_WAIT: + lock_wait_suspend_thread(thr); + + if (trx->error_state != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + goto handle_new_error; + } + + *new_err = err; + + return(TRUE); /* Operation needs to be retried. */ + + case DB_DEADLOCK: + case DB_LOCK_TABLE_FULL: + /* Roll back the whole transaction; this resolution was added + to version 3.23.43 */ + + trx_rollback_for_mysql(trx); + break; + + case DB_MUST_GET_MORE_FILE_SPACE: + + exit(1); + + case DB_CORRUPTION: + case DB_FOREIGN_EXCEED_MAX_CASCADE: + break; + default: + ut_error; + } + + if (trx->error_state != DB_SUCCESS) { + *new_err = trx->error_state; + } else { + *new_err = err; + } + + trx->error_state = DB_SUCCESS; + + return(FALSE); +} diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 8b7a19777ab..e3e127c3ace 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -697,14 +698,16 @@ btr_root_fseg_validate( #endif /* UNIV_BTR_DEBUG */ /**************************************************************//** -Gets the root node of a tree and x-latches it. -@return root page, x-latched */ +Gets the root node of a tree and x- or s-latches it. +@return root page, x- or s-latched */ static buf_block_t* btr_root_block_get( /*===============*/ - dict_index_t* index, /*!< in: index tree */ - mtr_t* mtr) /*!< in: mtr */ + const dict_index_t* index, /*!< in: index tree */ + ulint mode, /*!< in: either RW_S_LATCH + or RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr */ { ulint space; ulint zip_size; @@ -715,8 +718,7 @@ btr_root_block_get( zip_size = dict_table_zip_size(index->table); root_page_no = dict_index_get_page(index); - block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, - index, mtr); + block = btr_block_get(space, zip_size, root_page_no, mode, index, mtr); btr_assert_not_corrupted(block, index); #ifdef UNIV_BTR_DEBUG if (!dict_index_is_ibuf(index)) { @@ -739,10 +741,162 @@ UNIV_INTERN page_t* btr_root_get( /*=========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + return(buf_block_get_frame(btr_root_block_get(index, RW_X_LATCH, + mtr))); +} + +/**************************************************************//** +Gets the height of the B-tree (the level of the root, when the leaf +level is assumed to be 0). The caller must hold an S or X latch on +the index. +@return tree height (level of the root) */ +UNIV_INTERN +ulint +btr_height_get( +/*===========*/ dict_index_t* index, /*!< in: index tree */ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint height; + buf_block_t* root_block; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK) + || mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + + /* S latches the page */ + root_block = btr_root_block_get(index, RW_S_LATCH, mtr); + + height = btr_page_get_level(buf_block_get_frame(root_block), mtr); + + /* Release the S latch on the root page. */ + mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX); +#ifdef UNIV_SYNC_DEBUG + sync_thread_reset_level(&root_block->lock); +#endif /* UNIV_SYNC_DEBUG */ + + return(height); +} + +/**************************************************************//** +Checks a file segment header within a B-tree root page and updates +the segment header space id. +@return TRUE if valid */ +static +bool +btr_root_fseg_adjust_on_import( +/*===========================*/ + fseg_header_t* seg_header, /*!< in/out: segment header */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + ulint space, /*!< in: tablespace identifier */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { - return(buf_block_get_frame(btr_root_block_get(index, mtr))); + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + if (offset < FIL_PAGE_DATA + || offset > UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) { + + return(FALSE); + + } else if (page_zip) { + mach_write_to_4(seg_header + FSEG_HDR_SPACE, space); + page_zip_write_header(page_zip, seg_header + FSEG_HDR_SPACE, + 4, mtr); + } else { + mlog_write_ulint(seg_header + FSEG_HDR_SPACE, + space, MLOG_4BYTES, mtr); + } + + return(TRUE); +} + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +UNIV_INTERN +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ +{ + dberr_t err; + mtr_t mtr; + page_t* page; + buf_block_t* block; + page_zip_des_t* page_zip; + dict_table_t* table = index->table; + ulint space_id = dict_index_get_space(index); + ulint zip_size = dict_table_zip_size(table); + ulint root_page_no = dict_index_get_page(index); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", + return(DB_CORRUPTION);); + + block = btr_block_get( + space_id, zip_size, root_page_no, RW_X_LATCH, index, &mtr); + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + /* Check that this is a B-tree page and both the PREV and NEXT + pointers are FIL_NULL, because the root page does not have any + siblings. */ + if (fil_page_get_type(page) != FIL_PAGE_INDEX + || fil_page_get_prev(page) != FIL_NULL + || fil_page_get_next(page) != FIL_NULL) { + + err = DB_CORRUPTION; + + } else if (dict_index_is_clust(index)) { + bool page_is_compact_format; + + page_is_compact_format = page_is_comp(page) > 0; + + /* Check if the page format and table format agree. */ + if (page_is_compact_format != dict_table_is_comp(table)) { + err = DB_CORRUPTION; + } else { + + /* Check that the table flags and the tablespace + flags match. */ + ulint flags = fil_space_get_flags(table->space); + + if (flags + && flags != dict_tf_to_fsp_flags(table->flags)) { + + err = DB_CORRUPTION; + } else { + err = DB_SUCCESS; + } + } + } else { + err = DB_SUCCESS; + } + + /* Check and adjust the file segment headers, if all OK so far. */ + if (err == DB_SUCCESS + && (!btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + page, page_zip, space_id, &mtr) + || !btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + page, page_zip, space_id, &mtr))) { + + err = DB_CORRUPTION; + } + + mtr_commit(&mtr); + + return(err); } /*************************************************************//** @@ -1033,8 +1187,7 @@ btr_get_size( ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_S_LOCK)); - if (index->page == FIL_NULL - || index->to_be_dropped + if (index->page == FIL_NULL || dict_index_is_online_ddl(index) || *index->name == TEMP_INDEX_PREFIX) { return(ULINT_UNDEFINED); } @@ -1584,6 +1737,8 @@ btr_page_reorganize_low( there cannot exist locks on the page, and a hash index should not be dropped: it cannot exist */ + ulint compression_level,/*!< in: compression level to be used + if dealing with compressed page */ buf_block_t* block, /*!< in: page to be reorganized */ dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr) /*!< in: mtr */ @@ -1601,6 +1756,8 @@ btr_page_reorganize_low( ulint max_ins_size1; ulint max_ins_size2; ibool success = FALSE; + byte type; + byte* log_ptr; ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); btr_assert_not_corrupted(block, index); @@ -1612,9 +1769,23 @@ btr_page_reorganize_low( #ifndef UNIV_HOTBACKUP /* Write the log record */ - mlog_open_and_write_index(mtr, page, index, page_is_comp(page) - ? MLOG_COMP_PAGE_REORGANIZE - : MLOG_PAGE_REORGANIZE, 0); + if (page_zip) { + type = MLOG_ZIP_PAGE_REORGANIZE; + } else if (page_is_comp(page)) { + type = MLOG_COMP_PAGE_REORGANIZE; + } else { + type = MLOG_PAGE_REORGANIZE; + } + + log_ptr = mlog_open_and_write_index( + mtr, page, index, type, page_zip ? 1 : 0); + + /* For compressed pages write the compression level. */ + if (log_ptr && page_zip) { + mach_write_to_1(log_ptr, compression_level); + mlog_close(mtr, log_ptr + 1); + } + #endif /* !UNIV_HOTBACKUP */ /* Turn logging off */ @@ -1662,7 +1833,9 @@ btr_page_reorganize_low( ut_ad(max_trx_id != 0 || recovery); } - if (page_zip && !page_zip_compress(page_zip, page, index, NULL)) { + if (page_zip + && !page_zip_compress(page_zip, page, index, + compression_level, NULL)) { /* Restore the old page and exit. */ btr_blob_dbg_restore(page, temp_page, index, @@ -1750,7 +1923,8 @@ btr_page_reorganize( dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr) /*!< in: mtr */ { - return(btr_page_reorganize_low(FALSE, block, index, mtr)); + return(btr_page_reorganize_low(FALSE, page_compression_level, + block, index, mtr)); } #endif /* !UNIV_HOTBACKUP */ @@ -1762,18 +1936,32 @@ byte* btr_parse_page_reorganize( /*======================*/ byte* ptr, /*!< in: buffer */ - byte* end_ptr __attribute__((unused)), - /*!< in: buffer end */ + byte* end_ptr,/*!< in: buffer end */ dict_index_t* index, /*!< in: record descriptor */ + bool compressed,/*!< in: true if compressed page */ buf_block_t* block, /*!< in: page to be reorganized, or NULL */ mtr_t* mtr) /*!< in: mtr or NULL */ { + ulint level = page_compression_level; + ut_ad(ptr && end_ptr); - /* The record is empty, except for the record initial part */ + /* If dealing with a compressed page the record has the + compression level used during original compression written in + one byte. Otherwise record is empty. */ + if (compressed) { + if (ptr == end_ptr) { + return(NULL); + } + + level = (ulint)mach_read_from_1(ptr); + + ut_a(level <= 9); + ++ptr; + } if (block != NULL) { - btr_page_reorganize_low(TRUE, block, index, mtr); + btr_page_reorganize_low(TRUE, level, block, index, mtr); } return(ptr); @@ -1827,10 +2015,13 @@ UNIV_INTERN rec_t* btr_root_raise_and_insert( /*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be on the root page; when the function returns, the cursor is positioned on the predecessor of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ mtr_t* mtr) /*!< in: mtr */ @@ -1840,7 +2031,6 @@ btr_root_raise_and_insert( page_t* new_page; ulint new_page_no; rec_t* rec; - mem_heap_t* heap; dtuple_t* node_ptr; ulint level; rec_t* node_ptr_rec; @@ -1926,7 +2116,9 @@ btr_root_raise_and_insert( lock_update_root_raise(new_block, root_block); /* Create a memory heap where the node pointer is stored */ - heap = mem_heap_create(100); + if (!*heap) { + *heap = mem_heap_create(1000); + } rec = page_rec_get_next(page_get_infimum_rec(new_page)); new_page_no = buf_block_get_page_no(new_block); @@ -1934,8 +2126,8 @@ btr_root_raise_and_insert( /* Build the node pointer (= node key and page address) for the child */ - node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap, - level); + node_ptr = dict_index_build_node_ptr( + index, rec, new_page_no, *heap, level); /* The node pointer must be marked as the predefined minimum record, as there is no lower alphabetical limit to records in the leftmost node of a level: */ @@ -1961,15 +2153,12 @@ btr_root_raise_and_insert( page_cur_set_before_first(root_block, page_cursor); node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, - index, 0, mtr); + index, offsets, heap, 0, mtr); /* The root page should only contain the node pointer to new_page at this point. Thus, the data should fit. */ ut_a(node_ptr_rec); - /* Free the memory heap */ - mem_heap_free(heap); - /* We play safe and reset the free bits for the new page */ #if 0 @@ -1985,7 +2174,8 @@ btr_root_raise_and_insert( PAGE_CUR_LE, page_cursor); /* Split the child and insert tuple */ - return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr)); + return(btr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr)); } /*************************************************************//** @@ -2213,9 +2403,9 @@ func_exit: /*************************************************************//** Returns TRUE if the insert fits on the appropriate half-page with the chosen split_rec. -@return TRUE if fits */ -static -ibool +@return true if fits */ +static __attribute__((nonnull(1,3,4,6), warn_unused_result)) +bool btr_page_insert_fits( /*=================*/ btr_cur_t* cursor, /*!< in: cursor at which insert @@ -2223,11 +2413,11 @@ btr_page_insert_fits( const rec_t* split_rec,/*!< in: suggestion for first record on upper half-page, or NULL if tuple to be inserted should be first */ - const ulint* offsets,/*!< in: rec_get_offsets( - split_rec, cursor->index) */ + ulint** offsets,/*!< in: rec_get_offsets( + split_rec, cursor->index); out: garbage */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ - mem_heap_t* heap) /*!< in: temporary memory heap */ + mem_heap_t** heap) /*!< in: temporary memory heap */ { page_t* page; ulint insert_size; @@ -2236,15 +2426,13 @@ btr_page_insert_fits( ulint total_n_recs; const rec_t* rec; const rec_t* end_rec; - ulint* offs; page = btr_cur_get_page(cursor); - ut_ad(!split_rec == !offsets); - ut_ad(!offsets - || !page_is_comp(page) == !rec_offs_comp(offsets)); - ut_ad(!offsets - || rec_offs_validate(split_rec, cursor->index, offsets)); + ut_ad(!split_rec + || !page_is_comp(page) == !rec_offs_comp(*offsets)); + ut_ad(!split_rec + || rec_offs_validate(split_rec, cursor->index, *offsets)); insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); free_space = page_get_free_space_of_empty(page_is_comp(page)); @@ -2262,7 +2450,7 @@ btr_page_insert_fits( rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); - } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) { + } else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) { rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = split_rec; @@ -2277,19 +2465,17 @@ btr_page_insert_fits( /* Ok, there will be enough available space on the half page where the tuple is inserted */ - return(TRUE); + return(true); } - offs = NULL; - while (rec != end_rec) { /* In this loop we calculate the amount of reserved space after rec is removed from page. */ - offs = rec_get_offsets(rec, cursor->index, offs, - ULINT_UNDEFINED, &heap); + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); - total_data -= rec_offs_size(offs); + total_data -= rec_offs_size(*offsets); total_n_recs--; if (total_data + page_dir_calc_reserved_space(total_n_recs) @@ -2298,13 +2484,13 @@ btr_page_insert_fits( /* Ok, there will be enough available space on the half page where the tuple is inserted */ - return(TRUE); + return(true); } rec = page_rec_get_next_const(rec); } - return(FALSE); + return(false); } /*******************************************************//** @@ -2314,6 +2500,7 @@ UNIV_INTERN void btr_insert_on_non_leaf_level_func( /*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ dict_index_t* index, /*!< in: index */ ulint level, /*!< in: level, must be > 0 */ dtuple_t* tuple, /*!< in: the record to be inserted */ @@ -2323,8 +2510,10 @@ btr_insert_on_non_leaf_level_func( { big_rec_t* dummy_big_rec; btr_cur_t cursor; - ulint err; + dberr_t err; rec_t* rec; + ulint* offsets = NULL; + mem_heap_t* heap = NULL; ut_ad(level > 0); @@ -2335,26 +2524,35 @@ btr_insert_on_non_leaf_level_func( ut_ad(cursor.flag == BTR_CUR_BINARY); err = btr_cur_optimistic_insert( - BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG - | BTR_NO_UNDO_LOG_FLAG, &cursor, tuple, &rec, - &dummy_big_rec, 0, NULL, mtr); + flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &heap, + tuple, &rec, &dummy_big_rec, 0, NULL, mtr); if (err == DB_FAIL) { - err = btr_cur_pessimistic_insert( - BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG - | BTR_NO_UNDO_LOG_FLAG, - &cursor, tuple, &rec, &dummy_big_rec, 0, NULL, mtr); + err = btr_cur_pessimistic_insert(flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &heap, + tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); ut_a(err == DB_SUCCESS); } + mem_heap_free(heap); } /**************************************************************//** Attaches the halves of an index page on the appropriate level in an index tree. */ -static +static __attribute__((nonnull)) void btr_attach_half_pages( /*==================*/ + ulint flags, /*!< in: undo logging and + locking flags */ dict_index_t* index, /*!< in: the index tree */ buf_block_t* block, /*!< in/out: page to be split */ const rec_t* split_rec, /*!< in: first record on upper @@ -2432,7 +2630,8 @@ btr_attach_half_pages( /* Insert it next to the pointer to the lower half. Note that this may generate recursion leading to a split on the higher level. */ - btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr); + btr_insert_on_non_leaf_level(flags, index, level + 1, + node_ptr_upper, mtr); /* Free the memory heap */ mem_heap_free(heap); @@ -2484,13 +2683,13 @@ btr_attach_half_pages( /*************************************************************//** Determine if a tuple is smaller than any record on the page. @return TRUE if smaller */ -static -ibool +static __attribute__((nonnull, warn_unused_result)) +bool btr_page_tuple_smaller( /*===================*/ btr_cur_t* cursor, /*!< in: b-tree cursor */ const dtuple_t* tuple, /*!< in: tuple to consider */ - ulint* offsets,/*!< in/out: temporary storage */ + ulint** offsets,/*!< in/out: temporary storage */ ulint n_uniq, /*!< in: number of unique fields in the index page records */ mem_heap_t** heap) /*!< in/out: heap for offsets */ @@ -2505,11 +2704,11 @@ btr_page_tuple_smaller( page_cur_move_to_next(&pcur); first_rec = page_cur_get_rec(&pcur); - offsets = rec_get_offsets( - first_rec, cursor->index, offsets, + *offsets = rec_get_offsets( + first_rec, cursor->index, *offsets, n_uniq, heap); - return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0); + return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0); } /*************************************************************//** @@ -2525,9 +2724,12 @@ UNIV_INTERN rec_t* btr_page_split_and_insert( /*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the function returns, the cursor is positioned on the predecessor of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ mtr_t* mtr) /*!< in: mtr */ @@ -2553,18 +2755,21 @@ btr_page_split_and_insert( ibool insert_left; ulint n_iterations = 0; rec_t* rec; - mem_heap_t* heap; ulint n_uniq; - ulint* offsets; - heap = mem_heap_create(1024); + if (!*heap) { + *heap = mem_heap_create(1024); + } n_uniq = dict_index_get_n_unique_in_tree(cursor->index); func_start: - mem_heap_empty(heap); - offsets = NULL; + mem_heap_empty(*heap); + *offsets = NULL; ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), MTR_MEMO_X_LOCK)); + ut_ad(!dict_index_is_online_ddl(cursor->index) + || (flags & BTR_CREATE_FLAG) + || dict_index_is_clust(cursor->index)); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ @@ -2590,7 +2795,7 @@ func_start: if (split_rec == NULL) { insert_left = btr_page_tuple_smaller( - cursor, tuple, offsets, n_uniq, &heap); + cursor, tuple, offsets, n_uniq, heap); } } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { direction = FSP_UP; @@ -2612,7 +2817,7 @@ func_start: if (page_get_n_recs(page) > 1) { split_rec = page_get_middle_rec(page); } else if (btr_page_tuple_smaller(cursor, tuple, - offsets, n_uniq, &heap)) { + offsets, n_uniq, heap)) { split_rec = page_rec_get_next( page_get_infimum_rec(page)); } else { @@ -2635,10 +2840,10 @@ func_start: if (split_rec) { first_rec = move_limit = split_rec; - offsets = rec_get_offsets(split_rec, cursor->index, offsets, - n_uniq, &heap); + *offsets = rec_get_offsets(split_rec, cursor->index, *offsets, + n_uniq, heap); - insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0; + insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0; if (!insert_left && new_page_zip && n_iterations > 0) { /* If a compressed page has already been split, @@ -2665,7 +2870,7 @@ insert_empty: /* 4. Do first the modifications in the tree structure */ - btr_attach_half_pages(cursor->index, block, + btr_attach_half_pages(flags, cursor->index, block, first_rec, new_block, direction, mtr); /* If the split is made on the leaf level and the insert will fit @@ -2685,10 +2890,11 @@ insert_empty: insert_will_fit = !new_page_zip && btr_page_insert_fits(cursor, NULL, - NULL, tuple, n_ext, heap); + offsets, tuple, n_ext, heap); } - if (insert_will_fit && page_is_leaf(page)) { + if (insert_will_fit && page_is_leaf(page) + && !dict_index_is_online_ddl(cursor->index)) { mtr_memo_release(mtr, dict_index_get_lock(cursor->index), MTR_MEMO_X_LOCK); @@ -2805,8 +3011,8 @@ insert_empty: page_cur_search(insert_block, cursor->index, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, - cursor->index, n_ext, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); #ifdef UNIV_ZIP_DEBUG { @@ -2837,7 +3043,7 @@ insert_empty: page_cur_search(insert_block, cursor->index, tuple, PAGE_CUR_LE, page_cursor); rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, - n_ext, mtr); + offsets, heap, n_ext, mtr); if (rec == NULL) { /* The insert did not fit on the page: loop back to the @@ -2878,7 +3084,7 @@ func_exit: ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index)); ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index)); - mem_heap_free(heap); + ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets)); return(rec); } @@ -3058,15 +3264,15 @@ btr_node_ptr_delete( { btr_cur_t cursor; ibool compressed; - ulint err; + dberr_t err; ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); /* Delete node pointer on father page */ btr_page_get_father(index, block, mtr, &cursor); - compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE, - mtr); + compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, + BTR_CREATE_FLAG, RB_NONE, mtr); ut_a(err == DB_SUCCESS); if (!compressed) { @@ -3098,6 +3304,8 @@ btr_lift_page_up( buf_block_t* blocks[BTR_MAX_LEVELS]; ulint n_blocks; /*!< last used index in blocks[] */ ulint i; + bool lift_father_up; + buf_block_t* block_orig = block; ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); @@ -3108,11 +3316,13 @@ btr_lift_page_up( { btr_cur_t cursor; - mem_heap_t* heap = mem_heap_create(100); - ulint* offsets; + ulint* offsets = NULL; + mem_heap_t* heap = mem_heap_create( + sizeof(*offsets) + * (REC_OFFS_HEADER_SIZE + 1 + 1 + index->n_fields)); buf_block_t* b; - offsets = btr_page_get_father_block(NULL, heap, index, + offsets = btr_page_get_father_block(offsets, heap, index, block, mtr, &cursor); father_block = btr_cur_get_block(&cursor); father_page_zip = buf_block_get_page_zip(father_block); @@ -3136,6 +3346,29 @@ btr_lift_page_up( blocks[n_blocks++] = b = btr_cur_get_block(&cursor); } + lift_father_up = (n_blocks && page_level == 0); + if (lift_father_up) { + /* The father page also should be the only on its level (not + root). We should lift up the father page at first. + Because the leaf page should be lifted up only for root page. + The freeing page is based on page_level (==0 or !=0) + to choose segment. If the page_level is changed ==0 from !=0, + later freeing of the page doesn't find the page allocation + to be freed.*/ + + block = father_block; + page = buf_block_get_frame(block); + page_level = btr_page_get_level(page, mtr); + + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + father_block = blocks[0]; + father_page_zip = buf_block_get_page_zip(father_block); + father_page = buf_block_get_frame(father_block); + } + mem_heap_free(heap); } @@ -3143,6 +3376,7 @@ btr_lift_page_up( /* Make the father empty */ btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + page_level++; /* Copy the records to the father page one by one. */ if (0 @@ -3174,7 +3408,7 @@ btr_lift_page_up( lock_update_copy_and_discard(father_block, block); /* Go upward to root page, decrementing levels by one. */ - for (i = 0; i < n_blocks; i++, page_level++) { + for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) { page_t* page = buf_block_get_frame(blocks[i]); page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]); @@ -3196,7 +3430,7 @@ btr_lift_page_up( ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(index, father_block, mtr)); - return(father_block); + return(lift_father_up ? block_orig : father_block); } /*************************************************************//** @@ -3267,6 +3501,7 @@ btr_compress( if (adjust) { nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + ut_ad(nth_rec > 0); } /* Decide the page to which we try to merge and which will inherit @@ -3323,6 +3558,16 @@ err_exit: return(FALSE); } + /* If compression padding tells us that merging will result in + too packed up page i.e.: which is likely to cause compression + failure then don't merge the pages. */ + if (zip_size && page_is_leaf(merge_page) + && (page_get_data_size(merge_page) + data_size + >= dict_index_zip_pad_optimal_page_size(index))) { + + goto err_exit; + } + ut_ad(page_validate(merge_page, index)); max_ins_size = page_get_max_insert_size(merge_page, n_recs); @@ -3502,6 +3747,7 @@ func_exit: mem_heap_free(heap); if (adjust) { + ut_ad(nth_rec > 0); btr_cur_position( index, page_rec_get_nth(merge_block->frame, nth_rec), @@ -3818,7 +4064,7 @@ btr_print_index( mtr_start(&mtr); - root = btr_root_block_get(index, &mtr); + root = btr_root_block_get(index, RW_X_LATCH, &mtr); btr_print_recursive(index, root, width, &heap, &offsets, &mtr); if (heap) { @@ -3827,7 +4073,7 @@ btr_print_index( mtr_commit(&mtr); - btr_validate_index(index, NULL); + btr_validate_index(index, 0); } #endif /* UNIV_BTR_PRINT */ @@ -4013,8 +4259,22 @@ btr_index_page_validate( { page_cur_t cur; ibool ret = TRUE; +#ifndef DBUG_OFF + ulint nth = 1; +#endif /* !DBUG_OFF */ page_cur_set_before_first(block, &cur); + + /* Directory slot 0 should only contain the infimum record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(page_rec_get_nth_const( + page_cur_get_page(&cur), 0) + == cur.rec); + ut_a(page_dir_slot_get_n_owned( + page_dir_get_nth_slot( + page_cur_get_page(&cur), 0)) + == 1);); + page_cur_move_to_next(&cur); for (;;) { @@ -4028,6 +4288,16 @@ btr_index_page_validate( return(FALSE); } + /* Verify that page_rec_get_nth_const() is correctly + retrieving each record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(cur.rec == page_rec_get_nth_const( + page_cur_get_page(&cur), + page_rec_get_n_recs_before( + cur.rec))); + ut_a(nth++ == page_rec_get_n_recs_before( + cur.rec));); + page_cur_move_to_next(&cur); } @@ -4078,14 +4348,15 @@ btr_validate_report2( Validates index tree level. @return TRUE if ok */ static -ibool +bool btr_validate_level( /*===============*/ dict_index_t* index, /*!< in: index tree */ - trx_t* trx, /*!< in: transaction or NULL */ + const trx_t* trx, /*!< in: transaction or NULL */ ulint level) /*!< in: level number */ { ulint space; + ulint space_flags; ulint zip_size; buf_block_t* block; page_t* page; @@ -4099,9 +4370,10 @@ btr_validate_level( ulint left_page_no; page_cur_t cursor; dtuple_t* node_ptr_tuple; - ibool ret = TRUE; + bool ret = true; mtr_t mtr; mem_heap_t* heap = mem_heap_create(256); + fseg_header_t* seg; ulint* offsets = NULL; ulint* offsets2= NULL; #ifdef UNIV_ZIP_DEBUG @@ -4112,15 +4384,39 @@ btr_validate_level( mtr_x_lock(dict_index_get_lock(index), &mtr); - block = btr_root_block_get(index, &mtr); + block = btr_root_block_get(index, RW_X_LATCH, &mtr); page = buf_block_get_frame(block); + seg = page + PAGE_HEADER + PAGE_BTR_SEG_TOP; space = dict_index_get_space(index); zip_size = dict_table_zip_size(index->table); + fil_space_get_latch(space, &space_flags); + + if (zip_size != dict_tf_get_zip_size(space_flags)) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Flags mismatch: table=%lu, tablespace=%lu", + (ulint) index->table->flags, (ulint) space_flags); + + mtr_commit(&mtr); + + return(false); + } + while (level != btr_page_get_level(page, &mtr)) { const rec_t* node_ptr; + if (fseg_page_is_free(seg, + block->page.space, block->page.offset)) { + + btr_validate_report1(index, level, block); + + ib_logf(IB_LOG_LEVEL_WARN, "page is free"); + + ret = false; + } + ut_a(space == buf_block_get_space(block)); ut_a(space == page_get_space_id(page)); #ifdef UNIV_ZIP_DEBUG @@ -4141,12 +4437,13 @@ btr_validate_level( /* Now we are on the desired level. Loop through the pages on that level. */ -loop: - if (trx_is_interrupted(trx)) { - mtr_commit(&mtr); - mem_heap_free(heap); - return(ret); + + if (level == 0) { + /* Leaf pages are managed in their own file segment. */ + seg -= PAGE_BTR_SEG_TOP - PAGE_BTR_SEG_LEAF; } + +loop: mem_heap_empty(heap); offsets = offsets2 = NULL; mtr_x_lock(dict_index_get_lock(index), &mtr); @@ -4156,20 +4453,35 @@ loop: ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - /* Check ordering etc. of records */ + ut_a(block->page.space == space); + + if (fseg_page_is_free(seg, block->page.space, block->page.offset)) { + + btr_validate_report1(index, level, block); + + ib_logf(IB_LOG_LEVEL_WARN, "Page is marked as free"); + ret = false; + + } else if (btr_page_get_index_id(page) != index->id) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Page index id " IB_ID_FMT " != data dictionary " + "index id " IB_ID_FMT, + btr_page_get_index_id(page), index->id); + + ret = false; + + } else if (!page_validate(page, index)) { - if (!page_validate(page, index)) { btr_validate_report1(index, level, block); + ret = false; + + } else if (level == 0 && !btr_index_page_validate(block, index)) { - ret = FALSE; - } else if (level == 0) { /* We are on level 0. Check that the records have the right number of fields, and field lengths are right. */ - if (!btr_index_page_validate(block, index)) { - - ret = FALSE; - } + ret = false; } ut_a(btr_page_get_level(page, &mtr) == level); @@ -4195,7 +4507,7 @@ loop: buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH); - ret = FALSE; + ret = false; } if (page_is_comp(right_page) != page_is_comp(page)) { @@ -4204,7 +4516,7 @@ loop: buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH); - ret = FALSE; + ret = false; goto node_ptr_fails; } @@ -4237,7 +4549,7 @@ loop: rec_print(stderr, rec, index); putc('\n', stderr); - ret = FALSE; + ret = false; } } @@ -4288,7 +4600,7 @@ loop: fputs("InnoDB: record on page ", stderr); rec_print_new(stderr, rec, offsets); putc('\n', stderr); - ret = FALSE; + ret = false; goto node_ptr_fails; } @@ -4318,7 +4630,7 @@ loop: fputs("InnoDB: first rec ", stderr); rec_print(stderr, first_rec, index); putc('\n', stderr); - ret = FALSE; + ret = false; goto node_ptr_fails; } @@ -4346,7 +4658,7 @@ loop: if (btr_cur_get_rec(&right_node_cur) != right_node_ptr) { - ret = FALSE; + ret = false; fputs("InnoDB: node pointer to" " the right page is wrong\n", stderr); @@ -4372,7 +4684,7 @@ loop: != page_rec_get_next( page_get_infimum_rec( right_father_page))) { - ret = FALSE; + ret = false; fputs("InnoDB: node pointer 2 to" " the right page is wrong\n", stderr); @@ -4397,7 +4709,7 @@ loop: if (page_get_page_no(right_father_page) != btr_page_get_next(father_page, &mtr)) { - ret = FALSE; + ret = false; fputs("InnoDB: node pointer 3 to" " the right page is wrong\n", stderr); @@ -4428,17 +4740,23 @@ node_ptr_fails: on the next loop. The page has already been checked. */ mtr_commit(&mtr); - if (right_page_no != FIL_NULL) { + if (trx_is_interrupted(trx)) { + /* On interrupt, return the current status. */ + } else if (right_page_no != FIL_NULL) { + mtr_start(&mtr); - block = btr_block_get(space, zip_size, right_page_no, - RW_X_LATCH, index, &mtr); + block = btr_block_get( + space, zip_size, right_page_no, + RW_X_LATCH, index, &mtr); + page = buf_block_get_frame(block); goto loop; } mem_heap_free(heap); + return(ret); } @@ -4446,40 +4764,39 @@ node_ptr_fails: Checks the consistency of an index tree. @return TRUE if ok */ UNIV_INTERN -ibool +bool btr_validate_index( /*===============*/ dict_index_t* index, /*!< in: index */ - trx_t* trx) /*!< in: transaction or NULL */ + const trx_t* trx) /*!< in: transaction or NULL */ { - mtr_t mtr; - page_t* root; - ulint i; - ulint n; - /* Full Text index are implemented by auxiliary tables, not the B-tree */ - if (index->type & DICT_FTS) { - return(TRUE); + if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) { + return(true); } + mtr_t mtr; + mtr_start(&mtr); - mtr_x_lock(dict_index_get_lock(index), &mtr); - root = btr_root_get(index, &mtr); - n = btr_page_get_level(root, &mtr); + mtr_x_lock(dict_index_get_lock(index), &mtr); - for (i = 0; i <= n && !trx_is_interrupted(trx); i++) { - if (!btr_validate_level(index, trx, n - i)) { + bool ok = true; + page_t* root = btr_root_get(index, &mtr); + ulint n = btr_page_get_level(root, &mtr); - mtr_commit(&mtr); + for (ulint i = 0; i <= n; ++i) { - return(FALSE); + if (!btr_validate_level(index, trx, n - i)) { + ok = false; + break; } } mtr_commit(&mtr); - return(TRUE); + return(ok); } + #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index aeb16200f80..913b2088f24 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -2,6 +2,7 @@ Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2012, Facebook Inc. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -57,6 +58,7 @@ Created 10/16/1994 Heikki Tuuri #include "buf0lru.h" #include "btr0btr.h" #include "btr0sea.h" +#include "row0log.h" #include "row0purge.h" #include "row0upd.h" #include "trx0rec.h" @@ -69,13 +71,13 @@ Created 10/16/1994 Heikki Tuuri #include "zlib.h" /** Buffered B-tree operation types, introduced as part of delete buffering. */ -typedef enum btr_op_enum { +enum btr_op_t { BTR_NO_OP = 0, /*!< Not buffered */ BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */ BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */ BTR_DELETE_OP, /*!< Purge a delete-marked record */ BTR_DELMARK_OP /*!< Mark a record for deletion */ -} btr_op_t; +}; #ifdef UNIV_DEBUG /** If the following is set to TRUE, this module prints a lot of @@ -97,6 +99,11 @@ srv_refresh_innodb_monitor_stats(). Referenced by srv_printf_innodb_monitor(). */ UNIV_INTERN ulint btr_cur_n_sea_old = 0; +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +UNIV_INTERN uint btr_cur_limit_optimistic_insert_debug = 0; +#endif /* UNIV_DEBUG */ + /** In the optimistic insert, if the insert does not fit, but this much space can be released by page reorganize, then it is reorganized */ #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32) @@ -425,6 +432,14 @@ btr_cur_search_to_nth_level( cursor->low_match = ULINT_UNDEFINED; #endif + ibool s_latch_by_caller; + + s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED; + + ut_ad(!s_latch_by_caller + || mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + /* These flags are mutually exclusive, they are lumped together with the latch mode for historical reasons. It's possible for none of the flags to be set. */ @@ -460,11 +475,11 @@ btr_cur_search_to_nth_level( estimate = latch_mode & BTR_ESTIMATE; /* Turn the flags unrelated to the latch mode off. */ - latch_mode &= ~(BTR_INSERT - | BTR_DELETE_MARK - | BTR_DELETE - | BTR_ESTIMATE - | BTR_IGNORE_SEC_UNIQUE); + latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + ut_ad(!s_latch_by_caller + || latch_mode == BTR_SEARCH_LEAF + || latch_mode == BTR_MODIFY_LEAF); cursor->flag = BTR_CUR_BINARY; cursor->index = index; @@ -478,16 +493,16 @@ btr_cur_search_to_nth_level( #ifdef BTR_CUR_HASH_ADAPT -#ifdef UNIV_SEARCH_PERF_STAT +# ifdef UNIV_SEARCH_PERF_STAT info->n_searches++; -#endif +# endif if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ && !estimate -#ifdef PAGE_CUR_LE_OR_EXTENDS +# ifdef PAGE_CUR_LE_OR_EXTENDS && mode != PAGE_CUR_LE_OR_EXTENDS -#endif /* PAGE_CUR_LE_OR_EXTENDS */ +# endif /* PAGE_CUR_LE_OR_EXTENDS */ /* If !has_search_latch, we do a dirty read of btr_search_enabled below, and btr_search_guess_on_hash() will have to check it again. */ @@ -508,7 +523,7 @@ btr_cur_search_to_nth_level( return; } -#endif /* BTR_CUR_HASH_ADAPT */ +# endif /* BTR_CUR_HASH_ADAPT */ #endif /* BTR_CUR_ADAPT */ btr_cur_n_non_sea++; @@ -525,15 +540,19 @@ btr_cur_search_to_nth_level( savepoint = mtr_set_savepoint(mtr); - if (latch_mode == BTR_MODIFY_TREE) { + switch (latch_mode) { + case BTR_MODIFY_TREE: mtr_x_lock(dict_index_get_lock(index), mtr); - - } else if (latch_mode == BTR_CONT_MODIFY_TREE) { + break; + case BTR_CONT_MODIFY_TREE: /* Do nothing */ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); - } else { - mtr_s_lock(dict_index_get_lock(index), mtr); + break; + default: + if (!s_latch_by_caller) { + mtr_s_lock(dict_index_get_lock(index), mtr); + } } page_cursor = btr_cur_get_page_cur(cursor); @@ -687,6 +706,7 @@ retry_page_get: ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE); } + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); ut_ad(index->id == btr_page_get_index_id(page)); if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) { @@ -711,13 +731,17 @@ retry_page_get: cursor, mtr); } - if (latch_mode != BTR_MODIFY_TREE - && latch_mode != BTR_CONT_MODIFY_TREE) { - - /* Release the tree s-latch */ - - mtr_release_s_latch_at_savepoint( - mtr, savepoint, dict_index_get_lock(index)); + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + break; + default: + if (!s_latch_by_caller) { + /* Release the tree s-latch */ + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } } page_mode = mode; @@ -784,8 +808,7 @@ retry_page_get: will properly check btr_search_enabled again in btr_search_build_page_hash_index() before building a page hash index, while holding btr_search_latch. */ - if (UNIV_LIKELY(btr_search_enabled)) { - + if (btr_search_enabled) { btr_search_info_update(index, cursor); } #endif @@ -815,14 +838,16 @@ UNIV_INTERN void btr_cur_open_at_index_side_func( /*============================*/ - ibool from_left, /*!< in: TRUE if open to the low end, - FALSE if to the high end */ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ dict_index_t* index, /*!< in: index */ ulint latch_mode, /*!< in: latch mode */ - btr_cur_t* cursor, /*!< in: cursor */ + btr_cur_t* cursor, /*!< in/out: cursor */ + ulint level, /*!< in: level to search for + (0=leaf). */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { page_cur_t* page_cursor; ulint page_no; @@ -839,16 +864,27 @@ btr_cur_open_at_index_side_func( rec_offs_init(offsets_); estimate = latch_mode & BTR_ESTIMATE; - latch_mode = latch_mode & ~BTR_ESTIMATE; + latch_mode &= ~BTR_ESTIMATE; + + ut_ad(level != ULINT_UNDEFINED); /* Store the position of the tree latch we push to mtr so that we know how to release it when we have latched the leaf node */ savepoint = mtr_set_savepoint(mtr); - if (latch_mode == BTR_MODIFY_TREE) { + switch (latch_mode) { + case BTR_CONT_MODIFY_TREE: + break; + case BTR_MODIFY_TREE: mtr_x_lock(dict_index_get_lock(index), mtr); - } else { + break; + case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED: + case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED: + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + break; + default: mtr_s_lock(dict_index_get_lock(index), mtr); } @@ -868,6 +904,7 @@ btr_cur_open_at_index_side_func( RW_NO_LATCH, NULL, BUF_GET, file, line, mtr); page = buf_block_get_frame(block); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); ut_ad(index->id == btr_page_get_index_id(page)); block->check_index_page_at_flush = TRUE; @@ -877,26 +914,40 @@ btr_cur_open_at_index_side_func( height = btr_page_get_level(page, mtr); root_height = height; + ut_a(height >= level); + } else { + /* TODO: flag the index corrupted if this fails */ + ut_ad(height == btr_page_get_level(page, mtr)); } - if (height == 0) { - btr_cur_latch_leaves(page, space, zip_size, page_no, - latch_mode, cursor, mtr); - - /* In versions <= 3.23.52 we had forgotten to - release the tree latch here. If in an index scan - we had to scan far to find a record visible to the - current transaction, that could starve others - waiting for the tree latch. */ - - if ((latch_mode != BTR_MODIFY_TREE) - && (latch_mode != BTR_CONT_MODIFY_TREE)) { + if (height == level) { + btr_cur_latch_leaves( + page, space, zip_size, page_no, + latch_mode & ~BTR_ALREADY_S_LATCHED, + cursor, mtr); - /* Release the tree s-latch */ + if (height == 0) { + /* In versions <= 3.23.52 we had + forgotten to release the tree latch + here. If in an index scan we had to + scan far to find a record visible to + the current transaction, that could + starve others waiting for the tree + latch. */ + + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED: + case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED: + break; + default: + /* Release the tree s-latch */ - mtr_release_s_latch_at_savepoint( - mtr, savepoint, - dict_index_get_lock(index)); + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } } } @@ -906,7 +957,7 @@ btr_cur_open_at_index_side_func( page_cur_set_after_last(block, page_cursor); } - if (height == 0) { + if (height == level) { if (estimate) { btr_cur_add_path_info(cursor, height, root_height); @@ -965,9 +1016,12 @@ btr_cur_open_at_rnd_pos_func( ulint* offsets = offsets_; rec_offs_init(offsets_); - if (latch_mode == BTR_MODIFY_TREE) { + switch (latch_mode) { + case BTR_MODIFY_TREE: mtr_x_lock(dict_index_get_lock(index), mtr); - } else { + break; + default: + ut_ad(latch_mode != BTR_CONT_MODIFY_TREE); mtr_s_lock(dict_index_get_lock(index), mtr); } @@ -988,6 +1042,7 @@ btr_cur_open_at_rnd_pos_func( RW_NO_LATCH, NULL, BUF_GET, file, line, mtr); page = buf_block_get_frame(block); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); ut_ad(index->id == btr_page_get_index_id(page)); if (height == ULINT_UNDEFINED) { @@ -1032,7 +1087,7 @@ be freed by reorganizing. Differs from btr_cur_optimistic_insert because no heuristics is applied to whether it pays to use CPU time for reorganizing the page or not. @return pointer to inserted record if succeed, else NULL */ -static +static __attribute__((nonnull, warn_unused_result)) rec_t* btr_cur_insert_if_possible( /*=======================*/ @@ -1040,6 +1095,8 @@ btr_cur_insert_if_possible( cursor stays valid */ const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not have been stored to tuple */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ ulint n_ext, /*!< in: number of externally stored columns */ mtr_t* mtr) /*!< in: mtr */ { @@ -1055,8 +1112,8 @@ btr_cur_insert_if_possible( page_cursor = btr_cur_get_page_cur(cursor); /* Now, try the insert */ - rec = page_cur_tuple_insert(page_cursor, tuple, - cursor->index, n_ext, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); if (UNIV_UNLIKELY(!rec)) { /* If record did not fit, reorganize */ @@ -1066,19 +1123,21 @@ btr_cur_insert_if_possible( page_cur_search(block, cursor->index, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, - cursor->index, n_ext, mtr); + rec = page_cur_tuple_insert( + page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); } } + ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets)); return(rec); } /*************************************************************//** For an insert, checks the locks and does the undo logging if desired. @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ -UNIV_INLINE -ulint +UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,5,6))) +dberr_t btr_cur_ins_lock_and_undo( /*======================*/ ulint flags, /*!< in: undo logging and locking flags: if @@ -1093,7 +1152,7 @@ btr_cur_ins_lock_and_undo( successor record */ { dict_index_t* index; - ulint err; + dberr_t err; rec_t* rec; roll_ptr_t roll_ptr; @@ -1103,6 +1162,10 @@ btr_cur_ins_lock_and_undo( rec = btr_cur_get_rec(cursor); index = cursor->index; + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + err = lock_rec_insert_check_and_lock(flags, rec, btr_cur_get_block(cursor), index, thr, mtr, inherit); @@ -1115,7 +1178,7 @@ btr_cur_ins_lock_and_undo( err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP, thr, index, entry, - NULL, 0, NULL, + NULL, 0, NULL, NULL, &roll_ptr); if (err != DB_SUCCESS) { @@ -1140,13 +1203,13 @@ static void btr_cur_trx_report( /*===============*/ - trx_t* trx, /*!< in: transaction */ + trx_id_t trx_id, /*!< in: transaction id */ const dict_index_t* index, /*!< in: index */ const char* op) /*!< in: operation */ { - fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx->id); + fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx_id); fputs(op, stderr); - dict_index_name_print(stderr, trx, index); + dict_index_name_print(stderr, NULL, index); putc('\n', stderr); } #endif /* UNIV_DEBUG */ @@ -1159,7 +1222,7 @@ one record on the page, the insert will always succeed; this is to prevent trying to split a page with just one record. @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_optimistic_insert( /*======================*/ ulint flags, /*!< in: undo logging and locking flags: if not @@ -1167,6 +1230,8 @@ btr_cur_optimistic_insert( specified */ btr_cur_t* cursor, /*!< in: cursor on page after which to insert; cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ dtuple_t* entry, /*!< in/out: entry to insert */ rec_t** rec, /*!< out: pointer to inserted record if succeed */ @@ -1193,13 +1258,16 @@ btr_cur_optimistic_insert( ibool inherit; ulint zip_size; ulint rec_size; - ulint err; + dberr_t err; *big_rec = NULL; block = btr_cur_get_block(cursor); page = buf_block_get_frame(block); index = cursor->index; + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); zip_size = buf_block_get_zip_size(block); #ifdef UNIV_DEBUG_VALGRIND if (zip_size) { @@ -1214,7 +1282,7 @@ btr_cur_optimistic_insert( } #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { - btr_cur_trx_report(thr_get_trx(thr), index, "insert into "); + btr_cur_trx_report(thr_get_trx(thr)->id, index, "insert "); dtuple_print(stderr, entry); } #endif /* UNIV_DEBUG */ @@ -1276,6 +1344,9 @@ btr_cur_optimistic_insert( } } + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), + goto fail); + /* If there have been many consecutive inserts, and we are on the leaf level, check if we have to split the page to reserve enough free space for future updates of records. */ @@ -1305,6 +1376,15 @@ fail_err: goto fail; } + /* If compression padding tells us that insertion will result in + too packed up page i.e.: which is likely to cause compression + failure then don't do an optimistic insertion. */ + if (zip_size && leaf + && (page_get_data_size(page) + rec_size + >= dict_index_zip_pad_optimal_page_size(index))) { + + goto fail; + } /* Check locks and write to the undo log, if specified */ err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, mtr, &inherit); @@ -1321,7 +1401,7 @@ fail_err: { const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor); *rec = page_cur_tuple_insert(page_cursor, entry, index, - n_ext, mtr); + offsets, heap, n_ext, mtr); reorg = page_cursor_rec != page_cur_get_rec(page_cursor); if (UNIV_UNLIKELY(reorg)) { @@ -1351,7 +1431,7 @@ fail_err: page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor); *rec = page_cur_tuple_insert(page_cursor, entry, index, - n_ext, mtr); + offsets, heap, n_ext, mtr); if (UNIV_UNLIKELY(!*rec)) { if (zip_size != 0) { @@ -1426,7 +1506,7 @@ made on the leaf level, to avoid deadlocks, mtr must also own x-latches to brothers of page, if those brothers exist. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_pessimistic_insert( /*=======================*/ ulint flags, /*!< in: undo logging and locking flags: if not @@ -1437,6 +1517,9 @@ btr_cur_pessimistic_insert( insertion will certainly succeed */ btr_cur_t* cursor, /*!< in: cursor after which to insert; cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ dtuple_t* entry, /*!< in/out: entry to insert */ rec_t** rec, /*!< out: pointer to inserted record if succeed */ @@ -1450,8 +1533,7 @@ btr_cur_pessimistic_insert( dict_index_t* index = cursor->index; ulint zip_size = dict_table_zip_size(index->table); big_rec_t* big_rec_vec = NULL; - mem_heap_t* heap = NULL; - ulint err; + dberr_t err; ibool dummy_inh; ibool success; ulint n_extents = 0; @@ -1466,6 +1548,9 @@ btr_cur_pessimistic_insert( MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); cursor->flag = BTR_CUR_BINARY; @@ -1523,13 +1608,11 @@ btr_cur_pessimistic_insert( == buf_block_get_page_no(btr_cur_get_block(cursor))) { /* The page is the root page */ - *rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr); + *rec = btr_root_raise_and_insert( + flags, cursor, offsets, heap, entry, n_ext, mtr); } else { - *rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr); - } - - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); + *rec = btr_page_split_and_insert( + flags, cursor, offsets, heap, entry, n_ext, mtr); } ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec); @@ -1556,29 +1639,36 @@ btr_cur_pessimistic_insert( /*************************************************************//** For an update, checks the locks and does the undo logging. @return DB_SUCCESS, DB_WAIT_LOCK, or error number */ -UNIV_INLINE -ulint +UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,6,7))) +dberr_t btr_cur_upd_lock_and_undo( /*======================*/ ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor on record to update */ + const ulint* offsets,/*!< in: rec_get_offsets() on cursor */ const upd_t* update, /*!< in: update vector */ ulint cmpl_info,/*!< in: compiler info on secondary index updates */ - que_thr_t* thr, /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ mtr_t* mtr, /*!< in/out: mini-transaction */ roll_ptr_t* roll_ptr)/*!< out: roll pointer */ { dict_index_t* index; - rec_t* rec; - ulint err; + const rec_t* rec; + dberr_t err; - ut_ad(cursor && update && thr && roll_ptr); + ut_ad(thr || (flags & BTR_NO_LOCKING_FLAG)); rec = btr_cur_get_rec(cursor); index = cursor->index; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (!dict_index_is_clust(index)) { + ut_ad(dict_index_is_online_ddl(index) + == !!(flags & BTR_CREATE_FLAG)); + /* We do undo logging only when we update a clustered index record */ return(lock_sec_rec_modify_check_and_lock( @@ -1589,50 +1679,39 @@ btr_cur_upd_lock_and_undo( /* Check if we have to wait for a lock: enqueue an explicit lock request if yes */ - err = DB_SUCCESS; - if (!(flags & BTR_NO_LOCKING_FLAG)) { - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs_init(offsets_); - err = lock_clust_rec_modify_check_and_lock( flags, btr_cur_get_block(cursor), rec, index, - rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap), thr); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } + offsets, thr); if (err != DB_SUCCESS) { - return(err); } } /* Append the info about the update in the undo log */ - err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr, - index, NULL, update, - cmpl_info, rec, roll_ptr); - return(err); + return(trx_undo_report_row_operation( + flags, TRX_UNDO_MODIFY_OP, thr, + index, NULL, update, + cmpl_info, rec, offsets, roll_ptr)); } /***********************************************************//** Writes a redo log record of updating a record in-place. */ -UNIV_INLINE +UNIV_INLINE __attribute__((nonnull)) void btr_cur_update_in_place_log( /*========================*/ ulint flags, /*!< in: flags */ - rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: index where cursor positioned */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index of the record */ const upd_t* update, /*!< in: update vector */ - trx_t* trx, /*!< in: transaction */ + trx_id_t trx_id, /*!< in: transaction id */ roll_ptr_t roll_ptr, /*!< in: roll ptr */ mtr_t* mtr) /*!< in: mtr */ { - byte* log_ptr; - page_t* page = page_align(rec); + byte* log_ptr; + const page_t* page = page_align(rec); ut_ad(flags < 256); ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); @@ -1657,8 +1736,8 @@ btr_cur_update_in_place_log( mach_write_to_1(log_ptr, flags); log_ptr++; - log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, - mtr); + log_ptr = row_upd_write_sys_vals_to_log( + index, trx_id, roll_ptr, log_ptr, mtr); mach_write_to_2(log_ptr, page_offset(rec)); log_ptr += 2; @@ -1761,6 +1840,13 @@ btr_cur_update_alloc_zip( FALSE=update-in-place */ mtr_t* mtr) /*!< in: mini-transaction */ { + + /* Have a local copy of the variables as these can change + dynamically. */ + bool log_compressed = page_log_compressed_pages; + ulint compression_level = page_compression_level; + page_t* page = buf_block_get_frame(block); + ut_a(page_zip == buf_block_get_page_zip(block)); ut_ad(page_zip); ut_ad(!dict_index_is_ibuf(index)); @@ -1776,12 +1862,27 @@ btr_cur_update_alloc_zip( return(FALSE); } - if (!page_zip_compress(page_zip, buf_block_get_frame(block), - index, mtr)) { + page = buf_block_get_frame(block); + + if (create && page_is_leaf(page) + && (length + page_get_data_size(page) + >= dict_index_zip_pad_optimal_page_size(index))) { + + return(FALSE); + } + + if (!page_zip_compress( + page_zip, page, index, compression_level, + log_compressed ? mtr : NULL)) { /* Unable to compress the page */ return(FALSE); } + if (mtr && !log_compressed) { + page_zip_compress_write_log_no_data( + compression_level, page, index, mtr); + } + /* After recompressing a page, we must make sure that the free bits in the insert buffer bitmap will not exceed the free space on the page. Because this function will not attempt @@ -1795,8 +1896,7 @@ btr_cur_update_alloc_zip( if (!page_zip_available(page_zip, dict_index_is_clust(index), length, create)) { /* Out of space: reset the free bits. */ - if (!dict_index_is_clust(index) - && page_is_leaf(buf_block_get_frame(block))) { + if (!dict_index_is_clust(index) && page_is_leaf(page)) { ibuf_reset_free_bits(block); } return(FALSE); @@ -1810,45 +1910,50 @@ Updates a record when the update causes no size changes in its fields. We assume here that the ordering fields of the record do not change. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_update_in_place( /*====================*/ ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor on the record to update; cursor stays valid and positioned on the same record */ + const ulint* offsets,/*!< in: offsets on cursor->page_cur.rec */ const upd_t* update, /*!< in: update vector */ ulint cmpl_info,/*!< in: compiler info on secondary index updates */ - que_thr_t* thr, /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread, or NULL if + appropriate flags are set */ + trx_id_t trx_id, /*!< in: transaction id */ mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ { dict_index_t* index; buf_block_t* block; page_zip_des_t* page_zip; - ulint err; + dberr_t err; rec_t* rec; roll_ptr_t roll_ptr = 0; - trx_t* trx; ulint was_delete_marked; ibool is_hashed; - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets = offsets_; - rec_offs_init(offsets_); rec = btr_cur_get_rec(cursor); index = cursor->index; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); /* The insert buffer tree should never be updated in place. */ ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(!thr || thr_get_trx(thr)->id == trx_id); + ut_ad(thr || (flags & ~BTR_KEEP_POS_FLAG) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_get_type(btr_cur_get_page(cursor)) == FIL_PAGE_INDEX); + ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id); - trx = thr_get_trx(thr); - offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); #ifdef UNIV_DEBUG - if (btr_cur_print_record_ops && thr) { - btr_cur_trx_report(trx, index, "update "); + if (btr_cur_print_record_ops) { + btr_cur_trx_report(trx_id, index, "update "); rec_print_new(stderr, rec, offsets); } #endif /* UNIV_DEBUG */ @@ -1864,19 +1969,17 @@ btr_cur_update_in_place( } /* Do lock checking and undo logging */ - err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + err = btr_cur_upd_lock_and_undo(flags, cursor, offsets, + update, cmpl_info, thr, mtr, &roll_ptr); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } return(err); } if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, NULL, - index, offsets, trx, roll_ptr); + row_upd_rec_sys_fields(rec, NULL, index, offsets, + thr_get_trx(thr), roll_ptr); } was_delete_marked = rec_get_deleted_flag( @@ -1917,7 +2020,7 @@ btr_cur_update_in_place( } btr_cur_update_in_place_log(flags, rec, index, update, - trx, roll_ptr, mtr); + trx_id, roll_ptr, mtr); if (was_delete_marked && !rec_get_deleted_flag( @@ -1929,9 +2032,6 @@ btr_cur_update_in_place( rec, index, offsets, mtr); } - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } return(DB_SUCCESS); } @@ -1945,24 +2045,28 @@ fields of the record do not change. DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if there is not enough space left on the compressed page */ UNIV_INTERN -ulint +dberr_t btr_cur_optimistic_update( /*======================*/ ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor on the record to update; cursor stays valid and positioned on the same record */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ const upd_t* update, /*!< in: update vector; this must also contain trx id and roll ptr fields */ ulint cmpl_info,/*!< in: compiler info on secondary index updates */ - que_thr_t* thr, /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread, or NULL if + appropriate flags are set */ + trx_id_t trx_id, /*!< in: transaction id */ mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ { dict_index_t* index; page_cur_t* page_cursor; - ulint err; + dberr_t err; buf_block_t* block; page_t* page; page_zip_des_t* page_zip; @@ -1972,10 +2076,8 @@ btr_cur_optimistic_update( ulint old_rec_size; dtuple_t* new_entry; roll_ptr_t roll_ptr; - mem_heap_t* heap; ulint i; ulint n_ext; - ulint* offsets; block = btr_cur_get_block(cursor); page = buf_block_get_frame(block); @@ -1985,39 +2087,46 @@ btr_cur_optimistic_update( ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); /* The insert buffer tree should never be updated in place. */ ut_ad(!dict_index_is_ibuf(index)); - - heap = mem_heap_create(1024); - offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(!thr || thr_get_trx(thr)->id == trx_id); + ut_ad(thr || (flags & ~BTR_KEEP_POS_FLAG) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(btr_page_get_index_id(page) == index->id); + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - ut_a(!rec_offs_any_null_extern(rec, offsets) + ut_a(!rec_offs_any_null_extern(rec, *offsets) || trx_is_recv(thr_get_trx(thr))); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ #ifdef UNIV_DEBUG - if (btr_cur_print_record_ops && thr) { - btr_cur_trx_report(thr_get_trx(thr), index, "update "); - rec_print_new(stderr, rec, offsets); + if (btr_cur_print_record_ops) { + btr_cur_trx_report(trx_id, index, "update "); + rec_print_new(stderr, rec, *offsets); } #endif /* UNIV_DEBUG */ - if (!row_upd_changes_field_size_or_external(index, offsets, update)) { + if (!row_upd_changes_field_size_or_external(index, *offsets, update)) { /* The simplest and the most common case: the update does not change the size of any field and none of the updated fields is externally stored in rec or update, and there is enough space on the compressed page to log the update. */ - mem_heap_free(heap); - return(btr_cur_update_in_place(flags, cursor, update, - cmpl_info, thr, mtr)); + return(btr_cur_update_in_place( + flags, cursor, *offsets, update, + cmpl_info, thr, trx_id, mtr)); } - if (rec_offs_any_extern(offsets)) { + if (rec_offs_any_extern(*offsets)) { any_extern: /* Externally stored fields are treated in pessimistic update */ - mem_heap_free(heap); return(DB_OVERFLOW); } @@ -2030,8 +2139,14 @@ any_extern: page_cursor = btr_cur_get_page_cur(cursor); - new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, - &n_ext, heap); + if (!*heap) { + *heap = mem_heap_create( + rec_offs_size(*offsets) + + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets))); + } + + new_entry = row_rec_to_index_entry(rec, index, *offsets, + &n_ext, *heap); /* We checked above that there are no externally stored fields. */ ut_a(!n_ext); @@ -2039,8 +2154,8 @@ any_extern: corresponding to new_entry is latched in mtr. Thus the following call is safe. */ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, - FALSE, heap); - old_rec_size = rec_offs_size(offsets); + FALSE, *heap); + old_rec_size = rec_offs_size(*offsets); new_rec_size = rec_get_converted_size(index, new_entry, 0); page_zip = buf_block_get_page_zip(block); @@ -2051,16 +2166,14 @@ any_extern: if (page_zip && !btr_cur_update_alloc_zip(page_zip, block, index, new_rec_size, TRUE, mtr)) { - err = DB_ZIP_OVERFLOW; - goto err_exit; + return(DB_ZIP_OVERFLOW); } if (UNIV_UNLIKELY(new_rec_size >= (page_get_free_space_of_empty(page_is_comp(page)) / 2))) { - err = DB_OVERFLOW; - goto err_exit; + return(DB_OVERFLOW); } if (UNIV_UNLIKELY(page_get_data_size(page) @@ -2069,8 +2182,7 @@ any_extern: /* The page would become too empty */ - err = DB_UNDERFLOW; - goto err_exit; + return(DB_UNDERFLOW); } /* We do not attempt to reorganize if the page is compressed. @@ -2088,16 +2200,16 @@ any_extern: reorganize: for simplicity, we decide what to do assuming a reorganization is needed, though it might not be necessary */ - err = DB_OVERFLOW; - goto err_exit; + return(DB_OVERFLOW); } /* Do lock checking and undo logging */ - err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, thr, mtr, &roll_ptr); if (err != DB_SUCCESS) { - goto err_exit; + return(err); } /* Ok, we may do the replacement. Store on the page infimum the @@ -2108,13 +2220,7 @@ any_extern: btr_search_update_hash_on_delete(cursor); - /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above - invokes rec_offs_make_valid() to point to the copied record that - the fields of new_entry point to. We have to undo it here. */ - ut_ad(rec_offs_validate(NULL, index, offsets)); - rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets); - - page_cur_delete_rec(page_cursor, index, offsets, mtr); + page_cur_delete_rec(page_cursor, index, *offsets, mtr); page_cur_move_to_prev(page_cursor); @@ -2122,11 +2228,12 @@ any_extern: row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, roll_ptr); row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - thr_get_trx(thr)->id); + trx_id); } /* There are no externally stored columns in new_entry */ - rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr); + rec = btr_cur_insert_if_possible( + cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr); ut_a(rec); /* <- We calculated above the insert would fit */ if (page_zip && !dict_index_is_clust(index) @@ -2141,10 +2248,7 @@ any_extern: page_cur_move_to_next(page_cursor); - err = DB_SUCCESS; -err_exit: - mem_heap_free(heap); - return(err); + return(DB_SUCCESS); } /*************************************************************//** @@ -2203,7 +2307,7 @@ own x-latches to brothers of page, if those brothers exist. We assume here that the ordering fields of the record do not change. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t btr_cur_pessimistic_update( /*=======================*/ ulint flags, /*!< in: undo logging, locking, and rollback @@ -2211,7 +2315,13 @@ btr_cur_pessimistic_update( btr_cur_t* cursor, /*!< in/out: cursor on the record to update; cursor may become invalid if *big_rec == NULL || !(flags & BTR_KEEP_POS_FLAG) */ - mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to be stored externally by the caller, or NULL */ const upd_t* update, /*!< in: update vector; this is allowed also @@ -2219,7 +2329,9 @@ btr_cur_pessimistic_update( the values in update vector have no effect */ ulint cmpl_info,/*!< in: compiler info on secondary index updates */ - que_thr_t* thr, /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread, or NULL if + appropriate flags are set */ + trx_id_t trx_id, /*!< in: transaction id */ mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ { @@ -2231,17 +2343,15 @@ btr_cur_pessimistic_update( page_zip_des_t* page_zip; rec_t* rec; page_cur_t* page_cursor; - dtuple_t* new_entry; - ulint err; - ulint optim_err; + dberr_t err; + dberr_t optim_err; roll_ptr_t roll_ptr; - trx_t* trx; ibool was_first; ulint n_extents = 0; ulint n_reserved; ulint n_ext; - ulint* offsets = NULL; + *offsets = NULL; *big_rec = NULL; block = btr_cur_get_block(cursor); @@ -2258,9 +2368,16 @@ btr_cur_pessimistic_update( #endif /* UNIV_ZIP_DEBUG */ /* The insert buffer tree should never be updated in place. */ ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(!thr || thr_get_trx(thr)->id == trx_id); + ut_ad(thr || (flags & ~BTR_KEEP_POS_FLAG) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); - optim_err = btr_cur_optimistic_update(flags, cursor, update, - cmpl_info, thr, mtr); + optim_err = btr_cur_optimistic_update( + flags, cursor, offsets, offsets_heap, update, + cmpl_info, thr, trx_id, mtr); switch (optim_err) { case DB_UNDERFLOW: @@ -2272,7 +2389,8 @@ btr_cur_pessimistic_update( } /* Do lock checking and undo logging */ - err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, thr, mtr, &roll_ptr); if (err != DB_SUCCESS) { @@ -2300,20 +2418,11 @@ btr_cur_pessimistic_update( } } - if (!*heap) { - *heap = mem_heap_create(1024); - } - offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap); - - trx = thr_get_trx(thr); + *offsets = rec_get_offsets( + rec, index, *offsets, ULINT_UNDEFINED, offsets_heap); - new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, - &n_ext, *heap); - /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above - invokes rec_offs_make_valid() to point to the copied record that - the fields of new_entry point to. We have to undo it here. */ - ut_ad(rec_offs_validate(NULL, index, offsets)); - rec_offs_make_valid(rec, index, offsets); + dtuple_t* new_entry = row_rec_to_index_entry( + rec, index, *offsets, &n_ext, entry_heap); /* The page containing the clustered index record corresponding to new_entry is latched in mtr. If the @@ -2322,15 +2431,15 @@ btr_cur_pessimistic_update( purge would also have removed the clustered index record itself. Thus the following call is safe. */ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, - FALSE, *heap); + FALSE, entry_heap); if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, roll_ptr); row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx->id); + trx_id); } - if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) { + if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(*offsets)) { /* We are in a transaction rollback undoing a row update: we must free possible externally stored fields which got new values in the update, if they are not @@ -2341,16 +2450,17 @@ btr_cur_pessimistic_update( ut_ad(big_rec_vec == NULL); btr_rec_free_updated_extern_fields( - index, rec, page_zip, offsets, update, - trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr); + index, rec, page_zip, *offsets, update, + trx_is_recv(thr_get_trx(thr)) + ? RB_RECOVERY : RB_NORMAL, mtr); } /* We have to set appropriate extern storage bits in the new record to be inserted: we have to remember which fields were such */ ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec)); - ut_ad(rec_offs_validate(rec, index, offsets)); - n_ext += btr_push_update_extern_fields(new_entry, update, *heap); + ut_ad(rec_offs_validate(rec, index, *offsets)); + n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap); if (page_zip) { ut_ad(page_is_comp(page)); @@ -2396,11 +2506,12 @@ make_external: #endif /* UNIV_ZIP_DEBUG */ page_cursor = btr_cur_get_page_cur(cursor); - page_cur_delete_rec(page_cursor, index, offsets, mtr); + page_cur_delete_rec(page_cursor, index, *offsets, mtr); page_cur_move_to_prev(page_cursor); - rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr); + rec = btr_cur_insert_if_possible(cursor, new_entry, + offsets, offsets_heap, n_ext, mtr); if (rec) { page_cursor->rec = rec; @@ -2408,20 +2519,19 @@ make_external: lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), rec, block); - offsets = rec_get_offsets(rec, index, offsets, - ULINT_UNDEFINED, heap); - - if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(page_zip, - rec, index, offsets, mtr); + btr_cur_unmark_extern_fields( + page_zip, rec, index, *offsets, mtr); } - btr_cur_compress_if_useful( - cursor, - big_rec_vec != NULL && (flags & BTR_KEEP_POS_FLAG), - mtr); + bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG); + + if (btr_cur_compress_if_useful(cursor, adjust, mtr) + && adjust) { + rec_offs_make_valid(page_cursor->rec, index, *offsets); + } if (page_zip && !dict_index_is_clust(index) && page_is_leaf(page)) { @@ -2440,8 +2550,7 @@ make_external: ut_a(page_zip || optim_err != DB_UNDERFLOW); /* Out of space: reset the free bits. */ - if (!dict_index_is_clust(index) - && page_is_leaf(page)) { + if (!dict_index_is_clust(index) && page_is_leaf(page)) { ibuf_reset_free_bits(block); } } @@ -2473,11 +2582,13 @@ make_external: err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG, - cursor, new_entry, &rec, + cursor, offsets, offsets_heap, + new_entry, &rec, &dummy_big_rec, n_ext, NULL, mtr); ut_a(rec); ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); page_cursor->rec = rec; if (dict_index_is_sec_or_ibuf(index)) { @@ -2490,10 +2601,10 @@ make_external: page_update_max_trx_id(rec_block, buf_block_get_page_zip(rec_block), - trx->id, mtr); + trx_id, mtr); } - if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { /* The new inserted record owns its possible externally stored fields */ buf_block_t* rec_block = btr_cur_get_block(cursor); @@ -2504,10 +2615,8 @@ make_external: #endif /* UNIV_ZIP_DEBUG */ page_zip = buf_block_get_page_zip(rec_block); - offsets = rec_get_offsets(rec, index, offsets, - ULINT_UNDEFINED, heap); btr_cur_unmark_extern_fields(page_zip, - rec, index, offsets, mtr); + rec, index, *offsets, mtr); } lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), @@ -2546,17 +2655,13 @@ UNIV_INLINE void btr_cur_del_mark_set_clust_rec_log( /*===============================*/ - ulint flags, /*!< in: flags */ rec_t* rec, /*!< in: record */ dict_index_t* index, /*!< in: index of the record */ - ibool val, /*!< in: value to set */ - trx_t* trx, /*!< in: deleting transaction */ + trx_id_t trx_id, /*!< in: transaction id */ roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */ mtr_t* mtr) /*!< in: mtr */ { byte* log_ptr; - ut_ad(flags < 256); - ut_ad(val <= 1); ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); @@ -2572,13 +2677,11 @@ btr_cur_del_mark_set_clust_rec_log( return; } - mach_write_to_1(log_ptr, flags); - log_ptr++; - mach_write_to_1(log_ptr, val); - log_ptr++; + *log_ptr++ = 0; + *log_ptr++ = 1; - log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, - mtr); + log_ptr = row_upd_write_sys_vals_to_log( + index, trx_id, roll_ptr, log_ptr, mtr); mach_write_to_2(log_ptr, page_offset(rec)); log_ptr += 2; @@ -2675,20 +2778,18 @@ of the deleting transaction, and in the roll ptr field pointer to the undo log record created. @return DB_SUCCESS, DB_LOCK_WAIT, or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_del_mark_set_clust_rec( /*===========================*/ - ulint flags, /*!< in: undo logging and locking flags */ buf_block_t* block, /*!< in/out: buffer block of the record */ rec_t* rec, /*!< in/out: record */ dict_index_t* index, /*!< in: clustered index of the record */ const ulint* offsets,/*!< in: rec_get_offsets(rec) */ - ibool val, /*!< in: value to set */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr */ { roll_ptr_t roll_ptr; - ulint err; + dberr_t err; page_zip_des_t* page_zip; trx_t* trx; @@ -2700,7 +2801,7 @@ btr_cur_del_mark_set_clust_rec( #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { - btr_cur_trx_report(thr_get_trx(thr), index, "del mark "); + btr_cur_trx_report(thr_get_trx(thr)->id, index, "del mark "); rec_print_new(stderr, rec, offsets); } #endif /* UNIV_DEBUG */ @@ -2708,7 +2809,7 @@ btr_cur_del_mark_set_clust_rec( ut_ad(dict_index_is_clust(index)); ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); - err = lock_clust_rec_modify_check_and_lock(flags, block, + err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block, rec, index, offsets, thr); if (err != DB_SUCCESS) { @@ -2716,8 +2817,8 @@ btr_cur_del_mark_set_clust_rec( return(err); } - err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr, - index, NULL, NULL, 0, rec, + err = trx_undo_report_row_operation(0, TRX_UNDO_MODIFY_OP, thr, + index, NULL, NULL, 0, rec, offsets, &roll_ptr); if (err != DB_SUCCESS) { @@ -2730,17 +2831,21 @@ btr_cur_del_mark_set_clust_rec( page_zip = buf_block_get_page_zip(block); - btr_blob_dbg_set_deleted_flag(rec, index, offsets, val); - btr_rec_set_deleted_flag(rec, page_zip, val); + btr_blob_dbg_set_deleted_flag(rec, index, offsets, TRUE); + btr_rec_set_deleted_flag(rec, page_zip, TRUE); trx = thr_get_trx(thr); - if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, page_zip, - index, offsets, trx, roll_ptr); + if (dict_index_is_online_ddl(index)) { + row_log_table_delete( + rec, index, offsets, + trx_read_trx_id(row_get_trx_id_offset(index, offsets) + + rec)); } - btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx, + row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr); + + btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, roll_ptr, mtr); return(err); @@ -2829,7 +2934,7 @@ btr_cur_parse_del_mark_set_sec_rec( Sets a secondary index record delete mark to TRUE or FALSE. @return DB_SUCCESS, DB_LOCK_WAIT, or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_del_mark_set_sec_rec( /*=========================*/ ulint flags, /*!< in: locking flag */ @@ -2840,14 +2945,14 @@ btr_cur_del_mark_set_sec_rec( { buf_block_t* block; rec_t* rec; - ulint err; + dberr_t err; block = btr_cur_get_block(cursor); rec = btr_cur_get_rec(cursor); #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { - btr_cur_trx_report(thr_get_trx(thr), cursor->index, + btr_cur_trx_report(thr_get_trx(thr)->id, cursor->index, "del mark "); rec_print(stderr, rec, cursor->index); } @@ -2937,12 +3042,15 @@ positioned, but no latch on the whole tree. @return TRUE if success, i.e., the page did not become too empty */ UNIV_INTERN ibool -btr_cur_optimistic_delete( -/*======================*/ +btr_cur_optimistic_delete_func( +/*===========================*/ btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to delete; cursor stays valid: if deletion succeeds, on function exit it points to the successor of the deleted record */ +#ifdef UNIV_DEBUG + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ +#endif /* UNIV_DEBUG */ mtr_t* mtr) /*!< in: mtr; if this function returns TRUE on a leaf page of a secondary index, the mtr must be committed @@ -2956,6 +3064,7 @@ btr_cur_optimistic_delete( ibool no_compress_needed; rec_offs_init(offsets_); + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), MTR_MEMO_PAGE_X_FIX)); /* This is intended only for leaf page deletions */ @@ -2963,6 +3072,9 @@ btr_cur_optimistic_delete( block = btr_cur_get_block(cursor); ut_ad(page_is_leaf(buf_block_get_frame(block))); + ut_ad(!dict_index_is_online_ddl(cursor->index) + || dict_index_is_clust(cursor->index) + || (flags & BTR_CREATE_FLAG)); rec = btr_cur_get_rec(cursor); offsets = rec_get_offsets(rec, cursor->index, offsets, @@ -3030,7 +3142,7 @@ UNIV_INTERN ibool btr_cur_pessimistic_delete( /*=======================*/ - ulint* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; the latter may occur because we may have to update node pointers on upper levels, and in the case of variable length keys @@ -3043,6 +3155,7 @@ btr_cur_pessimistic_delete( if compression does not occur, the cursor stays valid: it points to successor of deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ mtr_t* mtr) /*!< in: mtr */ { @@ -3051,7 +3164,6 @@ btr_cur_pessimistic_delete( page_zip_des_t* page_zip; dict_index_t* index; rec_t* rec; - dtuple_t* node_ptr; ulint n_extents = 0; ulint n_reserved; ibool success; @@ -3064,6 +3176,10 @@ btr_cur_pessimistic_delete( page = buf_block_get_frame(block); index = btr_cur_get_index(cursor); + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); @@ -3112,13 +3228,15 @@ btr_cur_pessimistic_delete( btr_discard_page(cursor, mtr); - *err = DB_SUCCESS; ret = TRUE; goto return_after_reservations; } - lock_update_delete(block, rec); + if (flags == 0) { + lock_update_delete(block, rec); + } + level = btr_page_get_level(page, mtr); if (level > 0 @@ -3147,12 +3265,12 @@ btr_cur_pessimistic_delete( btr_node_ptr_delete(index, block, mtr); - node_ptr = dict_index_build_node_ptr( + dtuple_t* node_ptr = dict_index_build_node_ptr( index, next_rec, buf_block_get_page_no(block), heap, level); - btr_insert_on_non_leaf_level(index, - level + 1, node_ptr, mtr); + btr_insert_on_non_leaf_level( + flags, index, level + 1, node_ptr, mtr); } } @@ -3165,9 +3283,9 @@ btr_cur_pessimistic_delete( ut_ad(btr_check_node_ptr(index, block, mtr)); +return_after_reservations: *err = DB_SUCCESS; -return_after_reservations: mem_heap_free(heap); if (ret == FALSE) { @@ -3194,8 +3312,8 @@ btr_cur_add_path_info( ulint root_height) /*!< in: root node height in tree */ { btr_path_t* slot; - rec_t* rec; - page_t* page; + const rec_t* rec; + const page_t* page; ut_a(cursor->path_arr); @@ -3407,6 +3525,9 @@ btr_estimate_n_rows_in_range( ibool is_n_rows_exact; ulint i; mtr_t mtr; + ib_int64_t table_n_rows; + + table_n_rows = dict_table_get_n_rows(index->table); mtr_start(&mtr); @@ -3419,9 +3540,9 @@ btr_estimate_n_rows_in_range( &cursor, 0, __FILE__, __LINE__, &mtr); } else { - btr_cur_open_at_index_side(TRUE, index, + btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF | BTR_ESTIMATE, - &cursor, &mtr); + &cursor, 0, &mtr); } mtr_commit(&mtr); @@ -3437,9 +3558,9 @@ btr_estimate_n_rows_in_range( &cursor, 0, __FILE__, __LINE__, &mtr); } else { - btr_cur_open_at_index_side(FALSE, index, + btr_cur_open_at_index_side(false, index, BTR_SEARCH_LEAF | BTR_ESTIMATE, - &cursor, &mtr); + &cursor, 0, &mtr); } mtr_commit(&mtr); @@ -3471,20 +3592,21 @@ btr_estimate_n_rows_in_range( n_rows = n_rows * 2; } + DBUG_EXECUTE_IF("bug14007649", return(n_rows);); + /* Do not estimate the number of rows in the range to over 1 / 2 of the estimated rows in the whole table */ - if (n_rows > index->table->stat_n_rows / 2 - && !is_n_rows_exact) { + if (n_rows > table_n_rows / 2 && !is_n_rows_exact) { - n_rows = index->table->stat_n_rows / 2; + n_rows = table_n_rows / 2; /* If there are just 0 or 1 rows in the table, then we estimate all rows are in the range */ if (n_rows == 0) { - n_rows = index->table->stat_n_rows; + n_rows = table_n_rows; } } @@ -3544,9 +3666,9 @@ btr_estimate_n_rows_in_range( /*******************************************************************//** Record the number of non_null key values in a given index for -each n-column prefix of the index where n < dict_index_get_n_unique(index). +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). The estimates are eventually stored in the array: -index->stat_n_non_null_key_vals. */ +index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */ static void btr_record_not_null_field_in_rec( @@ -3557,7 +3679,7 @@ btr_record_not_null_field_in_rec( const ulint* offsets, /*!< in: rec_get_offsets(rec, index), its size could be for all fields or that of "n_unique" */ - ib_int64_t* n_not_null) /*!< in/out: array to record number of + ib_uint64_t* n_not_null) /*!< in/out: array to record number of not null rows for n-column prefix */ { ulint i; @@ -3579,11 +3701,12 @@ btr_record_not_null_field_in_rec( /*******************************************************************//** Estimates the number of different key values in a given index, for -each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals[] and -the number of pages that were sampled is saved in index->stat_n_sample_sizes[]. -If innodb_stats_method is "nulls_ignored", we also record the number of -non-null values for each prefix and store the estimates in +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +index->stat_n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in array index->stat_n_non_null_key_vals. */ UNIV_INTERN void @@ -3597,8 +3720,8 @@ btr_estimate_number_of_different_key_vals( ulint n_cols; ulint matched_fields; ulint matched_bytes; - ib_int64_t* n_diff; - ib_int64_t* n_not_null; + ib_uint64_t* n_diff; + ib_uint64_t* n_not_null; ibool stats_null_not_equal; ullint n_sample_pages; /* number of pages to sample */ ulint not_empty_flag = 0; @@ -3614,13 +3737,13 @@ btr_estimate_number_of_different_key_vals( n_cols = dict_index_get_n_unique(index); heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) - * (n_cols + 1) + * n_cols + dict_index_get_n_fields(index) * (sizeof *offsets_rec + sizeof *offsets_next_rec)); - n_diff = (ib_int64_t*) mem_heap_zalloc(heap, (n_cols + 1) - * sizeof(ib_int64_t)); + n_diff = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof(ib_int64_t)); n_not_null = NULL; @@ -3629,8 +3752,8 @@ btr_estimate_number_of_different_key_vals( considered equal (by setting stats_null_not_equal value) */ switch (srv_innodb_stats_method) { case SRV_STATS_NULLS_IGNORED: - n_not_null = (ib_int64_t*) mem_heap_zalloc(heap, (n_cols + 1) - * sizeof *n_not_null); + n_not_null = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof *n_not_null); /* fall through */ case SRV_STATS_NULLS_UNEQUAL: @@ -3681,7 +3804,7 @@ btr_estimate_number_of_different_key_vals( offsets_rec = rec_get_offsets(rec, index, offsets_rec, ULINT_UNDEFINED, &heap); - if (n_not_null) { + if (n_not_null != NULL) { btr_record_not_null_field_in_rec( n_cols, offsets_rec, n_not_null); } @@ -3709,14 +3832,14 @@ btr_estimate_number_of_different_key_vals( &matched_fields, &matched_bytes); - for (j = matched_fields + 1; j <= n_cols; j++) { + for (j = matched_fields; j < n_cols; j++) { /* We add one if this index record has a different prefix from the previous */ n_diff[j]++; } - if (n_not_null) { + if (n_not_null != NULL) { btr_record_not_null_field_in_rec( n_cols, offsets_next_rec, n_not_null); } @@ -3751,7 +3874,7 @@ btr_estimate_number_of_different_key_vals( if (btr_page_get_prev(page, &mtr) != FIL_NULL || btr_page_get_next(page, &mtr) != FIL_NULL) { - n_diff[n_cols]++; + n_diff[n_cols - 1]++; } } @@ -3766,7 +3889,7 @@ btr_estimate_number_of_different_key_vals( also the pages used for external storage of fields (those pages are included in index->stat_n_leaf_pages) */ - for (j = 0; j <= n_cols; j++) { + for (j = 0; j < n_cols; j++) { index->stat_n_diff_key_vals[j] = BTR_TABLE_STATS_FROM_SAMPLE( n_diff[j], index, n_sample_pages, @@ -3796,7 +3919,7 @@ btr_estimate_number_of_different_key_vals( sampled result. stat_n_non_null_key_vals[] is created and initialized to zero in dict_index_add_to_cache(), along with stat_n_diff_key_vals[] array */ - if (n_not_null != NULL && (j < n_cols)) { + if (n_not_null != NULL) { index->stat_n_non_null_key_vals[j] = BTR_TABLE_STATS_FROM_SAMPLE( n_not_null[j], index, n_sample_pages, @@ -4146,7 +4269,7 @@ The fields are stored on pages allocated from leaf node file segment of the index tree. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ UNIV_INTERN -enum db_err +dberr_t btr_store_big_rec_extern_fields( /*============================*/ dict_index_t* index, /*!< in: index of rec; the index tree @@ -4180,7 +4303,7 @@ btr_store_big_rec_extern_fields( z_stream c_stream; buf_block_t** freed_pages = NULL; ulint n_freed_pages = 0; - enum db_err error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(rec_offs_any_extern(offsets)); @@ -4211,7 +4334,7 @@ btr_store_big_rec_extern_fields( heap = mem_heap_create(250000); page_zip_set_alloc(&c_stream, heap); - err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, + err = deflateInit2(&c_stream, page_compression_level, Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); ut_a(err == Z_OK); } @@ -5083,6 +5206,7 @@ btr_copy_zblob_prefix( " page %lu space %lu\n", (ulong) fil_page_get_type(bpage->zip.data), (ulong) page_no, (ulong) space_id); + ut_ad(0); goto end_of_blob; } diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index 5a67afc7e69..aceb6bd1d41 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -133,6 +133,8 @@ btr_pcur_store_position( ut_a(btr_page_get_next(page, mtr) == FIL_NULL); ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_ad(page_is_leaf(page)); + ut_ad(page_get_page_no(page) == index->page); cursor->old_stored = BTR_PCUR_OLD_STORED; @@ -258,7 +260,8 @@ btr_pcur_restore_position_func( btr_cur_open_at_index_side( cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE, - index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr); + index, latch_mode, + btr_pcur_get_btr_cur(cursor), 0, mtr); cursor->latch_mode = latch_mode; cursor->pos_state = BTR_PCUR_IS_POSITIONED; @@ -326,13 +329,19 @@ btr_pcur_restore_position_func( /* Save the old search mode of the cursor */ old_mode = cursor->search_mode; - if (UNIV_LIKELY(cursor->rel_pos == BTR_PCUR_ON)) { + switch (cursor->rel_pos) { + case BTR_PCUR_ON: mode = PAGE_CUR_LE; - } else if (cursor->rel_pos == BTR_PCUR_AFTER) { + break; + case BTR_PCUR_AFTER: mode = PAGE_CUR_G; - } else { - ut_ad(cursor->rel_pos == BTR_PCUR_BEFORE); + break; + case BTR_PCUR_BEFORE: mode = PAGE_CUR_L; + break; + default: + ut_error; + mode = 0; } btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode, @@ -341,25 +350,39 @@ btr_pcur_restore_position_func( /* Restore the old search mode */ cursor->search_mode = old_mode; - if (cursor->rel_pos == BTR_PCUR_ON - && btr_pcur_is_on_user_rec(cursor) - && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), - rec_get_offsets( - btr_pcur_get_rec(cursor), index, - NULL, ULINT_UNDEFINED, &heap))) { - - /* We have to store the NEW value for the modify clock, since - the cursor can now be on a different page! But we can retain - the value of old_rec */ - - cursor->block_when_stored = btr_pcur_get_block(cursor); - cursor->modify_clock = buf_block_get_modify_clock( - cursor->block_when_stored); - cursor->old_stored = BTR_PCUR_OLD_STORED; - - mem_heap_free(heap); - - return(TRUE); + switch (cursor->rel_pos) { + case BTR_PCUR_ON: + if (btr_pcur_is_on_user_rec(cursor) + && !cmp_dtuple_rec( + tuple, btr_pcur_get_rec(cursor), + rec_get_offsets(btr_pcur_get_rec(cursor), + index, NULL, + ULINT_UNDEFINED, &heap))) { + + /* We have to store the NEW value for + the modify clock, since the cursor can + now be on a different page! But we can + retain the value of old_rec */ + + cursor->block_when_stored = + btr_pcur_get_block(cursor); + cursor->modify_clock = + buf_block_get_modify_clock( + cursor->block_when_stored); + cursor->old_stored = BTR_PCUR_OLD_STORED; + + mem_heap_free(heap); + + return(TRUE); + } +#ifdef UNIV_DEBUG + /* fall through */ + case BTR_PCUR_BEFORE: + case BTR_PCUR_AFTER: + break; + default: + ut_error; +#endif /* UNIV_DEBUG */ } mem_heap_free(heap); diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index 7e6e2ef1cb1..432fef05dd5 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -42,7 +42,6 @@ Created 2/17/1996 Heikki Tuuri #include "btr0pcur.h" #include "btr0btr.h" #include "ha0ha.h" -#include "srv0mon.h" /** Flag: has the search system been enabled? Protected by btr_search_latch. */ @@ -1077,6 +1076,7 @@ btr_search_drop_page_hash_index( mem_heap_t* heap; const dict_index_t* index; ulint* offsets; + btr_search_t* info; #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); @@ -1102,6 +1102,27 @@ retry: } ut_a(!dict_index_is_ibuf(index)); +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + /* The index is being created (bulk loaded). */ + case ONLINE_INDEX_COMPLETE: + /* The index has been published. */ + case ONLINE_INDEX_ABORTED: + /* Either the index creation was aborted due to an + error observed by InnoDB (in which case there should + not be any adaptive hash index entries), or it was + completed and then flagged aborted in + rollback_inplace_alter_table(). */ + break; + case ONLINE_INDEX_ABORTED_DROPPED: + /* The index should have been dropped from the tablespace + already, and the adaptive hash index entries should have + been dropped as well. */ + ut_error; + } +#endif /* UNIV_DEBUG */ + table = btr_search_sys->hash_index; #ifdef UNIV_SYNC_DEBUG @@ -1196,8 +1217,9 @@ next_rec: ha_remove_all_nodes_to_page(table, folds[i], page); } - ut_a(index->search_info->ref_count > 0); - index->search_info->ref_count--; + info = btr_search_get_info(block->index); + ut_a(info->ref_count > 0); + info->ref_count--; block->index = NULL; diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc index b6774aede8e..e34216dbc8f 100644 --- a/storage/innobase/buf/buf0buddy.cc +++ b/storage/innobase/buf/buf0buddy.cc @@ -335,7 +335,7 @@ buf_buddy_relocate( { buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; - mutex_t* mutex; + ib_mutex_t* mutex; ulint space; ulint page_no; diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 96821478e60..6efa14e6791 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -53,7 +53,6 @@ Created 11/5/1995 Heikki Tuuri #include "page0zip.h" #include "srv0mon.h" #include "buf0checksum.h" -#include "buf0dblwr.h" /* IMPLEMENTATION OF THE BUFFER POOL @@ -372,10 +371,6 @@ buf_get_total_list_len( buf_pool = buf_pool_from_array(i); - if (!buf_pool) { - continue; - } - *LRU_len += UT_LIST_GET_LEN(buf_pool->LRU); *free_len += UT_LIST_GET_LEN(buf_pool->free); *flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list); @@ -383,6 +378,32 @@ buf_get_total_list_len( } /********************************************************************//** +Get total list size in bytes from all buffer pools. */ +UNIV_INTERN +void +buf_get_total_list_size_in_bytes( +/*=============================*/ + buf_pools_list_size_t* buf_pools_list_size) /*!< out: list sizes + in all buffer pools */ +{ + ut_ad(buf_pools_list_size); + memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size)); + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + /* We don't need mutex protection since this is + for statistics purpose */ + buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes; + buf_pools_list_size->unzip_LRU_bytes += + UT_LIST_GET_LEN(buf_pool->unzip_LRU) * UNIV_PAGE_SIZE; + buf_pools_list_size->flush_list_bytes += + buf_pool->stat.flush_list_bytes; + } +} + +/********************************************************************//** Get total buffer pool statistics. */ UNIV_INTERN void @@ -400,10 +421,6 @@ buf_get_total_stat( buf_pool = buf_pool_from_array(i); - if (!buf_pool) { - continue; - } - buf_stat = &buf_pool->stat; tot_stat->n_page_gets += buf_stat->n_page_gets; tot_stat->n_pages_read += buf_stat->n_pages_read; @@ -456,6 +473,8 @@ UNIV_INTERN ibool buf_page_is_corrupted( /*==================*/ + bool check_lsn, /*!< in: true if we need to check + and complain about the LSN */ const byte* read_buf, /*!< in: a database page */ ulint zip_size) /*!< in: size of compressed page; 0 for uncompressed pages */ @@ -480,14 +499,17 @@ buf_page_is_corrupted( if (recv_lsn_checks_on) { lsn_t current_lsn; - if (log_peek_lsn(¤t_lsn) - && UNIV_UNLIKELY - (current_lsn - < mach_read_from_8(read_buf + FIL_PAGE_LSN))) { + /* Since we are going to reset the page LSN during the import + phase it makes no sense to spam the log with error messages. */ + + if (check_lsn + && log_peek_lsn(¤t_lsn) + && current_lsn + < mach_read_from_8(read_buf + FIL_PAGE_LSN)) { ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Error: page %lu log sequence number" + " InnoDB: Error: page %lu log sequence number" " " LSN_PF "\n" "InnoDB: is in the future! Current system " "log sequence number " LSN_PF ".\n" @@ -673,6 +695,8 @@ buf_page_is_corrupted( is added and not handled here */ } + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); ); + return(FALSE); } @@ -885,7 +909,7 @@ pfs_register_buffer_block( PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER); for (i = 0; i < num_to_register; i++) { - mutex_t* mutex; + ib_mutex_t* mutex; rw_lock_t* rwlock; # ifdef UNIV_PFS_MUTEX @@ -1267,7 +1291,7 @@ buf_pool_init_instance( SYNC_BUF_FLUSH_LIST); for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { - buf_pool->no_flush[i] = os_event_create(NULL); + buf_pool->no_flush[i] = os_event_create(); } buf_pool->watch = (buf_page_t*) mem_zalloc( @@ -1334,7 +1358,7 @@ buf_pool_free_instance( Creates the buffer pool. @return DB_SUCCESS if success, DB_ERROR if not enough memory or error */ UNIV_INTERN -ulint +dberr_t buf_pool_init( /*==========*/ ulint total_size, /*!< in: size of the total pool in bytes */ @@ -1731,7 +1755,7 @@ buf_pool_watch_unset( ut_a(bpage); if (UNIV_UNLIKELY(!buf_pool_watch_is_sentinel(buf_pool, bpage))) { - mutex_t* mutex = buf_page_get_mutex(bpage); + ib_mutex_t* mutex = buf_page_get_mutex(bpage); mutex_enter(mutex); ut_a(bpage->buf_fix_count > 0); @@ -1802,34 +1826,24 @@ buf_page_make_young( } /********************************************************************//** -Sets the time of the first access of a page and moves a page to the -start of the buffer pool LRU list if it is too old. This high-level -function can be used to prevent an important page from slipping -out of the buffer pool. */ +Moves a page to the start of the buffer pool LRU list if it is too old. +This high-level function can be used to prevent an important page from +slipping out of the buffer pool. */ static void -buf_page_set_accessed_make_young( -/*=============================*/ - buf_page_t* bpage, /*!< in/out: buffer block of a +buf_page_make_young_if_needed( +/*==========================*/ + buf_page_t* bpage) /*!< in/out: buffer block of a file page */ - unsigned access_time) /*!< in: bpage->access_time - read under mutex protection, - or 0 if unknown */ { +#ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(!buf_pool_mutex_own(buf_pool)); +#endif /* UNIV_DEBUG */ ut_a(buf_page_in_file(bpage)); if (buf_page_peek_if_too_old(bpage)) { - buf_pool_mutex_enter(buf_pool); - buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(buf_pool); - } else if (!access_time) { - ulint time_ms = ut_time_ms(); - buf_pool_mutex_enter(buf_pool); - buf_page_set_accessed(bpage, time_ms); - buf_pool_mutex_exit(buf_pool); + buf_page_make_young(bpage); } } @@ -1880,7 +1894,7 @@ buf_page_set_file_page_was_freed( &hash_lock); if (bpage) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); mutex_enter(block_mutex); rw_lock_s_unlock(hash_lock); @@ -1913,7 +1927,7 @@ buf_page_reset_file_page_was_freed( bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, &hash_lock); if (bpage) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); mutex_enter(block_mutex); rw_lock_s_unlock(hash_lock); @@ -1974,11 +1988,10 @@ buf_page_get_zip( ulint offset) /*!< in: page number */ { buf_page_t* bpage; - mutex_t* block_mutex; + ib_mutex_t* block_mutex; rw_lock_t* hash_lock; ibool discard_attempted = FALSE; ibool must_read; - unsigned access_time; buf_pool_t* buf_pool = buf_pool_get(space, offset); buf_pool->stat.n_page_gets++; @@ -2051,15 +2064,17 @@ err_exit: got_block: must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; - access_time = buf_page_is_accessed(bpage); rw_lock_s_unlock(hash_lock); #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(!bpage->file_page_was_freed); #endif + + buf_page_set_accessed(bpage); + mutex_exit(block_mutex); - buf_page_set_accessed_make_young(bpage, access_time); + buf_page_make_young_if_needed(bpage); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(++buf_dbg_counter % 5771 || buf_validate()); @@ -2372,6 +2387,28 @@ buf_block_is_uncompressed( return(buf_pointer_is_block_field_instance(buf_pool, (void*) block)); } +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/********************************************************************//** +Return true if probe is enabled. +@return true if probe enabled. */ +static +bool +buf_debug_execute_is_force_flush() +/*==============================*/ +{ + DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); ); + + /* This is used during queisce testing, we want to ensure maximum + buffering by the change buffer. */ + + if (srv_ibuf_disable_background_merge) { + return(true); + } + + return(false); +} +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + /********************************************************************//** This is the general function used to get access to a database page. @return pointer to the block or NULL */ @@ -2398,7 +2435,7 @@ buf_page_get_gen( ulint fix_type; ibool must_read; rw_lock_t* hash_lock; - mutex_t* block_mutex; + ib_mutex_t* block_mutex; buf_page_t* hash_bpage; ulint retries = 0; buf_pool_t* buf_pool = buf_pool_get(space, offset); @@ -2666,27 +2703,38 @@ wait_until_unfixed: block->page.buf_fix_count = 1; buf_block_set_io_fix(block, BUF_IO_READ); - rw_lock_x_lock_func(&block->lock, 0, file, line); + rw_lock_x_lock_inline(&block->lock, 0, file, line); UNIV_MEM_INVALID(bpage, sizeof *bpage); rw_lock_x_unlock(hash_lock); - mutex_exit(&block->mutex); - mutex_exit(&buf_pool->zip_mutex); - buf_pool->n_pend_unzip++; + buf_pool->n_pend_unzip++; buf_pool_mutex_exit(buf_pool); + access_time = buf_page_is_accessed(&block->page); + mutex_exit(&block->mutex); + mutex_exit(&buf_pool->zip_mutex); + buf_page_free_descriptor(bpage); - /* Decompress the page and apply buffered operations - while not holding buf_pool->mutex or block->mutex. */ + /* Decompress the page while not holding + buf_pool->mutex or block->mutex. */ - ut_a(buf_zip_decompress(block, TRUE)); + /* Page checksum verification is already done when + the page is read from disk. Hence page checksum + verification is not necessary when decompressing the page. */ + ut_a(buf_zip_decompress(block, FALSE)); if (UNIV_LIKELY(!recv_no_ibuf_operations)) { - ibuf_merge_or_delete_for_page(block, space, offset, - zip_size, TRUE); + if (access_time) { +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(space, offset) == 0); +#endif /* UNIV_IBUF_COUNT_DEBUG */ + } else { + ibuf_merge_or_delete_for_page( + block, space, offset, zip_size, TRUE); + } } /* Unfix and unlatch the block. */ @@ -2723,8 +2771,9 @@ wait_until_unfixed: UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page); #endif #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH) - && ibuf_debug) { + && (ibuf_debug || buf_debug_execute_is_force_flush())) { /* Try to evict the block from the buffer pool, to use the insert buffer (change buffer) as much as possible. */ @@ -2759,19 +2808,18 @@ wait_until_unfixed: buf_pool, space, offset, fold); } - if (UNIV_LIKELY_NULL(block)) { - block_mutex = buf_page_get_mutex( - &block->page); - /* The page entered the buffer - pool for some reason. Try to - evict it again. */ - mutex_enter(block_mutex); - rw_lock_x_unlock(hash_lock); + rw_lock_x_unlock(hash_lock); - goto got_block; + if (UNIV_LIKELY_NULL(block)) { + /* Either the page has been read in or + a watch was set on that in the window + where we released the buf_pool::mutex + and before we acquire the hash_lock + above. Try again. */ + guess = block; + goto loop; } - rw_lock_x_unlock(hash_lock); fprintf(stderr, "innodb_change_buffering_debug evict %u %u\n", (unsigned) space, (unsigned) offset); @@ -2799,14 +2847,15 @@ wait_until_unfixed: ut_a(mode == BUF_GET_POSSIBLY_FREED || !block->page.file_page_was_freed); #endif - mutex_exit(&block->mutex); - /* Check if this is the first access to the page */ - access_time = buf_page_is_accessed(&block->page); - if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) { - buf_page_set_accessed_make_young(&block->page, access_time); + buf_page_set_accessed(&block->page); + + mutex_exit(&block->mutex); + + if (mode != BUF_PEEK_IF_IN_POOL) { + buf_page_make_young_if_needed(&block->page); } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG @@ -2842,14 +2891,14 @@ wait_until_unfixed: break; case RW_S_LATCH: - rw_lock_s_lock_func(&(block->lock), 0, file, line); + rw_lock_s_lock_inline(&(block->lock), 0, file, line); fix_type = MTR_MEMO_PAGE_S_FIX; break; default: ut_ad(rw_latch == RW_X_LATCH); - rw_lock_x_lock_func(&(block->lock), 0, file, line); + rw_lock_x_lock_inline(&(block->lock), 0, file, line); fix_type = MTR_MEMO_PAGE_X_FIX; break; @@ -2857,7 +2906,7 @@ wait_until_unfixed: mtr_memo_push(mtr, block, fix_type); - if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) { + if (mode != BUF_PEEK_IF_IN_POOL && !access_time) { /* In the case of a first access, try to apply linear read-ahead */ @@ -2912,15 +2961,13 @@ buf_page_optimistic_get( buf_block_buf_fix_inc(block, file, line); - mutex_exit(&block->mutex); + access_time = buf_page_is_accessed(&block->page); - /* Check if this is the first access to the page. - We do a dirty read on purpose, to avoid mutex contention. - This field is only used for heuristic purposes; it does not - affect correctness. */ + buf_page_set_accessed(&block->page); - access_time = buf_page_is_accessed(&block->page); - buf_page_set_accessed_make_young(&block->page, access_time); + mutex_exit(&block->mutex); + + buf_page_make_young_if_needed(&block->page); ut_ad(!ibuf_inside(mtr) || ibuf_page(buf_block_get_space(block), @@ -2932,8 +2979,8 @@ buf_page_optimistic_get( file, line); fix_type = MTR_MEMO_PAGE_S_FIX; } else { - success = rw_lock_x_lock_func_nowait(&(block->lock), - file, line); + success = rw_lock_x_lock_func_nowait_inline(&(block->lock), + file, line); fix_type = MTR_MEMO_PAGE_X_FIX; } @@ -2975,7 +3022,7 @@ buf_page_optimistic_get( mutex_exit(&block->mutex); #endif - if (UNIV_UNLIKELY(!access_time)) { + if (!access_time) { /* In the case of a first access, try to apply linear read-ahead */ @@ -3038,24 +3085,14 @@ buf_page_get_known_nowait( buf_block_buf_fix_inc(block, file, line); + buf_page_set_accessed(&block->page); + mutex_exit(&block->mutex); buf_pool = buf_pool_from_block(block); - if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) { - buf_pool_mutex_enter(buf_pool); - buf_LRU_make_block_young(&block->page); - buf_pool_mutex_exit(buf_pool); - } else if (!buf_page_is_accessed(&block->page)) { - /* Above, we do a dirty read on purpose, to avoid - mutex contention. The field buf_page_t::access_time - is only used for heuristic purposes. Writes to the - field must be protected by mutex, however. */ - ulint time_ms = ut_time_ms(); - - buf_pool_mutex_enter(buf_pool); - buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(buf_pool); + if (mode == BUF_MAKE_YOUNG) { + buf_page_make_young_if_needed(&block->page); } ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD); @@ -3065,8 +3102,8 @@ buf_page_get_known_nowait( file, line); fix_type = MTR_MEMO_PAGE_S_FIX; } else { - success = rw_lock_x_lock_func_nowait(&(block->lock), - file, line); + success = rw_lock_x_lock_func_nowait_inline(&(block->lock), + file, line); fix_type = MTR_MEMO_PAGE_X_FIX; } @@ -3167,8 +3204,8 @@ buf_page_try_get_func( S-latch. */ fix_type = MTR_MEMO_PAGE_X_FIX; - success = rw_lock_x_lock_func_nowait(&block->lock, - file, line); + success = rw_lock_x_lock_func_nowait_inline(&block->lock, + file, line); } if (!success) { @@ -3234,6 +3271,7 @@ buf_page_init( ulint offset, /*!< in: offset of the page within space in units of a page */ ulint fold, /*!< in: buf_page_address_fold(space,offset) */ + ulint zip_size,/*!< in: compressed page size, or 0 */ buf_block_t* block) /*!< in/out: block to init */ { buf_page_t* hash_page; @@ -3302,6 +3340,9 @@ buf_page_init( ut_d(block->page.in_page_hash = TRUE); HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, &block->page); + if (zip_size) { + page_zip_set_size(&block->page.zip, zip_size); + } } /********************************************************************//** @@ -3318,7 +3359,7 @@ UNIV_INTERN buf_page_t* buf_page_init_for_read( /*===================*/ - ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */ ulint space, /*!< in: space id */ ulint zip_size,/*!< in: compressed page size, or 0 */ @@ -3407,7 +3448,7 @@ err_exit: ut_ad(buf_pool_from_bpage(bpage) == buf_pool); - buf_page_init(buf_pool, space, offset, fold, block); + buf_page_init(buf_pool, space, offset, fold, zip_size, block); rw_lock_x_unlock(hash_lock); /* The block must be put to the LRU list, to the old blocks */ @@ -3426,8 +3467,6 @@ err_exit: buf_page_set_io_fix(bpage, BUF_IO_READ); if (zip_size) { - page_zip_set_size(&block->page.zip, zip_size); - /* buf_pool->mutex may be released and reacquired by buf_buddy_alloc(). Thus, we must release block->mutex in order not to @@ -3528,7 +3567,8 @@ err_exit: rw_lock_x_unlock(hash_lock); - /* The block must be put to the LRU list, to the old blocks */ + /* The block must be put to the LRU list, to the old blocks. + The zip_size is already set into the page zip */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(bpage); @@ -3578,7 +3618,6 @@ buf_page_create( buf_block_t* block; ulint fold; buf_block_t* free_block = NULL; - ulint time_ms = ut_time_ms(); buf_pool_t* buf_pool = buf_pool_get(space, offset); rw_lock_t* hash_lock; @@ -3630,7 +3669,7 @@ buf_page_create( mutex_enter(&block->mutex); - buf_page_init(buf_pool, space, offset, fold, block); + buf_page_init(buf_pool, space, offset, fold, zip_size, block); rw_lock_x_unlock(hash_lock); @@ -3651,7 +3690,6 @@ buf_page_create( buf_page_set_io_fix(&block->page, BUF_IO_READ); rw_lock_x_lock(&block->lock); - page_zip_set_size(&block->page.zip, zip_size); mutex_exit(&block->mutex); /* buf_pool->mutex may be released and reacquired by buf_buddy_alloc(). Thus, we must release block->mutex @@ -3675,12 +3713,12 @@ buf_page_create( rw_lock_x_unlock(&block->lock); } - buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(buf_pool); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + buf_page_set_accessed(&block->page); + mutex_exit(&block->mutex); /* Delete possible entries for the page from the insert buffer: @@ -3849,6 +3887,8 @@ buf_mark_space_corrupt( BUF_IO_READ); } + mutex_exit(buf_page_get_mutex(bpage)); + /* Find the table with specified space id, and mark it corrupted */ if (dict_set_corrupted_by_space(space)) { buf_LRU_free_one_page(bpage); @@ -3859,7 +3899,6 @@ buf_mark_space_corrupt( ut_ad(buf_pool->n_pend_reads > 0); buf_pool->n_pend_reads--; - mutex_exit(buf_page_get_mutex(bpage)); buf_pool_mutex_exit(buf_pool); return(ret); @@ -3868,9 +3907,9 @@ buf_mark_space_corrupt( /********************************************************************//** Completes an asynchronous read or write request of a file page to or from the buffer pool. -@return TRUE if successful */ +@return true if successful */ UNIV_INTERN -ibool +bool buf_page_io_complete( /*=================*/ buf_page_t* bpage) /*!< in: pointer to the block in question */ @@ -3952,8 +3991,20 @@ buf_page_io_complete( /* From version 3.23.38 up we store the page checksum to the 4 first bytes of the page end lsn field */ - if (buf_page_is_corrupted(frame, + if (buf_page_is_corrupted(true, frame, buf_page_get_zip_size(bpage))) { + + /* Not a real corruption if it was triggered by + error injection */ + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", + if (bpage->space > TRX_SYS_SPACE + && buf_mark_space_corrupt(bpage)) { + ib_logf(IB_LOG_LEVEL_INFO, + "Simulated page corruption"); + return(true); + } + goto page_not_corrupt; + ;); corrupt: fprintf(stderr, "InnoDB: Database page corruption on disk" @@ -3997,7 +4048,7 @@ corrupt: table as corrupted instead of crashing server */ if (bpage->space > TRX_SYS_SPACE && buf_mark_space_corrupt(bpage)) { - return(FALSE); + return(false); } else { fputs("InnoDB: Ending processing" " because of" @@ -4008,6 +4059,9 @@ corrupt: } } + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", + page_not_corrupt: bpage = bpage; ); + if (recv_recovery_is_on()) { /* Pages must be uncompressed for crash recovery. */ ut_a(uncompressed); @@ -4090,7 +4144,7 @@ corrupt: mutex_exit(buf_page_get_mutex(bpage)); buf_pool_mutex_exit(buf_pool); - return(TRUE); + return(true); } /*********************************************************************//** @@ -5118,9 +5172,7 @@ void buf_refresh_io_stats_all(void) /*==========================*/ { - ulint i; - - for (i = 0; i < srv_buf_pool_instances; i++) { + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); @@ -5137,9 +5189,7 @@ ibool buf_all_freed(void) /*===============*/ { - ulint i; - - for (i = 0; i < srv_buf_pool_instances; i++) { + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index ad6ef7c4cef..fb853fe1543 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -25,16 +25,16 @@ Created 2011/12/19 #include "buf0dblwr.h" +#ifdef UNIV_NONINL +#include "buf0buf.ic" +#endif + #include "buf0buf.h" -#include "buf0lru.h" -#include "buf0flu.h" #include "buf0checksum.h" #include "srv0start.h" #include "srv0srv.h" #include "page0zip.h" #include "trx0sys.h" -#include "page0page.h" -#include "mtr0log.h" #ifndef UNIV_HOTBACKUP @@ -195,22 +195,20 @@ start_again: return; } - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Doublewrite buffer not found:" - " creating new\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Doublewrite buffer not found: creating new"); if (buf_pool_get_curr_size() < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + FSP_EXTENT_SIZE / 2 + 100) * UNIV_PAGE_SIZE)) { - fprintf(stderr, - "InnoDB: Cannot create doublewrite buffer:" - " you must\n" - "InnoDB: increase your buffer pool size.\n" - "InnoDB: Cannot continue operation.\n"); - exit(1); + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create doublewrite buffer: you must " + "increase your buffer pool size. Cannot continue " + "operation."); + + exit(EXIT_FAILURE); } block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, @@ -223,16 +221,15 @@ start_again: buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); if (block2 == NULL) { - fprintf(stderr, - "InnoDB: Cannot create doublewrite buffer:" - " you must\n" - "InnoDB: increase your tablespace size.\n" - "InnoDB: Cannot continue operation.\n"); + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create doublewrite buffer: you must " + "increase your tablespace size. " + "Cannot continue operation."); /* We exit without committing the mtr to prevent its modifications to the database getting to disk */ - exit(1); + exit(EXIT_FAILURE); } fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG; @@ -243,15 +240,12 @@ start_again: new_block = fseg_alloc_free_page( fseg_header, prev_page_no + 1, FSP_UP, &mtr); if (new_block == NULL) { - fprintf(stderr, - "InnoDB: Cannot create doublewrite" - " buffer: you must\n" - "InnoDB: increase your" - " tablespace size.\n" - "InnoDB: Cannot continue operation.\n" - ); - - exit(1); + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create doublewrite buffer: you must " + "increase your tablespace size. " + "Cannot continue operation."); + + exit(EXIT_FAILURE); } /* We read the allocated pages to the buffer pool; @@ -331,8 +325,7 @@ start_again: /* Remove doublewrite pages from LRU */ buf_pool_invalidate(); - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Doublewrite buffer created\n"); + ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created"); goto start_again; } @@ -391,7 +384,7 @@ buf_dblwr_init_or_restore_pages( } if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) - != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { /* We are upgrading from a version < 4.1.x to a version where multiple tablespaces are supported. We must reset the space id @@ -401,9 +394,8 @@ buf_dblwr_init_or_restore_pages( reset_space_ids = TRUE; - fprintf(stderr, - "InnoDB: Resetting space id's in the" - " doublewrite buffer\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Resetting space id's in the doublewrite buffer"); } /* Read the pages from the doublewrite buffer to memory */ @@ -459,12 +451,11 @@ buf_dblwr_init_or_restore_pages( } else if (!fil_check_adress_in_tablespace(space_id, page_no)) { - fprintf(stderr, - "InnoDB: Warning: a page in the" - " doublewrite buffer is not within space\n" - "InnoDB: bounds; space id %lu" - " page number %lu, page %lu in" - " doublewrite buf.\n", + ib_logf(IB_LOG_LEVEL_WARN, + "A page in the doublewrite buffer is not " + "within space bounds; space id %lu " + "page number %lu, page %lu in " + "doublewrite buf.", (ulong) space_id, (ulong) page_no, (ulong) i); } else if (space_id == TRX_SYS_SPACE @@ -489,8 +480,7 @@ buf_dblwr_init_or_restore_pages( /* Check if the page is corrupt */ - if (UNIV_UNLIKELY - (buf_page_is_corrupted(read_buf, zip_size))) { + if (buf_page_is_corrupted(true, read_buf, zip_size)) { fprintf(stderr, "InnoDB: Warning: database page" @@ -501,7 +491,8 @@ buf_dblwr_init_or_restore_pages( " the doublewrite buffer.\n", (ulong) space_id, (ulong) page_no); - if (buf_page_is_corrupted(page, zip_size)) { + if (buf_page_is_corrupted(true, + page, zip_size)) { fprintf(stderr, "InnoDB: Dump of the page:\n"); buf_page_print( @@ -538,9 +529,10 @@ buf_dblwr_init_or_restore_pages( zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL); - fprintf(stderr, - "InnoDB: Recovered the page from" - " the doublewrite buffer.\n"); + + ib_logf(IB_LOG_LEVEL_INFO, + "Recovered the page from" + " the doublewrite buffer."); } } @@ -595,6 +587,7 @@ buf_dblwr_update(void) ut_ad(buf_dblwr->batch_running); ut_ad(buf_dblwr->b_reserved > 0); + ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); buf_dblwr->b_reserved--; if (buf_dblwr->b_reserved == 0) { @@ -705,23 +698,29 @@ static void buf_dblwr_write_block_to_datafile( /*==============================*/ - const buf_block_t* block) /*!< in: block to write */ + const buf_page_t* bpage) /*!< in: page to write */ { - ut_a(block); - ut_a(buf_page_in_file(&block->page)); + ut_a(bpage); + ut_a(buf_page_in_file(bpage)); - if (block->page.zip.data) { + /* Increment the counter of I/O operations used + for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + + if (bpage->zip.data) { fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, - FALSE, buf_page_get_space(&block->page), - buf_page_get_zip_size(&block->page), - buf_page_get_page_no(&block->page), 0, - buf_page_get_zip_size(&block->page), - (void*) block->page.zip.data, - (void*) block); - - goto exit; + FALSE, buf_page_get_space(bpage), + buf_page_get_zip_size(bpage), + buf_page_get_page_no(bpage), 0, + buf_page_get_zip_size(bpage), + (void*) bpage->zip.data, + (void*) bpage); + + return; } + + const buf_block_t* block = (buf_block_t*) bpage; ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); buf_dblwr_check_page_lsn(block->frame); @@ -729,11 +728,6 @@ buf_dblwr_write_block_to_datafile( FALSE, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, (void*) block->frame, (void*) block); - -exit: - /* Increment the counter of I/O operations used - for selecting LRU policy. */ - buf_LRU_stat_inc_io(); } /********************************************************************//** @@ -748,9 +742,8 @@ buf_dblwr_flush_buffered_writes(void) /*=================================*/ { byte* write_buf; + ulint first_free; ulint len; - ulint len2; - ulint i; if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { /* Sync the writes to the disk. */ @@ -782,10 +775,12 @@ try_again: } ut_a(!buf_dblwr->batch_running); + ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved); /* Disallow anyone else to post to doublewrite buffer or to start another batch of flushing. */ buf_dblwr->batch_running = TRUE; + first_free = buf_dblwr->first_free; /* Now safe to release the mutex. Note that though no other thread is allowed to post to the doublewrite batch flushing @@ -795,7 +790,7 @@ try_again: write_buf = buf_dblwr->write_buf; - for (len2 = 0, i = 0; + for (ulint len2 = 0, i = 0; i < buf_dblwr->first_free; len2 += UNIV_PAGE_SIZE, i++) { @@ -845,8 +840,8 @@ try_again: flush: /* increment the doublewrite flushed pages counter */ - srv_dblwr_pages_written += buf_dblwr->first_free; - srv_dblwr_writes++; + srv_stats.dblwr_pages_written.add(buf_dblwr->first_free); + srv_stats.dblwr_writes.inc(); /* Now flush the doublewrite buffer data to disk */ fil_flush(TRX_SYS_SPACE); @@ -855,11 +850,21 @@ flush: and in recovery we will find them in the doublewrite buffer blocks. Next do the writes to the intended positions. */ - for (i = 0; i < buf_dblwr->first_free; i++) { - const buf_block_t* block = (buf_block_t*) - buf_dblwr->buf_block_arr[i]; - - buf_dblwr_write_block_to_datafile(block); + /* Up to this point first_free and buf_dblwr->first_free are + same because we have set the buf_dblwr->batch_running flag + disallowing any other thread to post any request but we + can't safely access buf_dblwr->first_free in the loop below. + This is so because it is possible that after we are done with + the last iteration and before we terminate the loop, the batch + gets finished in the IO helper thread and another thread posts + a new batch setting buf_dblwr->first_free to a higher value. + If this happens and we are using buf_dblwr->first_free in the + loop termination condition then we'll end up dispatching + the same block twice from two different threads. */ + ut_ad(first_free == buf_dblwr->first_free); + for (ulint i = 0; i < first_free; i++) { + buf_dblwr_write_block_to_datafile( + buf_dblwr->buf_block_arr[i]); } /* Wake possible simulated aio thread to actually post the @@ -935,6 +940,8 @@ try_again: buf_dblwr->first_free++; buf_dblwr->b_reserved++; + ut_ad(!buf_dblwr->batch_running); + ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved); ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size); if (buf_dblwr->first_free == srv_doublewrite_batch_size) { @@ -1065,7 +1072,7 @@ retry: /* We know that the write has been flushed to disk now and during recovery we will find it in the doublewrite buffer blocks. Next do the write to the intended position. */ - buf_dblwr_write_block_to_datafile((buf_block_t*) bpage); + buf_dblwr_write_block_to_datafile(bpage); /* Sync the writes to the disk. */ buf_flush_sync_datafiles(); @@ -1077,8 +1084,8 @@ retry: buf_dblwr->in_use[i] = FALSE; /* increment the doublewrite flushed pages counter */ - srv_dblwr_pages_written += buf_dblwr->first_free; - srv_dblwr_writes++; + srv_stats.dblwr_pages_written.inc(); + srv_stats.dblwr_writes.inc(); mutex_exit(&(buf_dblwr->mutex)); diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index 27757241c3e..467f817a2d1 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,14 +23,14 @@ Implements a buffer pool dump/load. Created April 08, 2011 Vasil Dimov *******************************************************/ +#include "univ.i" + #include <stdarg.h> /* va_* */ #include <string.h> /* strerror() */ -#include "univ.i" - #include "buf0buf.h" /* buf_pool_mutex_enter(), srv_buf_pool_instances */ #include "buf0dump.h" -#include "db0err.h" /* enum db_err */ +#include "db0err.h" #include "dict0dict.h" /* dict_operation_lock */ #include "os0file.h" /* OS_FILE_MAX_PATH */ #include "os0sync.h" /* os_event* */ @@ -40,7 +40,6 @@ Created April 08, 2011 Vasil Dimov #include "sync0rw.h" /* rw_lock_s_lock() */ #include "ut0byte.h" /* ut_ull_create() */ #include "ut0sort.h" /* UT_SORT_FUNCTION_BODY */ -#include "buf0rea.h" /* buf_read_page_async() */ enum status_severity { STATUS_INFO, @@ -579,6 +578,8 @@ DECLARE_THREAD(buf_dump_thread)( void* arg __attribute__((unused))) /*!< in: a dummy parameter required by os_thread_create */ { + ut_ad(!srv_read_only_mode); + srv_buf_dump_thread_active = TRUE; buf_dump_status(STATUS_INFO, "not started"); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 023ed766c62..542c1669667 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -25,6 +25,10 @@ Created 11/11/1995 Heikki Tuuri #include "buf0flu.h" +#ifdef UNIV_NONINL +#include "buf0flu.ic" +#endif + #include "buf0buf.h" #include "buf0checksum.h" #include "srv0start.h" @@ -44,39 +48,6 @@ Created 11/11/1995 Heikki Tuuri #include "srv0mon.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" -#include "buf0dblwr.h" - -#ifdef UNIV_NONINL -#include "buf0flu.ic" -#endif - -/********************************************************************** -These statistics are generated for heuristics used in estimating the -rate at which we should flush the dirty blocks to avoid bursty IO -activity. Note that the rate of flushing not only depends on how many -dirty pages we have in the buffer pool but it is also a fucntion of -how much redo the workload is generating and at what rate. */ -/* @{ */ - -/** Number of intervals for which we keep the history of these stats. -Each interval is 1 second, defined by the rate at which -srv_error_monitor_thread() calls buf_flush_stat_update(). */ -#define BUF_FLUSH_STAT_N_INTERVAL 20 - -/** Sampled values buf_flush_stat_cur. -Not protected by any mutex. Updated by buf_flush_stat_update(). */ -static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL]; - -/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */ -static ulint buf_flush_stat_arr_ind; - -/** Values at start of the current interval. Reset by -buf_flush_stat_update(). */ -static buf_flush_stat_t buf_flush_stat_cur; - -/** Running sum of past values of buf_flush_stat_cur. -Updated by buf_flush_stat_update(). Not protected by any mutex. */ -static buf_flush_stat_t buf_flush_stat_sum; /** Number of pages flushed through non flush_list flushes. */ static ulint buf_lru_flush_page_count = 0; @@ -104,6 +75,22 @@ in thrashing. */ /* @} */ +/******************************************************************//** +Increases flush_list size in bytes with zip_size for compressed page, +UNIV_PAGE_SIZE for uncompressed page in inline function */ +static inline +void +incr_flush_list_size_in_bytes( +/*==========================*/ + buf_block_t* block, /*!< in: control block */ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + ut_ad(buf_flush_list_mutex_own(buf_pool)); + ulint zip_size = page_zip_get_size(&block->page.zip); + buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE; + ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size); +} + #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /******************************************************************//** Validates the flush list. @@ -333,6 +320,7 @@ buf_flush_insert_into_flush_list( ut_d(block->page.in_flush_list = TRUE); block->page.oldest_modification = lsn; UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + incr_flush_list_size_in_bytes(block, buf_pool); #ifdef UNIV_DEBUG_VALGRIND { @@ -437,7 +425,7 @@ buf_flush_insert_sorted_into_flush_list( prev_b, &block->page); } - MONITOR_INC(MONITOR_PAGE_INFLUSH); + incr_flush_list_size_in_bytes(block, buf_pool); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_flush_validate_low(buf_pool)); @@ -538,6 +526,7 @@ buf_flush_remove( buf_page_t* bpage) /*!< in: pointer to the block in question */ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ulint zip_size; ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); @@ -576,14 +565,15 @@ buf_flush_remove( because we assert on in_flush_list in comparison function. */ ut_d(bpage->in_flush_list = FALSE); + zip_size = page_zip_get_size(&bpage->zip); + buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE; + bpage->oldest_modification = 0; #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_flush_validate_skip(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - MONITOR_DEC(MONITOR_PAGE_INFLUSH); - buf_flush_list_mutex_exit(buf_pool); } @@ -606,7 +596,7 @@ buf_flush_relocate_on_flush_list( buf_page_t* dpage) /*!< in/out: destination block */ { buf_page_t* prev; - buf_page_t* prev_b = NULL; + buf_page_t* prev_b = NULL; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); ut_ad(buf_pool_mutex_own(buf_pool)); @@ -710,6 +700,27 @@ buf_flush_write_complete( #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** +Calculate the checksum of a page from compressed table and update the page. */ +UNIV_INTERN +void +buf_flush_update_zip_checksum( +/*==========================*/ + buf_frame_t* page, /*!< in/out: Page to update */ + ulint zip_size, /*!< in: Compressed page size */ + lsn_t lsn) /*!< in: Lsn to stamp on the page */ +{ + ut_a(zip_size > 0); + + ib_uint32_t checksum = page_zip_calc_checksum( + page, zip_size, + static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm)); + + mach_write_to_8(page + FIL_PAGE_LSN, lsn); + memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); +} + +/********************************************************************//** Initializes a page for writing to the tablespace. */ UNIV_INTERN void @@ -747,17 +758,10 @@ buf_flush_init_for_writing( case FIL_PAGE_TYPE_ZBLOB: case FIL_PAGE_TYPE_ZBLOB2: case FIL_PAGE_INDEX: - checksum = page_zip_calc_checksum( - page_zip->data, zip_size, - static_cast<srv_checksum_algorithm_t>( - srv_checksum_algorithm)); - - mach_write_to_8(page_zip->data - + FIL_PAGE_LSN, newest_lsn); - memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); - mach_write_to_4(page_zip->data - + FIL_PAGE_SPACE_OR_CHKSUM, - checksum); + + buf_flush_update_zip_checksum( + page_zip->data, zip_size, newest_lsn); + return; } @@ -865,7 +869,7 @@ buf_flush_write_block_low( #endif #ifdef UNIV_LOG_DEBUG - static ibool univ_log_debug_warned; + static ibool univ_log_debug_warned; #endif /* UNIV_LOG_DEBUG */ ut_ad(buf_page_in_file(bpage)); @@ -949,15 +953,15 @@ os_aio_simulated_wake_handler_threads after we have posted a batch of writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be held upon entering this function, and they will be released by this function. */ -static +UNIV_INTERN void buf_flush_page( /*===========*/ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ buf_page_t* bpage, /*!< in: buffer control block */ - enum buf_flush flush_type) /*!< in: type of flush */ + buf_flush flush_type) /*!< in: type of flush */ { - mutex_t* block_mutex; + ib_mutex_t* block_mutex; ibool is_uncompressed; ut_ad(flush_type < BUF_FLUSH_N_TYPES); @@ -1091,6 +1095,56 @@ buf_flush_page_try( } # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ /***********************************************************//** +Check the page is in buffer pool and can be flushed. +@return true if the page can be flushed. */ +static +bool +buf_flush_check_neighbor( +/*=====================*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page offset */ + enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST */ +{ + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + bool ret; + + ut_ad(flush_type == BUF_FLUSH_LRU + || flush_type == BUF_FLUSH_LIST); + + buf_pool_mutex_enter(buf_pool); + + /* We only want to flush pages from this buffer pool. */ + bpage = buf_page_hash_get(buf_pool, space, offset); + + if (!bpage) { + + buf_pool_mutex_exit(buf_pool); + return(false); + } + + ut_a(buf_page_in_file(bpage)); + + /* We avoid flushing 'non-old' blocks in an LRU flush, + because the flushed blocks are soon freed */ + + ret = false; + if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) { + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + if (buf_flush_ready_for_flush(bpage, flush_type)) { + ret = true; + } + mutex_exit(block_mutex); + } + buf_pool_mutex_exit(buf_pool); + + return(ret); +} + +/***********************************************************//** Flushes to disk all flushable pages within the flush area. @return number of pages flushed */ static @@ -1115,7 +1169,7 @@ buf_flush_try_neighbors( ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN - || !srv_flush_neighbors) { + || srv_flush_neighbors == 0) { /* If there is little space or neighbor flushing is not enabled then just flush the victim. */ low = offset; @@ -1133,6 +1187,30 @@ buf_flush_try_neighbors( low = (offset / buf_flush_area) * buf_flush_area; high = (offset / buf_flush_area + 1) * buf_flush_area; + + if (srv_flush_neighbors == 1) { + /* adjust 'low' and 'high' to limit + for contiguous dirty area */ + if (offset > low) { + for (i = offset - 1; + i >= low + && buf_flush_check_neighbor( + space, i, flush_type); + i--) { + /* do nothing */ + } + low = i + 1; + } + + for (i = offset + 1; + i < high + && buf_flush_check_neighbor( + space, i, flush_type); + i++) { + /* do nothing */ + } + high = i; + } } /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */ @@ -1181,7 +1259,7 @@ buf_flush_try_neighbors( if (flush_type != BUF_FLUSH_LRU || i == offset || buf_page_is_old(bpage)) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); mutex_enter(block_mutex); @@ -1240,7 +1318,7 @@ buf_flush_page_and_try_neighbors( ulint* count) /*!< in/out: number of pages flushed */ { - mutex_t* block_mutex; + ib_mutex_t* block_mutex; ibool flushed = FALSE; #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -1374,7 +1452,7 @@ buf_flush_LRU_list_batch( && free_len < srv_LRU_scan_depth && lru_len > BUF_LRU_MIN_LEN) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); ibool evict; mutex_enter(block_mutex); @@ -1576,8 +1654,7 @@ NOTE 1: in the case of an LRU flush the calling thread may own latches to pages: to avoid deadlocks, this function must be written so that it cannot end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! -@return number of blocks for which the write request was queued; -ULINT_UNDEFINED if there was a flush of the same type already running */ +@return number of blocks for which the write request was queued */ static ulint buf_flush_batch( @@ -1621,8 +1698,6 @@ buf_flush_batch( buf_pool_mutex_exit(buf_pool); - buf_dblwr_flush_buffered_writes(); - #ifdef UNIV_DEBUG if (buf_debug_prints && count > 0) { fprintf(stderr, flush_type == BUF_FLUSH_LRU @@ -1632,8 +1707,6 @@ buf_flush_batch( } #endif /* UNIV_DEBUG */ - srv_buf_pool_flushed += count; - return(count); } @@ -1659,14 +1732,7 @@ buf_flush_common( } #endif /* UNIV_DEBUG */ - srv_buf_pool_flushed += page_count; - - if (flush_type == BUF_FLUSH_LRU) { - /* We keep track of all flushes happening as part of LRU - flush. When estimating the desired rate at which flush_list - should be flushed we factor in this value. */ - buf_lru_flush_page_count += page_count; - } + srv_stats.buf_pool_flushed.add(page_count); } /******************************************************************//** @@ -1750,7 +1816,7 @@ buf_flush_wait_batch_end( } } else { thd_wait_begin(NULL, THD_WAIT_DISKIO); - os_event_wait(buf_pool->no_flush[type]); + os_event_wait(buf_pool->no_flush[type]); thd_wait_end(NULL); } } @@ -1760,21 +1826,28 @@ This utility flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free list. NOTE: The calling thread is not allowed to own any latches on pages! -@return number of blocks for which the write request was queued; -ULINT_UNDEFINED if there was a flush of the same type already running */ +@return true if a batch was queued successfully. false if another batch +of same type was already running. */ static -ulint +bool buf_flush_LRU( /*==========*/ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ - ulint min_n) /*!< in: wished minimum mumber of blocks + ulint min_n, /*!< in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ { ulint page_count; + if (n_processed) { + *n_processed = 0; + } + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { - return(ULINT_UNDEFINED); + return(false); } page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0); @@ -1783,31 +1856,43 @@ buf_flush_LRU( buf_flush_common(BUF_FLUSH_LRU, page_count); - return(page_count); + if (n_processed) { + *n_processed = page_count; + } + + return(true); } /*******************************************************************//** This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. NOTE: The calling thread is not allowed to own any latches on pages! -@return number of blocks for which the write request was queued; -ULINT_UNDEFINED if there was a flush of the same type already running */ +@return true if a batch was queued successfully for each buffer pool +instance. false if another batch of same type was already running in +at least one of the buffer pool instance */ UNIV_INTERN -ulint +bool buf_flush_list( /*===========*/ ulint min_n, /*!< in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) */ - lsn_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + { ulint i; - ulint total_page_count = 0; - ibool skipped = FALSE; + bool success = true; + + if (n_processed) { + *n_processed = 0; + } if (min_n != ULINT_MAX) { /* Ensure that flushing is spread evenly amongst the @@ -1836,7 +1921,7 @@ buf_flush_list( pools based on the assumption that it will help in the retry which will follow the failure. */ - skipped = TRUE; + success = false; continue; } @@ -1848,7 +1933,9 @@ buf_flush_list( buf_flush_common(BUF_FLUSH_LIST, page_count); - total_page_count += page_count; + if (n_processed) { + *n_processed += page_count; + } if (page_count) { MONITOR_INC_VALUE_CUMULATIVE( @@ -1859,8 +1946,7 @@ buf_flush_list( } } - return(lsn_limit != LSN_MAX && skipped - ? ULINT_UNDEFINED : total_page_count); + return(success); } /******************************************************************//** @@ -1879,7 +1965,7 @@ buf_flush_single_page_from_LRU( { ulint scanned; buf_page_t* bpage; - mutex_t* block_mutex; + ib_mutex_t* block_mutex; ibool freed; ibool evict_zip; @@ -1957,128 +2043,6 @@ buf_flush_single_page_from_LRU( return(freed); } -/********************************************************************* -Update the historical stats that we are collecting for flush rate -heuristics at the end of each interval. -Flush rate heuristic depends on (a) rate of redo log generation and -(b) the rate at which LRU flush is happening. */ -UNIV_INTERN -void -buf_flush_stat_update(void) -/*=======================*/ -{ - buf_flush_stat_t* item; - lsn_t lsn_diff; - lsn_t lsn; - ulint n_flushed; - - lsn = log_get_lsn(); - if (buf_flush_stat_cur.redo == 0) { - /* First time around. Just update the current LSN - and return. */ - buf_flush_stat_cur.redo = lsn; - return; - } - - item = &buf_flush_stat_arr[buf_flush_stat_arr_ind]; - - /* values for this interval */ - lsn_diff = lsn - buf_flush_stat_cur.redo; - n_flushed = buf_lru_flush_page_count - - buf_flush_stat_cur.n_flushed; - - /* add the current value and subtract the obsolete entry. */ - buf_flush_stat_sum.redo += lsn_diff - item->redo; - buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed; - - /* put current entry in the array. */ - item->redo = lsn_diff; - item->n_flushed = n_flushed; - - /* update the index */ - buf_flush_stat_arr_ind++; - buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL; - - /* reset the current entry. */ - buf_flush_stat_cur.redo = lsn; - buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count; -} - -/********************************************************************* -Determines the fraction of dirty pages that need to be flushed based -on the speed at which we generate redo log. Note that if redo log -is generated at a significant rate without corresponding increase -in the number of dirty pages (for example, an in-memory workload) -it can cause IO bursts of flushing. This function implements heuristics -to avoid this burstiness. -@return number of dirty pages to be flushed / second */ -static -ulint -buf_flush_get_desired_flush_rate(void) -/*==================================*/ -{ - ulint i; - lsn_t redo_avg; - ulint n_dirty = 0; - ib_uint64_t n_flush_req; - ib_uint64_t lru_flush_avg; - lsn_t lsn = log_get_lsn(); - lsn_t log_capacity = log_get_capacity(); - - /* log_capacity should never be zero after the initialization - of log subsystem. */ - ut_ad(log_capacity != 0); - - /* Get total number of dirty pages. It is OK to access - flush_list without holding any mutex as we are using this - only for heuristics. */ - for (i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool; - - buf_pool = buf_pool_from_array(i); - n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list); - } - - /* An overflow can happen if we generate more than 2^32 bytes - of redo in this interval i.e.: 4G of redo in 1 second. We can - safely consider this as infinity because if we ever come close - to 4G we'll start a synchronous flush of dirty pages. */ - /* redo_avg below is average at which redo is generated in - past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current - interval. */ - redo_avg = buf_flush_stat_sum.redo / BUF_FLUSH_STAT_N_INTERVAL - + (lsn - buf_flush_stat_cur.redo); - - /* An overflow can happen possibly if we flush more than 2^32 - pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very - unlikely scenario. Even when this happens it means that our - flush rate will be off the mark. It won't affect correctness - of any subsystem. */ - /* lru_flush_avg below is rate at which pages are flushed as - part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the - number of pages flushed in the current interval. */ - lru_flush_avg = buf_flush_stat_sum.n_flushed - / BUF_FLUSH_STAT_N_INTERVAL - + (buf_lru_flush_page_count - - buf_flush_stat_cur.n_flushed); - - n_flush_req = (n_dirty * redo_avg) / log_capacity; - - /* The number of pages that we want to flush from the flush - list is the difference between the required rate and the - number of pages that we are historically flushing from the - LRU list */ - if (n_flush_req <= lru_flush_avg) { - return(0); - } else { - ib_uint64_t rate; - - rate = n_flush_req - lru_flush_avg; - - return((ulint) (rate < PCT_IO(100) ? rate : PCT_IO(100))); - } -} - /*********************************************************************//** Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list @@ -2086,36 +2050,35 @@ Clears up tail of the LRU lists: The depth to which we scan each buffer pool is controlled by dynamic config parameter innodb_LRU_scan_depth. @return total pages flushed */ -UNIV_INLINE +UNIV_INTERN ulint -page_cleaner_flush_LRU_tail(void) -/*=============================*/ +buf_flush_LRU_tail(void) +/*====================*/ { - ulint i; - ulint j; ulint total_flushed = 0; - for (i = 0; i < srv_buf_pool_instances; i++) { + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool = buf_pool_from_array(i); /* We divide LRU flush into smaller chunks because there may be user threads waiting for the flush to end in buf_LRU_get_free_block(). */ - for (j = 0; + for (ulint j = 0; j < srv_LRU_scan_depth; j += PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE) { - ulint n_flushed = buf_flush_LRU(buf_pool, - PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE); + ulint n_flushed = 0; /* Currently page_cleaner is the only thread that can trigger an LRU flush. It is possible that a batch triggered during last iteration is still running, */ - if (n_flushed != ULINT_UNDEFINED) { - total_flushed += n_flushed; - } + buf_flush_LRU(buf_pool, + PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE, + &n_flushed); + + total_flushed += n_flushed; } } @@ -2132,14 +2095,12 @@ page_cleaner_flush_LRU_tail(void) /*********************************************************************//** Wait for any possible LRU flushes that are in progress to end. */ -UNIV_INLINE +UNIV_INTERN void -page_cleaner_wait_LRU_flush(void) -/*=============================*/ +buf_flush_wait_LRU_batch_end(void) +/*==============================*/ { - ulint i; - - for (i = 0; i < srv_buf_pool_instances; i++) { + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); @@ -2166,22 +2127,87 @@ ulint page_cleaner_do_flush_batch( /*========================*/ ulint n_to_flush, /*!< in: number of pages that - we should attempt to flush. If - an lsn_limit is provided then - this value will have no affect */ + we should attempt to flush. */ lsn_t lsn_limit) /*!< in: LSN up to which flushing must happen */ { ulint n_flushed; - ut_ad(n_to_flush == ULINT_MAX || lsn_limit == LSN_MAX); + buf_flush_list(n_to_flush, lsn_limit, &n_flushed); + + return(n_flushed); +} - n_flushed = buf_flush_list(n_to_flush, lsn_limit); - if (n_flushed == ULINT_UNDEFINED) { - n_flushed = 0; +/*********************************************************************//** +Calculates if flushing is required based on number of dirty pages in +the buffer pool. +@return percent of io_capacity to flush to manage dirty page ratio */ +static +ulint +af_get_pct_for_dirty() +/*==================*/ +{ + ulint dirty_pct = buf_get_modified_ratio_pct(); + + ut_a(srv_max_dirty_pages_pct_lwm + <= srv_max_buf_pool_modified_pct); + + if (srv_max_dirty_pages_pct_lwm == 0) { + /* The user has not set the option to preflush dirty + pages as we approach the high water mark. */ + if (dirty_pct > srv_max_buf_pool_modified_pct) { + /* We have crossed the high water mark of dirty + pages In this case we start flushing at 100% of + innodb_io_capacity. */ + return(100); + } + } else if (dirty_pct > srv_max_dirty_pages_pct_lwm) { + /* We should start flushing pages gradually. */ + return((dirty_pct * 100) + / (srv_max_buf_pool_modified_pct + 1)); } - return(n_flushed); + return(0); +} + +/*********************************************************************//** +Calculates if flushing is required based on redo generation rate. +@return percent of io_capacity to flush to manage redo space */ +static +ulint +af_get_pct_for_lsn( +/*===============*/ + lsn_t age) /*!< in: current age of LSN. */ +{ + lsn_t max_async_age; + lsn_t lsn_age_factor; + lsn_t af_lwm = (srv_adaptive_flushing_lwm + * log_get_capacity()) / 100; + + if (age < af_lwm) { + /* No adaptive flushing. */ + return(0); + } + + max_async_age = log_get_max_modified_age_async(); + + if (age < max_async_age && !srv_adaptive_flushing) { + /* We have still not reached the max_async point and + the user has disabled adaptive flushing. */ + return(0); + } + + /* If we are here then we know that either: + 1) User has enabled adaptive flushing + 2) User may have disabled adaptive flushing but we have reached + max_async_age. */ + lsn_age_factor = (age * 100) / max_async_age; + + ut_ad(srv_max_io_capacity >= srv_io_capacity); + return(static_cast<ulint>( + ((srv_max_io_capacity / srv_io_capacity) + * (lsn_age_factor * sqrt((double)lsn_age_factor))) + / 7.5)); } /*********************************************************************//** @@ -2195,78 +2221,103 @@ ulint page_cleaner_flush_pages_if_needed(void) /*====================================*/ { - ulint n_pages_flushed = 0; - lsn_t lsn_limit = log_async_flush_lsn(); + static lsn_t lsn_avg_rate = 0; + static lsn_t prev_lsn = 0; + static lsn_t last_lsn = 0; + static ulint sum_pages = 0; + static ulint last_pages = 0; + static ulint prev_pages = 0; + static ulint avg_page_rate = 0; + static ulint n_iterations = 0; + lsn_t oldest_lsn; + lsn_t cur_lsn; + lsn_t age; + lsn_t lsn_rate; + ulint n_pages = 0; + ulint pct_for_dirty = 0; + ulint pct_for_lsn = 0; + ulint pct_total = 0; + int age_factor = 0; + + cur_lsn = log_get_lsn(); + + if (prev_lsn == 0) { + /* First time around. */ + prev_lsn = cur_lsn; + return(0); + } - /* Currently we decide whether or not to flush and how much to - flush based on three factors. + if (prev_lsn == cur_lsn) { + return(0); + } - 1) If the amount of LSN for which pages are not flushed to disk - yet is greater than log_sys->max_modified_age_async. This is - the most urgent type of flush and we attempt to cleanup enough - of the tail of the flush_list to avoid flushing inside user - threads. + /* We update our variables every srv_flushing_avg_loops + iterations to smooth out transition in workload. */ + if (++n_iterations >= srv_flushing_avg_loops) { - 2) If modified page ratio is greater than the one specified by - the user. In that case we flush full 100% IO_CAPACITY of the - server. Note that 1 and 2 are not mutually exclusive. We can - end up executing both steps. + avg_page_rate = ((sum_pages / srv_flushing_avg_loops) + + avg_page_rate) / 2; - 3) If adaptive_flushing is set by the user and neither of 1 - or 2 has occurred above then we flush a batch based on our - heuristics. */ + /* How much LSN we have generated since last call. */ + lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops; - if (lsn_limit != LSN_MAX) { + lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; - /* async flushing is requested */ - n_pages_flushed = page_cleaner_do_flush_batch(ULINT_MAX, - lsn_limit); + prev_lsn = cur_lsn; - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_ASYNC_TOTAL_PAGE, - MONITOR_FLUSH_ASYNC_COUNT, - MONITOR_FLUSH_ASYNC_PAGES, - n_pages_flushed); + n_iterations = 0; + + sum_pages = 0; } - if (UNIV_UNLIKELY(n_pages_flushed < PCT_IO(100) - && buf_get_modified_ratio_pct() - > srv_max_buf_pool_modified_pct)) { + oldest_lsn = buf_pool_get_oldest_modification(); - /* Try to keep the number of modified pages in the - buffer pool under the limit wished by the user */ + ut_ad(oldest_lsn <= cur_lsn); - n_pages_flushed += page_cleaner_do_flush_batch(PCT_IO(100), - LSN_MAX); + age = cur_lsn - oldest_lsn; - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE, - MONITOR_FLUSH_MAX_DIRTY_COUNT, - MONITOR_FLUSH_MAX_DIRTY_PAGES, - n_pages_flushed); + pct_for_dirty = af_get_pct_for_dirty(); + pct_for_lsn = af_get_pct_for_lsn(age); + + pct_total = ut_max(pct_for_dirty, pct_for_lsn); + + /* Cap the maximum IO capacity that we are going to use by + max_io_capacity. */ + n_pages = (PCT_IO(pct_total) + avg_page_rate) / 2; + + if (n_pages > srv_max_io_capacity) { + n_pages = srv_max_io_capacity; } - if (srv_adaptive_flushing && n_pages_flushed == 0) { + if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) { + age_factor = prev_pages / last_pages; + } - /* Try to keep the rate of flushing of dirty - pages such that redo log generation does not - produce bursts of IO at checkpoint time. */ - ulint n_flush = buf_flush_get_desired_flush_rate(); + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages); - ut_ad(n_flush <= PCT_IO(100)); - if (n_flush) { - n_pages_flushed = page_cleaner_do_flush_batch( - n_flush, LSN_MAX); + prev_pages = n_pages; + n_pages = page_cleaner_do_flush_batch( + n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1)); - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, - MONITOR_FLUSH_ADAPTIVE_COUNT, - MONITOR_FLUSH_ADAPTIVE_PAGES, - n_pages_flushed); - } + last_lsn= cur_lsn; + last_pages= n_pages + 1; + + MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate); + MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate); + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); + + if (n_pages) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_pages); + + sum_pages += n_pages; } - return(n_pages_flushed); + return(n_pages); } /*********************************************************************//** @@ -2306,7 +2357,8 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( ulint next_loop_time = ut_time_ms() + 1000; ulint n_flushed = 0; ulint last_activity = srv_get_activity_count(); - ulint i; + + ut_ad(!srv_read_only_mode); #ifdef UNIV_PFS_THREAD pfs_register_thread(buf_page_cleaner_thread_key); @@ -2336,7 +2388,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( last_activity = srv_get_activity_count(); /* Flush pages from end of LRU if required */ - n_flushed = page_cleaner_flush_LRU_tail(); + n_flushed = buf_flush_LRU_tail(); /* Flush pages from flush_list if required */ n_flushed += page_cleaner_flush_pages_if_needed(); @@ -2396,19 +2448,21 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( sweep and we'll come out of the loop leaving behind dirty pages in the flush_list */ buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); - page_cleaner_wait_LRU_flush(); + buf_flush_wait_LRU_batch_end(); + + bool success; do { - n_flushed = buf_flush_list(PCT_IO(100), LSN_MAX); + success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed); buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); - } while (n_flushed > 0); + } while (!success || n_flushed > 0); /* Some sanity checks */ ut_a(srv_get_active_thread_type() == SRV_NONE); ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE); - for (i = 0; i < srv_buf_pool_instances; i++) { + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool = buf_pool_from_array(i); ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0); } @@ -2521,3 +2575,66 @@ buf_flush_validate( } #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush +list in a particular buffer pool. +@return number of dirty pages present in a single buffer pool */ +UNIV_INTERN +ulint +buf_pool_get_dirty_pages_count( +/*===========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint id) /*!< in: space id to check */ + +{ + ulint count = 0; + + buf_pool_mutex_enter(buf_pool); + buf_flush_list_mutex_enter(buf_pool); + + buf_page_t* bpage; + + for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + bpage != 0; + bpage = UT_LIST_GET_NEXT(list, bpage)) { + + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_flush_list); + ut_ad(bpage->oldest_modification > 0); + + if (buf_page_get_space(bpage) == id) { + ++count; + } + } + + buf_flush_list_mutex_exit(buf_pool); + buf_pool_mutex_exit(buf_pool); + + return(count); +} + +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush list. +@return number of dirty pages present in all the buffer pools */ +UNIV_INTERN +ulint +buf_flush_get_dirty_pages_count( +/*============================*/ + ulint id) /*!< in: space id to check */ + +{ + ulint count = 0; + + for (ulint i = 0; i < srv_buf_pool_instances; ++i) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + count += buf_pool_get_dirty_pages_count(buf_pool, id); + } + + return(count); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index c35d84cb985..270263d95f1 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -51,6 +51,9 @@ Created 11/5/1995 Heikki Tuuri #include "log0recv.h" #include "srv0srv.h" #include "srv0mon.h" +#include "lock0lock.h" + +#include "ha_prototypes.h" /** The number of blocks from the LRU_old pointer onward, including the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV @@ -158,6 +161,22 @@ buf_LRU_block_free_hashed_page( be in a state where it can be freed */ /******************************************************************//** +Increases LRU size in bytes with zip_size for compressed page, +UNIV_PAGE_SIZE for uncompressed page in inline function */ +static inline +void +incr_LRU_size_in_bytes( +/*===================*/ + buf_page_t* bpage, /*!< in: control block */ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + ut_ad(buf_pool_mutex_own(buf_pool)); + ulint zip_size = page_zip_get_size(&bpage->zip); + buf_pool->stat.LRU_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE; + ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size); +} + +/******************************************************************//** Determines if the unzip_LRU list should be used for evicting a victim instead of the general LRU list. @return TRUE if should use unzip_LRU */ @@ -342,39 +361,338 @@ next_page: } /******************************************************************//** +While flushing (or removing dirty) pages from a tablespace we don't +want to hog the CPU and resources. Release the buffer pool and block +mutex and try to force a context switch. Then reacquire the same mutexes. +The current page is "fixed" before the release of the mutexes and then +"unfixed" again once we have reacquired the mutexes. */ +static __attribute__((nonnull)) +void +buf_flush_yield( +/*============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_page_t* bpage) /*!< in/out: current page */ +{ + ib_mutex_t* block_mutex; + + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(buf_page_in_file(bpage)); + + block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + /* "Fix" the block so that the position cannot be + changed after we release the buffer pool and + block mutexes. */ + buf_page_set_sticky(bpage); + + /* Now it is safe to release the buf_pool->mutex. */ + buf_pool_mutex_exit(buf_pool); + + mutex_exit(block_mutex); + /* Try and force a context switch. */ + os_thread_yield(); + + buf_pool_mutex_enter(buf_pool); + + mutex_enter(block_mutex); + /* "Unfix" the block now that we have both the + buffer pool and block mutex again. */ + buf_page_unset_sticky(bpage); + mutex_exit(block_mutex); +} + +/******************************************************************//** +If we have hogged the resources for too long then release the buffer +pool and flush list mutex and do a thread yield. Set the current page +to "sticky" so that it is not relocated during the yield. +@return true if yielded */ +static __attribute__((nonnull(1), warn_unused_result)) +bool +buf_flush_try_yield( +/*================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_page_t* bpage, /*!< in/out: bpage to remove */ + ulint processed) /*!< in: number of pages processed */ +{ + /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the + loop we release buf_pool->mutex to let other threads + do their job but only if the block is not IO fixed. This + ensures that the block stays in its position in the + flush_list. */ + + if (bpage != NULL + && processed >= BUF_LRU_DROP_SEARCH_SIZE + && buf_page_get_io_fix(bpage) == BUF_IO_NONE) { + + buf_flush_list_mutex_exit(buf_pool); + + /* Release the buffer pool and block mutex + to give the other threads a go. */ + + buf_flush_yield(buf_pool, bpage); + + buf_flush_list_mutex_enter(buf_pool); + + /* Should not have been removed from the flush + list during the yield. However, this check is + not sufficient to catch a remove -> add. */ + + ut_ad(bpage->in_flush_list); + + return(true); + } + + return(false); +} + +/******************************************************************//** +Removes a single page from a given tablespace inside a specific +buffer pool instance. +@return true if page was removed. */ +static __attribute__((nonnull, warn_unused_result)) +bool +buf_flush_or_remove_page( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_page_t* bpage, /*!< in/out: bpage to remove */ + bool flush) /*!< in: flush to disk if true but + don't remove else remove without + flushing to disk */ +{ + ib_mutex_t* block_mutex; + bool processed = false; + + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(buf_flush_list_mutex_own(buf_pool)); + + block_mutex = buf_page_get_mutex(bpage); + + /* bpage->space and bpage->io_fix are protected by + buf_pool->mutex and block_mutex. It is safe to check + them while holding buf_pool->mutex only. */ + + if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + + /* We cannot remove this page during this scan + yet; maybe the system is currently reading it + in, or flushing the modifications to the file */ + + } else { + + /* We have to release the flush_list_mutex to obey the + latching order. We are however guaranteed that the page + will stay in the flush_list because buf_flush_remove() + needs buf_pool->mutex as well (for the non-flush case). */ + + buf_flush_list_mutex_exit(buf_pool); + + mutex_enter(block_mutex); + + ut_ad(bpage->oldest_modification != 0); + + if (bpage->buf_fix_count > 0) { + + mutex_exit(block_mutex); + + /* We cannot remove this page yet; + maybe the system is currently reading + it in, or flushing the modifications + to the file */ + + } else if (!flush) { + + buf_flush_remove(bpage); + + mutex_exit(block_mutex); + + processed = true; + + } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + + /* Check the status again after releasing the flush + list mutex and acquiring the block mutex. The background + flush thread may be in the process of flushing this + page when we released the flush list mutex. */ + + /* The following call will release the buffer pool + and block mutex. */ + buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE); + + /* Wake possible simulated aio thread to actually + post the writes to the operating system */ + os_aio_simulated_wake_handler_threads(); + + buf_pool_mutex_enter(buf_pool); + + processed = true; + } else { + mutex_exit(block_mutex); + } + + buf_flush_list_mutex_enter(buf_pool); + } + + ut_ad(!mutex_own(block_mutex)); + + return(processed); +} + +/******************************************************************//** Remove all dirty pages belonging to a given tablespace inside a specific buffer pool instance when we are deleting the data file(s) of that tablespace. The pages still remain a part of LRU and are evicted from -the list as they age towards the tail of the LRU. */ -static +the list as they age towards the tail of the LRU. +@retval DB_SUCCESS if all freed +@retval DB_FAIL if not all freed +@retval DB_INTERRUPTED if the transaction was interrupted */ +static __attribute__((nonnull(1), warn_unused_result)) +dberr_t +buf_flush_or_remove_pages( +/*======================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id, /*!< in: target space id for which + to remove or flush pages */ + bool flush, /*!< in: flush to disk if true but + don't remove else remove without + flushing to disk */ + const trx_t* trx) /*!< to check if the operation must + be interrupted, can be 0 */ +{ + buf_page_t* prev; + buf_page_t* bpage; + ulint processed = 0; + bool all_freed = true; + + buf_flush_list_mutex_enter(buf_pool); + + for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + bpage != NULL; + bpage = prev) { + + ut_a(buf_page_in_file(bpage)); + + /* Save the previous link because once we free the + page we can't rely on the links. */ + + prev = UT_LIST_GET_PREV(list, bpage); + + if (buf_page_get_space(bpage) != id) { + + /* Skip this block, as it does not belong to + the target space. */ + + } else if (!buf_flush_or_remove_page(buf_pool, bpage, flush)) { + + /* Remove was unsuccessful, we have to try again + by scanning the entire list from the end. */ + + all_freed = false; + } + + ++processed; + + /* Yield if we have hogged the CPU and mutexes for too long. */ + if (buf_flush_try_yield(buf_pool, prev, processed)) { + + /* Reset the batch size counter if we had to yield. */ + + processed = 0; + } + +#ifdef DBUG_OFF + if (flush) { + DBUG_EXECUTE_IF("ib_export_flush_crash", + static ulint n_pages; + if (++n_pages == 4) {DBUG_SUICIDE();}); + } +#endif /* DBUG_OFF */ + + /* The check for trx is interrupted is expensive, we want + to check every N iterations. */ + if (!processed && trx && trx_is_interrupted(trx)) { + buf_flush_list_mutex_exit(buf_pool); + return(DB_INTERRUPTED); + } + } + + buf_flush_list_mutex_exit(buf_pool); + + return(all_freed ? DB_SUCCESS : DB_FAIL); +} + +/******************************************************************//** +Remove or flush all the dirty pages that belong to a given tablespace +inside a specific buffer pool instance. The pages will remain in the LRU +list and will be evicted from the LRU list as they age and move towards +the tail of the LRU list. */ +static __attribute__((nonnull(1))) void -buf_LRU_remove_dirty_pages_for_tablespace( -/*======================================*/ +buf_flush_dirty_pages( +/*==================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id, /*!< in: space id */ + bool flush, /*!< in: flush to disk if true otherwise + remove the pages without flushing */ + const trx_t* trx) /*!< to check if the operation must + be interrupted */ +{ + dberr_t err; + + do { + buf_pool_mutex_enter(buf_pool); + + err = buf_flush_or_remove_pages(buf_pool, id, flush, trx); + + buf_pool_mutex_exit(buf_pool); + + ut_ad(buf_flush_validate(buf_pool)); + + if (err == DB_FAIL) { + os_thread_sleep(20000); + } + + /* DB_FAIL is a soft error, it means that the task wasn't + completed, needs to be retried. */ + + ut_ad(buf_flush_validate(buf_pool)); + + } while (err == DB_FAIL); +} + +/******************************************************************//** +Remove all pages that belong to a given tablespace inside a specific +buffer pool instance when we are DISCARDing the tablespace. */ +static __attribute__((nonnull)) +void +buf_LRU_remove_all_pages( +/*=====================*/ buf_pool_t* buf_pool, /*!< buffer pool instance */ ulint id) /*!< in: space id */ { buf_page_t* bpage; ibool all_freed; - ulint i; scan_again: buf_pool_mutex_enter(buf_pool); - buf_flush_list_mutex_enter(buf_pool); all_freed = TRUE; - for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0; - bpage != NULL; ++i) { + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); + bpage != NULL; + /* No op */) { + rw_lock_t* hash_lock; buf_page_t* prev_bpage; - mutex_t* block_mutex = NULL; + ib_mutex_t* block_mutex = NULL; ut_a(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); - prev_bpage = UT_LIST_GET_PREV(list, bpage); + prev_bpage = UT_LIST_GET_PREV(LRU, bpage); /* bpage->space and bpage->io_fix are protected by - buf_pool->mutex and block_mutex. It is safe to check + buf_pool->mutex and the block_mutex. It is safe to check them while holding buf_pool->mutex only. */ if (buf_page_get_space(bpage) != id) { @@ -388,83 +706,103 @@ scan_again: all_freed = FALSE; goto next_page; - } + } else { + ulint fold = buf_page_address_fold( + bpage->space, bpage->offset); - /* We have to release the flush_list_mutex to obey the - latching order. We are however guaranteed that the page - will stay in the flush_list because buf_flush_remove() - needs buf_pool->mutex as well. */ - buf_flush_list_mutex_exit(buf_pool); - block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); + hash_lock = buf_page_hash_lock_get(buf_pool, fold); - if (bpage->buf_fix_count > 0) { - mutex_exit(block_mutex); - buf_flush_list_mutex_enter(buf_pool); + rw_lock_x_lock(hash_lock); - /* We cannot remove this page during - this scan yet; maybe the system is - currently reading it in, or flushing - the modifications to the file */ + block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); - all_freed = FALSE; - goto next_page; - } + if (bpage->buf_fix_count > 0) { - ut_ad(bpage->oldest_modification != 0); + mutex_exit(block_mutex); - buf_flush_remove(bpage); + rw_lock_x_unlock(hash_lock); - mutex_exit(block_mutex); - buf_flush_list_mutex_enter(buf_pool); -next_page: - bpage = prev_bpage; + /* We cannot remove this page during + this scan yet; maybe the system is + currently reading it in, or flushing + the modifications to the file */ - if (!bpage) { - break; + all_freed = FALSE; + + goto next_page; + } } - /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the - loop we release buf_pool->mutex to let other threads - do their job. */ - if (i < BUF_LRU_DROP_SEARCH_SIZE) { - continue; + ut_ad(mutex_own(block_mutex)); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Dropping space %lu page %lu\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); } +#endif + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + /* Do nothing, because the adaptive hash index + covers uncompressed pages only. */ + } else if (((buf_block_t*) bpage)->index) { + ulint page_no; + ulint zip_size; - /* We IO-fix the block to make sure that the block - stays in its position in the flush_list. */ - if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { - /* Block is already IO-fixed. We don't - want to change the value. Lets leave - this block alone. */ - continue; + buf_pool_mutex_exit(buf_pool); + + zip_size = buf_page_get_zip_size(bpage); + page_no = buf_page_get_page_no(bpage); + + rw_lock_x_unlock(hash_lock); + + mutex_exit(block_mutex); + + /* Note that the following call will acquire + and release block->lock X-latch. */ + + btr_search_drop_page_hash_when_freed( + id, zip_size, page_no); + + goto scan_again; } - buf_flush_list_mutex_exit(buf_pool); - block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); - buf_page_set_sticky(bpage); - mutex_exit(block_mutex); + if (bpage->oldest_modification != 0) { - /* Now it is safe to release the buf_pool->mutex. */ - buf_pool_mutex_exit(buf_pool); - os_thread_yield(); - buf_pool_mutex_enter(buf_pool); + buf_flush_remove(bpage); + } - mutex_enter(block_mutex); - buf_page_unset_sticky(bpage); - mutex_exit(block_mutex); + ut_ad(!bpage->in_flush_list); - buf_flush_list_mutex_enter(buf_pool); - ut_ad(bpage->in_flush_list); + /* Remove from the LRU list. */ - i = 0; + if (buf_LRU_block_remove_hashed_page(bpage, TRUE) + != BUF_BLOCK_ZIP_FREE) { + + buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + + } else { + /* The block_mutex should have been released + by buf_LRU_block_remove_hashed_page() when it + returns BUF_BLOCK_ZIP_FREE. */ + ut_ad(block_mutex == &buf_pool->zip_mutex); + } + + ut_ad(!mutex_own(block_mutex)); + +#ifdef UNIV_SYNC_DEBUG + /* buf_LRU_block_remove_hashed_page() releases the hash_lock */ + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + +next_page: + bpage = prev_bpage; } buf_pool_mutex_exit(buf_pool); - buf_flush_list_mutex_exit(buf_pool); - - ut_ad(buf_flush_validate(buf_pool)); if (!all_freed) { os_thread_sleep(20000); @@ -474,15 +812,60 @@ next_page: } /******************************************************************//** -Invalidates all pages belonging to a given tablespace when we are deleting -the data file(s) of that tablespace. */ +Remove pages belonging to a given tablespace inside a specific +buffer pool instance when we are deleting the data file(s) of that +tablespace. The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU only if buf_remove +is BUF_REMOVE_FLUSH_NO_WRITE. */ +static __attribute__((nonnull(1))) +void +buf_LRU_remove_pages( +/*=================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove, /*!< in: remove or flush strategy */ + const trx_t* trx) /*!< to check if the operation must + be interrupted */ +{ + switch (buf_remove) { + case BUF_REMOVE_ALL_NO_WRITE: + buf_LRU_remove_all_pages(buf_pool, id); + break; + + case BUF_REMOVE_FLUSH_NO_WRITE: + ut_a(trx == 0); + buf_flush_dirty_pages(buf_pool, id, false, NULL); + ut_ad(trx_is_interrupted(trx) + || buf_pool_get_dirty_pages_count(buf_pool, id) == 0); + break; + + case BUF_REMOVE_FLUSH_WRITE: + ut_a(trx != 0); + buf_flush_dirty_pages(buf_pool, id, true, trx); + ut_ad(trx_is_interrupted(trx) + || buf_pool_get_dirty_pages_count(buf_pool, id) == 0); + /* Ensure that all asynchronous IO is completed. */ + os_aio_wait_until_no_pending_writes(); + fil_flush(id); + break; + } +} + +/******************************************************************//** +Flushes all dirty pages or removes all pages belonging +to a given tablespace. A PROBLEM: if readahead is being started, what +guarantees that it will not try to read in pages after this operation +has completed? */ UNIV_INTERN void -buf_LRU_invalidate_tablespace( +buf_LRU_flush_or_remove_pages( /*==========================*/ - ulint id) /*!< in: space id */ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove, /*!< in: remove or flush strategy */ + const trx_t* trx) /*!< to check if the operation must + be interrupted */ { - ulint i; + ulint i; /* Before we attempt to drop pages one by one we first attempt to drop page hash index entries in batches to make @@ -494,9 +877,28 @@ buf_LRU_invalidate_tablespace( buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); - buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); - buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id); + + switch (buf_remove) { + case BUF_REMOVE_ALL_NO_WRITE: + case BUF_REMOVE_FLUSH_NO_WRITE: + buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); + break; + + case BUF_REMOVE_FLUSH_WRITE: + /* We allow read-only queries against the + table, there is no need to drop the AHI entries. */ + break; + } + + buf_LRU_remove_pages(buf_pool, id, buf_remove, trx); } + +#ifdef UNIV_DEBUG + if (trx != 0 && id != 0) { + ut_ad(trx_is_interrupted(trx) + || buf_flush_get_dirty_pages_count(id) == 0); + } +#endif /* UNIV_DEBUG */ } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG @@ -790,7 +1192,7 @@ buf_LRU_check_size_of_non_data_objects( buf_lru_switched_on_innodb_mon = TRUE; srv_print_innodb_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } } else if (buf_lru_switched_on_innodb_mon) { @@ -938,7 +1340,7 @@ loop: mon_value_was = srv_print_innodb_monitor; started_monitor = TRUE; srv_print_innodb_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } /* If we have scanned the whole LRU and still are unable to @@ -965,7 +1367,7 @@ loop: ++flush_failures; } - ++srv_buf_pool_wait_free; + srv_stats.buf_pool_wait_free.add(n_iterations, 1); n_iterations++; @@ -1107,6 +1509,7 @@ buf_LRU_remove_block( buf_page_t* bpage) /*!< in: control block */ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ulint zip_size; ut_ad(buf_pool); ut_ad(bpage); @@ -1142,6 +1545,9 @@ buf_LRU_remove_block( UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage); ut_d(bpage->in_LRU_list = FALSE); + zip_size = page_zip_get_size(&bpage->zip); + buf_pool->stat.LRU_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE; + buf_unzip_LRU_remove_block_if_needed(bpage); /* If the LRU list is so short that LRU_old is not defined, @@ -1202,7 +1608,10 @@ buf_unzip_LRU_add_block( } /******************************************************************//** -Adds a block to the LRU list end. */ +Adds a block to the LRU list end. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ UNIV_INLINE void buf_LRU_add_block_to_end_low( @@ -1221,6 +1630,8 @@ buf_LRU_add_block_to_end_low( UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage); ut_d(bpage->in_LRU_list = TRUE); + incr_LRU_size_in_bytes(bpage, buf_pool); + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { ut_ad(buf_pool->LRU_old); @@ -1249,7 +1660,10 @@ buf_LRU_add_block_to_end_low( } /******************************************************************//** -Adds a block to the LRU list. */ +Adds a block to the LRU list. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ UNIV_INLINE void buf_LRU_add_block_low( @@ -1291,6 +1705,8 @@ buf_LRU_add_block_low( ut_d(bpage->in_LRU_list = TRUE); + incr_LRU_size_in_bytes(bpage, buf_pool); + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { ut_ad(buf_pool->LRU_old); @@ -1318,7 +1734,10 @@ buf_LRU_add_block_low( } /******************************************************************//** -Adds a block to the LRU list. */ +Adds a block to the LRU list. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ UNIV_INTERN void buf_LRU_add_block( @@ -1391,7 +1810,7 @@ buf_LRU_free_block( bpage->offset); rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); - mutex_t* block_mutex = buf_page_get_mutex(bpage); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_in_file(bpage)); @@ -1540,6 +1959,8 @@ func_exit: UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, prev_b, b); + incr_LRU_size_in_bytes(b, buf_pool); + if (buf_page_is_old(b)) { buf_pool->LRU_old_len++; if (UNIV_UNLIKELY @@ -1995,24 +2416,28 @@ buf_LRU_free_one_page( be in a state where it can be freed; there may or may not be a hash index to the page */ { -#ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); -#endif - mutex_t* block_mutex = buf_page_get_mutex(bpage); + const ulint fold = buf_page_address_fold(bpage->space, + bpage->offset); + rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); ut_ad(buf_pool_mutex_own(buf_pool)); - ut_ad(mutex_own(block_mutex)); + + rw_lock_x_lock(hash_lock); + mutex_enter(block_mutex); if (buf_LRU_block_remove_hashed_page(bpage, TRUE) != BUF_BLOCK_ZIP_FREE) { buf_LRU_block_free_hashed_page((buf_block_t*) bpage); - } else { - /* The block_mutex should have been released by - buf_LRU_block_remove_hashed_page() when it returns - BUF_BLOCK_ZIP_FREE. */ - ut_ad(block_mutex == &buf_pool->zip_mutex); - mutex_enter(block_mutex); } + + /* buf_LRU_block_remove_hashed_page() releases hash_lock and block_mutex */ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX) + && !rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!mutex_own(block_mutex)); } /**********************************************************************//** diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 227cb083725..3a579e251ff 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -61,7 +61,7 @@ buf_read_page_handle_error( buf_page_t* bpage) /*!< in: pointer to the block */ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - const ibool uncompressed = (buf_page_get_state(bpage) + const bool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); /* First unfix and release lock on the bpage */ @@ -79,13 +79,14 @@ buf_read_page_handle_error( BUF_IO_READ); } + mutex_exit(buf_page_get_mutex(bpage)); + /* remove the block from LRU list */ buf_LRU_free_one_page(bpage); ut_ad(buf_pool->n_pend_reads > 0); buf_pool->n_pend_reads--; - mutex_exit(buf_page_get_mutex(bpage)); buf_pool_mutex_exit(buf_pool); } @@ -103,7 +104,7 @@ static ulint buf_read_page_low( /*==============*/ - ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are + dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are trying to read from a non-existent tablespace, or a tablespace which is just now being dropped */ ibool sync, /*!< in: TRUE if synchronous aio is desired */ @@ -192,13 +193,9 @@ buf_read_page_low( } thd_wait_end(NULL); - if (*err == DB_TABLESPACE_DELETED) { - buf_read_page_handle_error(bpage); - return(0); - } - if (*err != DB_SUCCESS) { - if (ignore_nonexistent_pages) { + if (ignore_nonexistent_pages || *err == DB_TABLESPACE_DELETED) { + buf_read_page_handle_error(bpage); return(0); } /* else */ @@ -248,7 +245,7 @@ buf_read_ahead_random( ulint ibuf_mode; ulint count; ulint low, high; - ulint err; + dberr_t err; ulint i; const ulint buf_read_ahead_random_area = BUF_READ_AHEAD_AREA(buf_pool); @@ -377,7 +374,7 @@ read_ahead: buf_LRU_stat_inc_io(); buf_pool->stat.n_ra_pages_read_rnd += count; - srv_buf_pool_reads += count; + srv_stats.buf_pool_reads.add(count); return(count); } @@ -397,7 +394,7 @@ buf_read_page( { ib_int64_t tablespace_version; ulint count; - ulint err; + dberr_t err; tablespace_version = fil_space_get_version(space); @@ -407,7 +404,7 @@ buf_read_page( count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, zip_size, FALSE, tablespace_version, offset); - srv_buf_pool_reads += count; + srv_stats.buf_pool_reads.add(count); if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, @@ -440,7 +437,7 @@ buf_read_page_async( ulint zip_size; ib_int64_t tablespace_version; ulint count; - ulint err; + dberr_t err; zip_size = fil_space_get_zip_size(space); @@ -455,7 +452,7 @@ buf_read_page_async( | BUF_READ_IGNORE_NONEXISTENT_PAGES, space, zip_size, FALSE, tablespace_version, offset); - srv_buf_pool_reads += count; + srv_stats.buf_pool_reads.add(count); /* We do not increment number of I/O operations used for LRU policy here (buf_LRU_stat_inc_io()). We use this in heuristics to decide @@ -513,7 +510,7 @@ buf_read_ahead_linear( ulint fail_count; ulint ibuf_mode; ulint low, high; - ulint err; + dberr_t err; ulint i; const ulint buf_read_ahead_linear_area = BUF_READ_AHEAD_AREA(buf_pool); @@ -784,7 +781,7 @@ buf_read_ibuf_merge_pages( #endif for (i = 0; i < n_stored; i++) { - ulint err; + dberr_t err; buf_pool_t* buf_pool; ulint zip_size = fil_space_get_zip_size(space_ids[i]); @@ -850,7 +847,7 @@ buf_read_recv_pages( { ib_int64_t tablespace_version; ulint count; - ulint err; + dberr_t err; ulint i; zip_size = fil_space_get_zip_size(space); diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc index 8e305364ac8..eea10759fcd 100644 --- a/storage/innobase/dict/dict0boot.cc +++ b/storage/innobase/dict/dict0boot.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -241,9 +241,10 @@ dict_hdr_create( /*****************************************************************//** Initializes the data dictionary memory structures when the database is -started. This function is also called when the data dictionary is created. */ +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ UNIV_INTERN -void +dberr_t dict_boot(void) /*===========*/ { @@ -252,7 +253,7 @@ dict_boot(void) dict_hdr_t* dict_hdr; mem_heap_t* heap; mtr_t mtr; - ulint error; + dberr_t error; /* Be sure these constants do not ever change. To avoid bloat, only check the *NUM_FIELDS* in each table */ @@ -307,9 +308,7 @@ dict_boot(void) dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); - /* If the format is UNIV_FORMAT_A, table->flags == 0, and - TYPE == 1, which is defined as SYS_TABLE_TYPE_ANTELOPE. - The low order bit of TYPE is always set to 1. If the format + /* The low order bit of TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field matches table->flags. */ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); @@ -454,14 +453,27 @@ dict_boot(void) ibuf_init_at_db_start(); - /* Load definitions of other indexes on system tables */ + dberr_t err = DB_SUCCESS; + + if (srv_read_only_mode && !ibuf_is_empty()) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Change buffer must be empty when --innodb-read-only " + "is set!"); - dict_load_sys_table(dict_sys->sys_tables); - dict_load_sys_table(dict_sys->sys_columns); - dict_load_sys_table(dict_sys->sys_indexes); - dict_load_sys_table(dict_sys->sys_fields); + err = DB_ERROR; + } else { + /* Load definitions of other indexes on system tables */ + + dict_load_sys_table(dict_sys->sys_tables); + dict_load_sys_table(dict_sys->sys_columns); + dict_load_sys_table(dict_sys->sys_indexes); + dict_load_sys_table(dict_sys->sys_fields); + } mutex_exit(&(dict_sys->mutex)); + + return(err); } /*****************************************************************//** @@ -476,9 +488,10 @@ dict_insert_initial_data(void) } /*****************************************************************//** -Creates and initializes the data dictionary at the database creation. */ +Creates and initializes the data dictionary at the server bootstrap. +@return DB_SUCCESS or error code. */ UNIV_INTERN -void +dberr_t dict_create(void) /*=============*/ { @@ -490,7 +503,11 @@ dict_create(void) mtr_commit(&mtr); - dict_boot(); + dberr_t err = dict_boot(); + + if (err == DB_SUCCESS) { + dict_insert_initial_data(); + } - dict_insert_initial_data(); + return(err); } diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc index d58b304ab92..864150b324a 100644 --- a/storage/innobase/dict/dict0crea.cc +++ b/storage/innobase/dict/dict0crea.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -43,6 +43,7 @@ Created 1/8/1996 Heikki Tuuri #include "usr0sess.h" #include "ut0vec.h" #include "dict0priv.h" +#include "fts0priv.h" /*****************************************************************//** Based on a table object, this function builds the entry to be inserted @@ -244,8 +245,8 @@ dict_create_sys_columns_tuple( /***************************************************************//** Builds a table definition to insert. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t dict_build_table_def_step( /*======================*/ que_thr_t* thr, /*!< in: query thread */ @@ -253,9 +254,8 @@ dict_build_table_def_step( { dict_table_t* table; dtuple_t* row; - ulint error; - const char* path_or_name; - ibool is_path; + dberr_t error; + const char* path; mtr_t mtr; ulint space = 0; bool use_tablespace; @@ -263,7 +263,7 @@ dict_build_table_def_step( ut_ad(mutex_own(&(dict_sys->mutex))); table = node->table; - use_tablespace = !!(table->flags2 & DICT_TF2_USE_TABLESPACE); + use_tablespace = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE); dict_hdr_get_new_id(&table->id, NULL, NULL); @@ -274,6 +274,11 @@ dict_build_table_def_step( Get a new space id. */ dict_hdr_get_new_id(NULL, NULL, &space); + DBUG_EXECUTE_IF( + "ib_create_table_fail_out_of_space_ids", + space = ULINT_UNDEFINED; + ); + if (UNIV_UNLIKELY(space == ULINT_UNDEFINED)) { return(DB_ERROR); } @@ -286,26 +291,19 @@ dict_build_table_def_step( - page 3 will contain the root of the clustered index of the table we create here. */ - if (table->dir_path_of_temp_table) { - /* We place tables created with CREATE TEMPORARY - TABLE in the tmp dir of mysqld server */ - - path_or_name = table->dir_path_of_temp_table; - is_path = TRUE; - } else { - path_or_name = table->name; - is_path = FALSE; - } + path = table->data_dir_path ? table->data_dir_path + : table->dir_path_of_temp_table; ut_ad(dict_table_get_format(table) <= UNIV_FORMAT_MAX); ut_ad(!dict_table_zip_size(table) || dict_table_get_format(table) >= UNIV_FORMAT_B); error = fil_create_new_single_table_tablespace( - space, path_or_name, is_path, + space, table->name, path, dict_tf_to_fsp_flags(table->flags), table->flags2, FIL_IBD_FILE_INITIAL_SIZE); + table->space = (unsigned int) space; if (error != DB_SUCCESS) { @@ -333,10 +331,9 @@ dict_build_table_def_step( } /***************************************************************//** -Builds a column definition to insert. -@return DB_SUCCESS */ +Builds a column definition to insert. */ static -ulint +void dict_build_col_def_step( /*====================*/ tab_node_t* node) /*!< in: table create node */ @@ -346,8 +343,6 @@ dict_build_col_def_step( row = dict_create_sys_columns_tuple(node->table, node->col_no, node->heap); ins_node_set_new_row(node->col_def, row); - - return(DB_SUCCESS); } /*****************************************************************//** @@ -571,8 +566,8 @@ dict_create_search_tuple( /***************************************************************//** Builds an index definition row to insert. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t dict_build_index_def_step( /*======================*/ que_thr_t* thr, /*!< in: query thread */ @@ -595,7 +590,10 @@ dict_build_index_def_step( return(DB_TABLE_NOT_FOUND); } - trx->table_id = table->id; + if (!trx->table_id) { + /* Record only the first table id. */ + trx->table_id = table->id; + } node->table = table; @@ -616,15 +614,16 @@ dict_build_index_def_step( /* Note that the index was created by this transaction. */ index->trx_id = trx->id; + ut_ad(table->def_trx_id <= trx->id); + table->def_trx_id = trx->id; return(DB_SUCCESS); } /***************************************************************//** -Builds a field definition row to insert. -@return DB_SUCCESS */ +Builds a field definition row to insert. */ static -ulint +void dict_build_field_def_step( /*======================*/ ind_node_t* node) /*!< in: index create node */ @@ -637,15 +636,13 @@ dict_build_field_def_step( row = dict_create_sys_fields_tuple(index, node->field_no, node->heap); ins_node_set_new_row(node->field_def, row); - - return(DB_SUCCESS); } /***************************************************************//** Creates an index tree for the index if it is not a member of a cluster. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t dict_create_index_tree_step( /*========================*/ ind_node_t* node) /*!< in: index create node */ @@ -653,7 +650,6 @@ dict_create_index_tree_step( dict_index_t* index; dict_table_t* sys_indexes; dtuple_t* search_tuple; - ulint zip_size; btr_pcur_t pcur; mtr_t mtr; @@ -682,25 +678,37 @@ dict_create_index_tree_step( btr_pcur_move_to_next_user_rec(&pcur, &mtr); - zip_size = dict_table_zip_size(index->table); - node->page_no = btr_create(index->type, index->space, zip_size, - index->id, index, &mtr); - /* printf("Created a new index tree in space %lu root page %lu\n", - index->space, node->page_no); */ + dberr_t err = DB_SUCCESS; + ulint zip_size = dict_table_zip_size(index->table); - page_rec_write_field(btr_pcur_get_rec(&pcur), - DICT_FLD__SYS_INDEXES__PAGE_NO, - node->page_no, &mtr); - btr_pcur_close(&pcur); - mtr_commit(&mtr); + if (node->index->table->ibd_file_missing + || dict_table_is_discarded(node->index->table)) { + + node->page_no = FIL_NULL; + } else { + node->page_no = btr_create( + index->type, index->space, zip_size, + index->id, index, &mtr); - if (node->page_no == FIL_NULL) { + if (node->page_no == FIL_NULL) { + err = DB_OUT_OF_FILE_SPACE; + } - return(DB_OUT_OF_FILE_SPACE); + DBUG_EXECUTE_IF("ib_import_create_index_failure_1", + node->page_no = FIL_NULL; + err = DB_OUT_OF_FILE_SPACE; ); } - return(DB_SUCCESS); + page_rec_write_field( + btr_pcur_get_rec(&pcur), DICT_FLD__SYS_INDEXES__PAGE_NO, + node->page_no, &mtr); + + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + + return(err); } /*******************************************************************//** @@ -883,7 +891,7 @@ create: for (index = UT_LIST_GET_FIRST(table->indexes); index; index = UT_LIST_GET_NEXT(indexes, index)) { - if (index->id == index_id) { + if (index->id == index_id && !(index->type & DICT_FTS)) { root_page_no = btr_create(type, space, zip_size, index_id, index, mtr); index->page = (unsigned int) root_page_no; @@ -910,7 +918,9 @@ tab_create_graph_create( /*====================*/ dict_table_t* table, /*!< in: table to create, built as a memory data structure */ - mem_heap_t* heap) /*!< in: heap where created */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit) /*!< in: true if the commit node should be + added to the query graph */ { tab_node_t* node; @@ -932,8 +942,12 @@ tab_create_graph_create( heap); node->col_def->common.parent = node; - node->commit_node = trx_commit_node_create(heap); - node->commit_node->common.parent = node; + if (commit) { + node->commit_node = trx_commit_node_create(heap); + node->commit_node->common.parent = node; + } else { + node->commit_node = 0; + } return(node); } @@ -947,7 +961,9 @@ ind_create_graph_create( /*====================*/ dict_index_t* index, /*!< in: index to create, built as a memory data structure */ - mem_heap_t* heap) /*!< in: heap where created */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit) /*!< in: true if the commit node should be + added to the query graph */ { ind_node_t* node; @@ -970,8 +986,12 @@ ind_create_graph_create( dict_sys->sys_fields, heap); node->field_def->common.parent = node; - node->commit_node = trx_commit_node_create(heap); - node->commit_node->common.parent = node; + if (commit) { + node->commit_node = trx_commit_node_create(heap); + node->commit_node->common.parent = node; + } else { + node->commit_node = 0; + } return(node); } @@ -986,7 +1006,7 @@ dict_create_table_step( que_thr_t* thr) /*!< in: query thread */ { tab_node_t* node; - ulint err = DB_ERROR; + dberr_t err = DB_ERROR; trx_t* trx; ut_ad(thr); @@ -1025,12 +1045,7 @@ dict_create_table_step( if (node->col_no < (node->table)->n_def) { - err = dict_build_col_def_step(node); - - if (err != DB_SUCCESS) { - - goto function_exit; - } + dict_build_col_def_step(node); node->col_no++; @@ -1063,7 +1078,7 @@ dict_create_table_step( } function_exit: - trx->error_state = (enum db_err) err; + trx->error_state = err; if (err == DB_SUCCESS) { /* Ok: do nothing */ @@ -1093,7 +1108,7 @@ dict_create_index_step( que_thr_t* thr) /*!< in: query thread */ { ind_node_t* node; - ulint err = DB_ERROR; + dberr_t err = DB_ERROR; trx_t* trx; ut_ad(thr); @@ -1130,12 +1145,7 @@ dict_create_index_step( if (node->field_no < (node->index)->n_fields) { - err = dict_build_field_def_step(node); - - if (err != DB_SUCCESS) { - - goto function_exit; - } + dict_build_field_def_step(node); node->field_no++; @@ -1172,7 +1182,37 @@ dict_create_index_step( err = dict_create_index_tree_step(node); + DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail", + err = DB_OUT_OF_MEMORY;); + if (err != DB_SUCCESS) { + /* If this is a FTS index, we will need to remove + it from fts->cache->indexes list as well */ + if ((node->index->type & DICT_FTS) + && node->table->fts) { + fts_index_cache_t* index_cache; + + rw_lock_x_lock( + &node->table->fts->cache->init_lock); + + index_cache = (fts_index_cache_t*) + fts_find_index_cache( + node->table->fts->cache, + node->index); + + if (index_cache->words) { + rbt_free(index_cache->words); + index_cache->words = 0; + } + + ib_vector_remove( + node->table->fts->cache->indexes, + *reinterpret_cast<void**>(index_cache)); + + rw_lock_x_unlock( + &node->table->fts->cache->init_lock); + } + dict_index_remove_from_cache(node->table, node->index); node->index = NULL; @@ -1180,6 +1220,11 @@ dict_create_index_step( } node->index->page = node->page_no; + /* These should have been set in + dict_build_index_def_step() and + dict_index_add_to_cache(). */ + ut_ad(node->index->trx_id == trx->id); + ut_ad(node->index->table->def_trx_id == trx->id); node->state = INDEX_COMMIT_WORK; } @@ -1197,7 +1242,7 @@ dict_create_index_step( } function_exit: - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err == DB_SUCCESS) { /* Ok: do nothing */ @@ -1217,93 +1262,107 @@ function_exit: } /****************************************************************//** -Check whether the system foreign key tables exist. Additionally, If -they exist then move them to non-LRU end of the table LRU list. -@return TRUE if they exist. */ +Check whether a system table exists. Additionally, if it exists, +move it to the non-LRU end of the table LRU list. This is oly used +for system tables that can be upgraded or added to an older database, +which include SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_TABLESPACES and +SYS_DATAFILES. +@return DB_SUCCESS if the sys table exists, DB_CORRUPTION if it exists +but is not current, DB_TABLE_NOT_FOUND if it does not exist*/ static -ibool -dict_check_sys_foreign_tables_exist(void) -/*=====================================*/ +dberr_t +dict_check_if_system_table_exists( +/*==============================*/ + const char* tablename, /*!< in: name of table */ + ulint num_fields, /*!< in: number of fields */ + ulint num_indexes) /*!< in: number of indexes */ { - dict_table_t* sys_foreign; - ibool exists = FALSE; - dict_table_t* sys_foreign_cols; + dict_table_t* sys_table; + dberr_t error = DB_SUCCESS; ut_a(srv_get_active_thread_type() == SRV_NONE); mutex_enter(&dict_sys->mutex); - sys_foreign = dict_table_get_low("SYS_FOREIGN"); - sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS"); + sys_table = dict_table_get_low(tablename); - if (sys_foreign != NULL - && sys_foreign_cols != NULL - && UT_LIST_GET_LEN(sys_foreign->indexes) == 3 - && UT_LIST_GET_LEN(sys_foreign_cols->indexes) == 1) { + if (sys_table == NULL) { + error = DB_TABLE_NOT_FOUND; - /* Foreign constraint system tables have already been - created, and they are ok. Ensure that they can't be - evicted from the table LRU cache. */ + } else if (UT_LIST_GET_LEN(sys_table->indexes) != num_indexes + || sys_table->n_cols != num_fields) { + error = DB_CORRUPTION; - dict_table_move_from_lru_to_non_lru(sys_foreign); - dict_table_move_from_lru_to_non_lru(sys_foreign_cols); + } else { + /* This table has already been created, and it is OK. + Ensure that it can't be evicted from the table LRU cache. */ - exists = TRUE; + dict_table_move_from_lru_to_non_lru(sys_table); } mutex_exit(&dict_sys->mutex); - return(exists); + return(error); } /****************************************************************//** Creates the foreign key constraints system tables inside InnoDB -at database creation or database start if they are not found or are +at server bootstrap or server start if they are not found or are not of the right form. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t dict_create_or_check_foreign_constraint_tables(void) /*================================================*/ { trx_t* trx; - ulint error; - ibool success; - ibool srv_file_per_table_backup; + my_bool srv_file_per_table_backup; + dberr_t err; + dberr_t sys_foreign_err; + dberr_t sys_foreign_cols_err; ut_a(srv_get_active_thread_type() == SRV_NONE); /* Note: The master thread has not been started at this point. */ - if (dict_check_sys_foreign_tables_exist()) { + + sys_foreign_err = dict_check_if_system_table_exists( + "SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3); + sys_foreign_cols_err = dict_check_if_system_table_exists( + "SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1); + + if (sys_foreign_err == DB_SUCCESS + && sys_foreign_cols_err == DB_SUCCESS) { return(DB_SUCCESS); } trx = trx_allocate_for_mysql(); + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->op_info = "creating foreign key sys tables"; row_mysql_lock_data_dictionary(trx); /* Check which incomplete table definition to drop. */ - if (dict_table_get_low("SYS_FOREIGN") != NULL) { - fprintf(stderr, - "InnoDB: dropping incompletely created" - " SYS_FOREIGN table\n"); + if (sys_foreign_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_FOREIGN table."); row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); } - if (dict_table_get_low("SYS_FOREIGN_COLS") != NULL) { - fprintf(stderr, - "InnoDB: dropping incompletely created" - " SYS_FOREIGN_COLS table\n"); + if (sys_foreign_cols_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_FOREIGN_COLS table."); row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); } - fprintf(stderr, - "InnoDB: Creating foreign key constraint system tables\n"); + ib_logf(IB_LOG_LEVEL_WARN, + "Creating foreign key constraint system tables."); /* NOTE: in dict_load_foreigns we use the fact that there are 2 secondary indexes on SYS_FOREIGN, and they @@ -1315,50 +1374,50 @@ dict_create_or_check_foreign_constraint_tables(void) VARBINARY, like in other InnoDB system tables, to get a clean design. */ - srv_file_per_table_backup = (ibool) srv_file_per_table; + srv_file_per_table_backup = srv_file_per_table; /* We always want SYSTEM tables to be created inside the system tablespace. */ srv_file_per_table = 0; - error = que_eval_sql(NULL, - "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n" - "BEGIN\n" - "CREATE TABLE\n" - "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR," - " REF_NAME CHAR, N_COLS INT);\n" - "CREATE UNIQUE CLUSTERED INDEX ID_IND" - " ON SYS_FOREIGN (ID);\n" - "CREATE INDEX FOR_IND" - " ON SYS_FOREIGN (FOR_NAME);\n" - "CREATE INDEX REF_IND" - " ON SYS_FOREIGN (REF_NAME);\n" - "CREATE TABLE\n" - "SYS_FOREIGN_COLS(ID CHAR, POS INT," - " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n" - "CREATE UNIQUE CLUSTERED INDEX ID_IND" - " ON SYS_FOREIGN_COLS (ID, POS);\n" - "END;\n" - , FALSE, trx); - - if (error != DB_SUCCESS) { - fprintf(stderr, "InnoDB: error %lu in creation\n", - (ulong) error); - - ut_a(error == DB_OUT_OF_FILE_SPACE - || error == DB_TOO_MANY_CONCURRENT_TRXS); - - fprintf(stderr, - "InnoDB: creation failed\n" - "InnoDB: tablespace is full\n" - "InnoDB: dropping incompletely created" - " SYS_FOREIGN tables\n"); + err = que_eval_sql( + NULL, + "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR," + " REF_NAME CHAR, N_COLS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN (ID);\n" + "CREATE INDEX FOR_IND" + " ON SYS_FOREIGN (FOR_NAME);\n" + "CREATE INDEX REF_IND" + " ON SYS_FOREIGN (REF_NAME);\n" + "CREATE TABLE\n" + "SYS_FOREIGN_COLS(ID CHAR, POS INT," + " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN_COLS (ID, POS);\n" + "END;\n", + FALSE, trx); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Creation of SYS_FOREIGN and SYS_FOREIGN_COLS " + "has failed with error %lu. Tablespace is full. " + "Dropping incompletely created tables.", + (ulong) err); + + ut_ad(err == DB_OUT_OF_FILE_SPACE + || err == DB_TOO_MANY_CONCURRENT_TRXS); row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); - error = DB_MUST_GET_MORE_FILE_SPACE; + if (err == DB_OUT_OF_FILE_SPACE) { + err = DB_MUST_GET_MORE_FILE_SPACE; + } } trx_commit_for_mysql(trx); @@ -1367,28 +1426,31 @@ dict_create_or_check_foreign_constraint_tables(void) trx_free_for_mysql(trx); - if (error == DB_SUCCESS) { - fprintf(stderr, - "InnoDB: Foreign key constraint system tables" - " created\n"); + srv_file_per_table = srv_file_per_table_backup; + + if (err == DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_INFO, + "Foreign key constraint system tables created"); } /* Note: The master thread has not been started at this point. */ /* Confirm and move to the non-LRU part of the table LRU list. */ + sys_foreign_err = dict_check_if_system_table_exists( + "SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3); + ut_a(sys_foreign_err == DB_SUCCESS); - success = dict_check_sys_foreign_tables_exist(); - ut_a(success); - - srv_file_per_table = (my_bool) srv_file_per_table_backup; + sys_foreign_cols_err = dict_check_if_system_table_exists( + "SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1); + ut_a(sys_foreign_cols_err == DB_SUCCESS); - return(error); + return(err); } /****************************************************************//** Evaluate the given foreign key SQL statement. @return error code or DB_SUCCESS */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t dict_foreign_eval_sql( /*==================*/ pars_info_t* info, /*!< in: info struct, or NULL */ @@ -1397,8 +1459,8 @@ dict_foreign_eval_sql( dict_foreign_t* foreign,/*!< in: foreign */ trx_t* trx) /*!< in: transaction */ { - ulint error; - FILE* ef = dict_foreign_err_file; + dberr_t error; + FILE* ef = dict_foreign_err_file; error = que_eval_sql(info, sql, FALSE, trx); @@ -1453,8 +1515,8 @@ dict_foreign_eval_sql( Add a single foreign key field definition to the data dictionary tables in the database. @return error code or DB_SUCCESS */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t dict_create_add_foreign_field_to_dictionary( /*========================================*/ ulint field_nr, /*!< in: foreign field number */ @@ -1492,17 +1554,17 @@ databasename/tablename_ibfk_NUMBER, where the numbers start from 1, and are given locally for this table, that is, the number is not global, as in the old format constraints < 4.0.18 it used to be. @return error code or DB_SUCCESS */ -static -ulint +UNIV_INTERN +dberr_t dict_create_add_foreign_to_dictionary( /*==================================*/ ulint* id_nr, /*!< in/out: number to use in id generation; incremented if used */ dict_table_t* table, /*!< in: table */ dict_foreign_t* foreign,/*!< in: foreign */ - trx_t* trx) /*!< in: transaction */ + trx_t* trx) /*!< in/out: dictionary transaction */ { - ulint error; + dberr_t error; ulint i; pars_info_t* info = pars_info_create(); @@ -1553,12 +1615,6 @@ dict_create_add_foreign_to_dictionary( } } - trx->op_info = "committing foreign key definitions"; - - trx_commit(trx); - - trx->op_info = ""; - return(error); } @@ -1566,7 +1622,7 @@ dict_create_add_foreign_to_dictionary( Adds foreign key definitions to data dictionary tables in the database. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t dict_create_add_foreigns_to_dictionary( /*===================================*/ ulint start_id,/*!< in: if we are actually doing ALTER TABLE @@ -1582,7 +1638,7 @@ dict_create_add_foreigns_to_dictionary( { dict_foreign_t* foreign; ulint number = start_id + 1; - ulint error; + dberr_t error; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -1607,5 +1663,188 @@ dict_create_add_foreigns_to_dictionary( } } + trx->op_info = "committing foreign key definitions"; + + trx_commit(trx); + + trx->op_info = ""; + return(DB_SUCCESS); } + +/****************************************************************//** +Creates the tablespaces and datafiles system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_sys_tablespace(void) +/*=====================================*/ +{ + trx_t* trx; + my_bool srv_file_per_table_backup; + dberr_t err; + dberr_t sys_tablespaces_err; + dberr_t sys_datafiles_err; + + ut_a(srv_get_active_thread_type() == SRV_NONE); + + /* Note: The master thread has not been started at this point. */ + + sys_tablespaces_err = dict_check_if_system_table_exists( + "SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1); + sys_datafiles_err = dict_check_if_system_table_exists( + "SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1); + + if (sys_tablespaces_err == DB_SUCCESS + && sys_datafiles_err == DB_SUCCESS) { + return(DB_SUCCESS); + } + + trx = trx_allocate_for_mysql(); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + trx->op_info = "creating tablepace and datafile sys tables"; + + row_mysql_lock_data_dictionary(trx); + + /* Check which incomplete table definition to drop. */ + + if (sys_tablespaces_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_TABLESPACES table."); + row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE); + } + + if (sys_datafiles_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_DATAFILES table."); + + row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Creating tablespace and datafile system tables."); + + /* We always want SYSTEM tables to be created inside the system + tablespace. */ + srv_file_per_table_backup = srv_file_per_table; + srv_file_per_table = 0; + + err = que_eval_sql( + NULL, + "PROCEDURE CREATE_SYS_TABLESPACE_PROC () IS\n" + "BEGIN\n" + "CREATE TABLE SYS_TABLESPACES(\n" + " SPACE INT, NAME CHAR, FLAGS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX SYS_TABLESPACES_SPACE" + " ON SYS_TABLESPACES (SPACE);\n" + "CREATE TABLE SYS_DATAFILES(\n" + " SPACE INT, PATH CHAR);\n" + "CREATE UNIQUE CLUSTERED INDEX SYS_DATAFILES_SPACE" + " ON SYS_DATAFILES (SPACE);\n" + "END;\n", + FALSE, trx); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Creation of SYS_TABLESPACES and SYS_DATAFILES " + "has failed with error %lu. Tablespace is full. " + "Dropping incompletely created tables.", + (ulong) err); + + ut_a(err == DB_OUT_OF_FILE_SPACE + || err == DB_TOO_MANY_CONCURRENT_TRXS); + + row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE); + row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE); + + if (err == DB_OUT_OF_FILE_SPACE) { + err = DB_MUST_GET_MORE_FILE_SPACE; + } + } + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + srv_file_per_table = srv_file_per_table_backup; + + if (err == DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_INFO, + "Tablespace and datafile system tables created."); + } + + /* Note: The master thread has not been started at this point. */ + /* Confirm and move to the non-LRU part of the table LRU list. */ + + sys_tablespaces_err = dict_check_if_system_table_exists( + "SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1); + ut_a(sys_tablespaces_err == DB_SUCCESS); + + sys_datafiles_err = dict_check_if_system_table_exists( + "SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1); + ut_a(sys_datafiles_err == DB_SUCCESS); + + return(err); +} + +/********************************************************************//** +Add a single tablespace definition to the data dictionary tables in the +database. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_tablespace_to_dictionary( +/*=====================================*/ + ulint space, /*!< in: tablespace id */ + const char* name, /*!< in: tablespace name */ + ulint flags, /*!< in: tablespace flags */ + const char* path, /*!< in: tablespace path */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true then commit the + transaction */ +{ + dberr_t error; + + pars_info_t* info = pars_info_create(); + + ut_a(space > TRX_SYS_SPACE); + + pars_info_add_int4_literal(info, "space", space); + + pars_info_add_str_literal(info, "name", name); + + pars_info_add_int4_literal(info, "flags", flags); + + pars_info_add_str_literal(info, "path", path); + + error = que_eval_sql(info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_TABLESPACES VALUES" + "(:space, :name, :flags);\n" + "INSERT INTO SYS_DATAFILES VALUES" + "(:space, :path);\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + return(error); + } + + if (commit) { + trx->op_info = "committing tablespace and datafile definition"; + trx_commit(trx); + } + + trx->op_info = ""; + + return(error); +} diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 8282dafda0c..8e111645880 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -25,6 +26,7 @@ Created 1/8/1996 Heikki Tuuri #include "dict0dict.h" #include "fts0fts.h" +#include "fil0fil.h" #ifdef UNIV_NONINL #include "dict0dict.ic" @@ -56,7 +58,6 @@ UNIV_INTERN dict_index_t* dict_ind_compact; #include "rem0cmp.h" #include "fts0fts.h" #include "fts0types.h" -#include "row0merge.h" #include "m_ctype.h" /* my_isspace() */ #include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str() */ #include "srv0mon.h" @@ -64,6 +65,14 @@ UNIV_INTERN dict_index_t* dict_ind_compact; #include "lock0lock.h" #include "dict0priv.h" #include "row0upd.h" +#include "row0mysql.h" +#include "row0merge.h" +#include "row0log.h" +#include "ut0ut.h" /* ut_format_name() */ +#include "m_string.h" +#include "my_sys.h" +#include "mysqld.h" /* system_charset_info */ +#include "strfunc.h" /* strconvert() */ #include <ctype.h> @@ -77,17 +86,27 @@ backround operations purge, rollback, foreign key checks reserve this in S-mode; we cannot trust that MySQL protects implicit or background operations a table drop since MySQL does not know of them; therefore we need this; NOTE: a transaction which reserves this must keep book -on the mode in trx_struct::dict_operation_lock_mode */ +on the mode in trx_t::dict_operation_lock_mode */ UNIV_INTERN rw_lock_t dict_operation_lock; +/** Percentage of compression failures that are allowed in a single +round */ +UNIV_INTERN ulong zip_failure_threshold_pct = 5; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +UNIV_INTERN ulong zip_pad_max = 50; + /* Keys to register rwlocks and mutexes with performance schema */ #ifdef UNIV_PFS_RWLOCK UNIV_INTERN mysql_pfs_key_t dict_operation_lock_key; UNIV_INTERN mysql_pfs_key_t index_tree_rw_lock_key; +UNIV_INTERN mysql_pfs_key_t index_online_log_key; UNIV_INTERN mysql_pfs_key_t dict_table_stats_latch_key; #endif /* UNIV_PFS_RWLOCK */ #ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t zip_pad_mutex_key; UNIV_INTERN mysql_pfs_key_t dict_sys_mutex_key; UNIV_INTERN mysql_pfs_key_t dict_foreign_err_mutex_key; #endif /* UNIV_PFS_MUTEX */ @@ -157,13 +176,6 @@ dict_index_build_internal_fts( dict_table_t* table, /*!< in: table */ dict_index_t* index); /*!< in: user representation of an FTS index */ /**********************************************************************//** -Removes a foreign constraint struct from the dictionary cache. */ -static -void -dict_foreign_remove_from_cache( -/*===========================*/ - dict_foreign_t* foreign); /*!< in, own: foreign constraint */ -/**********************************************************************//** Prints a column data. */ static void @@ -185,14 +197,6 @@ void dict_field_print_low( /*=================*/ const dict_field_t* field); /*!< in: field */ -#ifndef UNIV_HOTBACKUP -/*********************************************************************//** -Frees a foreign key struct. */ -static -void -dict_foreign_free( -/*==============*/ - dict_foreign_t* foreign); /*!< in, own: foreign key struct */ /**********************************************************************//** Removes an index from the dictionary cache. */ @@ -216,14 +220,14 @@ dict_table_remove_from_cache_low( /**********************************************************************//** Validate the dictionary table LRU list. @return TRUE if validate OK */ -UNIV_INTERN +static ibool dict_lru_validate(void); /*===================*/ /**********************************************************************//** Check if table is in the dictionary table LRU list. @return TRUE if table found */ -UNIV_INTERN +static ibool dict_lru_find_table( /*================*/ @@ -239,11 +243,11 @@ dict_non_lru_find_table( #endif /* UNIV_DEBUG */ /* Stream for storing detailed information about the latest foreign key -and unique key errors */ +and unique key errors. Only created if !srv_read_only_mode */ UNIV_INTERN FILE* dict_foreign_err_file = NULL; /* mutex protecting the foreign and unique error buffers */ -UNIV_INTERN mutex_t dict_foreign_err_mutex; -#endif /* !UNIV_HOTBACKUP */ +UNIV_INTERN ib_mutex_t dict_foreign_err_mutex; + /******************************************************************//** Makes all characters in a NUL-terminated UTF-8 string lower case. */ UNIV_INTERN @@ -330,7 +334,7 @@ dict_mutex_exit_for_mysql(void) /** Get the latch that protects the stats of a given table */ #define GET_TABLE_STATS_LATCH(table) \ - (&dict_table_stats_latches[ut_fold_ull(table->id) \ + (&dict_table_stats_latches[ut_fold_ull((ib_uint64_t) table) \ % DICT_TABLE_STATS_LATCHES_SIZE]) /**********************************************************************//** @@ -389,6 +393,75 @@ dict_table_stats_unlock( } } +/**********************************************************************//** +Try to drop any indexes after an aborted index creation. +This can also be after a server kill during DROP INDEX. */ +static +void +dict_table_try_drop_aborted( +/*========================*/ + dict_table_t* table, /*!< in: table, or NULL if it + needs to be looked up again */ + table_id_t table_id, /*!< in: table identifier */ + ulint ref_count) /*!< in: expected table->n_ref_count */ +{ + trx_t* trx; + + trx = trx_allocate_for_background(); + trx->op_info = "try to drop any indexes after an aborted index creation"; + row_mysql_lock_data_dictionary(trx); + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + if (table == NULL) { + table = dict_table_open_on_id_low(table_id); + } else { + ut_ad(table->id == table_id); + } + + if (table && table->n_ref_count == ref_count && table->drop_aborted) { + /* Silence a debug assertion in row_merge_drop_indexes(). */ + ut_d(table->n_ref_count++); + row_merge_drop_indexes(trx, table, TRUE); + ut_d(table->n_ref_count--); + ut_ad(table->n_ref_count == ref_count); + trx_commit_for_mysql(trx); + } + + row_mysql_unlock_data_dictionary(trx); + trx_free_for_background(trx); +} + +/**********************************************************************//** +When opening a table, +try to drop any indexes after an aborted index creation. +Release the dict_sys->mutex. */ +static +void +dict_table_try_drop_aborted_and_mutex_exit( +/*=======================================*/ + dict_table_t* table, /*!< in: table (may be NULL) */ + ibool try_drop) /*!< in: FALSE if should try to + drop indexes whose online creation + was aborted */ +{ + if (try_drop + && table != NULL + && table->drop_aborted + && table->n_ref_count == 1 + && dict_table_get_first_index(table)) { + + /* Attempt to drop the indexes whose online creation + was aborted. */ + table_id_t table_id = table->id; + + mutex_exit(&dict_sys->mutex); + + dict_table_try_drop_aborted(table, table_id, 1); + } else { + mutex_exit(&dict_sys->mutex); + } +} + /********************************************************************//** Decrements the count of open handles to a table. */ UNIV_INTERN @@ -396,7 +469,10 @@ void dict_table_close( /*=============*/ dict_table_t* table, /*!< in/out: table */ - ibool dict_locked) /*!< in: TRUE=data dictionary locked */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop) /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ { if (!dict_locked) { mutex_enter(&dict_sys->mutex); @@ -407,6 +483,18 @@ dict_table_close( --table->n_ref_count; + /* Force persistent stats re-read upon next open of the table + so that FLUSH TABLE can be used to forcibly fetch stats from disk + if they have been manually modified. We reset table->stat_initialized + only if table reference count is 0 because we do not want too frequent + stats re-reads (e.g. in other cases than FLUSH TABLE). */ + if (strchr(table->name, '/') != NULL + && table->n_ref_count == 0 + && dict_stats_is_persistent_enabled(table)) { + + dict_stats_deinit(table); + } + MONITOR_DEC(MONITOR_TABLE_REFERENCE); ut_ad(dict_lru_validate()); @@ -420,7 +508,19 @@ dict_table_close( #endif /* UNIV_DEBUG */ if (!dict_locked) { + table_id_t table_id = table->id; + ibool drop_aborted; + + drop_aborted = try_drop + && table->drop_aborted + && table->n_ref_count == 1 + && dict_table_get_first_index(table); + mutex_exit(&dict_sys->mutex); + + if (drop_aborted) { + dict_table_try_drop_aborted(NULL, table_id, 0); + } } } #endif /* !UNIV_HOTBACKUP */ @@ -550,33 +650,6 @@ dict_table_autoinc_unlock( { mutex_exit(&table->autoinc_mutex); } - -/**********************************************************************//** -Looks for an index with the given table and index id. -Note: Does not reserve the dictionary mutex. -@return index or NULL if not found in cache */ -UNIV_INTERN -dict_index_t* -dict_index_get_on_id_low( -/*=====================*/ - dict_table_t* table, /*!< in: table */ - index_id_t id) /*!< in: index id */ -{ - dict_index_t* index; - - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - - if (id == index->id) { - /* Found */ - - return(index); - } - } - - return(NULL); -} #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** @@ -712,7 +785,10 @@ dict_table_t* dict_table_open_on_id( /*==================*/ table_id_t table_id, /*!< in: table id */ - ibool dict_locked) /*!< in: TRUE=data dictionary locked */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop) /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ { dict_table_t* table; @@ -736,7 +812,7 @@ dict_table_open_on_id( } if (!dict_locked) { - mutex_exit(&dict_sys->mutex); + dict_table_try_drop_aborted_and_mutex_exit(table, try_drop); } return(table); @@ -815,11 +891,13 @@ dict_init(void) rw_lock_create(dict_operation_lock_key, &dict_operation_lock, SYNC_DICT_OPERATION); - dict_foreign_err_file = os_file_create_tmpfile(); - ut_a(dict_foreign_err_file); + if (!srv_read_only_mode) { + dict_foreign_err_file = os_file_create_tmpfile(); + ut_a(dict_foreign_err_file); - mutex_create(dict_foreign_err_mutex_key, - &dict_foreign_err_mutex, SYNC_NO_ORDER_CHECK); + mutex_create(dict_foreign_err_mutex_key, + &dict_foreign_err_mutex, SYNC_NO_ORDER_CHECK); + } for (i = 0; i < DICT_TABLE_STATS_LATCHES_SIZE; i++) { rw_lock_create(dict_table_stats_latch_key, @@ -849,14 +927,20 @@ dict_move_to_mru( } /**********************************************************************//** -Returns a table object and increments its open handle count. +Returns a table object and increment its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' module. Inside this directory dict_table_get_low +is usually the appropriate function. @return table, NULL if does not exist */ -static +UNIV_INTERN dict_table_t* -dict_table_open_on_name_low( -/*========================*/ +dict_table_open_on_name( +/*====================*/ const char* table_name, /*!< in: table name */ ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop, /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ dict_err_ignore_t ignore_err) /*!< in: error to be ignored when loading a table definition */ @@ -915,61 +999,11 @@ dict_table_open_on_name_low( ut_ad(dict_lru_validate()); if (!dict_locked) { - mutex_exit(&(dict_sys->mutex)); + dict_table_try_drop_aborted_and_mutex_exit(table, try_drop); } return(table); } - -/**********************************************************************//** -Returns a table object and increment its open handle count. -NOTE! This is a high-level function to be used mainly from outside the -'dict' directory. Inside this directory dict_table_get_low -is usually the appropriate function. -@return table, NULL if does not exist */ -UNIV_INTERN -dict_table_t* -dict_table_open_on_name( -/*====================*/ - const char* table_name, /*!< in: table name */ - ibool dict_locked) /*!< in: TRUE=data dictionary locked */ -{ - dict_table_t* table; - - table = dict_table_open_on_name_low(table_name, dict_locked, - DICT_ERR_IGNORE_NONE); - - if (table != NULL) { - /* If table->ibd_file_missing == TRUE, this will - print an error message and return without doing - anything. */ - dict_stats_update(table, - DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY, - dict_locked); - } - - return(table); -} - -/**********************************************************************//** -Returns a table object and increment its open handle count. Table -statistics will not be updated if they are not initialized. -Call this function when dropping a table. -@return table, NULL if does not exist */ -UNIV_INTERN -dict_table_t* -dict_table_open_on_name_no_stats( -/*=============================*/ - const char* table_name, /*!< in: table name */ - ibool dict_locked, /*!< in: TRUE=data dictionary locked */ - dict_err_ignore_t - ignore_err) /*!< in: error to be ignored during - table open */ -{ - return(dict_table_open_on_name_low(table_name, dict_locked, - ignore_err)); -} - #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** @@ -1156,7 +1190,7 @@ dict_table_can_be_evicted( index != NULL; index = dict_table_get_next_index(index)) { - btr_search_t* info = index->search_info; + btr_search_t* info = btr_search_get_info(index); /* We are not allowed to free the in-memory index struct dict_index_t until all entries in the adaptive @@ -1358,7 +1392,7 @@ dict_index_find_on_id_low( Renames a table object. @return TRUE if success */ UNIV_INTERN -ibool +dberr_t dict_table_rename_in_cache( /*=======================*/ dict_table_t* table, /*!< in/out: table */ @@ -1372,7 +1406,6 @@ dict_table_rename_in_cache( ulint fold; char old_name[MAX_FULL_NAME_LEN + 1]; - ut_ad(table); ut_ad(mutex_own(&(dict_sys->mutex))); /* store the old/current name to an automatic variable */ @@ -1389,28 +1422,59 @@ dict_table_rename_in_cache( fold = ut_fold_string(new_name); /* Look for a table with the same name: error if such exists */ - { - dict_table_t* table2; - HASH_SEARCH(name_hash, dict_sys->table_hash, fold, - dict_table_t*, table2, ut_ad(table2->cached), - (ut_strcmp(table2->name, new_name) == 0)); - if (UNIV_LIKELY_NULL(table2)) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: dictionary cache" - " already contains a table ", stderr); - ut_print_name(stderr, NULL, TRUE, new_name); - fputs("\n" - "InnoDB: cannot rename table ", stderr); - ut_print_name(stderr, NULL, TRUE, old_name); - putc('\n', stderr); - return(FALSE); - } + dict_table_t* table2; + HASH_SEARCH(name_hash, dict_sys->table_hash, fold, + dict_table_t*, table2, ut_ad(table2->cached), + (ut_strcmp(table2->name, new_name) == 0)); + DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure", + if (table2 == NULL) { + table2 = (dict_table_t*) -1; + } ); + if (table2) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot rename table '%s' to '%s' since the " + "dictionary cache already contains '%s'.", + old_name, new_name, new_name); + return(DB_ERROR); } /* If the table is stored in a single-table tablespace, rename the - .ibd file */ + .ibd file and rebuild the .isl file if needed. */ + + if (dict_table_is_discarded(table)) { + os_file_type_t type; + ibool exists; + char* filepath; + + ut_ad(table->space != TRX_SYS_SPACE);
+
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) { + + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + + fil_delete_tablespace(table->space, BUF_REMOVE_FLUSH_NO_WRITE);
+ + /* Delete any temp file hanging around. */ + if (os_file_status(filepath, &exists, &type) + && exists + && !os_file_delete_if_exists(filepath)) { + + ib_logf(IB_LOG_LEVEL_INFO, + "Delete of %s failed.", filepath); + } + + mem_free(filepath); + + } else if (table->space != TRX_SYS_SPACE) { + char* new_path = NULL; - if (table->space != 0) { if (table->dir_path_of_temp_table != NULL) { ut_print_timestamp(stderr); fputs(" InnoDB: Error: trying to rename a" @@ -1420,10 +1484,40 @@ dict_table_rename_in_cache( ut_print_filename(stderr, table->dir_path_of_temp_table); fputs(" )\n", stderr); - return(FALSE); - } else if (!fil_rename_tablespace(old_name, table->space, - new_name)) { - return(FALSE); + return(DB_ERROR); + + } else if (DICT_TF_HAS_DATA_DIR(table->flags)) { + char* old_path; + + old_path = fil_space_get_first_path(table->space); + + new_path = os_file_make_new_pathname( + old_path, new_name); + + mem_free(old_path); + + dberr_t err = fil_create_link_file( + new_name, new_path); + + if (err != DB_SUCCESS) { + mem_free(new_path); + return(DB_TABLESPACE_EXISTS); + } + } + + ibool success = fil_rename_tablespace( + old_name, table->space, new_name, new_path); + + /* If the tablespace is remote, a new .isl file was created + If success, delete the old one. If not, delete the new one. */ + if (new_path) { + + mem_free(new_path); + fil_delete_link_file(success ? old_name : new_name); + } + + if (!success) { + return(DB_ERROR); } } @@ -1450,12 +1544,11 @@ dict_table_rename_in_cache( ut_a(dict_sys->size > 0); /* Update the table_name field in indexes */ - index = dict_table_get_first_index(table); + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { - while (index != NULL) { index->table_name = table->name; - - index = dict_table_get_next_index(index); } if (!rename_also_foreigns) { @@ -1490,7 +1583,7 @@ dict_table_rename_in_cache( UT_LIST_INIT(table->referenced_list); - return(TRUE); + return(DB_SUCCESS); } /* Update the table name fields in foreign constraints, and update also @@ -1571,9 +1664,10 @@ dict_table_rename_in_cache( foreign = UT_LIST_GET_NEXT(foreign_list, foreign); } - foreign = UT_LIST_GET_FIRST(table->referenced_list); + for (foreign = UT_LIST_GET_FIRST(table->referenced_list); + foreign != NULL; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { - while (foreign != NULL) { if (ut_strlen(foreign->referenced_table_name) < ut_strlen(table->name)) { /* Allocate a longer name buffer; @@ -1581,16 +1675,19 @@ dict_table_rename_in_cache( foreign->referenced_table_name = mem_heap_strdup( foreign->heap, table->name); - dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + dict_mem_referenced_table_name_lookup_set( + foreign, TRUE); } else { /* Use the same buffer */ strcpy(foreign->referenced_table_name, table->name); - dict_mem_referenced_table_name_lookup_set(foreign, FALSE); + + dict_mem_referenced_table_name_lookup_set( + foreign, FALSE); } - foreign = UT_LIST_GET_NEXT(referenced_list, foreign); } - return(TRUE); + return(DB_SUCCESS); } /**********************************************************************//** @@ -1692,6 +1789,30 @@ dict_table_remove_from_cache_low( ut_ad(dict_lru_validate()); + if (lru_evict && table->drop_aborted) { + /* Do as dict_table_try_drop_aborted() does. */ + + trx_t* trx = trx_allocate_for_background(); + + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + /* Mimic row_mysql_lock_data_dictionary(). */ + trx->dict_operation_lock_mode = RW_X_LATCH; + + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + /* Silence a debug assertion in row_merge_drop_indexes(). */ + ut_d(table->n_ref_count++); + row_merge_drop_indexes(trx, table, TRUE); + ut_d(table->n_ref_count--); + ut_ad(table->n_ref_count == 0); + trx_commit_for_mysql(trx); + trx->dict_operation_lock_mode = 0; + trx_free_for_background(trx); + } + size = mem_heap_get_size(table->heap) + strlen(table->name) + 1; ut_ad(dict_sys->size >= size); @@ -1777,6 +1898,12 @@ dict_index_too_big_for_undo( + 10 + FIL_PAGE_DATA_END /* trx_undo_left() */ + 2/* pointer to previous undo log record */; + /* FTS index consists of auxiliary tables, they shall be excluded from + index row size check */ + if (new_index->type & DICT_FTS) { + return(false); + } + if (!clust_index) { ut_a(dict_index_is_clust(new_index)); clust_index = new_index; @@ -1900,6 +2027,12 @@ dict_index_too_big_for_tree( /* maximum allowed size of a node pointer record */ ulint page_ptr_max; + /* FTS index consists of auxiliary tables, they shall be excluded from + index row size check */ + if (new_index->type & DICT_FTS) { + return(false); + } + comp = dict_table_is_comp(table); zip_size = dict_table_zip_size(table); @@ -2032,7 +2165,7 @@ add_field_size: Adds an index to the dictionary cache. @return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */ UNIV_INTERN -ulint +dberr_t dict_index_add_to_cache( /*====================*/ dict_table_t* table, /*!< in: table on which the index is */ @@ -2051,6 +2184,7 @@ dict_index_add_to_cache( ut_ad(mutex_own(&(dict_sys->mutex))); ut_ad(index->n_def == index->n_fields); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(!dict_index_is_online_ddl(index)); ut_ad(mem_heap_validate(index->heap)); ut_a(!dict_index_is_clust(index) @@ -2077,6 +2211,7 @@ dict_index_add_to_cache( number of fields in the cache internal representation */ new_index->n_fields = new_index->n_def; + new_index->trx_id = index->trx_id; if (strict && dict_index_too_big_for_tree(table, new_index)) { too_big: @@ -2169,51 +2304,41 @@ undo_size_ok: } } - /* Add the new index as the last index for the table */ - - UT_LIST_ADD_LAST(indexes, table->indexes, new_index); - new_index->table = table; - new_index->table_name = table->name; - - new_index->search_info = btr_search_info_create(new_index->heap); - - new_index->stat_index_size = 1; - new_index->stat_n_leaf_pages = 1; - - new_index->page = page_no; - rw_lock_create(index_tree_rw_lock_key, &new_index->lock, - dict_index_is_ibuf(index) - ? SYNC_IBUF_INDEX_TREE : SYNC_INDEX_TREE); - if (!dict_index_is_univ(new_index)) { new_index->stat_n_diff_key_vals = - static_cast<ib_uint64_t*>(mem_heap_alloc( + static_cast<ib_uint64_t*>(mem_heap_zalloc( new_index->heap, - (1 + dict_index_get_n_unique(new_index)) + dict_index_get_n_unique(new_index) * sizeof(*new_index->stat_n_diff_key_vals))); new_index->stat_n_sample_sizes = - static_cast<ib_uint64_t*>(mem_heap_alloc( + static_cast<ib_uint64_t*>(mem_heap_zalloc( new_index->heap, - (1 + dict_index_get_n_unique(new_index)) + dict_index_get_n_unique(new_index) * sizeof(*new_index->stat_n_sample_sizes))); new_index->stat_n_non_null_key_vals = static_cast<ib_uint64_t*>(mem_heap_zalloc( new_index->heap, - (1 + dict_index_get_n_unique(new_index)) + dict_index_get_n_unique(new_index) * sizeof(*new_index->stat_n_non_null_key_vals))); + } - /* Give some sensible values to stat_n_... in case we do - not calculate statistics quickly enough */ + new_index->stat_index_size = 1; + new_index->stat_n_leaf_pages = 1; - for (i = 0; i <= dict_index_get_n_unique(new_index); i++) { + /* Add the new index as the last index for the table */ - new_index->stat_n_diff_key_vals[i] = 100; - new_index->stat_n_sample_sizes[i] = 0; - } - } + UT_LIST_ADD_LAST(indexes, table->indexes, new_index); + new_index->table = table; + new_index->table_name = table->name; + new_index->search_info = btr_search_info_create(new_index->heap); + + new_index->page = page_no; + rw_lock_create(index_tree_rw_lock_key, &new_index->lock, + dict_index_is_ibuf(index) + ? SYNC_IBUF_INDEX_TREE : SYNC_INDEX_TREE); dict_sys->size += mem_heap_get_size(new_index->heap); @@ -2242,9 +2367,17 @@ dict_index_remove_from_cache_low( ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); ut_ad(mutex_own(&(dict_sys->mutex))); + /* No need to acquire the dict_index_t::lock here because + there can't be any active operations on this index (or table). */ + + if (index->online_log) { + ut_ad(index->online_status == ONLINE_INDEX_CREATION); + row_log_free(index->online_log); + } + /* We always create search info whether or not adaptive hash index is enabled or not. */ - info = index->search_info; + info = btr_search_get_info(index); ut_ad(info); /* We are not allowed to free the in-memory index struct @@ -2270,15 +2403,15 @@ dict_index_remove_from_cache_low( if (retries % 500 == 0) { /* No luck after 5 seconds of wait. */ fprintf(stderr, "InnoDB: Error: Waited for" - " %lu secs for hash index" - " ref_count (%lu) to drop" - " to 0.\n" - "index: \"%s\"" - " table: \"%s\"\n", - retries/100, - ref_count, - index->name, - table->name); + " %lu secs for hash index" + " ref_count (%lu) to drop" + " to 0.\n" + "index: \"%s\"" + " table: \"%s\"\n", + retries/100, + ref_count, + index->name, + table->name); } /* To avoid a hang here we commit suicide if the @@ -2821,8 +2954,6 @@ dict_index_build_internal_fts( return(new_index); } - -#ifndef UNIV_HOTBACKUP /*====================== FOREIGN KEY PROCESSING ========================*/ /*********************************************************************//** @@ -2889,8 +3020,7 @@ dict_table_get_foreign_constraint( foreign; foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { - if (foreign->foreign_index == index - || foreign->referenced_index == index) { + if (foreign->foreign_index == index) { return(foreign); } @@ -2901,7 +3031,7 @@ dict_table_get_foreign_constraint( /*********************************************************************//** Frees a foreign key struct. */ -static +UNIV_INTERN void dict_foreign_free( /*==============*/ @@ -2912,7 +3042,7 @@ dict_foreign_free( /**********************************************************************//** Removes a foreign constraint struct from the dictionary cache. */ -static +UNIV_INTERN void dict_foreign_remove_from_cache( /*===========================*/ @@ -2976,84 +3106,50 @@ dict_foreign_find( return(NULL); } + /*********************************************************************//** Tries to find an index whose first fields are the columns in the array, in the same order and is not marked for deletion and is not the same as types_idx. @return matching index, NULL if not found */ -static +UNIV_INTERN dict_index_t* dict_foreign_find_index( /*====================*/ - dict_table_t* table, /*!< in: table */ - const char** columns,/*!< in: array of column names */ - ulint n_cols, /*!< in: number of columns */ - dict_index_t* types_idx, /*!< in: NULL or an index to whose types the - column types must match */ - ibool check_charsets, - /*!< in: whether to check charsets. - only has an effect if types_idx != NULL */ - ulint check_null) - /*!< in: nonzero if none of the columns must - be declared NOT NULL */ + const dict_table_t* table, /*!< in: table */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + ibool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ { dict_index_t* index; + ut_ad(mutex_own(&dict_sys->mutex)); + index = dict_table_get_first_index(table); while (index != NULL) { /* Ignore matches that refer to the same instance - or the index is to be dropped */ - if (index->to_be_dropped || types_idx == index - || index->type & DICT_FTS) { + (or the index is to be dropped) */ + if (types_idx == index || index->type & DICT_FTS + || index->to_be_dropped) { goto next_rec; - } else if (dict_index_get_n_fields(index) >= n_cols) { - ulint i; - - for (i = 0; i < n_cols; i++) { - dict_field_t* field; - const char* col_name; - - field = dict_index_get_nth_field(index, i); - - col_name = dict_table_get_col_name( - table, dict_col_get_no(field->col)); - - if (field->prefix_len != 0) { - /* We do not accept column prefix - indexes here */ - - break; - } - - if (0 != innobase_strcasecmp(columns[i], - col_name)) { - break; - } - - if (check_null - && (field->col->prtype & DATA_NOT_NULL)) { - - return(NULL); - } - - if (types_idx && !cmp_cols_are_equal( - dict_index_get_nth_col(index, i), - dict_index_get_nth_col(types_idx, - i), - check_charsets)) { - - break; - } - } - - if (i == n_cols) { - /* We found a matching index */ - - return(index); - } + } else if (dict_foreign_qualify_index( + table, columns, n_cols, index, types_idx, + check_charsets, check_null)) { + return(index); } next_rec: @@ -3064,90 +3160,6 @@ next_rec: } /**********************************************************************//** -Find an index that is equivalent to the one passed in and is not marked -for deletion. -@return index equivalent to foreign->foreign_index, or NULL */ -UNIV_INTERN -dict_index_t* -dict_foreign_find_equiv_index( -/*==========================*/ - dict_foreign_t* foreign)/*!< in: foreign key */ -{ - ut_a(foreign != NULL); - - /* Try to find an index which contains the columns as the - first fields and in the right order, and the types are the - same as in foreign->foreign_index */ - - return(dict_foreign_find_index( - foreign->foreign_table, - foreign->foreign_col_names, foreign->n_fields, - foreign->foreign_index, TRUE, /* check types */ - FALSE/* allow columns to be NULL */)); -} - -#endif /* !UNIV_HOTBACKUP */ -/**********************************************************************//** -Returns an index object by matching on the name and column names and -if more than one index matches return the index with the max id -@return matching index, NULL if not found */ -UNIV_INTERN -dict_index_t* -dict_table_get_index_by_max_id( -/*===========================*/ - dict_table_t* table, /*!< in: table */ - const char* name, /*!< in: the index name to find */ - const char** columns,/*!< in: array of column names */ - ulint n_cols) /*!< in: number of columns */ -{ - dict_index_t* index; - dict_index_t* found; - - found = NULL; - index = dict_table_get_first_index(table); - - while (index != NULL) { - if (ut_strcmp(index->name, name) == 0 - && dict_index_get_n_ordering_defined_by_user(index) - == n_cols) { - - ulint i; - - for (i = 0; i < n_cols; i++) { - dict_field_t* field; - const char* col_name; - - field = dict_index_get_nth_field(index, i); - - col_name = dict_table_get_col_name( - table, dict_col_get_no(field->col)); - - if (0 != innobase_strcasecmp( - columns[i], col_name)) { - - break; - } - } - - if (i == n_cols) { - /* We found a matching index, select - the index with the higher id*/ - - if (!found || index->id > found->id) { - - found = index; - } - } - } - - index = dict_table_get_next_index(index); - } - - return(found); -} - -#ifndef UNIV_HOTBACKUP -/**********************************************************************//** Report an error in a foreign key definition. */ static void @@ -3196,7 +3208,7 @@ At least one of the foreign table and the referenced table must already be in the dictionary cache! @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t dict_foreign_add_to_cache( /*======================*/ dict_foreign_t* foreign, /*!< in, own: foreign key constraint */ @@ -3325,7 +3337,6 @@ dict_foreign_add_to_cache( return(DB_SUCCESS); } -#endif /* !UNIV_HOTBACKUP */ /*********************************************************************//** Scans from pointer onwards. Stops if is at the start of a copy of 'string' where characters are compared without case sensitivity, and @@ -3579,6 +3590,67 @@ dict_scan_col( return(ptr); } + +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +UNIV_INTERN +char* +dict_get_referenced_table( +/*======================*/ + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len, /*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap) /*!< in/out: heap memory */ +{ + char* ref; + const char* db_name; + + if (!database_name) { + /* Use the database name of the foreign key table */ + + db_name = name; + database_name_len = dict_get_db_name_len(name); + } else { + db_name = database_name; + } + + /* Copy database_name, '/', table_name, '\0' */ + ref = static_cast<char*>( + mem_heap_alloc(heap, database_name_len + table_name_len + 2)); + + memcpy(ref, db_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + /* Values; 0 = Store and compare as given; case sensitive + 1 = Store and compare in lower; case insensitive + 2 = Store as given, compare in lower; case semi-sensitive */ + if (innobase_get_lower_case_table_names() == 2) { + innobase_casedn_str(ref); + *table = dict_table_get_low(ref); + memcpy(ref, db_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + } else { +#ifndef __WIN__ + if (innobase_get_lower_case_table_names() == 1) { + innobase_casedn_str(ref); + } +#else + innobase_casedn_str(ref); +#endif /* !__WIN__ */ + *table = dict_table_get_low(ref); + } + + return(ref); +} /*********************************************************************//** Scans a table name from an SQL string. @return scanned to */ @@ -3598,9 +3670,7 @@ dict_scan_table_name( const char* database_name = NULL; ulint database_name_len = 0; const char* table_name = NULL; - ulint table_name_len; const char* scan_name; - char* ref; *success = FALSE; *table = NULL; @@ -3648,46 +3718,11 @@ dict_scan_table_name( table_name = scan_name; } - if (database_name == NULL) { - /* Use the database name of the foreign key table */ - - database_name = name; - database_name_len = dict_get_db_name_len(name); - } - - table_name_len = strlen(table_name); - - /* Copy database_name, '/', table_name, '\0' */ - ref = static_cast<char*>( - mem_heap_alloc(heap, database_name_len + table_name_len + 2)); - - memcpy(ref, database_name, database_name_len); - ref[database_name_len] = '/'; - memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); - - /* Values; 0 = Store and compare as given; case sensitive - 1 = Store and compare in lower; case insensitive - 2 = Store as given, compare in lower; case semi-sensitive */ - if (innobase_get_lower_case_table_names() == 2) { - innobase_casedn_str(ref); - *table = dict_table_get_low(ref); - memcpy(ref, database_name, database_name_len); - ref[database_name_len] = '/'; - memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); - - } else { -#ifndef __WIN__ - if (innobase_get_lower_case_table_names() == 1) { - innobase_casedn_str(ref); - } -#else - innobase_casedn_str(ref); -#endif /* !__WIN__ */ - *table = dict_table_get_low(ref); - } + *ref_name = dict_get_referenced_table( + name, database_name, database_name_len, + table_name, strlen(table_name), table, heap); *success = TRUE; - *ref_name = ref; return(ptr); } @@ -3810,13 +3845,12 @@ end_of_string: } } -#ifndef UNIV_HOTBACKUP /*********************************************************************//** Finds the highest [number] for foreign key constraints of the table. Looks only at the >= 4.0.18-format id's, which are of the form databasename/tablename_ibfk_[number]. @return highest number, 0 if table has no new format foreign key constraints */ -static +UNIV_INTERN ulint dict_table_get_highest_foreign_id( /*==============================*/ @@ -3871,6 +3905,8 @@ dict_foreign_report_syntax_err( in the SQL string */ const char* ptr) /*!< in: place of the syntax error */ { + ut_ad(!srv_read_only_mode); + FILE* ef = dict_foreign_err_file; mutex_enter(&dict_foreign_err_mutex); @@ -3888,7 +3924,7 @@ be accompanied with indexes in both participating tables. The indexes are allowed to contain more fields than mentioned in the constraint. @return error code or DB_SUCCESS */ static -ulint +dberr_t dict_create_foreign_constraints_low( /*================================*/ trx_t* trx, /*!< in: transaction */ @@ -3919,7 +3955,7 @@ dict_create_foreign_constraints_low( FILE* ef = dict_foreign_err_file; const char* constraint_name; ibool success; - ulint error; + dberr_t error; const char* ptr1; const char* ptr2; ulint i; @@ -3931,6 +3967,7 @@ dict_create_foreign_constraints_low( const char* column_names[500]; const char* referenced_table_name; + ut_ad(!srv_read_only_mode); ut_ad(mutex_own(&(dict_sys->mutex))); table = dict_table_get_low(name); @@ -4470,11 +4507,11 @@ UNIV_INTERN ibool dict_str_starts_with_keyword( /*=========================*/ - void* mysql_thd, /*!< in: MySQL thread handle */ + THD* thd, /*!< in: MySQL thread handle */ const char* str, /*!< in: string to scan for keyword */ const char* keyword) /*!< in: keyword to look for */ { - struct charset_info_st* cs = innobase_get_charset(mysql_thd); + struct charset_info_st* cs = innobase_get_charset(thd); ibool success; dict_accept(cs, str, keyword, &success); @@ -4489,7 +4526,7 @@ be accompanied with indexes in both participating tables. The indexes are allowed to contain more fields than mentioned in the constraint. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t dict_create_foreign_constraints( /*============================*/ trx_t* trx, /*!< in: transaction */ @@ -4509,9 +4546,9 @@ dict_create_foreign_constraints( code DB_CANNOT_ADD_CONSTRAINT if any foreign keys are found. */ { - char* str; - ulint err; - mem_heap_t* heap; + char* str; + dberr_t err; + mem_heap_t* heap; ut_a(trx); ut_a(trx->mysql_thd); @@ -4534,7 +4571,7 @@ Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. @return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the constraint id does not match */ UNIV_INTERN -ulint +dberr_t dict_foreign_parse_drop_constraints( /*================================*/ mem_heap_t* heap, /*!< in: heap from which we can @@ -4552,7 +4589,6 @@ dict_foreign_parse_drop_constraints( size_t len; const char* ptr; const char* id; - FILE* ef = dict_foreign_err_file; struct charset_info_st* cs; ut_a(trx); @@ -4618,10 +4654,11 @@ loop: foreign = UT_LIST_GET_FIRST(table->foreign_list); while (foreign != NULL) { - if (0 == strcmp(foreign->id, id) + if (0 == innobase_strcasecmp(foreign->id, id) || (strchr(foreign->id, '/') - && 0 == strcmp(id, - dict_remove_db_name(foreign->id)))) { + && 0 == innobase_strcasecmp( + id, + dict_remove_db_name(foreign->id)))) { /* Found */ break; } @@ -4629,20 +4666,26 @@ loop: foreign = UT_LIST_GET_NEXT(foreign_list, foreign); } + if (foreign == NULL) { - mutex_enter(&dict_foreign_err_mutex); - rewind(ef); - ut_print_timestamp(ef); - fputs(" Error in dropping of a foreign key constraint" - " of table ", ef); - ut_print_name(ef, NULL, TRUE, table->name); - fputs(",\n" - "in SQL command\n", ef); - fputs(str, ef); - fputs("\nCannot find a constraint with the given id ", ef); - ut_print_name(ef, NULL, FALSE, id); - fputs(".\n", ef); - mutex_exit(&dict_foreign_err_mutex); + + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in dropping of a foreign key " + "constraint of table ", ef); + ut_print_name(ef, NULL, TRUE, table->name); + fputs(",\nin SQL command\n", ef); + fputs(str, ef); + fputs("\nCannot find a constraint with the " + "given id ", ef); + ut_print_name(ef, NULL, FALSE, id); + fputs(".\n", ef); + mutex_exit(&dict_foreign_err_mutex); + } mem_free(str); @@ -4652,15 +4695,19 @@ loop: goto loop; syntax_error: - mutex_enter(&dict_foreign_err_mutex); - rewind(ef); - ut_print_timestamp(ef); - fputs(" Syntax error in dropping of a" - " foreign key constraint of table ", ef); - ut_print_name(ef, NULL, TRUE, table->name); - fprintf(ef, ",\n" - "close to:\n%s\n in SQL command\n%s\n", ptr, str); - mutex_exit(&dict_foreign_err_mutex); + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Syntax error in dropping of a" + " foreign key constraint of table ", ef); + ut_print_name(ef, NULL, TRUE, table->name); + fprintf(ef, ",\n" + "close to:\n%s\n in SQL command\n%s\n", ptr, str); + mutex_exit(&dict_foreign_err_mutex); + } mem_free(str); @@ -4668,7 +4715,7 @@ syntax_error: } /*==================== END OF FOREIGN KEY PROCESSING ====================*/ -#endif /* !UNIV_HOTBACKUP */ + /**********************************************************************//** Returns an index object if it is found in the dictionary cache. Assumes that dict_sys->mutex is already being held. @@ -4908,7 +4955,6 @@ dict_index_calc_min_rec_len( return(sum); } -#ifndef UNIV_HOTBACKUP /**********************************************************************//** Prints info of a foreign key constraint. */ static @@ -4939,7 +4985,6 @@ dict_foreign_print_low( fputs(" )\n", stderr); } -#endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** Prints a table data. */ UNIV_INTERN @@ -4948,60 +4993,29 @@ dict_table_print( /*=============*/ dict_table_t* table) /*!< in: table */ { - mutex_enter(&(dict_sys->mutex)); - dict_table_print_low(table); - mutex_exit(&(dict_sys->mutex)); -} - -/**********************************************************************//** -Prints a table data when we know the table name. */ -UNIV_INTERN -void -dict_table_print_by_name( -/*=====================*/ - const char* name) /*!< in: table name */ -{ - dict_table_t* table; - - mutex_enter(&(dict_sys->mutex)); - - table = dict_table_get_low(name); - - ut_a(table); - - dict_table_print_low(table); - mutex_exit(&(dict_sys->mutex)); -} - -/**********************************************************************//** -Prints a table data. */ -UNIV_INTERN -void -dict_table_print_low( -/*=================*/ - dict_table_t* table) /*!< in: table */ -{ dict_index_t* index; dict_foreign_t* foreign; ulint i; ut_ad(mutex_own(&(dict_sys->mutex))); - dict_stats_update(table, DICT_STATS_FETCH, TRUE); + dict_table_stats_lock(table, RW_X_LATCH); - dict_table_stats_lock(table, RW_S_LATCH); + if (!table->stat_initialized) { + dict_stats_update_transient(table); + } fprintf(stderr, "--------------------------------------\n" "TABLE: name %s, id %llu, flags %lx, columns %lu," - " indexes %lu, appr.rows %lu\n" + " indexes %lu, appr.rows " UINT64PF "\n" " COLUMNS: ", table->name, (ullint) table->id, (ulong) table->flags, (ulong) table->n_cols, (ulong) UT_LIST_GET_LEN(table->indexes), - (ulong) table->stat_n_rows); + table->stat_n_rows); for (i = 0; i < (ulint) table->n_cols; i++) { dict_col_print_low(table, dict_table_get_nth_col(table, i)); @@ -5017,7 +5031,9 @@ dict_table_print_low( index = UT_LIST_GET_NEXT(indexes, index); } - dict_table_stats_unlock(table, RW_S_LATCH); + table->stat_initialized = FALSE; + + dict_table_stats_unlock(table, RW_X_LATCH); foreign = UT_LIST_GET_FIRST(table->foreign_list); @@ -5065,13 +5081,15 @@ dict_index_print_low( ib_int64_t n_vals; ulint i; + ut_a(index->table->stat_initialized); + ut_ad(mutex_own(&(dict_sys->mutex))); if (index->n_user_defined_cols > 0) { n_vals = index->stat_n_diff_key_vals[ - index->n_user_defined_cols]; + index->n_user_defined_cols - 1]; } else { - n_vals = index->stat_n_diff_key_vals[1]; + n_vals = index->stat_n_diff_key_vals[0]; } fprintf(stderr, @@ -5121,7 +5139,6 @@ dict_field_print_low( } } -#ifndef UNIV_HOTBACKUP /**********************************************************************//** Outputs info on a foreign key of a table in a format suitable for CREATE TABLE. */ @@ -5310,7 +5327,6 @@ dict_print_info_on_foreign_keys( mutex_exit(&(dict_sys->mutex)); } -#endif /* !UNIV_HOTBACKUP */ /********************************************************************//** Displays the names of the index and the table. */ UNIV_INTERN @@ -5318,7 +5334,7 @@ void dict_index_name_print( /*==================*/ FILE* file, /*!< in: output stream */ - trx_t* trx, /*!< in: transaction */ + const trx_t* trx, /*!< in: transaction */ const dict_index_t* index) /*!< in: index to print */ { fputs("index ", file); @@ -5393,7 +5409,9 @@ UNIV_INTERN void dict_set_corrupted( /*===============*/ - dict_index_t* index) /*!< in/out: index */ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx, /*!< in/out: transaction */ + const char* ctx) /*!< in: context */ { mem_heap_t* heap; mtr_t mtr; @@ -5401,8 +5419,14 @@ dict_set_corrupted( dtuple_t* tuple; dfield_t* dfield; byte* buf; + char* table_name; const char* status; btr_cur_t cursor; + bool locked = RW_X_LATCH == trx->dict_operation_lock_mode; + + if (!locked) { + row_mysql_lock_data_dictionary(trx); + } ut_ad(index); ut_ad(mutex_own(&dict_sys->mutex)); @@ -5422,7 +5446,7 @@ dict_set_corrupted( if (index->type & DICT_CORRUPT) { /* The index was already flagged corrupted. */ ut_ad(!dict_index_is_clust(index) || index->table->corrupted); - return; + goto func_exit; } heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t) @@ -5463,19 +5487,29 @@ dict_set_corrupted( goto fail; } mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr); - status = " InnoDB: Flagged corruption of "; + status = "Flagged"; } else { fail: - status = " InnoDB: Unable to flag corruption of "; + status = "Unable to flag"; } mtr_commit(&mtr); + mem_heap_empty(heap); + table_name = static_cast<char*>(mem_heap_alloc(heap, FN_REFLEN + 1)); + *innobase_convert_name( + table_name, FN_REFLEN, + index->table_name, strlen(index->table_name), + NULL, TRUE) = 0; + + ib_logf(IB_LOG_LEVEL_ERROR, "%s corruption of %s in table %s in %s", + status, index->name, table_name, ctx); + mem_heap_free(heap); - ut_print_timestamp(stderr); - fputs(status, stderr); - dict_index_name_print(stderr, NULL, index); - putc('\n', stderr); +func_exit: + if (!locked) { + row_mysql_unlock_data_dictionary(trx); + } } /**********************************************************************//** @@ -5582,7 +5616,7 @@ dict_table_get_index_on_name( /* If name is NULL, just return */ if (!name) { - return NULL; + return(NULL); } index = dict_table_get_first_index(table); @@ -5597,42 +5631,47 @@ dict_table_get_index_on_name( } return(NULL); - } /**********************************************************************//** -Replace the index passed in with another equivalent index in the tables -foreign key list. */ +Replace the index passed in with another equivalent index in the +foreign key lists of the table. */ UNIV_INTERN void -dict_table_replace_index_in_foreign_list( -/*=====================================*/ - dict_table_t* table, /*!< in/out: table */ - dict_index_t* index, /*!< in: index to be replaced */ - const trx_t* trx) /*!< in: transaction handle */ +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const dict_index_t* index, /*!< in: index to be replaced */ + const trx_t* trx) /*!< in: transaction handle */ { dict_foreign_t* foreign; + ut_ad(index->to_be_dropped); + for (foreign = UT_LIST_GET_FIRST(table->foreign_list); foreign; foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { - if (foreign->foreign_index == index) { - dict_index_t* new_index - = dict_foreign_find_equiv_index(foreign); + dict_index_t* new_index; - /* There must exist an alternative index if - check_foreigns (FOREIGN_KEY_CHECKS) is on, - since ha_innobase::prepare_drop_index had done - the check before we reach here. */ + if (foreign->foreign_index == index) { + ut_ad(foreign->foreign_table == index->table); + new_index = dict_foreign_find_index( + foreign->foreign_table, + foreign->foreign_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, /*check_null=*/FALSE); + /* There must exist an alternative index, + since this must have been checked earlier. */ ut_a(new_index || !trx->check_foreigns); + ut_ad(!new_index || new_index->table == index->table); + ut_ad(!new_index || !new_index->to_be_dropped); foreign->foreign_index = new_index; } } - for (foreign = UT_LIST_GET_FIRST(table->referenced_list); foreign; foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { @@ -5647,8 +5686,11 @@ dict_table_replace_index_in_foreign_list( foreign->referenced_col_names, foreign->n_fields, index, /*check_charsets=*/TRUE, /*check_null=*/FALSE); - ut_ad(new_index || !trx->check_foreigns); + /* There must exist an alternative index, + since this must have been checked earlier. */ + ut_a(new_index || !trx->check_foreigns); ut_ad(!new_index || new_index->table == index->table); + ut_ad(!new_index || !new_index->to_be_dropped); foreign->referenced_index = new_index; } @@ -5696,8 +5738,8 @@ dict_table_check_for_dup_indexes( /*=============================*/ const dict_table_t* table, /*!< in: Check for dup indexes in this table */ - ibool tmp_ok) /*!< in: TRUE=allow temporary - index names */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ { /* Check for duplicates, ignoring indexes that are marked as to be dropped */ @@ -5713,17 +5755,32 @@ dict_table_check_for_dup_indexes( index1 = UT_LIST_GET_FIRST(table->indexes); do { - ut_ad(tmp_ok || *index1->name != TEMP_INDEX_PREFIX); - - index2 = UT_LIST_GET_NEXT(indexes, index1); - - while (index2) { - - if (!index2->to_be_dropped) { - ut_ad(ut_strcmp(index1->name, index2->name)); + if (*index1->name == TEMP_INDEX_PREFIX) { + ut_a(!dict_index_is_clust(index1)); + + switch (check) { + case CHECK_ALL_COMPLETE: + ut_error; + case CHECK_ABORTED_OK: + switch (dict_index_get_online_status(index1)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + ut_error; + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + /* fall through */ + case CHECK_PARTIAL_OK: + break; } + } - index2 = UT_LIST_GET_NEXT(indexes, index2); + for (index2 = UT_LIST_GET_NEXT(indexes, index1); + index2 != NULL; + index2 = UT_LIST_GET_NEXT(indexes, index2)) { + ut_ad(ut_strcmp(index1->name, index2->name)); } index1 = UT_LIST_GET_NEXT(indexes, index1); @@ -5739,17 +5796,17 @@ The caller must own the dictionary mutex. dict_table_schema_check() @{ @return DB_SUCCESS if the table exists and contains the necessary columns */ UNIV_INTERN -enum db_err +dberr_t dict_table_schema_check( /*====================*/ dict_table_schema_t* req_schema, /*!< in/out: required table schema */ char* errstr, /*!< out: human readable error - message if != DB_SUCCESS and - != DB_TABLE_NOT_FOUND is + message if != DB_SUCCESS is returned */ size_t errstr_sz) /*!< in: errstr size */ { + char buf[MAX_FULL_NAME_LEN]; dict_table_t* table; ulint i; @@ -5757,8 +5814,24 @@ dict_table_schema_check( table = dict_table_get_low(req_schema->table_name); - if (table == NULL || table->ibd_file_missing) { - /* no such table or missing tablespace */ + if (table == NULL) { + /* no such table */ + + ut_snprintf(errstr, errstr_sz, + "Table %s not found.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf))); + + return(DB_TABLE_NOT_FOUND); + } + + if (table->ibd_file_missing) { + /* missing tablespace */ + + ut_snprintf(errstr, errstr_sz, + "Tablespace for table %s is missing.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf))); return(DB_TABLE_NOT_FOUND); } @@ -5769,7 +5842,8 @@ dict_table_schema_check( ut_snprintf(errstr, errstr_sz, "%s has %d columns but should have %lu.", - req_schema->table_name, + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), table->n_def - DATA_N_SYS_COLS, req_schema->n_cols); @@ -5814,9 +5888,12 @@ dict_table_schema_check( if (j == table->n_def) { ut_snprintf(errstr, errstr_sz, - "required column %s.%s not found.", - req_schema->table_name, - req_schema->columns[i].name); + "required column %s " + "not found in table %s.", + req_schema->columns[i].name, + ut_format_name( + req_schema->table_name, + TRUE, buf, sizeof(buf))); return(DB_ERROR); } @@ -5839,10 +5916,11 @@ dict_table_schema_check( if (req_schema->columns[i].len != table->cols[j].len) { ut_snprintf(errstr, errstr_sz, - "Column %s.%s is %s but should be %s " - "(length mismatch).", - req_schema->table_name, + "Column %s in table %s is %s " + "but should be %s (length mismatch).", req_schema->columns[i].name, + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), actual_type, req_type); return(DB_ERROR); @@ -5852,10 +5930,11 @@ dict_table_schema_check( if (req_schema->columns[i].mtype != table->cols[j].mtype) { ut_snprintf(errstr, errstr_sz, - "Column %s.%s is %s but should be %s " - "(type mismatch).", - req_schema->table_name, + "Column %s in table %s is %s " + "but should be %s (type mismatch).", req_schema->columns[i].name, + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), actual_type, req_type); return(DB_ERROR); @@ -5868,20 +5947,110 @@ dict_table_schema_check( != req_schema->columns[i].prtype_mask) { ut_snprintf(errstr, errstr_sz, - "Column %s.%s is %s but should be %s " - "(flags mismatch).", - req_schema->table_name, + "Column %s in table %s is %s " + "but should be %s (flags mismatch).", req_schema->columns[i].name, + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), actual_type, req_type); return(DB_ERROR); } } + if (req_schema->n_foreign != UT_LIST_GET_LEN(table->foreign_list)) { + ut_snprintf( + errstr, errstr_sz, + "Table %s has %lu foreign key(s) pointing to other " + "tables, but it must have %lu.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + UT_LIST_GET_LEN(table->foreign_list), + req_schema->n_foreign); + return(DB_ERROR); + } + + if (req_schema->n_referenced != UT_LIST_GET_LEN(table->referenced_list)) { + ut_snprintf( + errstr, errstr_sz, + "There are %lu foreign key(s) pointing to %s, " + "but there must be %lu.", + UT_LIST_GET_LEN(table->referenced_list), + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + req_schema->n_referenced); + return(DB_ERROR); + } + return(DB_SUCCESS); } /* @} */ +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +UNIV_INTERN +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ +{ + char db[MAX_DATABASE_NAME_LEN + 1]; + ulint db_len; + uint errors; + + db_len = dict_get_db_name_len(db_and_table); + + ut_a(db_len <= sizeof(db)); + + memcpy(db, db_and_table, db_len); + db[db_len] = '\0'; + + strconvert( + &my_charset_filename, db, + system_charset_info, db_utf8, db_utf8_size, + &errors); + + /* convert each # to @0023 in table name and store the result in buf */ + const char* table = dict_remove_db_name(db_and_table); + const char* table_p; + char buf[MAX_TABLE_NAME_LEN * 5 + 1]; + char* buf_p; + for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) { + if (table_p[0] != '#') { + buf_p[0] = table_p[0]; + buf_p++; + } else { + buf_p[0] = '@'; + buf_p[1] = '0'; + buf_p[2] = '0'; + buf_p[3] = '2'; + buf_p[4] = '3'; + buf_p += 5; + } + ut_a((size_t) (buf_p - buf) < sizeof(buf)); + } + buf_p[0] = '\0'; + + errors = 0; + strconvert( + &my_charset_filename, buf, + system_charset_info, table_utf8, table_utf8_size, + &errors); + + if (errors != 0) { + ut_snprintf(table_utf8, table_utf8_size, "%s%s", + srv_mysql50_table_name_prefix, table); + } +} + /**********************************************************************//** Closes the data dictionary module. */ UNIV_INTERN @@ -5929,7 +6098,9 @@ dict_close(void) rw_lock_free(&dict_operation_lock); memset(&dict_operation_lock, 0x0, sizeof(dict_operation_lock)); - mutex_free(&dict_foreign_err_mutex); + if (!srv_read_only_mode) { + mutex_free(&dict_foreign_err_mutex); + } mem_free(dict_sys); dict_sys = NULL; @@ -5943,7 +6114,7 @@ dict_close(void) /**********************************************************************//** Validate the dictionary table LRU list. @return TRUE if valid */ -UNIV_INTERN +static ibool dict_lru_validate(void) /*===================*/ @@ -5972,7 +6143,7 @@ dict_lru_validate(void) /**********************************************************************//** Check if a table exists in the dict table LRU list. @return TRUE if table found in LRU list */ -UNIV_INTERN +static ibool dict_lru_find_table( /*================*/ @@ -6025,4 +6196,279 @@ dict_non_lru_find_table( return(FALSE); } # endif /* UNIV_DEBUG */ +/*********************************************************************//** +Check an index to see whether its first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return true if the index qualifies, otherwise false */ +UNIV_INTERN +bool +dict_foreign_qualify_index( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + ibool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ +{ + ulint i; + + if (dict_index_get_n_fields(index) < n_cols) { + return(false); + } + + for (i= 0; i < n_cols; i++) { + dict_field_t* field; + const char* col_name; + + field = dict_index_get_nth_field(index, i); + + col_name = dict_table_get_col_name( + table, dict_col_get_no(field->col)); + + if (field->prefix_len != 0) { + /* We do not accept column prefix + indexes here */ + + break; + } + + if (0 != innobase_strcasecmp(columns[i], + col_name)) { + break; + } + + if (check_null + && (field->col->prtype & DATA_NOT_NULL)) { + + break; + } + + if (types_idx && !cmp_cols_are_equal( + dict_index_get_nth_col(index, i), + dict_index_get_nth_col(types_idx, + i), + check_charsets)) { + + break; + } + } + + return((i == n_cols) ? true : false); +} + +/*********************************************************************//** +Update the state of compression failure padding heuristics. This is +called whenever a compression operation succeeds or fails. +The caller must be holding info->mutex */ +static +void +dict_index_zip_pad_update( +/*======================*/ + zip_pad_info_t* info, /*<! in/out: info to be updated */ + ulint zip_threshold) /*<! in: zip threshold value */ +{ + ulint total; + ulint fail_pct; + + ut_ad(info); + + total = info->success + info->failure; + + ut_ad(total > 0); + + if(zip_threshold == 0) { + /* User has just disabled the padding. */ + return; + } + + if (total < ZIP_PAD_ROUND_LEN) { + /* We are in middle of a round. Do nothing. */ + return; + } + + /* We are at a 'round' boundary. Reset the values but first + calculate fail rate for our heuristic. */ + fail_pct = (info->failure * 100) / total; + info->failure = 0; + info->success = 0; + + if (fail_pct > zip_threshold) { + /* Compression failures are more then user defined + threshold. Increase the pad size to reduce chances of + compression failures. */ + ut_ad(info->pad % ZIP_PAD_INCR == 0); + + /* Only do increment if it won't increase padding + beyond max pad size. */ + if (info->pad + ZIP_PAD_INCR + < (UNIV_PAGE_SIZE * zip_pad_max) / 100) { +#ifdef HAVE_ATOMIC_BUILTINS + /* Use atomics even though we have the mutex. + This is to ensure that we are able to read + info->pad atomically where atomics are + supported. */ + os_atomic_increment_ulint(&info->pad, ZIP_PAD_INCR); +#else /* HAVE_ATOMIC_BUILTINS */ + info->pad += ZIP_PAD_INCR; +#endif /* HAVE_ATOMIC_BUILTINS */ + + MONITOR_INC(MONITOR_PAD_INCREMENTS); + } + + info->n_rounds = 0; + + } else { + /* Failure rate was OK. Another successful round + completed. */ + ++info->n_rounds; + + /* If enough successful rounds are completed with + compression failure rate in control, decrease the + padding. */ + if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT + && info->pad > 0) { + + ut_ad(info->pad % ZIP_PAD_INCR == 0); +#ifdef HAVE_ATOMIC_BUILTINS + /* Use atomics even though we have the mutex. + This is to ensure that we are able to read + info->pad atomically where atomics are + supported. */ + os_atomic_decrement_ulint(&info->pad, ZIP_PAD_INCR); +#else /* HAVE_ATOMIC_BUILTINS */ + info->pad -= ZIP_PAD_INCR; +#endif /* HAVE_ATOMIC_BUILTINS */ + + info->n_rounds = 0; + + MONITOR_INC(MONITOR_PAD_DECREMENTS); + } + } +} + +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ut_ad(index); + + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + os_fast_mutex_lock(&index->zip_pad.mutex); + ++index->zip_pad.success; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + os_fast_mutex_unlock(&index->zip_pad.mutex); +} + +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ut_ad(index); + + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + os_fast_mutex_lock(&index->zip_pad.mutex); + ++index->zip_pad.failure; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + os_fast_mutex_unlock(&index->zip_pad.mutex); +} + + +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page might not compress */ +UNIV_INTERN +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ +{ + ulint pad; + ulint min_sz; + ulint sz; + + ut_ad(index); + + if (!zip_failure_threshold_pct) { + /* Disabled by user. */ + return(UNIV_PAGE_SIZE); + } + + /* We use atomics to read index->zip_pad.pad. Here we use zero + as increment as are not changing the value of the 'pad'. On + platforms where atomics are not available we grab the mutex. */ + +#ifdef HAVE_ATOMIC_BUILTINS + pad = os_atomic_increment_ulint(&index->zip_pad.pad, 0); +#else /* HAVE_ATOMIC_BUILTINS */ + os_fast_mutex_lock(&index->zip_pad.mutex); + pad = index->zip_pad.pad; + os_fast_mutex_unlock(&index->zip_pad.mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + ut_ad(pad < UNIV_PAGE_SIZE); + sz = UNIV_PAGE_SIZE - pad; + + /* Min size allowed by user. */ + ut_ad(zip_pad_max < 100); + min_sz = (UNIV_PAGE_SIZE * (100 - zip_pad_max)) / 100; + + return(ut_max(sz, min_sz)); +} + +/*************************************************************//** +Convert table flag to row format string. +@return row format name. */ +UNIV_INTERN +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag) /*!< in: row format setting */ +{ + switch (dict_tf_get_rec_format(table_flag)) { + case REC_FORMAT_REDUNDANT: + return("ROW_TYPE_REDUNDANT"); + case REC_FORMAT_COMPACT: + return("ROW_TYPE_COMPACT"); + case REC_FORMAT_COMPRESSED: + return("ROW_TYPE_COMPRESSED"); + case REC_FORMAT_DYNAMIC: + return("ROW_TYPE_DYNAMIC"); + } + + ut_error; + return(0); +} #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc index ff93be3e76a..46d72786ac6 100644 --- a/storage/innobase/dict/dict0load.cc +++ b/storage/innobase/dict/dict0load.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -41,18 +41,22 @@ Created 4/24/1996 Heikki Tuuri #include "rem0cmp.h" #include "srv0start.h" #include "srv0srv.h" +#include "dict0crea.h" #include "dict0priv.h" #include "ha_prototypes.h" /* innobase_casedn_str() */ #include "fts0priv.h" -/** Following are six InnoDB system tables */ +/** Following are the InnoDB system tables. The positions in +this array are referenced by enum dict_system_table_id. */ static const char* SYSTEM_TABLE_NAME[] = { "SYS_TABLES", "SYS_INDEXES", "SYS_COLUMNS", "SYS_FIELDS", "SYS_FOREIGN", - "SYS_FOREIGN_COLS" + "SYS_FOREIGN_COLS", + "SYS_TABLESPACES", + "SYS_DATAFILES" }; /* If this flag is TRUE, then we will load the cluster index's (and tables') @@ -183,7 +187,8 @@ dict_print(void) os_increment_counter_by_amount( server_mutex, - srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/); + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); heap = mem_heap_create(1000); mutex_enter(&(dict_sys->mutex)); @@ -196,13 +201,11 @@ dict_print(void) err_msg = static_cast<const char*>( dict_process_sys_tables_rec_and_mtr_commit( - heap, rec, &table, - static_cast<dict_table_info_t>( - DICT_TABLE_LOAD_FROM_CACHE - | DICT_TABLE_UPDATE_STATS), &mtr)); + heap, rec, &table, DICT_TABLE_LOAD_FROM_CACHE, + &mtr)); if (!err_msg) { - dict_table_print_low(table); + dict_table_print(table); } else { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: %s\n", err_msg); @@ -221,7 +224,8 @@ dict_print(void) /* Restore the fatal semaphore wait timeout */ os_decrement_counter_by_amount( server_mutex, - srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/); + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); } /********************************************************************//** @@ -278,8 +282,8 @@ dict_startscan_system( clust_index = UT_LIST_GET_FIRST(system_table->indexes); - btr_pcur_open_at_index_side(TRUE, clust_index, BTR_SEARCH_LEAF, pcur, - TRUE, mtr); + btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, pcur, + true, 0, mtr); rec = dict_getnext_system_low(pcur, mtr); @@ -307,6 +311,7 @@ dict_getnext_system( return(rec); } + /********************************************************************//** This function processes one SYS_TABLES record and populate the dict_table_t struct for the table. Extracted out of dict_print() to be used by @@ -362,15 +367,6 @@ dict_process_sys_tables_rec_and_mtr_commit( return(err_msg); } - if ((status & DICT_TABLE_UPDATE_STATS) - && dict_table_get_first_index(*table)) { - - /* Update statistics member fields in *table if - DICT_TABLE_UPDATE_STATS is set */ - ut_ad(mutex_own(&dict_sys->mutex)); - dict_stats_update(*table, DICT_STATS_FETCH, TRUE); - } - return(NULL); } @@ -401,6 +397,7 @@ dict_process_sys_indexes_rec( return(err_msg); } + /********************************************************************//** This function parses a SYS_COLUMNS record and populate a dict_column_t structure with the information from the record. @@ -423,6 +420,7 @@ dict_process_sys_columns_rec( return(err_msg); } + /********************************************************************//** This function parses a SYS_FIELDS record and populates a dict_field_t structure with the information from the record. @@ -475,7 +473,7 @@ dict_process_sys_foreign_rec( const byte* field; ulint n_fields_and_type; - if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) { + if (rec_get_deleted_flag(rec, 0)) { return("delete-marked record in SYS_FOREIGN"); } @@ -485,7 +483,7 @@ dict_process_sys_foreign_rec( field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN__ID, &len); - if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) { + if (len == 0 || len == UNIV_SQL_NULL) { err_len: return("incorrect column length in SYS_FOREIGN"); } @@ -512,7 +510,7 @@ err_len: field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { goto err_len; } foreign->foreign_table_name = mem_heap_strdupl( @@ -520,7 +518,7 @@ err_len: field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { goto err_len; } foreign->referenced_table_name = mem_heap_strdupl( @@ -568,7 +566,7 @@ dict_process_sys_foreign_col_rec( field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { err_len: return("incorrect column length in SYS_FOREIGN_COLS"); } @@ -594,14 +592,14 @@ err_len: field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { goto err_len; } *for_col_name = mem_heap_strdupl(heap, (char*) field, len); field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { goto err_len; } *ref_col_name = mem_heap_strdupl(heap, (char*) field, len); @@ -610,6 +608,127 @@ err_len: } /********************************************************************//** +This function parses a SYS_TABLESPACES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tablespaces( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */ + ulint* space, /*!< out: space id */ + const char** name, /*!< out: tablespace name */ + ulint* flags) /*!< out: tablespace flags */ +{ + ulint len; + const byte* field; + + /* Initialize the output values */ + *space = ULINT_UNDEFINED; + *name = NULL; + *flags = ULINT_UNDEFINED; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_TABLESPACES"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLESPACES) { + return("wrong number of columns in SYS_TABLESPACES record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len); + if (len != DICT_FLD_LEN_SPACE) { +err_len: + return("incorrect column length in SYS_TABLESPACES"); + } + *space = mach_read_from_4(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLESPACES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLESPACES__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len); + if (len != DICT_FLD_LEN_FLAGS) { + goto err_len; + } + *flags = mach_read_from_4(field); + + return(NULL); +} + +/********************************************************************//** +This function parses a SYS_DATAFILES record, extracts necessary +information from the record and returns it to the caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_datafiles( +/*=======================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_DATAFILES rec */ + ulint* space, /*!< out: space id */ + const char** path) /*!< out: datafile paths */ +{ + ulint len; + const byte* field; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_DATAFILES"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_DATAFILES) { + return("wrong number of columns in SYS_DATAFILES record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_DATAFILES__SPACE, &len); + if (len != DICT_FLD_LEN_SPACE) { +err_len: + return("incorrect column length in SYS_DATAFILES"); + } + *space = mach_read_from_4(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_DATAFILES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_DATAFILES__PATH, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *path = mem_heap_strdupl(heap, (char*) field, len); + + return(NULL); +} + +/********************************************************************//** Determine the flags of a table as stored in SYS_TABLES.TYPE and N_COLS. @return ULINT_UNDEFINED if error, else a valid dict_table_t::flags. */ static @@ -629,11 +748,9 @@ dict_sys_tables_get_flags( ut_a(len == 4); type = mach_read_from_4(field); - /* The low order bit of SYS_TABLES.TYPE is always set to 1. If no - other bits are used, that is defined as SYS_TABLE_TYPE_ANTELOPE. - But in dict_table_t::flags the low order bit is used to determine - if the row format is Redundant or Compact when the format is - Antelope. + /* The low order bit of SYS_TABLES.TYPE is always set to 1. But in + dict_table_t::flags the low order bit is used to determine if the + row format is Redundant or Compact when the format is Antelope. Read the 4 byte N_COLS field and look at the high order bit. It should be set for COMPACT and later. It should not be set for REDUNDANT. */ @@ -645,10 +762,193 @@ dict_sys_tables_get_flags( /* This validation function also combines the DICT_N_COLS_COMPACT flag in n_cols into the type field to effectively make it a dict_table_t::flags. */ - return(dict_sys_tables_type_validate(type, n_cols)); + + if (ULINT_UNDEFINED == dict_sys_tables_type_validate(type, n_cols)) { + return(ULINT_UNDEFINED); + } + + return(dict_sys_tables_type_to_tf(type, n_cols)); } /********************************************************************//** +Gets the filepath for a spaceid from SYS_DATAFILES and checks it against +the contents of a link file. This function is called when there is no +fil_node_t entry for this space ID so both durable locations on disk +must be checked and compared. +We use a temporary heap here for the table lookup, but not for the path +returned which the caller must free. +This function can return NULL if the space ID is not found in SYS_DATAFILES, +then the caller will assume that the ibd file is in the normal datadir. +@return own: A copy of the first datafile found in SYS_DATAFILES.PATH for +the given space ID. NULL if space ID is zero or not found. */ +UNIV_INTERN +char* +dict_get_first_path( +/*================*/ + ulint space, /*!< in: space id */ + const char* name) /*!< in: tablespace name */ +{ + mtr_t mtr; + dict_table_t* sys_datafiles; + dict_index_t* sys_index; + dtuple_t* tuple; + dfield_t* dfield; + byte* buf; + btr_pcur_t pcur; + const rec_t* rec; + const byte* field; + ulint len; + char* dict_filepath = NULL; + mem_heap_t* heap = mem_heap_create(1024); + + ut_ad(mutex_own(&(dict_sys->mutex))); + + mtr_start(&mtr); + + sys_datafiles = dict_table_get_low("SYS_DATAFILES"); + sys_index = UT_LIST_GET_FIRST(sys_datafiles->indexes); + ut_ad(!dict_table_is_comp(sys_datafiles)); + ut_ad(name_of_col_is(sys_datafiles, sys_index, + DICT_FLD__SYS_DATAFILES__SPACE, "SPACE")); + ut_ad(name_of_col_is(sys_datafiles, sys_index, + DICT_FLD__SYS_DATAFILES__PATH, "PATH")); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, DICT_FLD__SYS_DATAFILES__SPACE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(buf, space); + + dfield_set_data(dfield, buf, 4); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + /* If the file-per-table tablespace was created with + an earlier version of InnoDB, then this record is not + in SYS_DATAFILES. But a link file still might exist. */ + + if (btr_pcur_is_on_user_rec(&pcur)) { + /* A record for this space ID was found. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_DATAFILES__PATH, &len); + ut_a(len > 0 || len == UNIV_SQL_NULL); + ut_a(len < OS_FILE_MAX_PATH); + dict_filepath = mem_strdupl((char*) field, len); + ut_a(dict_filepath); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(dict_filepath); +} + +/********************************************************************//** +Update the record for space_id in SYS_TABLESPACES to this filepath. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_update_filepath( +/*=================*/ + ulint space_id, /*!< in: space id */ + const char* filepath) /*!< in: filepath */ +{ + dberr_t err = DB_SUCCESS; + trx_t* trx; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = trx_allocate_for_background(); + trx->op_info = "update filepath"; + trx->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + pars_info_t* info = pars_info_create(); + + pars_info_add_int4_literal(info, "space", space_id); + pars_info_add_str_literal(info, "path", filepath); + + err = que_eval_sql(info, + "PROCEDURE UPDATE_FILEPATH () IS\n" + "BEGIN\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :path\n" + " WHERE SPACE = :space;\n" + "END;\n", FALSE, trx); + + trx_commit_for_mysql(trx); + trx->dict_operation_lock_mode = 0; + trx_free_for_background(trx); + + if (err == DB_SUCCESS) { + /* We just updated SYS_DATAFILES due to the contents in + a link file. Make a note that we did this. */ + ib_logf(IB_LOG_LEVEL_INFO, + "The InnoDB data dictionary table SYS_DATAFILES " + "for tablespace ID %lu was updated to use file %s.", + (ulong) space_id, filepath); + } else { + ib_logf(IB_LOG_LEVEL_WARN, + "Problem updating InnoDB data dictionary table " + "SYS_DATAFILES for tablespace ID %lu to file %s.", + (ulong) space_id, filepath); + } + + return(err); +} + +/********************************************************************//** +Insert records into SYS_TABLESPACES and SYS_DATAFILES. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_insert_tablespace_and_filepath( +/*================================*/ + ulint space, /*!< in: space id */ + const char* name, /*!< in: talespace name */ + const char* filepath, /*!< in: filepath */ + ulint fsp_flags) /*!< in: tablespace flags */ +{ + dberr_t err = DB_SUCCESS; + trx_t* trx; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(filepath); + + trx = trx_allocate_for_background(); + trx->op_info = "insert tablespace and filepath"; + trx->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + /* A record for this space ID was not found in + SYS_DATAFILES. Assume the record is also missing in + SYS_TABLESPACES. Insert records onto them both. */ + err = dict_create_add_tablespace_to_dictionary( + space, name, fsp_flags, filepath, trx, false); + + trx_commit_for_mysql(trx); + trx->dict_operation_lock_mode = 0; + trx_free_for_background(trx); + + return(err); +} + +/********************************************************************//** +This function looks at each table defined in SYS_TABLES. It checks the +tablespace for any table with a space_id > 0. It looks up the tablespace +in SYS_DATAFILES to ensure the correct path. + In a crash recovery we already have all the tablespace objects created. This function compares the space id information in the InnoDB data dictionary to what we already read with fil_load_single_table_tablespaces(). @@ -669,6 +969,7 @@ dict_check_tablespaces_and_store_max_id( ulint max_space_id; mtr_t mtr; + rw_lock_x_lock(&dict_operation_lock); mutex_enter(&(dict_sys->mutex)); mtr_start(&mtr); @@ -682,8 +983,8 @@ dict_check_tablespaces_and_store_max_id( MLOG_4BYTES, &mtr); fil_set_max_space_id_if_bigger(max_space_id); - btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, - TRUE, &mtr); + btr_pcur_open_at_index_side(true, sys_index, BTR_SEARCH_LEAF, &pcur, + true, 0, &mtr); loop: btr_pcur_move_to_next_user_rec(&pcur, &mtr); @@ -703,6 +1004,7 @@ loop: fil_set_max_space_id_if_bigger(max_space_id); mutex_exit(&(dict_sys->mutex)); + rw_lock_x_unlock(&dict_operation_lock); return; } @@ -718,8 +1020,14 @@ loop: field = rec_get_nth_field_old( rec, DICT_FLD__SYS_TABLES__NAME, &len); + name = mem_strdupl((char*) field, len); + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), name, FALSE); + flags = dict_sys_tables_get_flags(rec); if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) { /* Read again the 4 bytes from rec. */ @@ -728,13 +1036,9 @@ loop: ut_ad(len == 4); /* this was checked earlier */ flags = mach_read_from_4(field); - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_filename(stderr, name); - fprintf(stderr, "\n" - "InnoDB: in InnoDB data dictionary" - " has unknown type %lx.\n", - (ulong) flags); + ib_logf(IB_LOG_LEVEL_ERROR, + "Table '%s' in InnoDB data dictionary" + " has unknown type %lx", table_name, flags); goto loop; } @@ -749,43 +1053,84 @@ loop: mtr_commit(&mtr); + /* For tables created with old versions of InnoDB, + SYS_TABLES.MIX_LEN may contain garbage. Such tables + would always be in ROW_FORMAT=REDUNDANT. Pretend that + all such tables are non-temporary. That is, do not + suppress error printouts about temporary or discarded + tablespaces not being found. */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + + bool is_temp = false; + bool discarded = false; + ib_uint32_t flags2 = mach_read_from_4(field); + + /* Check that the tablespace (the .ibd file) really + exists; print a warning to the .err log if not. + Do not print warnings for temporary tables or for + tablespaces that have been discarded. */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + + /* MIX_LEN valid only for ROW_FORMAT > REDUNDANT. */ + if (mach_read_from_4(field) & DICT_N_COLS_COMPACT) { + + is_temp = !!(flags2 & DICT_TF2_TEMPORARY); + discarded = !!(flags2 & DICT_TF2_DISCARDED); + } + if (space_id == 0) { /* The system tablespace always exists. */ + ut_ad(!discarded); } else if (in_crash_recovery) { - /* Check that the tablespace (the .ibd file) really - exists; print a warning to the .err log if not. - Do not print warnings for temporary tables. */ - ibool is_temp; + /* All tablespaces should have been found in + fil_load_single_table_tablespaces(). */ - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_TABLES__N_COLS, &len); - if (mach_read_from_4(field) & DICT_N_COLS_COMPACT) { - /* ROW_FORMAT=COMPACT: read the is_temp - flag from SYS_TABLES.MIX_LEN. */ - field = rec_get_nth_field_old( - rec, 7/*MIX_LEN*/, &len); - is_temp = !!(mach_read_from_4(field) - & DICT_TF2_TEMPORARY); - } else { - /* For tables created with old versions - of InnoDB, SYS_TABLES.MIX_LEN may contain - garbage. Such tables would always be - in ROW_FORMAT=REDUNDANT. Pretend that - all such tables are non-temporary. That is, - do not suppress error printouts about - temporary tables not being found. */ - is_temp = FALSE; + fil_space_for_table_exists_in_mem( + space_id, name, TRUE, !(is_temp || discarded), + false, NULL, 0); + + } else if (!discarded) { + + /* It is a normal database startup: create the + space object and check that the .ibd file exists. + If the table uses a remote tablespace, look for the + space_id in SYS_DATAFILES to find the filepath */ + + /* Use the remote filepath if known. */ + char* filepath = NULL; + if (DICT_TF_HAS_DATA_DIR(flags)) { + filepath = dict_get_first_path( + space_id, name); } - fil_space_for_table_exists_in_mem( - space_id, name, TRUE, !is_temp); - } else { - /* It is a normal database startup: create the space - object and check that the .ibd file exists. */ + /* We set the 2nd param (fix_dict = true) + here because we already have an x-lock on + dict_operation_lock and dict_sys->mutex. Besides, + this is at startup and we are now single threaded. + If the filepath is not known, it will need to + be discovered. */ + dberr_t err = fil_open_single_table_tablespace( + false, srv_read_only_mode ? false : true, + space_id, dict_tf_to_fsp_flags(flags), + name, filepath); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespace open failed for '%s', " + "ignored.", table_name); + } - fil_open_single_table_tablespace( - FALSE, space_id, - dict_tf_to_fsp_flags(flags), name); + if (filepath) { + mem_free(filepath); + } + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "DISCARD flag set for table '%s', ignored.", + table_name); } mem_free(name); @@ -879,7 +1224,7 @@ err_len: field = rec_get_nth_field_old( rec, DICT_FLD__SYS_COLUMNS__NAME, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { goto err_len; } @@ -1003,6 +1348,11 @@ dict_load_columns( err_msg = dict_load_column_low(table, heap, NULL, NULL, &name, rec); + if (err_msg) { + fprintf(stderr, "InnoDB: %s\n", err_msg); + ut_error; + } + /* Note: Currently we have one DOC_ID column that is shared by all FTS indexes on a table. */ if (innobase_strcasecmp(name, @@ -1037,11 +1387,6 @@ dict_load_columns( table->fts->doc_col = i; } - if (err_msg) { - fprintf(stderr, "InnoDB: %s\n", err_msg); - ut_error; - } - btr_pcur_move_to_next_user_rec(&pcur, &mtr); } @@ -1154,7 +1499,7 @@ err_len: field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { goto err_len; } @@ -1194,7 +1539,7 @@ dict_load_fields( byte* buf; ulint i; mtr_t mtr; - ulint error; + dberr_t error; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -1394,8 +1739,8 @@ Loads definitions for table indexes. Adds them to the data dictionary cache. @return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary table or DB_UNSUPPORTED if table has unknown index type */ -static -ulint +static __attribute__((nonnull)) +dberr_t dict_load_indexes( /*==============*/ dict_table_t* table, /*!< in/out: table */ @@ -1412,7 +1757,7 @@ dict_load_indexes( const rec_t* rec; byte* buf; mtr_t mtr; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -1443,6 +1788,21 @@ dict_load_indexes( if (!btr_pcur_is_on_user_rec(&pcur)) { + /* We should allow the table to open even + without index when DICT_ERR_IGNORE_CORRUPT is set. + DICT_ERR_IGNORE_CORRUPT is currently only set + for drop table */ + if (dict_table_get_first_index(table) == NULL + && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Cannot load table %s " + "because it has no indexes in " + "InnoDB internal data dictionary.", + table->name); + error = DB_CORRUPTION; + goto func_exit; + } + break; } @@ -1456,6 +1816,20 @@ dict_load_indexes( if (err_msg == dict_load_index_id_err) { /* TABLE_ID mismatch means that we have run out of index definitions for the table. */ + + if (dict_table_get_first_index(table) == NULL + && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to load the " + "clustered index for table %s " + "because of the following error: %s. " + "Refusing to load the rest of the " + "indexes (if any) and the whole table " + "altogether.", table->name, err_msg); + error = DB_CORRUPTION; + goto func_exit; + } + break; } else if (err_msg == dict_load_index_del) { /* Skip delete-marked records. */ @@ -1510,15 +1884,15 @@ dict_load_indexes( subsequent checks are relevant for the supported types. */ if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE | DICT_CORRUPT | DICT_FTS)) { - fprintf(stderr, - "InnoDB: Error: unknown type %lu" - " of index %s of table %s\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown type %lu of index %s of table %s", (ulong) index->type, index->name, table->name); error = DB_UNSUPPORTED; dict_mem_index_free(index); goto func_exit; } else if (index->page == FIL_NULL + && !table->ibd_file_missing && (!(index->type & DICT_FTS))) { fprintf(stderr, @@ -1560,7 +1934,7 @@ corrupted: " is not clustered!\n", stderr); goto corrupted; - } else if (table->id < DICT_HDR_FIRST_ID + } else if (dict_is_sys_table(table->id) && (dict_index_is_clust(index) || ((table == dict_sys->sys_tables) && !strcmp("ID_IND", index->name)))) { @@ -1570,8 +1944,10 @@ corrupted: dict_mem_index_free(index); } else { dict_load_fields(index, heap); - error = dict_index_add_to_cache(table, index, - index->page, FALSE); + + error = dict_index_add_to_cache( + table, index, index->page, FALSE); + /* The data dictionary tables should never contain invalid index definitions. If we ignored this error and simply did not load this index definition, the @@ -1629,7 +2005,7 @@ dict_load_table_low( rec_get_nth_field_offs_old( rec, DICT_FLD__SYS_TABLES__NAME, &len); - if (len < 1 || len == UNIV_SQL_NULL) { + if (len == 0 || len == UNIV_SQL_NULL) { err_len: return("incorrect column length in SYS_TABLES"); } @@ -1751,6 +2127,77 @@ err_len: } /********************************************************************//** +Using the table->heap, copy the null-terminated filepath into +table->data_dir_path and replace the 'databasename/tablename.ibd' +portion with 'tablename'. +This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path. +Make this data directory path only if it has not yet been saved. */ +UNIV_INTERN +void +dict_save_data_dir_path( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + char* filepath) /*!< in: filepath of tablespace */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(DICT_TF_HAS_DATA_DIR(table->flags)); + + ut_a(!table->data_dir_path); + ut_a(filepath); + + /* Be sure this filepath is not the default filepath. */ + char* default_filepath = fil_make_ibd_name(table->name, false); + if (strcmp(filepath, default_filepath)) { + ulint pathlen = strlen(filepath); + ut_a(pathlen < OS_FILE_MAX_PATH); + ut_a(0 == strcmp(filepath + pathlen - 4, ".ibd")); + + table->data_dir_path = mem_heap_strdup(table->heap, filepath); + os_file_make_data_dir_path(table->data_dir_path); + } else { + /* This does not change SYS_DATAFILES or SYS_TABLES + or FSP_FLAGS on the header page of the tablespace, + but it makes dict_table_t consistent */ + table->flags &= ~DICT_TF_MASK_DATA_DIR; + } + mem_free(default_filepath); +} + +/*****************************************************************//** +Make sure the data_file_name is saved in dict_table_t if needed. Try to +read it from the file dictionary first, then from SYS_DATAFILES. */ +UNIV_INTERN +void +dict_get_and_save_data_dir_path( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + bool dict_mutex_own) /*!< in: true if dict_sys->mutex + is owned already */ +{ + if (DICT_TF_HAS_DATA_DIR(table->flags) + && (!table->data_dir_path)) { + char* path = fil_space_get_first_path(table->space); + + if (!dict_mutex_own) { + dict_mutex_enter_for_mysql(); + } + if (!path) { + path = dict_get_first_path( + table->space, table->name); + } + + if (path) { + dict_save_data_dir_path(table, path); + mem_free(path); + } + + if (!dict_mutex_own) { + dict_mutex_exit_for_mysql(); + } + } +} + +/********************************************************************//** Loads a table definition and also all its index definitions, and also the cluster definition if the table is a member in a cluster. Also loads all foreign key constraints where the foreign key is in the table or where @@ -1770,6 +2217,7 @@ dict_load_table( /*!< in: error to be ignored when loading table and its indexes' definition */ { + dberr_t err; dict_table_t* table; dict_table_t* sys_tables; btr_pcur_t pcur; @@ -1780,7 +2228,7 @@ dict_load_table( const rec_t* rec; const byte* field; ulint len; - ulint err; + char* filepath = NULL; const char* err_msg; mtr_t mtr; @@ -1843,39 +2291,71 @@ err_exit: goto err_exit; } + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name(table_name, sizeof(table_name), name, FALSE); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + if (table->space == 0) { /* The system tablespace is always available. */ + } else if (table->flags2 & DICT_TF2_DISCARDED) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Table '%s' tablespace is set as discarded.", + table_name); + + table->ibd_file_missing = TRUE; + } else if (!fil_space_for_table_exists_in_mem( - table->space, name, FALSE, FALSE)) { + table->space, name, FALSE, FALSE, true, heap, + table->id)) { - if (table->flags2 & DICT_TF2_TEMPORARY) { + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)) { /* Do not bother to retry opening temporary tables. */ table->ibd_file_missing = TRUE; + } else { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: error: space object of table "); - ut_print_filename(stderr, name); - fprintf(stderr, ",\n" - "InnoDB: space id %lu did not exist in memory." - " Retrying an open.\n", - (ulong) table->space); - /* Try to open the tablespace */ - if (!fil_open_single_table_tablespace( - TRUE, table->space, + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to find tablespace for table '%s' " + "in the cache. Attempting to load the " + "tablespace with space id %lu.", + table_name, (ulong) table->space); + + /* Use the remote filepath if needed. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + /* This needs to be added to the table + from SYS_DATAFILES */ + dict_get_and_save_data_dir_path(table, true); + + if (table->data_dir_path) { + filepath = os_file_make_remote_pathname( + table->data_dir_path, + table->name, "ibd"); + } + } + + /* Try to open the tablespace. We set the + 2nd param (fix_dict = false) here because we + do not have an x-lock on dict_operation_lock */ + err = fil_open_single_table_tablespace( + true, false, table->space, dict_tf_to_fsp_flags(table->flags), - name)) { + name, filepath); + + if (err != DB_SUCCESS) { /* We failed to find a sensible tablespace file */ table->ibd_file_missing = TRUE; } + if (filepath) { + mem_free(filepath); + } } } - btr_pcur_close(&pcur); - mtr_commit(&mtr); - dict_load_columns(table, heap); if (cached) { @@ -1886,7 +2366,15 @@ err_exit: mem_heap_empty(heap); - err = dict_load_indexes(table, heap, ignore_err); + /* If there is no tablespace for the table then we only need to + load the index definitions. So that we can IMPORT the tablespace + later. */ + if (table->ibd_file_missing) { + err = dict_load_indexes( + table, heap, DICT_ERR_IGNORE_ALL); + } else { + err = dict_load_indexes(table, heap, ignore_err); + } if (err == DB_INDEX_CORRUPT) { /* Refuse to load the table if the table has a corrupted @@ -1920,7 +2408,8 @@ err_exit: of the error condition, since the user may want to dump data from the clustered index. However we load the foreign key information only if all indexes were loaded. */ - if (!cached) { + if (!cached || table->ibd_file_missing) { + /* Don't attempt to load the indexes from disk. */ } else if (err == DB_SUCCESS) { err = dict_load_foreigns(table->name, TRUE, TRUE); @@ -1937,11 +2426,15 @@ err_exit: Otherwise refuse to load the table */ index = dict_table_get_first_index(table); - if (!srv_force_recovery || !index + if (!srv_force_recovery + || !index || !dict_index_is_clust(index)) { + dict_table_remove_from_cache(table); table = NULL; - } else if (dict_index_is_corrupted(index)) { + + } else if (dict_index_is_corrupted(index) + && !table->ibd_file_missing) { /* It is possible we force to load a corrupted clustered index if srv_load_corrupted is set. @@ -1949,36 +2442,28 @@ err_exit: table->corrupted = TRUE; } } -#if 0 - if (err != DB_SUCCESS && table != NULL) { - mutex_enter(&dict_foreign_err_mutex); - - ut_print_timestamp(stderr); - - fprintf(stderr, - " InnoDB: Error: could not make a foreign key" - " definition to match\n" - "InnoDB: the foreign key table" - " or the referenced table!\n" - "InnoDB: The data dictionary of InnoDB is corrupt." - " You may need to drop\n" - "InnoDB: and recreate the foreign key table" - " or the referenced table.\n" - "InnoDB: Submit a detailed bug report" - " to http://bugs.mysql.com\n" - "InnoDB: Latest foreign key error printout:\n%s\n", - dict_foreign_err_buf); - - mutex_exit(&dict_foreign_err_mutex); - } -#endif /* 0 */ func_exit: mem_heap_free(heap); - ut_ad(!table || ignore_err != DICT_ERR_IGNORE_NONE + ut_ad(!table + || ignore_err != DICT_ERR_IGNORE_NONE + || table->ibd_file_missing || !table->corrupted); + if (table && table->fts) { + if (!(dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) { + /* the table->fts could be created in dict_load_column + when a user defined FTS_DOC_ID is present, but no + FTS */ + fts_free(table); + } else { + fts_optimize_add_table(table); + } + } + return(table); } @@ -2019,6 +2504,7 @@ dict_load_table_on_id( sys_table_ids = dict_table_get_next_index( dict_table_get_first_index(sys_tables)); ut_ad(!dict_table_is_comp(sys_tables)); + ut_ad(!dict_index_is_clust(sys_table_ids)); heap = mem_heap_create(256); tuple = dtuple_create(heap, 1); @@ -2099,15 +2585,20 @@ dict_load_sys_table( } /********************************************************************//** -Loads foreign key constraint col names (also for the referenced table). */ +Loads foreign key constraint col names (also for the referenced table). +Members that must be set (and valid) in foreign: +foreign->heap +foreign->n_fields +foreign->id ('\0'-terminated) +Members that will be created and set by this function: +foreign->foreign_col_names[i] +foreign->referenced_col_names[i] +(for i=0..foreign->n_fields-1) */ static void dict_load_foreign_cols( /*===================*/ - const char* id, /*!< in: foreign constraint id, not - necessary '\0'-terminated */ - ulint id_len, /*!< in: id length */ - dict_foreign_t* foreign)/*!< in: foreign constraint object */ + dict_foreign_t* foreign)/*!< in/out: foreign constraint object */ { dict_table_t* sys_foreign_cols; dict_index_t* sys_index; @@ -2119,9 +2610,12 @@ dict_load_foreign_cols( ulint len; ulint i; mtr_t mtr; + size_t id_len; ut_ad(mutex_own(&(dict_sys->mutex))); + id_len = strlen(foreign->id); + foreign->foreign_col_names = static_cast<const char**>( mem_heap_alloc(foreign->heap, foreign->n_fields * sizeof(void*))); @@ -2140,7 +2634,7 @@ dict_load_foreign_cols( tuple = dtuple_create(foreign->heap, 1); dfield = dtuple_get_nth_field(tuple, 0); - dfield_set_data(dfield, id, id_len); + dfield_set_data(dfield, foreign->id, id_len); dict_index_copy_types(tuple, sys_index, 1); btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, @@ -2154,8 +2648,42 @@ dict_load_foreign_cols( field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); - ut_a(len == id_len); - ut_a(ut_memcmp(id, field, len) == 0); + + if (len != id_len || ut_memcmp(foreign->id, field, len) != 0) { + const rec_t* pos; + ulint pos_len; + const rec_t* for_col_name; + ulint for_col_name_len; + const rec_t* ref_col_name; + ulint ref_col_name_len; + + pos = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, + &pos_len); + + for_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, + &for_col_name_len); + + ref_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, + &ref_col_name_len); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to load columns names for foreign " + "key '%s' because it was not found in " + "InnoDB internal table SYS_FOREIGN_COLS. The " + "closest entry we found is: " + "(ID='%.*s', POS=%lu, FOR_COL_NAME='%.*s', " + "REF_COL_NAME='%.*s')", + foreign->id, + (int) len, field, + mach_read_from_4(pos), + (int) for_col_name_len, for_col_name, + (int) ref_col_name_len, ref_col_name); + + ut_error; + } field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len); @@ -2182,13 +2710,12 @@ dict_load_foreign_cols( /***********************************************************************//** Loads a foreign key constraint to the dictionary cache. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t dict_load_foreign( /*==============*/ - const char* id, /*!< in: foreign constraint id, not - necessary '\0'-terminated */ - ulint id_len, /*!< in: id length */ + const char* id, /*!< in: foreign constraint id, must be + '\0'-terminated */ ibool check_charsets, /*!< in: TRUE=check charset compatibility */ ibool check_recursive) @@ -2210,9 +2737,12 @@ dict_load_foreign( mtr_t mtr; dict_table_t* for_table; dict_table_t* ref_table; + size_t id_len; ut_ad(mutex_own(&(dict_sys->mutex))); + id_len = strlen(id); + heap2 = mem_heap_create(1000); mtr_start(&mtr); @@ -2238,8 +2768,8 @@ dict_load_foreign( fprintf(stderr, "InnoDB: Error: cannot load foreign constraint " - "%.*s: could not find the relevant record in " - "SYS_FOREIGN\n", (int) id_len, id); + "%s: could not find the relevant record in " + "SYS_FOREIGN\n", id); btr_pcur_close(&pcur); mtr_commit(&mtr); @@ -2255,8 +2785,8 @@ dict_load_foreign( fprintf(stderr, "InnoDB: Error: cannot load foreign constraint " - "%.*s: found %.*s instead in SYS_FOREIGN\n", - (int) id_len, id, (int) len, field); + "%s: found %.*s instead in SYS_FOREIGN\n", + id, (int) len, field); btr_pcur_close(&pcur); mtr_commit(&mtr); @@ -2301,7 +2831,7 @@ dict_load_foreign( btr_pcur_close(&pcur); mtr_commit(&mtr); - dict_load_foreign_cols(id, id_len, foreign); + dict_load_foreign_cols(foreign); ref_table = dict_table_check_if_in_cache_low( foreign->referenced_table_name_lookup); @@ -2371,7 +2901,7 @@ cache already contains all constraints where the other relevant table is already in the dictionary cache. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t dict_load_foreigns( /*===============*/ const char* table_name, /*!< in: table name */ @@ -2389,7 +2919,7 @@ dict_load_foreigns( const rec_t* rec; const byte* field; ulint len; - ulint err; + dberr_t err; mtr_t mtr; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -2414,6 +2944,7 @@ dict_load_foreigns( sec_index = dict_table_get_next_index( dict_table_get_first_index(sys_foreign)); + ut_ad(!dict_index_is_clust(sec_index)); start_load: tuple = dtuple_create_from_mem(tuple_buf, sizeof(tuple_buf), 1); @@ -2436,7 +2967,6 @@ loop: /* Now we have the record in the secondary index containing a table name and a foreign constraint ID */ - rec = btr_pcur_get_rec(&pcur); field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len); @@ -2475,14 +3005,21 @@ loop: field = rec_get_nth_field_old( rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len); + /* Copy the string because the page may be modified or evicted + after mtr_commit() below. */ + char fk_id[MAX_TABLE_NAME_LEN + 1]; + + ut_a(len <= MAX_TABLE_NAME_LEN); + memcpy(fk_id, field, len); + fk_id[len] = '\0'; + btr_pcur_store_position(&pcur, &mtr); mtr_commit(&mtr); /* Load the foreign constraint definition to the dictionary cache */ - err = dict_load_foreign((char*) field, len, check_charsets, - check_recursive); + err = dict_load_foreign(fk_id, check_charsets, check_recursive); if (err != DB_SUCCESS) { btr_pcur_close(&pcur); diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc index 28b935d2e58..116a6a6d96a 100644 --- a/storage/innobase/dict/dict0mem.cc +++ b/storage/innobase/dict/dict0mem.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -35,8 +36,9 @@ Created 1/8/1996 Heikki Tuuri #include "dict0dict.h" #include "fts0priv.h" #ifndef UNIV_HOTBACKUP -#include "ha_prototypes.h" /* innobase_casedn_str(), +# include "ha_prototypes.h" /* innobase_casedn_str(), innobase_get_lower_case_table_names */ +# include "mysql_com.h" /* NAME_LEN */ # include "lock0lock.h" #endif /* !UNIV_HOTBACKUP */ #ifdef UNIV_BLOB_DEBUG @@ -51,6 +53,10 @@ Created 1/8/1996 Heikki Tuuri UNIV_INTERN mysql_pfs_key_t autoinc_mutex_key; #endif /* UNIV_PFS_MUTEX */ +/** Prefix for tmp tables, adopted from sql/table.h */ +#define tmp_file_prefix "#sql" +#define tmp_file_prefix_length 4 + /**********************************************************************//** Creates a table memory object. @return own: table object */ @@ -60,9 +66,7 @@ dict_mem_table_create( /*==================*/ const char* name, /*!< in: table name */ ulint space, /*!< in: space where the clustered index of - the table is placed; this parameter is - ignored if the table is made a member of - a cluster */ + the table is placed */ ulint n_cols, /*!< in: number of columns */ ulint flags, /*!< in: table flags */ ulint flags2) /*!< in: table flags2 */ @@ -71,7 +75,7 @@ dict_mem_table_create( mem_heap_t* heap; ut_ad(name); - dict_tf_validate(flags); + ut_a(dict_tf_is_valid(flags)); ut_a(!(flags2 & ~DICT_TF2_BIT_MASK)); heap = mem_heap_create(DICT_HEAP_SIZE); @@ -115,7 +119,6 @@ dict_mem_table_create( || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { table->fts = fts_create(table); table->fts->cache = fts_cache_create(table); - fts_optimize_add_table(table); } else { table->fts = NULL; } @@ -243,6 +246,156 @@ dict_mem_table_add_col( dict_mem_fill_column_struct(col, i, mtype, prtype, len); } +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +static __attribute__((nonnull)) +void +dict_mem_table_col_rename_low( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned i, /*!< in: column offset corresponding to s */ + const char* to, /*!< in: new column name */ + const char* s) /*!< in: pointer to table->col_names */ +{ + size_t from_len = strlen(s), to_len = strlen(to); + + ut_ad(i < table->n_def); + ut_ad(from_len <= NAME_LEN); + ut_ad(to_len <= NAME_LEN); + + if (from_len == to_len) { + /* The easy case: simply replace the column name in + table->col_names. */ + strcpy(const_cast<char*>(s), to); + } else { + /* We need to adjust all affected index->field + pointers, as in dict_index_add_col(). First, copy + table->col_names. */ + ulint prefix_len = s - table->col_names; + + for (; i < table->n_def; i++) { + s += strlen(s) + 1; + } + + ulint full_len = s - table->col_names; + char* col_names; + + if (to_len > from_len) { + col_names = static_cast<char*>( + mem_heap_alloc( + table->heap, + full_len + to_len - from_len)); + + memcpy(col_names, table->col_names, prefix_len); + } else { + col_names = const_cast<char*>(table->col_names); + } + + memcpy(col_names + prefix_len, to, to_len); + memmove(col_names + prefix_len + to_len, + table->col_names + (prefix_len + from_len), + full_len - (prefix_len + from_len)); + + /* Replace the field names in every index. */ + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + ulint n_fields = dict_index_get_n_fields(index); + + for (ulint i = 0; i < n_fields; i++) { + dict_field_t* field + = dict_index_get_nth_field( + index, i); + ulint name_ofs + = field->name - table->col_names; + if (name_ofs <= prefix_len) { + field->name = col_names + name_ofs; + } else { + ut_a(name_ofs < full_len); + field->name = col_names + + name_ofs + to_len - from_len; + } + } + } + + table->col_names = col_names; + } + + /* Replace the field names in every foreign key constraint. */ + for (dict_foreign_t* foreign = UT_LIST_GET_FIRST(table->foreign_list); + foreign != NULL; + foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* These can point straight to + table->col_names, because the foreign key + constraints will be freed at the same time + when the table object is freed. */ + foreign->foreign_col_names[f] + = dict_index_get_nth_field( + foreign->foreign_index, f)->name; + } + } + + for (dict_foreign_t* foreign = UT_LIST_GET_FIRST( + table->referenced_list); + foreign != NULL; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* foreign->referenced_col_names[] need to be + copies, because the constraint may become + orphan when foreign_key_checks=0 and the + parent table is dropped. */ + + const char* col_name = dict_index_get_nth_field( + foreign->referenced_index, f)->name; + + if (strcmp(foreign->referenced_col_names[f], + col_name)) { + char** rc = const_cast<char**>( + foreign->referenced_col_names + f); + size_t col_name_len_1 = strlen(col_name) + 1; + + if (col_name_len_1 <= strlen(*rc) + 1) { + memcpy(*rc, col_name, col_name_len_1); + } else { + *rc = static_cast<char*>( + mem_heap_dup( + foreign->heap, + col_name, + col_name_len_1)); + } + } + } + } +} + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +UNIV_INTERN +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to) /*!< in: new column name */ +{ + const char* s = table->col_names; + + ut_ad(nth_col < table->n_def); + + for (unsigned i = 0; i < nth_col; i++) { + size_t len = strlen(s); + ut_ad(len > 0); + s += len + 1; + } + + /* This could fail if the data dictionaries are out of sync. + Proceed with the renaming anyway. */ + ut_ad(!strcmp(from, s)); + + dict_mem_table_col_rename_low(table, nth_col, to, s); +} /**********************************************************************//** This function populates a dict_col_t memory structure with @@ -304,6 +457,8 @@ dict_mem_index_create( dict_mem_fill_index_struct(index, heap, table_name, index_name, space, type, n_fields); + os_fast_mutex_init(zip_pad_mutex_key, &index->zip_pad.mutex); + return(index); } @@ -436,5 +591,31 @@ dict_mem_index_free( } #endif /* UNIV_BLOB_DEBUG */ + os_fast_mutex_free(&index->zip_pad.mutex); + mem_heap_free(index->heap); } + +/*******************************************************************//** +Create a temporary tablename. +@return temporary tablename suitable for InnoDB use */ +UNIV_INTERN +char* +dict_mem_create_temporary_tablename( +/*================================*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* dbtab, /*!< in: database/table name */ + table_id_t id) /*!< in: InnoDB table id */ +{ + const char* dbend = strchr(dbtab, '/'); + ut_ad(dbend); + size_t dblen = dbend - dbtab + 1; + size_t size = tmp_file_prefix_length + 4 + 9 + 9 + dblen; + + char* name = static_cast<char*>(mem_heap_alloc(heap, size)); + memcpy(name, dbtab, dblen); + ut_snprintf(name + dblen, size - dblen, + tmp_file_prefix "-ib" UINT64PF, id); + return(name); +} + diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index eebf6b1ec26..ff7e1ce642c 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2009, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,27 +29,27 @@ Created Jan 06, 2010 Vasil Dimov #include "btr0btr.h" /* btr_get_size() */ #include "btr0cur.h" /* btr_estimate_number_of_different_key_vals() */ -#include "dict0dict.h" /* dict_table_get_first_index() */ +#include "dict0dict.h" /* dict_table_get_first_index(), dict_fs2utf8() */ #include "dict0mem.h" /* DICT_TABLE_MAGIC_N */ #include "dict0stats.h" #include "data0type.h" /* dtype_t */ -#include "db0err.h" /* db_err */ +#include "db0err.h" /* dberr_t */ #include "dyn0dyn.h" /* dyn_array* */ +#include "page0page.h" /* page_align() */ #include "pars0pars.h" /* pars_info_create() */ #include "pars0types.h" /* pars_info_t */ #include "que0que.h" /* que_eval_sql() */ #include "rem0cmp.h" /* REC_MAX_N_FIELDS,cmp_rec_rec_with_match() */ -#include "row0sel.h" /* sel_node_struct */ +#include "row0sel.h" /* sel_node_t */ #include "row0types.h" /* sel_node_t */ #include "trx0trx.h" /* trx_create() */ #include "trx0roll.h" /* trx_rollback_to_savepoint() */ #include "ut0rnd.h" /* ut_rnd_interval() */ - -#include "ha_prototypes.h" /* innobase_strcasecmp() */ +#include "ut0ut.h" /* ut_format_name(), ut_time() */ /* Sampling algorithm description @{ -The algorithm is controlled by one number - srv_stats_persistent_sample_pages, +The algorithm is controlled by one number - N_SAMPLE_PAGES(index), let it be A, which is the number of leaf pages to analyze for a given index for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be analyzed). @@ -124,126 +124,34 @@ where n=1..n_uniq. #define DEBUG_PRINTF(fmt, ...) /* noop */ #endif /* UNIV_STATS_DEBUG */ -/* number of distinct records on a given level that are required to stop -descending to lower levels and fetch -srv_stats_persistent_sample_pages records from that level */ -#define N_DIFF_REQUIRED (srv_stats_persistent_sample_pages * 10) +/* Gets the number of leaf pages to sample in persistent stats estimation */ +#define N_SAMPLE_PAGES(index) \ + ((index)->table->stats_sample_pages != 0 ? \ + (index)->table->stats_sample_pages : \ + srv_stats_persistent_sample_pages) -/** Open handles on the stats tables. Currently this is used to increase the -reference count of the stats tables. */ -typedef struct dict_stats_struct { - dict_table_t* table_stats; /*!< Handle to open TABLE_STATS_NAME */ - dict_table_t* index_stats; /*!< Handle to open INDEX_STATS_NAME */ -} dict_stats_t; +/* number of distinct records on a given level that are required to stop +descending to lower levels and fetch N_SAMPLE_PAGES(index) records +from that level */ +#define N_DIFF_REQUIRED(index) (N_SAMPLE_PAGES(index) * 10) /*********************************************************************//** -Calculates new estimates for table and index statistics. This function -is relatively quick and is used to calculate transient statistics that -are not saved on disk. -This was the only way to calculate statistics before the -Persistent Statistics feature was introduced. -dict_stats_update_transient() @{ */ -static -void -dict_stats_update_transient( -/*========================*/ - dict_table_t* table) /*!< in/out: table */ +Checks whether an index should be ignored in stats manipulations: +* stats fetch +* stats recalc +* stats save +dict_stats_should_ignore_index() @{ +@return true if exists and all tables are ok */ +UNIV_INLINE +bool +dict_stats_should_ignore_index( +/*===========================*/ + const dict_index_t* index) /*!< in: index */ { - dict_index_t* index; - ulint sum_of_index_sizes = 0; - - /* Find out the sizes of the indexes and how many different values - for the key they approximately have */ - - index = dict_table_get_first_index(table); - - if (index == NULL) { - /* Table definition is corrupt */ - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: table %s has no indexes. " - "Cannot calculate statistics.\n", table->name); - return; - } - - do { - - if (index->type & DICT_FTS) { - index = dict_table_get_next_index(index); - continue; - } - - if (UNIV_LIKELY - (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE - || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO - && dict_index_is_clust(index)))) { - mtr_t mtr; - ulint size; - - mtr_start(&mtr); - mtr_s_lock(dict_index_get_lock(index), &mtr); - - size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); - - if (size != ULINT_UNDEFINED) { - index->stat_index_size = size; - - size = btr_get_size( - index, BTR_N_LEAF_PAGES, &mtr); - } - - mtr_commit(&mtr); - - switch (size) { - case ULINT_UNDEFINED: - goto fake_statistics; - case 0: - /* The root node of the tree is a leaf */ - size = 1; - } - - sum_of_index_sizes += index->stat_index_size; - - index->stat_n_leaf_pages = size; - - btr_estimate_number_of_different_key_vals(index); - } else { - /* If we have set a high innodb_force_recovery - level, do not calculate statistics, as a badly - corrupted index can cause a crash in it. - Initialize some bogus index cardinality - statistics, so that the data can be queried in - various means, also via secondary indexes. */ - ulint i; - -fake_statistics: - sum_of_index_sizes++; - index->stat_index_size = index->stat_n_leaf_pages = 1; - - for (i = dict_index_get_n_unique(index); i; ) { - index->stat_n_diff_key_vals[i--] = 1; - } - - memset(index->stat_n_non_null_key_vals, 0, - (1 + dict_index_get_n_unique(index)) - * sizeof(*index->stat_n_non_null_key_vals)); - } - - index = dict_table_get_next_index(index); - } while (index); - - index = dict_table_get_first_index(table); - - table->stat_n_rows = index->stat_n_diff_key_vals[ - dict_index_get_n_unique(index)]; - - table->stat_clustered_index_size = index->stat_index_size; - - table->stat_sum_of_other_index_sizes = sum_of_index_sizes - - index->stat_index_size; - - table->stat_modified_counter = 0; - - table->stat_initialized = TRUE; + return((index->type & DICT_FTS) + || dict_index_is_corrupted(index) + || index->to_be_dropped + || *index->name == TEMP_INDEX_PREFIX); } /* @} */ @@ -251,24 +159,24 @@ fake_statistics: Checks whether the persistent statistics storage exists and that all tables have the proper structure. dict_stats_persistent_storage_check() @{ -@return TRUE if exists and all tables are ok */ +@return true if exists and all tables are ok */ static -ibool +bool dict_stats_persistent_storage_check( /*================================*/ - ibool caller_has_dict_sys_mutex) /*!< in: TRUE if the caller + bool caller_has_dict_sys_mutex) /*!< in: true if the caller owns dict_sys->mutex */ { /* definition for the table TABLE_STATS_NAME */ dict_col_meta_t table_stats_columns[] = { {"database_name", DATA_VARMYSQL, - DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */}, + DATA_NOT_NULL, 192}, {"table_name", DATA_VARMYSQL, - DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */}, + DATA_NOT_NULL, 192}, - {"last_update", DATA_INT, - DATA_NOT_NULL | DATA_UNSIGNED, 4}, + {"last_update", DATA_FIXBINARY, + DATA_NOT_NULL, 4}, {"n_rows", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, @@ -282,22 +190,24 @@ dict_stats_persistent_storage_check( dict_table_schema_t table_stats_schema = { TABLE_STATS_NAME, UT_ARR_SIZE(table_stats_columns), - table_stats_columns + table_stats_columns, + 0 /* n_foreign */, + 0 /* n_referenced */ }; /* definition for the table INDEX_STATS_NAME */ dict_col_meta_t index_stats_columns[] = { {"database_name", DATA_VARMYSQL, - DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */}, + DATA_NOT_NULL, 192}, {"table_name", DATA_VARMYSQL, - DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */}, + DATA_NOT_NULL, 192}, {"index_name", DATA_VARMYSQL, - DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */}, + DATA_NOT_NULL, 192}, - {"last_update", DATA_INT, - DATA_NOT_NULL | DATA_UNSIGNED, 4}, + {"last_update", DATA_FIXBINARY, + DATA_NOT_NULL, 4}, {"stat_name", DATA_VARMYSQL, DATA_NOT_NULL, 64*3}, @@ -314,11 +224,13 @@ dict_stats_persistent_storage_check( dict_table_schema_t index_stats_schema = { INDEX_STATS_NAME, UT_ARR_SIZE(index_stats_columns), - index_stats_columns + index_stats_columns, + 0 /* n_foreign */, + 0 /* n_referenced */ }; char errstr[512]; - enum db_err ret; + dberr_t ret; if (!caller_has_dict_sys_mutex) { mutex_enter(&(dict_sys->mutex)); @@ -339,24 +251,660 @@ dict_stats_persistent_storage_check( mutex_exit(&(dict_sys->mutex)); } - if (ret != DB_SUCCESS && ret != DB_TABLE_NOT_FOUND) { + if (ret != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: %s\n", errstr); + return(false); + } + /* else */ + + return(true); +} +/* @} */ + +/*********************************************************************//** +Executes a given SQL statement using the InnoDB internal SQL parser +in its own transaction and commits it. +This function will free the pinfo object. +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_exec_sql( +/*================*/ + pars_info_t* pinfo, /*!< in/out: pinfo to pass to que_eval_sql() + must already have any literals bound to it */ + const char* sql) /*!< in: SQL string to execute */ +{ + trx_t* trx; + dberr_t err; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + if (!dict_stats_persistent_storage_check(true)) { + pars_info_free(pinfo); + return(DB_STATS_DO_NOT_EXIST); + } + + trx = trx_allocate_for_background(); + trx_start_if_not_started(trx); + + err = que_eval_sql(pinfo, sql, FALSE, trx); /* pinfo is freed here */ + + if (err == DB_SUCCESS) { + trx_commit_for_mysql(trx); + } else { + trx->op_info = "rollback of internal trx on stats tables"; + trx->dict_operation_lock_mode = RW_X_LATCH; + trx_rollback_to_savepoint(trx, NULL); + trx->dict_operation_lock_mode = 0; + trx->op_info = ""; + ut_a(trx->error_state == DB_SUCCESS); + } + + trx_free_for_background(trx); + + return(err); +} + +/*********************************************************************//** +Duplicate a table object and its indexes. +This function creates a dummy dict_table_t object and initializes the +following table and index members: +dict_table_t::id (copied) +dict_table_t::heap (newly created) +dict_table_t::name (copied) +dict_table_t::corrupted (copied) +dict_table_t::indexes<> (newly created) +dict_table_t::magic_n +for each entry in dict_table_t::indexes, the following are initialized: +(indexes that have DICT_FTS set in index->type are skipped) +dict_index_t::id (copied) +dict_index_t::name (copied) +dict_index_t::table_name (points to the copied table name) +dict_index_t::table (points to the above semi-initialized object) +dict_index_t::type (copied) +dict_index_t::to_be_dropped (copied) +dict_index_t::online_status (copied) +dict_index_t::n_uniq (copied) +dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name) +dict_index_t::indexes<> (newly created) +dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized) +dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized) +dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized) +dict_index_t::magic_n +The returned object should be freed with dict_stats_table_clone_free() +when no longer needed. +@return incomplete table object */ +static +dict_table_t* +dict_stats_table_clone_create( +/*==========================*/ + const dict_table_t* table) /*!< in: table whose stats to copy */ +{ + size_t heap_size; + dict_index_t* index; + + /* Estimate the size needed for the table and all of its indexes */ + + heap_size = 0; + heap_size += sizeof(dict_table_t); + heap_size += strlen(table->name) + 1; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); + + ulint n_uniq = dict_index_get_n_unique(index); + + heap_size += sizeof(dict_index_t); + heap_size += strlen(index->name) + 1; + heap_size += n_uniq * sizeof(index->fields[0]); + for (ulint i = 0; i < n_uniq; i++) { + heap_size += strlen(index->fields[i].name) + 1; + } + heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]); + heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]); + heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]); + } + + /* Allocate the memory and copy the members */ + + mem_heap_t* heap; + + heap = mem_heap_create(heap_size); + + dict_table_t* t; + + t = (dict_table_t*) mem_heap_alloc(heap, sizeof(*t)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->id, sizeof(table->id)); + t->id = table->id; + + t->heap = heap; + + UNIV_MEM_ASSERT_RW_ABORT(table->name, strlen(table->name) + 1); + t->name = (char*) mem_heap_strdup(heap, table->name); + + t->corrupted = table->corrupted; + + UT_LIST_INIT(t->indexes); + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); + + dict_index_t* idx; + + idx = (dict_index_t*) mem_heap_alloc(heap, sizeof(*idx)); + + UNIV_MEM_ASSERT_RW_ABORT(&index->id, sizeof(index->id)); + idx->id = index->id; + + UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name) + 1); + idx->name = (char*) mem_heap_strdup(heap, index->name); + + idx->table_name = t->name; + + idx->table = t; + + idx->type = index->type; + + idx->to_be_dropped = 0; + + idx->online_status = ONLINE_INDEX_COMPLETE; + + idx->n_uniq = index->n_uniq; + + idx->fields = (dict_field_t*) mem_heap_alloc( + heap, idx->n_uniq * sizeof(idx->fields[0])); + + for (ulint i = 0; i < idx->n_uniq; i++) { + UNIV_MEM_ASSERT_RW_ABORT(index->fields[i].name, strlen(index->fields[i].name) + 1); + idx->fields[i].name = (char*) mem_heap_strdup( + heap, index->fields[i].name); + } + + /* hook idx into t->indexes */ + UT_LIST_ADD_LAST(indexes, t->indexes, idx); + + idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_alloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0])); + + idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_alloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0])); + + idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_alloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); + ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + } + + ut_d(t->magic_n = DICT_TABLE_MAGIC_N); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_table_clone_create(). +dict_stats_table_clone_free() @{ */ +static +void +dict_stats_table_clone_free( +/*========================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + mem_heap_free(t->heap); +} +/* @} */ + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into an index +statistics members. The resulting stats correspond to an empty index. +The caller must own index's table stats latch in X mode +(dict_table_stats_lock(table, RW_X_LATCH)) +dict_stats_empty_index() @{ */ +static +void +dict_stats_empty_index( +/*===================*/ + dict_index_t* index) /*!< in/out: index */ +{ + ut_ad(!(index->type & DICT_FTS)); + ut_ad(!dict_index_is_univ(index)); + + ulint n_uniq = index->n_uniq; + + for (ulint i = 0; i < n_uniq; i++) { + index->stat_n_diff_key_vals[i] = 0; + index->stat_n_sample_sizes[i] = 1; + index->stat_n_non_null_key_vals[i] = 0; + } + + index->stat_index_size = 1; + index->stat_n_leaf_pages = 1; +} +/* @} */ + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into a table and its indexes' +statistics members. The resulting stats correspond to an empty table. +dict_stats_empty_table() @{ */ +static +void +dict_stats_empty_table( +/*===================*/ + dict_table_t* table) /*!< in/out: table */ +{ + /* Zero the stats members */ + + dict_table_stats_lock(table, RW_X_LATCH); + + table->stat_n_rows = 0; + table->stat_clustered_index_size = 1; + /* 1 page for each index, not counting the clustered */ + table->stat_sum_of_other_index_sizes + = UT_LIST_GET_LEN(table->indexes) - 1; + table->stat_modified_counter = 0; + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index->type & DICT_FTS) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); + + dict_stats_empty_index(index); + } + + table->stat_initialized = TRUE; + + dict_table_stats_unlock(table, RW_X_LATCH); +} +/* @} */ + +/*********************************************************************//** +Check whether index's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized_index( +/*================================*/ + const dict_index_t* index) /*!< in: index */ +{ + UNIV_MEM_ASSERT_RW_ABORT( + index->stat_n_diff_key_vals, + index->n_uniq * sizeof(index->stat_n_diff_key_vals[0])); + + UNIV_MEM_ASSERT_RW_ABORT( + index->stat_n_sample_sizes, + index->n_uniq * sizeof(index->stat_n_sample_sizes[0])); + + UNIV_MEM_ASSERT_RW_ABORT( + index->stat_n_non_null_key_vals, + index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0])); + + UNIV_MEM_ASSERT_RW_ABORT( + &index->stat_index_size, + sizeof(index->stat_index_size)); + + UNIV_MEM_ASSERT_RW_ABORT( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); +} +/*********************************************************************//** +Check whether table's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_a(table->stat_initialized); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_last_recalc, + sizeof(table->stats_last_recalc)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_persistent, + sizeof(table->stat_persistent)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_auto_recalc, + sizeof(table->stats_auto_recalc)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_sample_pages, + sizeof(table->stats_sample_pages)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_n_rows, + sizeof(table->stat_n_rows)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_clustered_index_size, + sizeof(table->stat_clustered_index_size)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_sum_of_other_index_sizes, + sizeof(table->stat_sum_of_other_index_sizes)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_modified_counter, + sizeof(table->stat_modified_counter)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_bg_flag, + sizeof(table->stats_bg_flag)); + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (!dict_stats_should_ignore_index(index)) { + dict_stats_assert_initialized_index(index); + } + } +} + +#define INDEX_EQ(i1, i2) \ + ((i1) != NULL \ + && (i2) != NULL \ + && (i1)->id == (i2)->id \ + && strcmp((i1)->name, (i2)->name) == 0) +/*********************************************************************//** +Copy table and index statistics from one table to another, including index +stats. Extra indexes in src are ignored and extra indexes in dst are +initialized to correspond to an empty index. */ +static +void +dict_stats_copy( +/*============*/ + dict_table_t* dst, /*!< in/out: destination table */ + const dict_table_t* src) /*!< in: source table */ +{ + dst->stats_last_recalc = src->stats_last_recalc; + dst->stat_n_rows = src->stat_n_rows; + dst->stat_clustered_index_size = src->stat_clustered_index_size; + dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes; + dst->stat_modified_counter = src->stat_modified_counter; + + dict_index_t* dst_idx; + dict_index_t* src_idx; + + for (dst_idx = dict_table_get_first_index(dst), + src_idx = dict_table_get_first_index(src); + dst_idx != NULL; + dst_idx = dict_table_get_next_index(dst_idx), + (src_idx != NULL + && (src_idx = dict_table_get_next_index(src_idx)))) { + + if (dict_stats_should_ignore_index(dst_idx)) { + continue; + } + + ut_ad(!dict_index_is_univ(dst_idx)); + + if (!INDEX_EQ(src_idx, dst_idx)) { + for (src_idx = dict_table_get_first_index(src); + src_idx != NULL; + src_idx = dict_table_get_next_index(src_idx)) { + + if (INDEX_EQ(src_idx, dst_idx)) { + break; + } + } + } + + if (!INDEX_EQ(src_idx, dst_idx)) { + dict_stats_empty_index(dst_idx); + continue; + } + + ulint n_copy_el; + + if (dst_idx->n_uniq > src_idx->n_uniq) { + n_copy_el = src_idx->n_uniq; + /* Since src is smaller some elements in dst + will remain untouched by the following memmove(), + thus we init all of them here. */ + dict_stats_empty_index(dst_idx); + } else { + n_copy_el = dst_idx->n_uniq; + } + + memmove(dst_idx->stat_n_diff_key_vals, + src_idx->stat_n_diff_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0])); + + memmove(dst_idx->stat_n_sample_sizes, + src_idx->stat_n_sample_sizes, + n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0])); + + memmove(dst_idx->stat_n_non_null_key_vals, + src_idx->stat_n_non_null_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0])); + + dst_idx->stat_index_size = src_idx->stat_index_size; + + dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; + } + + dst->stat_initialized = TRUE; +} + +/*********************************************************************//** +Duplicate the stats of a table and its indexes. +This function creates a dummy dict_table_t object and copies the input +table's stats into it. The returned table object is not in the dictionary +cache and cannot be accessed by any other threads. In addition to the +members copied in dict_stats_table_clone_create() this function initializes +the following: +dict_table_t::stat_initialized +dict_table_t::stat_persistent +dict_table_t::stat_n_rows +dict_table_t::stat_clustered_index_size +dict_table_t::stat_sum_of_other_index_sizes +dict_table_t::stat_modified_counter +dict_index_t::stat_n_diff_key_vals[] +dict_index_t::stat_n_sample_sizes[] +dict_index_t::stat_n_non_null_key_vals[] +dict_index_t::stat_index_size +dict_index_t::stat_n_leaf_pages +The returned object should be freed with dict_stats_snapshot_free() +when no longer needed. +@return incomplete table object */ +static +dict_table_t* +dict_stats_snapshot_create( +/*=======================*/ + const dict_table_t* table) /*!< in: table whose stats to copy */ +{ + mutex_enter(&dict_sys->mutex); + + dict_table_stats_lock(table, RW_S_LATCH); + + dict_stats_assert_initialized(table); + + dict_table_t* t; + + t = dict_stats_table_clone_create(table); + + dict_stats_copy(t, table); + + t->stat_persistent = table->stat_persistent; + t->stats_auto_recalc = table->stats_auto_recalc; + t->stats_sample_pages = table->stats_sample_pages; + t->stats_bg_flag = table->stats_bg_flag; + + dict_table_stats_unlock(table, RW_S_LATCH); + + mutex_exit(&dict_sys->mutex); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_snapshot_create(). +dict_stats_snapshot_free() @{ */ +static +void +dict_stats_snapshot_free( +/*=====================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + dict_stats_table_clone_free(t); +} +/* @} */ + +/*********************************************************************//** +Calculates new estimates for index statistics. This function is +relatively quick and is used to calculate transient statistics that +are not saved on disk. This was the only way to calculate statistics +before the Persistent Statistics feature was introduced. +dict_stats_update_transient_for_index() @{ */ +static +void +dict_stats_update_transient_for_index( +/*==================================*/ + dict_index_t* index) /*!< in/out: index */ +{ + if (UNIV_LIKELY + (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE + || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO + && dict_index_is_clust(index)))) { + mtr_t mtr; + ulint size; + mtr_start(&mtr); + mtr_s_lock(dict_index_get_lock(index), &mtr); + + size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); + + if (size != ULINT_UNDEFINED) { + index->stat_index_size = size; + + size = btr_get_size( + index, BTR_N_LEAF_PAGES, &mtr); + } + + mtr_commit(&mtr); + + switch (size) { + case ULINT_UNDEFINED: + dict_stats_empty_index(index); + return; + case 0: + /* The root node of the tree is a leaf */ + size = 1; + } + + index->stat_n_leaf_pages = size; + + btr_estimate_number_of_different_key_vals(index); + } else { + /* If we have set a high innodb_force_recovery + level, do not calculate statistics, as a badly + corrupted index can cause a crash in it. + Initialize some bogus index cardinality + statistics, so that the data can be queried in + various means, also via secondary indexes. */ + dict_stats_empty_index(index); + } +} +/* @} */ + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively quick and is used to calculate transient statistics that +are not saved on disk. +This was the only way to calculate statistics before the +Persistent Statistics feature was introduced. +dict_stats_update_transient() @{ */ +UNIV_INTERN +void +dict_stats_update_transient( +/*========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + dict_index_t* index; + ulint sum_of_index_sizes = 0; + + /* Find out the sizes of the indexes and how many different values + for the key they approximately have */ + + index = dict_table_get_first_index(table); + + if (dict_table_is_discarded(table)) { + /* Nothing to do. */ + dict_stats_empty_table(table); + return; + } else if (index == NULL) { + /* Table definition is corrupt */ + + char buf[MAX_FULL_NAME_LEN]; ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: %s\n", errstr); + fprintf(stderr, " InnoDB: table %s has no indexes. " + "Cannot calculate statistics.\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf))); + dict_stats_empty_table(table); + return; + } + + for (; index != NULL; index = dict_table_get_next_index(index)) { + + ut_ad(!dict_index_is_univ(index)); + + if (index->type & DICT_FTS) { + continue; + } + + dict_stats_empty_index(index); + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + dict_stats_update_transient_for_index(index); + + sum_of_index_sizes += index->stat_index_size; } - /* We return silently if some of the tables are not present because - this code is executed during open table. By design we check if the - persistent statistics storage is present and whether there are stats - for the table being opened and if so, then we use them, otherwise we - silently switch back to using the transient stats. */ - return(ret == DB_SUCCESS); + index = dict_table_get_first_index(table); + + table->stat_n_rows = index->stat_n_diff_key_vals[ + dict_index_get_n_unique(index) - 1]; + + table->stat_clustered_index_size = index->stat_index_size; + + table->stat_sum_of_other_index_sizes = sum_of_index_sizes + - index->stat_index_size; + + table->stats_last_recalc = ut_time(); + + table->stat_modified_counter = 0; + + table->stat_initialized = TRUE; } /* @} */ /* @{ Pseudo code about the relation between the following functions -let N = srv_stats_persistent_sample_pages +let N = N_SAMPLE_PAGES(index) dict_stats_analyze_index() for each n_prefix @@ -375,14 +923,11 @@ dict_stats_analyze_index() /*********************************************************************//** Find the total number and the number of distinct keys on a given level in an index. Each of the 1..n_uniq prefixes are looked up and the results are -saved in the array n_diff[]. Notice that n_diff[] must be able to store -n_uniq+1 numbers because the results are saved in -n_diff[1] .. n_diff[n_uniq]. The total number of records on the level is -saved in total_recs. +saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of +records on the level is saved in total_recs. Also, the index of the last record in each group of equal records is saved -in n_diff_boundaries[1..n_uniq], records indexing starts from the leftmost -record on the level and continues cross pages boundaries, counting from 0. -dict_stats_analyze_index_level() @{ */ +in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost +record on the level and continues cross pages boundaries, counting from 0. */ static void dict_stats_analyze_index_level( @@ -393,78 +938,87 @@ dict_stats_analyze_index_level( distinct keys for all prefixes */ ib_uint64_t* total_recs, /*!< out: total number of records */ ib_uint64_t* total_pages, /*!< out: total number of pages */ - dyn_array_t* n_diff_boundaries)/*!< out: boundaries of the groups + dyn_array_t* n_diff_boundaries,/*!< out: boundaries of the groups of distinct keys */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint n_uniq; mem_heap_t* heap; - dtuple_t* dtuple; btr_pcur_t pcur; - mtr_t mtr; const page_t* page; const rec_t* rec; const rec_t* prev_rec; + bool prev_rec_is_copied; byte* prev_rec_buf = NULL; ulint prev_rec_buf_size = 0; + ulint* rec_offsets; + ulint* prev_rec_offsets; ulint i; DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu)\n", __func__, index->table->name, index->name, level); + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + n_uniq = dict_index_get_n_unique(index); - /* elements in the n_diff array are 1..n_uniq (inclusive) */ - memset(n_diff, 0x0, (n_uniq + 1) * sizeof(*n_diff)); + /* elements in the n_diff array are 0..n_uniq-1 (inclusive) */ + memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0])); + + /* Allocate space for the offsets header (the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1, + so that this will never be less than the size calculated in + rec_get_offsets_func(). */ + i = (REC_OFFS_HEADER_SIZE + 1 + 1) + index->n_fields; - heap = mem_heap_create(256); + heap = mem_heap_create((2 * sizeof *rec_offsets) * i); + rec_offsets = static_cast<ulint*>( + mem_heap_alloc(heap, i * sizeof *rec_offsets)); + prev_rec_offsets = static_cast<ulint*>( + mem_heap_alloc(heap, i * sizeof *prev_rec_offsets)); + rec_offs_set_n_alloc(rec_offsets, i); + rec_offs_set_n_alloc(prev_rec_offsets, i); - /* reset the dynamic arrays n_diff_boundaries[1..n_uniq]; - n_diff_boundaries[0] is ignored to follow the same convention - as n_diff[] */ + /* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */ if (n_diff_boundaries != NULL) { - for (i = 1; i <= n_uniq; i++) { + for (i = 0; i < n_uniq; i++) { dyn_array_free(&n_diff_boundaries[i]); dyn_array_create(&n_diff_boundaries[i]); } } - /* craft a record that is always smaller than the others, - this way we are sure that the cursor pcur will be positioned - on the leftmost record on the leftmost page on the desired level */ - dtuple = dtuple_create(heap, dict_index_get_n_unique(index)); - dict_table_copy_types(dtuple, index->table); - dtuple_set_info_bits(dtuple, REC_INFO_MIN_REC_FLAG); - - mtr_start(&mtr); + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ - btr_pcur_open_low(index, level, dtuple, PAGE_CUR_LE, BTR_SEARCH_LEAF, - &pcur, __FILE__, __LINE__, &mtr); + btr_pcur_open_at_index_side( + true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED, + &pcur, true, level, mtr); + btr_pcur_move_to_next_on_page(&pcur); page = btr_pcur_get_page(&pcur); + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page)); + ut_ad(btr_pcur_get_rec(&pcur) + == page_rec_get_next_const(page_get_infimum_rec(page))); + /* check that we are indeed on the desired level */ - ut_a(btr_page_get_level(page, &mtr) == level); + ut_a(btr_page_get_level(page, mtr) == level); /* there should not be any pages on the left */ - ut_a(btr_page_get_prev(page, &mtr) == FIL_NULL); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); /* check whether the first record on the leftmost page is marked as such, if we are on a non-leaf level */ - ut_a(level == 0 - || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( - page_rec_get_next_const(page_get_infimum_rec(page)), - page_is_comp(page)))); - - if (btr_pcur_is_before_first_on_page(&pcur)) { - btr_pcur_move_to_next_on_page(&pcur); - } - - if (btr_pcur_is_after_last_on_page(&pcur)) { - btr_pcur_move_to_prev_on_page(&pcur); - } + ut_a((level == 0) + == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + btr_pcur_get_rec(&pcur), page_is_comp(page)))); prev_rec = NULL; + prev_rec_is_copied = false; /* no records by default */ *total_recs = 0; @@ -476,56 +1030,83 @@ dict_stats_analyze_index_level( X and the fist on page X+1 */ for (; btr_pcur_is_on_user_rec(&pcur); - btr_pcur_move_to_next_user_rec(&pcur, &mtr)) { + btr_pcur_move_to_next_user_rec(&pcur, mtr)) { ulint matched_fields = 0; ulint matched_bytes = 0; - ulint offsets_rec_onstack[REC_OFFS_NORMAL_SIZE]; - ulint* offsets_rec; - - rec_offs_init(offsets_rec_onstack); + bool rec_is_last_on_page; rec = btr_pcur_get_rec(&pcur); + /* If rec and prev_rec are on different pages, then prev_rec + must have been copied, because we hold latch only on the page + where rec resides. */ + if (prev_rec != NULL + && page_align(rec) != page_align(prev_rec)) { + + ut_a(prev_rec_is_copied); + } + + rec_is_last_on_page = + page_rec_is_supremum(page_rec_get_next_const(rec)); + /* increment the pages counter at the end of each page */ - if (page_rec_is_supremum(page_rec_get_next_const(rec))) { + if (rec_is_last_on_page) { (*total_pages)++; } - /* skip delete-marked records */ - if (rec_get_deleted_flag(rec, page_is_comp( - btr_pcur_get_page(&pcur)))) { + /* Skip delete-marked records on the leaf level. If we + do not skip them, then ANALYZE quickly after DELETE + could count them or not (purge may have already wiped + them away) which brings non-determinism. We skip only + leaf-level delete marks because delete marks on + non-leaf level do not make sense. */ + if (level == 0 && + rec_get_deleted_flag( + rec, + page_is_comp(btr_pcur_get_page(&pcur)))) { + + if (rec_is_last_on_page + && !prev_rec_is_copied + && prev_rec != NULL) { + /* copy prev_rec */ + + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, + n_uniq, &heap); + + prev_rec = rec_copy_prefix_to_buf( + prev_rec, index, + rec_offs_n_fields(prev_rec_offsets), + &prev_rec_buf, &prev_rec_buf_size); + + prev_rec_is_copied = true; + } continue; } - offsets_rec = rec_get_offsets(rec, index, offsets_rec_onstack, - n_uniq, &heap); + rec_offsets = rec_get_offsets( + rec, index, rec_offsets, n_uniq, &heap); (*total_recs)++; if (prev_rec != NULL) { - - ulint offsets_prev_rec_onstack[REC_OFFS_NORMAL_SIZE]; - ulint* offsets_prev_rec; - - rec_offs_init(offsets_prev_rec_onstack); - - offsets_prev_rec = rec_get_offsets( - prev_rec, index, offsets_prev_rec_onstack, + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, n_uniq, &heap); cmp_rec_rec_with_match(rec, prev_rec, - offsets_rec, - offsets_prev_rec, + rec_offsets, + prev_rec_offsets, index, FALSE, &matched_fields, &matched_bytes); - for (i = matched_fields + 1; i <= n_uniq; i++) { + for (i = matched_fields; i < n_uniq; i++) { if (n_diff_boundaries != NULL) { /* push the index of the previous @@ -553,17 +1134,18 @@ dict_stats_analyze_index_level( } /* increment the number of different keys - for n_prefix=i */ + for n_prefix=i+1 (e.g. if i=0 then we increment + for n_prefix=1 which is stored in n_diff[0]) */ n_diff[i]++; } } else { /* this is the first non-delete marked record */ - for (i = 1; i <= n_uniq; i++) { + for (i = 0; i < n_uniq; i++) { n_diff[i] = 1; } } - if (page_rec_is_supremum(page_rec_get_next_const(rec))) { + if (rec_is_last_on_page) { /* end of a page has been reached */ /* we need to copy the record instead of assigning @@ -574,8 +1156,9 @@ dict_stats_analyze_index_level( btr_pcur_move_to_next_user_rec() will release the latch on the page that prev_rec is on */ prev_rec = rec_copy_prefix_to_buf( - rec, index, rec_offs_n_fields(offsets_rec), + rec, index, rec_offs_n_fields(rec_offsets), &prev_rec_buf, &prev_rec_buf_size); + prev_rec_is_copied = true; } else { /* still on the same page, the next call to @@ -584,12 +1167,14 @@ dict_stats_analyze_index_level( instead of copying the records like above */ prev_rec = rec; + prev_rec_is_copied = false; } } /* if *total_pages is left untouched then the above loop was not entered at all and there is one page in the whole tree which is - empty */ + empty or the loop was entered but this is level 0, contains one page + and all records are delete-marked */ if (*total_pages == 0) { ut_ad(level == 0); @@ -605,7 +1190,7 @@ dict_stats_analyze_index_level( /* remember the index of the last record on the level as the last one from the last group of equal keys; this holds for all possible prefixes */ - for (i = 1; i <= n_uniq; i++) { + for (i = 0; i < n_uniq; i++) { void* p; ib_uint64_t idx; @@ -619,10 +1204,10 @@ dict_stats_analyze_index_level( } /* now in n_diff_boundaries[i] there are exactly n_diff[i] integers, - for i=1..n_uniq */ + for i=0..n_uniq-1 */ #ifdef UNIV_STATS_DEBUG - for (i = 1; i <= n_uniq; i++) { + for (i = 0; i < n_uniq; i++) { DEBUG_PRINTF(" %s(): total recs: " UINT64PF ", total pages: " UINT64PF @@ -654,9 +1239,11 @@ dict_stats_analyze_index_level( } #endif /* UNIV_STATS_DEBUG */ - btr_pcur_close(&pcur); + /* Release the latch on the last page, because that is not done by + btr_pcur_close(). This function works also for non-leaf pages. */ + btr_leaf_page_release(btr_pcur_get_block(&pcur), BTR_SEARCH_LEAF, mtr); - mtr_commit(&mtr); + btr_pcur_close(&pcur); if (prev_rec_buf != NULL) { @@ -665,15 +1252,16 @@ dict_stats_analyze_index_level( mem_heap_free(heap); } -/* @} */ /* aux enum for controlling the behavior of dict_stats_scan_page() @{ */ -typedef enum page_scan_method_enum { - COUNT_ALL_NON_BORING, /* scan all records on the given page - and count the number of distinct ones */ +enum page_scan_method_t { + COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED,/* scan all records on + the given page and count the number of + distinct ones, also ignore delete marked + records */ QUIT_ON_FIRST_NON_BORING/* quit when the first record that differs from its right neighbor is found */ -} page_scan_method_t; +}; /* @} */ /*********************************************************************//** @@ -715,11 +1303,18 @@ dict_stats_scan_page( Because offsets1,offsets2 should be big enough, this memory heap should never be used. */ mem_heap_t* heap = NULL; + const rec_t* (*get_next)(const rec_t*); - rec = page_rec_get_next_const(page_get_infimum_rec(page)); + if (scan_method == COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED) { + get_next = page_rec_get_next_non_del_marked; + } else { + get_next = page_rec_get_next_const; + } + + rec = get_next(page_get_infimum_rec(page)); if (page_rec_is_supremum(rec)) { - /* the page is empty */ + /* the page is empty or contains only delete-marked records */ *n_diff = 0; *out_rec = NULL; return(NULL); @@ -728,7 +1323,7 @@ dict_stats_scan_page( offsets_rec = rec_get_offsets(rec, index, offsets_rec, ULINT_UNDEFINED, &heap); - next_rec = page_rec_get_next_const(rec); + next_rec = get_next(rec); *n_diff = 1; @@ -777,7 +1372,8 @@ dict_stats_scan_page( offsets_rec = offsets_next_rec; offsets_next_rec = offsets_tmp; } - next_rec = page_rec_get_next_const(next_rec); + + next_rec = get_next(next_rec); } func_exit: @@ -814,7 +1410,6 @@ dict_stats_analyze_index_below_cur( ulint* offsets1; ulint* offsets2; ulint* offsets_rec; - ulint root_height; ib_uint64_t n_diff; /* the result */ ulint size; @@ -841,8 +1436,6 @@ dict_stats_analyze_index_below_cur( rec_offs_set_n_alloc(offsets1, size); rec_offs_set_n_alloc(offsets2, size); - root_height = btr_page_get_level(btr_root_get(index, mtr), mtr); - space = dict_index_get_space(index); zip_size = dict_table_zip_size(index->table); @@ -907,14 +1500,7 @@ dict_stats_analyze_index_below_cur( offsets_rec = dict_stats_scan_page( &rec, offsets1, offsets2, index, page, n_prefix, - COUNT_ALL_NON_BORING, &n_diff); - - if (root_height > 0) { - - /* empty pages are allowed only if the whole B-tree is empty - and contains a single empty page */ - ut_a(offsets_rec != NULL); - } + COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, &n_diff); #if 0 DEBUG_PRINTF(" %s(): n_diff below page_no=%lu: " UINT64PF "\n", @@ -928,42 +1514,40 @@ dict_stats_analyze_index_below_cur( /* @} */ /*********************************************************************//** -For a given level in an index select srv_stats_persistent_sample_pages +For a given level in an index select N_SAMPLE_PAGES(index) (or less) records from that level and dive below them to the corresponding leaf pages, then scan those leaf pages and save the sampling results in -index->stat_n_diff_key_vals[n_prefix] and the number of pages scanned in -index->stat_n_sample_sizes[n_prefix]. -dict_stats_analyze_index_for_n_prefix() @{ */ +index->stat_n_diff_key_vals[n_prefix - 1] and the number of pages scanned in +index->stat_n_sample_sizes[n_prefix - 1]. */ static void dict_stats_analyze_index_for_n_prefix( /*==================================*/ - dict_index_t* index, /*!< in/out: index */ - ulint level, /*!< in: level, - must be >= 1 */ - ib_uint64_t total_recs_on_level, /*!< in: total number of - records on the given level */ - ulint n_prefix, /*!< in: look at first - n_prefix columns when - comparing records */ - ib_uint64_t n_diff_for_this_prefix, /*!< in: number of distinct - records on the given level, - when looking at the first - n_prefix columns */ - dyn_array_t* boundaries) /*!< in: array that contains - n_diff_for_this_prefix - integers each of which - represents the index (on the - level, counting from - left/smallest to right/biggest - from 0) of the last record - from each group of distinct - keys */ + dict_index_t* index, /*!< in/out: index */ + ulint level, /*!< in: level, must be >= 1 */ + ib_uint64_t total_recs_on_level, + /*!< in: total number of + records on the given level */ + ulint n_prefix, /*!< in: look at first + n_prefix columns when + comparing records */ + ib_uint64_t n_diff_for_this_prefix, + /*!< in: number of distinct + records on the given level, + when looking at the first + n_prefix columns */ + dyn_array_t* boundaries, /*!< in: array that contains + n_diff_for_this_prefix + integers each of which + represents the index (on the + level, counting from + left/smallest to right/biggest + from 0) of the last record + from each group of distinct + keys */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { - mem_heap_t* heap; - dtuple_t* dtuple; btr_pcur_t pcur; - mtr_t mtr; const page_t* page; ib_uint64_t rec_idx; ib_uint64_t last_idx_on_level; @@ -978,51 +1562,45 @@ dict_stats_analyze_index_for_n_prefix( n_prefix, n_diff_for_this_prefix); #endif + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + /* if some of those is 0 then this means that there is exactly one page in the B-tree and it is empty and we should have done full scan and should not be here */ ut_ad(total_recs_on_level > 0); ut_ad(n_diff_for_this_prefix > 0); - /* this is configured to be min 1, someone has changed the code */ - ut_ad(srv_stats_persistent_sample_pages > 0); + /* this must be at least 1 */ + ut_ad(N_SAMPLE_PAGES(index) > 0); - heap = mem_heap_create(256); + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ - /* craft a record that is always smaller than the others, - this way we are sure that the cursor pcur will be positioned - on the leftmost record on the leftmost page on the desired level */ - dtuple = dtuple_create(heap, dict_index_get_n_unique(index)); - dict_table_copy_types(dtuple, index->table); - dtuple_set_info_bits(dtuple, REC_INFO_MIN_REC_FLAG); - - mtr_start(&mtr); - - btr_pcur_open_low(index, level, dtuple, PAGE_CUR_LE, BTR_SEARCH_LEAF, - &pcur, __FILE__, __LINE__, &mtr); + btr_pcur_open_at_index_side( + true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED, + &pcur, true, level, mtr); + btr_pcur_move_to_next_on_page(&pcur); page = btr_pcur_get_page(&pcur); + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page)); + ut_ad(btr_pcur_get_rec(&pcur) + == page_rec_get_next_const(page_get_infimum_rec(page))); + /* check that we are indeed on the desired level */ - ut_a(btr_page_get_level(page, &mtr) == level); + ut_a(btr_page_get_level(page, mtr) == level); /* there should not be any pages on the left */ - ut_a(btr_page_get_prev(page, &mtr) == FIL_NULL); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); /* check whether the first record on the leftmost page is marked as such, if we are on a non-leaf level */ - ut_a(level == 0 || REC_INFO_MIN_REC_FLAG - & rec_get_info_bits(page_rec_get_next_const( - page_get_infimum_rec(page)), - page_is_comp(page))); - - if (btr_pcur_is_before_first_on_page(&pcur)) { - btr_pcur_move_to_next_on_page(&pcur); - } - - if (btr_pcur_is_after_last_on_page(&pcur)) { - btr_pcur_move_to_prev_on_page(&pcur); - } + ut_a((level == 0) + == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + btr_pcur_get_rec(&pcur), page_is_comp(page)))); last_idx_on_level = *(ib_uint64_t*) dyn_array_get_element(boundaries, (ulint) ((n_diff_for_this_prefix - 1) * sizeof(ib_uint64_t))); @@ -1031,7 +1609,7 @@ dict_stats_analyze_index_for_n_prefix( n_diff_sum_of_all_analyzed_pages = 0; - n_recs_to_dive_below = ut_min(srv_stats_persistent_sample_pages, + n_recs_to_dive_below = ut_min(N_SAMPLE_PAGES(index), n_diff_for_this_prefix); for (i = 0; i < n_recs_to_dive_below; i++) { @@ -1093,7 +1671,7 @@ dict_stats_analyze_index_for_n_prefix( while (rec_idx < dive_below_idx && btr_pcur_is_on_user_rec(&pcur)) { - btr_pcur_move_to_next_user_rec(&pcur, &mtr); + btr_pcur_move_to_next_user_rec(&pcur, mtr); rec_idx++; } @@ -1107,12 +1685,20 @@ dict_stats_analyze_index_for_n_prefix( break; } + /* it could be that the tree has changed in such a way that + the record under dive_below_idx is the supremum record, in + this case rec_idx == dive_below_idx and pcur is positioned + on the supremum, we do not want to dive below it */ + if (!btr_pcur_is_on_user_rec(&pcur)) { + break; + } + ut_a(rec_idx == dive_below_idx); ib_uint64_t n_diff_on_leaf_page; n_diff_on_leaf_page = dict_stats_analyze_index_below_cur( - btr_pcur_get_btr_cur(&pcur), n_prefix, &mtr); + btr_pcur_get_btr_cur(&pcur), n_prefix, mtr); /* We adjust n_diff_on_leaf_page here to avoid counting one record twice - once as the last on some page and once @@ -1135,12 +1721,13 @@ dict_stats_analyze_index_for_n_prefix( n_diff_sum_of_all_analyzed_pages += n_diff_on_leaf_page; } - if (n_diff_sum_of_all_analyzed_pages == 0) { - n_diff_sum_of_all_analyzed_pages = 1; - } + /* n_diff_sum_of_all_analyzed_pages can be 0 here if all the leaf + pages sampled contained only delete-marked records. In this case + we should assign 0 to index->stat_n_diff_key_vals[n_prefix - 1], which + the formula below does. */ /* See REF01 for an explanation of the algorithm */ - index->stat_n_diff_key_vals[n_prefix] + index->stat_n_diff_key_vals[n_prefix - 1] = index->stat_n_leaf_pages * n_diff_for_this_prefix @@ -1149,31 +1736,25 @@ dict_stats_analyze_index_for_n_prefix( * n_diff_sum_of_all_analyzed_pages / n_recs_to_dive_below; - index->stat_n_sample_sizes[n_prefix] = n_recs_to_dive_below; + index->stat_n_sample_sizes[n_prefix - 1] = n_recs_to_dive_below; DEBUG_PRINTF(" %s(): n_diff=" UINT64PF " for n_prefix=%lu " "(%lu" " * " UINT64PF " / " UINT64PF " * " UINT64PF " / " UINT64PF ")\n", - __func__, index->stat_n_diff_key_vals[n_prefix], + __func__, index->stat_n_diff_key_vals[n_prefix - 1], n_prefix, index->stat_n_leaf_pages, n_diff_for_this_prefix, total_recs_on_level, n_diff_sum_of_all_analyzed_pages, n_recs_to_dive_below); btr_pcur_close(&pcur); - - mtr_commit(&mtr); - - mem_heap_free(heap); } -/* @} */ /*********************************************************************//** Calculates new statistics for a given index and saves them to the index members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and -stat_n_leaf_pages. This function could be slow. -dict_stats_analyze_index() @{ */ +stat_n_leaf_pages. This function could be slow. */ static void dict_stats_analyze_index( @@ -1182,7 +1763,7 @@ dict_stats_analyze_index( { ulint root_level; ulint level; - ibool level_is_analyzed; + bool level_is_analyzed; ulint n_uniq; ulint n_prefix; ib_uint64_t* n_diff_on_level; @@ -1191,10 +1772,11 @@ dict_stats_analyze_index( dyn_array_t* n_diff_boundaries; mtr_t mtr; ulint size; - ulint i; DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name); + dict_stats_empty_index(index); + mtr_start(&mtr); mtr_s_lock(dict_index_get_lock(index), &mtr); @@ -1206,19 +1788,12 @@ dict_stats_analyze_index( size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr); } + /* Release the X locks on the root page taken by btr_get_size() */ + mtr_commit(&mtr); + switch (size) { case ULINT_UNDEFINED: - mtr_commit(&mtr); - /* Fake some statistics. */ - index->stat_index_size = index->stat_n_leaf_pages = 1; - - for (i = dict_index_get_n_unique(index); i; ) { - index->stat_n_diff_key_vals[i--] = 1; - } - - memset(index->stat_n_non_null_key_vals, 0, - (1 + dict_index_get_n_unique(index)) - * sizeof(*index->stat_n_non_null_key_vals)); + dict_stats_assert_initialized_index(index); return; case 0: /* The root node of the tree is a leaf */ @@ -1227,23 +1802,25 @@ dict_stats_analyze_index( index->stat_n_leaf_pages = size; - root_level = btr_page_get_level(btr_root_get(index, &mtr), &mtr); + mtr_start(&mtr); + + mtr_s_lock(dict_index_get_lock(index), &mtr); - mtr_commit(&mtr); + root_level = btr_height_get(index, &mtr); n_uniq = dict_index_get_n_unique(index); - /* if the tree has just one level (and one page) or if the user - has requested to sample too many pages then do full scan */ + /* If the tree has just one level (and one page) or if the user + has requested to sample too many pages then do full scan. + + For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index) + will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf + pages will be sampled. If that number is bigger than the total + number of leaf pages then do full scan of the leaf level instead + since it will be faster and will give better results. */ + if (root_level == 0 - /* for each n-column prefix (for n=1..n_uniq) - srv_stats_persistent_sample_pages will be sampled, so in total - srv_stats_persistent_sample_pages * n_uniq leaf pages will be - sampled. If that number is bigger than the total number of leaf - pages then do full scan of the leaf level instead since it will - be faster and will give better results. */ - || srv_stats_persistent_sample_pages * n_uniq - > index->stat_n_leaf_pages) { + || N_SAMPLE_PAGES(index) * n_uniq > index->stat_n_leaf_pages) { if (root_level == 0) { DEBUG_PRINTF(" %s(): just one page, " @@ -1261,27 +1838,28 @@ dict_stats_analyze_index( index->stat_n_diff_key_vals, &total_recs, &total_pages, - NULL /*boundaries not needed*/); + NULL /* boundaries not needed */, + &mtr); - for (i = 1; i <= n_uniq; i++) { + for (ulint i = 0; i < n_uniq; i++) { index->stat_n_sample_sizes[i] = total_pages; } + mtr_commit(&mtr); + + dict_stats_assert_initialized_index(index); return; } - /* else */ /* set to zero */ - n_diff_on_level = (ib_uint64_t*) mem_zalloc((n_uniq + 1) - * sizeof(ib_uint64_t)); + n_diff_on_level = reinterpret_cast<ib_uint64_t*> + (mem_zalloc(n_uniq * sizeof(ib_uint64_t))); - n_diff_boundaries = (dyn_array_t*) mem_alloc((n_uniq + 1) - * sizeof(dyn_array_t)); + n_diff_boundaries = reinterpret_cast<dyn_array_t*> + (mem_alloc(n_uniq * sizeof(dyn_array_t))); - for (i = 1; i <= n_uniq; i++) { - /* initialize the dynamic arrays, the first one - (index=0) is ignored to follow the same indexing - scheme as n_diff_on_level[] */ + for (ulint i = 0; i < n_uniq; i++) { + /* initialize the dynamic arrays */ dyn_array_create(&n_diff_boundaries[i]); } @@ -1299,25 +1877,42 @@ dict_stats_analyze_index( So if we find that the first level containing D distinct keys (on n_prefix columns) is L, we continue from L when searching for D distinct keys on n_prefix-1 columns. */ - level = (long) root_level; - level_is_analyzed = FALSE; + level = root_level; + level_is_analyzed = false; + for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) { DEBUG_PRINTF(" %s(): searching level with >=%llu " "distinct records, n_prefix=%lu\n", - __func__, N_DIFF_REQUIRED, n_prefix); + __func__, N_DIFF_REQUIRED(index), n_prefix); + + /* Commit the mtr to release the tree S lock to allow + other threads to do some work too. */ + mtr_commit(&mtr); + mtr_start(&mtr); + mtr_s_lock(dict_index_get_lock(index), &mtr); + if (root_level != btr_height_get(index, &mtr)) { + /* Just quit if the tree has changed beyond + recognition here. The old stats from previous + runs will remain in the values that we have + not calculated yet. Initially when the index + object is created the stats members are given + some sensible values so leaving them untouched + here even the first time will not cause us to + read uninitialized memory later. */ + break; + } /* check whether we should pick the current level; we pick level 1 even if it does not have enough distinct records because we do not want to scan the leaf level because it may contain too many records */ if (level_is_analyzed - && (n_diff_on_level[n_prefix] >= N_DIFF_REQUIRED + && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index) || level == 1)) { goto found_level; } - /* else */ /* search for a level that contains enough distinct records */ @@ -1325,12 +1920,14 @@ dict_stats_analyze_index( /* if this does not hold we should be on "found_level" instead of here */ - ut_ad(n_diff_on_level[n_prefix] < N_DIFF_REQUIRED); + ut_ad(n_diff_on_level[n_prefix - 1] + < N_DIFF_REQUIRED(index)); level--; - level_is_analyzed = FALSE; + level_is_analyzed = false; } + /* descend into the tree, searching for "good enough" level */ for (;;) { /* make sure we do not scan the leaf level @@ -1349,18 +1946,19 @@ dict_stats_analyze_index( total_recs is left from the previous iteration when we scanned one level upper or we have not scanned any levels yet in which case total_recs is 1. */ - if (total_recs > srv_stats_persistent_sample_pages) { + if (total_recs > N_SAMPLE_PAGES(index)) { - /* if the above cond is true then we are not - at the root level since on the root level - total_recs == 1 and cannot - be > srv_stats_persistent_sample_pages */ + /* if the above cond is true then we are + not at the root level since on the root + level total_recs == 1 (set before we + enter the n-prefix loop) and cannot + be > N_SAMPLE_PAGES(index) */ ut_a(level != root_level); /* step one level back and be satisfied with whatever it contains */ level++; - level_is_analyzed = TRUE; + level_is_analyzed = true; break; } @@ -1370,27 +1968,28 @@ dict_stats_analyze_index( n_diff_on_level, &total_recs, &total_pages, - n_diff_boundaries); + n_diff_boundaries, + &mtr); - level_is_analyzed = TRUE; + level_is_analyzed = true; - if (n_diff_on_level[n_prefix] >= N_DIFF_REQUIRED + if (n_diff_on_level[n_prefix - 1] + >= N_DIFF_REQUIRED(index) || level == 1) { /* we found a good level with many distinct records or we have reached the last level we could scan */ break; } - /* else */ level--; - level_is_analyzed = FALSE; + level_is_analyzed = false; } found_level: DEBUG_PRINTF(" %s(): found level %lu that has " UINT64PF " distinct records for n_prefix=%lu\n", - __func__, level, n_diff_on_level[n_prefix], + __func__, level, n_diff_on_level[n_prefix - 1], n_prefix); /* here we are either on level 1 or the level that we are on @@ -1406,28 +2005,30 @@ found_level: dict_stats_analyze_index_for_n_prefix( index, level, total_recs, n_prefix, - n_diff_on_level[n_prefix], - &n_diff_boundaries[n_prefix]); + n_diff_on_level[n_prefix - 1], + &n_diff_boundaries[n_prefix - 1], &mtr); } - for (i = 1; i <= n_uniq; i++) { + mtr_commit(&mtr); + + for (ulint i = 0; i < n_uniq; i++) { dyn_array_free(&n_diff_boundaries[i]); } mem_free(n_diff_boundaries); mem_free(n_diff_on_level); + + dict_stats_assert_initialized_index(index); } -/* @} */ /*********************************************************************//** Calculates new estimates for table and index statistics. This function is relatively slow and is used to calculate persistent statistics that will be saved on disk. -dict_stats_update_persistent() @{ @return DB_SUCCESS or error code */ static -enum db_err +dberr_t dict_stats_update_persistent( /*=========================*/ dict_table_t* table) /*!< in/out: table */ @@ -1436,21 +2037,30 @@ dict_stats_update_persistent( DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name); - /* XXX quit if interrupted, e.g. SIGTERM */ + dict_table_stats_lock(table, RW_X_LATCH); /* analyze the clustered index first */ index = dict_table_get_first_index(table); - if (index == NULL) { + if (index == NULL + || dict_index_is_corrupted(index) + || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) { + /* Table definition is corrupt */ + dict_table_stats_unlock(table, RW_X_LATCH); + dict_stats_empty_table(table); + return(DB_CORRUPTION); } + ut_ad(!dict_index_is_univ(index)); + dict_stats_analyze_index(index); - table->stat_n_rows - = index->stat_n_diff_key_vals[dict_index_get_n_unique(index)]; + ulint n_unique = dict_index_get_n_unique(index); + + table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1]; table->stat_clustered_index_size = index->stat_index_size; @@ -1462,31 +2072,47 @@ dict_stats_update_persistent( index != NULL; index = dict_table_get_next_index(index)) { + ut_ad(!dict_index_is_univ(index)); + if (index->type & DICT_FTS) { continue; } - dict_stats_analyze_index(index); + dict_stats_empty_index(index); + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + if (!(table->stats_bg_flag & BG_STAT_SHOULD_QUIT)) { + dict_stats_analyze_index(index); + } table->stat_sum_of_other_index_sizes += index->stat_index_size; } + table->stats_last_recalc = ut_time(); + table->stat_modified_counter = 0; table->stat_initialized = TRUE; + dict_stats_assert_initialized(table); + + dict_table_stats_unlock(table, RW_X_LATCH); + return(DB_SUCCESS); } -/* @} */ +#include "mysql_com.h" /*********************************************************************//** Save an individual index's statistic into the persistent statistics storage. dict_stats_save_index_stat() @{ @return DB_SUCCESS or error code */ static -enum db_err +dberr_t dict_stats_save_index_stat( /*=======================*/ dict_index_t* index, /*!< in: index */ @@ -1494,95 +2120,114 @@ dict_stats_save_index_stat( const char* stat_name, /*!< in: name of the stat */ ib_uint64_t stat_value, /*!< in: value of the stat */ ib_uint64_t* sample_size, /*!< in: n pages sampled or NULL */ - const char* stat_description,/*!< in: description of the stat */ - trx_t* trx, /*!< in/out: transaction to use */ - ibool caller_has_dict_sys_mutex)/*!< in: TRUE if the caller - owns dict_sys->mutex */ + const char* stat_description)/*!< in: description of the stat */ { pars_info_t* pinfo; - enum db_err ret; + dberr_t ret; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; - pinfo = pars_info_create(); - - pars_info_add_literal(pinfo, "database_name", index->table->name, - dict_get_db_name_len(index->table->name), - DATA_VARCHAR, 0); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); - pars_info_add_str_literal(pinfo, "table_name", - dict_remove_db_name(index->table->name)); + dict_fs2utf8(index->table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + pinfo = pars_info_create(); + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name)); pars_info_add_str_literal(pinfo, "index_name", index->name); - + UNIV_MEM_ASSERT_RW_ABORT(&last_update, 4); pars_info_add_int4_literal(pinfo, "last_update", last_update); - + UNIV_MEM_ASSERT_RW_ABORT(stat_name, strlen(stat_name)); pars_info_add_str_literal(pinfo, "stat_name", stat_name); - + UNIV_MEM_ASSERT_RW_ABORT(&stat_value, 8); pars_info_add_ull_literal(pinfo, "stat_value", stat_value); - if (sample_size != NULL) { + UNIV_MEM_ASSERT_RW_ABORT(sample_size, 8); pars_info_add_ull_literal(pinfo, "sample_size", *sample_size); } else { pars_info_add_literal(pinfo, "sample_size", NULL, UNIV_SQL_NULL, DATA_FIXBINARY, 0); } - + UNIV_MEM_ASSERT_RW_ABORT(stat_description, strlen(stat_description)); pars_info_add_str_literal(pinfo, "stat_description", stat_description); - ret = que_eval_sql(pinfo, - "PROCEDURE INDEX_STATS_SAVE () IS\n" - "dummy CHAR;\n" - "BEGIN\n" - - "SELECT database_name INTO dummy\n" - "FROM \"" INDEX_STATS_NAME "\"\n" - "WHERE\n" - "database_name = :database_name AND\n" - "table_name = :table_name AND\n" - "index_name = :index_name AND\n" - "stat_name = :stat_name\n" - "FOR UPDATE;\n" - - "IF (SQL % NOTFOUND) THEN\n" - " INSERT INTO \"" INDEX_STATS_NAME "\"\n" - " VALUES\n" - " (\n" - " :database_name,\n" - " :table_name,\n" - " :index_name,\n" - " :last_update,\n" - " :stat_name,\n" - " :stat_value,\n" - " :sample_size,\n" - " :stat_description\n" - " );\n" - "ELSE\n" - " UPDATE \"" INDEX_STATS_NAME "\" SET\n" - " last_update = :last_update,\n" - " stat_value = :stat_value,\n" - " sample_size = :sample_size,\n" - " stat_description = :stat_description\n" - " WHERE\n" - " database_name = :database_name AND\n" - " table_name = :table_name AND\n" - " index_name = :index_name AND\n" - " stat_name = :stat_name;\n" - "END IF;\n" - "END;", - !caller_has_dict_sys_mutex, trx); - - /* pinfo is freed by que_eval_sql() */ + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE INDEX_STATS_SAVE_INSERT () IS\n" + "BEGIN\n" + "INSERT INTO \"" INDEX_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":index_name,\n" + ":last_update,\n" + ":stat_name,\n" + ":stat_value,\n" + ":sample_size,\n" + ":stat_description\n" + ");\n" + "END;"); + + if (ret == DB_DUPLICATE_KEY) { + + pinfo = pars_info_create(); + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name)); + pars_info_add_str_literal(pinfo, "index_name", index->name); + UNIV_MEM_ASSERT_RW_ABORT(&last_update, 4); + pars_info_add_int4_literal(pinfo, "last_update", last_update); + UNIV_MEM_ASSERT_RW_ABORT(stat_name, strlen(stat_name)); + pars_info_add_str_literal(pinfo, "stat_name", stat_name); + UNIV_MEM_ASSERT_RW_ABORT(&stat_value, 8); + pars_info_add_ull_literal(pinfo, "stat_value", stat_value); + if (sample_size != NULL) { + UNIV_MEM_ASSERT_RW_ABORT(sample_size, 8); + pars_info_add_ull_literal(pinfo, "sample_size", *sample_size); + } else { + pars_info_add_literal(pinfo, "sample_size", NULL, + UNIV_SQL_NULL, DATA_FIXBINARY, 0); + } + UNIV_MEM_ASSERT_RW_ABORT(stat_description, strlen(stat_description)); + pars_info_add_str_literal(pinfo, "stat_description", + stat_description); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE INDEX_STATS_SAVE_UPDATE () IS\n" + "BEGIN\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET\n" + "last_update = :last_update,\n" + "stat_value = :stat_value,\n" + "sample_size = :sample_size,\n" + "stat_description = :stat_description\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name AND\n" + "stat_name = :stat_name;\n" + "END;"); + } if (ret != DB_SUCCESS) { + char buf_table[MAX_FULL_NAME_LEN]; + char buf_index[MAX_FULL_NAME_LEN]; ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Error while trying to save index " - "statistics for table %s, index %s, " - "stat name %s: %s\n", - index->table->name, index->name, + " InnoDB: Cannot save index statistics for table " + "%s, index %s, stat name \"%s\": %s\n", + ut_format_name(index->table->name, TRUE, + buf_table, sizeof(buf_table)), + ut_format_name(index->name, FALSE, + buf_index, sizeof(buf_index)), stat_name, ut_strerr(ret)); - - trx->error_state = DB_SUCCESS; } return(ret); @@ -1594,196 +2239,165 @@ Save the table's statistics into the persistent statistics storage. dict_stats_save() @{ @return DB_SUCCESS or error code */ static -enum db_err +dberr_t dict_stats_save( /*============*/ - dict_table_t* table, /*!< in: table */ - ibool caller_has_dict_sys_mutex)/*!< in: TRUE if the caller - owns dict_sys->mutex */ + dict_table_t* table_orig) /*!< in: table */ { - trx_t* trx; pars_info_t* pinfo; - dict_index_t* index; lint now; - enum db_err ret; + dberr_t ret; + dict_table_t* table; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + table = dict_stats_snapshot_create(table_orig); + + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); /* MySQL's timestamp is 4 byte, so we use pars_info_add_int4_literal() which takes a lint arg, so "now" is lint */ now = (lint) ut_time(); - trx = trx_allocate_for_background(); - - /* Use 'read-uncommitted' so that the SELECTs we execute - do not get blocked in case some user has locked the rows we - are SELECTing */ - - trx->isolation_level = TRX_ISO_READ_UNCOMMITTED; - - trx_start_if_not_started(trx); +#define PREPARE_PINFO_FOR_TABLE_SAVE(p, t, n) \ + do { \ + pars_info_add_str_literal((p), "database_name", db_utf8); \ + pars_info_add_str_literal((p), "table_name", table_utf8); \ + pars_info_add_int4_literal((p), "last_update", (n)); \ + pars_info_add_ull_literal((p), "n_rows", (t)->stat_n_rows); \ + pars_info_add_ull_literal((p), "clustered_index_size", \ + (t)->stat_clustered_index_size); \ + pars_info_add_ull_literal((p), "sum_of_other_index_sizes", \ + (t)->stat_sum_of_other_index_sizes); \ + } while(false); pinfo = pars_info_create(); - pars_info_add_literal(pinfo, "database_name", table->name, - dict_get_db_name_len(table->name), - DATA_VARCHAR, 0); - - pars_info_add_str_literal(pinfo, "table_name", - dict_remove_db_name(table->name)); - - pars_info_add_int4_literal(pinfo, "last_update", now); - - pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows); - - pars_info_add_ull_literal(pinfo, "clustered_index_size", - table->stat_clustered_index_size); - - pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes", - table->stat_sum_of_other_index_sizes); - - ret = que_eval_sql(pinfo, - "PROCEDURE TABLE_STATS_SAVE () IS\n" - "dummy CHAR;\n" - "BEGIN\n" - - "SELECT database_name INTO dummy\n" - "FROM \"" TABLE_STATS_NAME "\"\n" - "WHERE\n" - "database_name = :database_name AND\n" - "table_name = :table_name\n" - "FOR UPDATE;\n" - - "IF (SQL % NOTFOUND) THEN\n" - " INSERT INTO \"" TABLE_STATS_NAME "\"\n" - " VALUES\n" - " (\n" - " :database_name,\n" - " :table_name,\n" - " :last_update,\n" - " :n_rows,\n" - " :clustered_index_size,\n" - " :sum_of_other_index_sizes\n" - " );\n" - "ELSE\n" - " UPDATE \"" TABLE_STATS_NAME "\" SET\n" - " last_update = :last_update,\n" - " n_rows = :n_rows,\n" - " clustered_index_size = :clustered_index_size,\n" - " sum_of_other_index_sizes = " - " :sum_of_other_index_sizes\n" - " WHERE\n" - " database_name = :database_name AND\n" - " table_name = :table_name;\n" - "END IF;\n" - "END;", - !caller_has_dict_sys_mutex, trx); - - /* pinfo is freed by que_eval_sql() */ + PREPARE_PINFO_FOR_TABLE_SAVE(pinfo, table, now); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE TABLE_STATS_SAVE_INSERT () IS\n" + "BEGIN\n" + "INSERT INTO \"" TABLE_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":last_update,\n" + ":n_rows,\n" + ":clustered_index_size,\n" + ":sum_of_other_index_sizes\n" + ");\n" + "END;"); + + if (ret == DB_DUPLICATE_KEY) { + pinfo = pars_info_create(); + + PREPARE_PINFO_FOR_TABLE_SAVE(pinfo, table, now); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE TABLE_STATS_SAVE_UPDATE () IS\n" + "BEGIN\n" + "UPDATE \"" TABLE_STATS_NAME "\" SET\n" + "last_update = :last_update,\n" + "n_rows = :n_rows,\n" + "clustered_index_size = :clustered_index_size,\n" + "sum_of_other_index_sizes = " + " :sum_of_other_index_sizes\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;"); + } if (ret != DB_SUCCESS) { - + char buf[MAX_FULL_NAME_LEN]; ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Error while trying to save table " - "statistics for table %s: %s\n", - table->name, ut_strerr(ret)); - - goto end_rollback; + " InnoDB: Cannot save table statistics for table " + "%s: %s\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf)), + ut_strerr(ret)); + goto end; } + dict_index_t* index; + for (index = dict_table_get_first_index(table); index != NULL; index = dict_table_get_next_index(index)) { - ib_uint64_t stat_n_diff_key_vals[REC_MAX_N_FIELDS]; - ib_uint64_t stat_n_sample_sizes[REC_MAX_N_FIELDS]; - ulint n_uniq; - ulint i; + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); ret = dict_stats_save_index_stat(index, now, "size", index->stat_index_size, NULL, "Number of pages " - "in the index", - trx, - caller_has_dict_sys_mutex); + "in the index"); if (ret != DB_SUCCESS) { - goto end_rollback; + goto end; } ret = dict_stats_save_index_stat(index, now, "n_leaf_pages", index->stat_n_leaf_pages, NULL, "Number of leaf pages " - "in the index", - trx, - caller_has_dict_sys_mutex); + "in the index"); if (ret != DB_SUCCESS) { - goto end_rollback; + goto end; } - n_uniq = dict_index_get_n_unique(index); - - ut_ad(n_uniq + 1 <= UT_ARR_SIZE(stat_n_diff_key_vals)); - - memcpy(stat_n_diff_key_vals, index->stat_n_diff_key_vals, - (n_uniq + 1) * sizeof(index->stat_n_diff_key_vals[0])); - - ut_ad(n_uniq + 1 <= UT_ARR_SIZE(stat_n_sample_sizes)); - - memcpy(stat_n_sample_sizes, index->stat_n_sample_sizes, - (n_uniq + 1) * sizeof(index->stat_n_sample_sizes[0])); - - for (i = 1; i <= n_uniq; i++) { + for (ulint i = 0; i < index->n_uniq; i++) { char stat_name[16]; char stat_description[1024]; ulint j; ut_snprintf(stat_name, sizeof(stat_name), - "n_diff_pfx%02lu", i); + "n_diff_pfx%02lu", i + 1); /* craft a string that contains the columns names */ ut_snprintf(stat_description, sizeof(stat_description), "%s", index->fields[0].name); - for (j = 2; j <= i; j++) { + for (j = 1; j <= i; j++) { size_t len; len = strlen(stat_description); ut_snprintf(stat_description + len, sizeof(stat_description) - len, - ",%s", index->fields[j - 1].name); + ",%s", index->fields[j].name); } ret = dict_stats_save_index_stat( index, now, stat_name, - stat_n_diff_key_vals[i], - &stat_n_sample_sizes[i], - stat_description, trx, - caller_has_dict_sys_mutex); + index->stat_n_diff_key_vals[i], + &index->stat_n_sample_sizes[i], + stat_description); if (ret != DB_SUCCESS) { - goto end_rollback; + goto end; } } } - trx_commit_for_mysql(trx); - ret = DB_SUCCESS; - goto end_free; - -end_rollback: - - trx->op_info = "rollback of internal transaction on stats tables"; - trx_rollback_to_savepoint(trx, NULL); - trx->op_info = ""; - ut_a(trx->error_state == DB_SUCCESS); +end: + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); -end_free: - - trx_free_for_background(trx); + dict_stats_snapshot_free(table); return(ret); } @@ -1875,11 +2489,11 @@ dict_stats_fetch_table_stats_step( /** Aux struct used to pass a table and a boolean to dict_stats_fetch_index_stats_step(). */ -typedef struct index_fetch_struct { +struct index_fetch_t { dict_table_t* table; /*!< table whose indexes are to be modified */ - ibool stats_were_modified; /*!< will be set to TRUE if at + bool stats_were_modified; /*!< will be set to true if at least one index stats were modified */ -} index_fetch_t; +}; /*********************************************************************//** Called for the rows that are selected by @@ -2036,12 +2650,12 @@ dict_stats_fetch_index_stats_step( if (stat_name_len == 4 /* strlen("size") */ && strncasecmp("size", stat_name, stat_name_len) == 0) { index->stat_index_size = (ulint) stat_value; - arg->stats_were_modified = TRUE; + arg->stats_were_modified = true; } else if (stat_name_len == 12 /* strlen("n_leaf_pages") */ && strncasecmp("n_leaf_pages", stat_name, stat_name_len) == 0) { index->stat_n_leaf_pages = (ulint) stat_value; - arg->stats_were_modified = TRUE; + arg->stats_were_modified = true; } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */ && strncasecmp(PFX, stat_name, PFX_LEN) == 0) { @@ -2057,19 +2671,24 @@ dict_stats_fetch_index_stats_step( || num_ptr[0] < '0' || num_ptr[0] > '9' || num_ptr[1] < '0' || num_ptr[1] > '9') { + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Ignoring strange row from " "%s WHERE " - "database_name = '%.*s' AND " + "database_name = '%s' AND " "table_name = '%s' AND " "index_name = '%s' AND " "stat_name = '%.*s'; because stat_name " "is malformed\n", INDEX_STATS_NAME_PRINT, - (int) dict_get_db_name_len(table->name), - table->name, - dict_remove_db_name(table->name), + db_utf8, + table_utf8, index->name, (int) stat_name_len, stat_name); @@ -2081,41 +2700,50 @@ dict_stats_fetch_index_stats_step( note that stat_name does not have a terminating '\0' */ n_pfx = (num_ptr[0] - '0') * 10 + (num_ptr[1] - '0'); - if (n_pfx == 0 || n_pfx > dict_index_get_n_unique(index)) { + ulint n_uniq = index->n_uniq; + + if (n_pfx == 0 || n_pfx > n_uniq) { + + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Ignoring strange row from " "%s WHERE " - "database_name = '%.*s' AND " + "database_name = '%s' AND " "table_name = '%s' AND " "index_name = '%s' AND " "stat_name = '%.*s'; because stat_name is " "out of range, the index has %lu unique " "columns\n", INDEX_STATS_NAME_PRINT, - (int) dict_get_db_name_len(table->name), - table->name, - dict_remove_db_name(table->name), + db_utf8, + table_utf8, index->name, (int) stat_name_len, stat_name, - dict_index_get_n_unique(index)); + n_uniq); return(TRUE); } /* else */ - index->stat_n_diff_key_vals[n_pfx] = stat_value; + index->stat_n_diff_key_vals[n_pfx - 1] = stat_value; if (sample_size != UINT64_UNDEFINED) { - index->stat_n_sample_sizes[n_pfx] = sample_size; + index->stat_n_sample_sizes[n_pfx - 1] = sample_size; } else { /* hmm, strange... the user must have UPDATEd the table manually and SET sample_size = NULL */ - index->stat_n_sample_sizes[n_pfx] = 0; + index->stat_n_sample_sizes[n_pfx - 1] = 0; } - arg->stats_were_modified = TRUE; + index->stat_n_non_null_key_vals[n_pfx - 1] = 0; + + arg->stats_were_modified = true; } else { /* silently ignore rows with unknown stat_name, the user may have developed her own stats */ @@ -2131,19 +2759,25 @@ Read table's statistics from the persistent statistics storage. dict_stats_fetch_from_ps() @{ @return DB_SUCCESS or error code */ static -enum db_err +dberr_t dict_stats_fetch_from_ps( /*=====================*/ - dict_table_t* table, /*!< in/out: table */ - ibool caller_has_dict_sys_mutex)/*!< in: TRUE if the caller - owns dict_sys->mutex */ + dict_table_t* table) /*!< in/out: table */ { index_fetch_t index_fetch_arg; trx_t* trx; pars_info_t* pinfo; - enum db_err ret; + dberr_t ret; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + ut_ad(!mutex_own(&dict_sys->mutex)); - ut_ad(mutex_own(&dict_sys->mutex) == caller_has_dict_sys_mutex); + /* Initialize all stats to dummy values before fetching because if + the persistent storage contains incomplete stats (e.g. missing stats + for some index) then we would end up with (partially) uninitialized + stats. */ + dict_stats_empty_table(table); trx = trx_allocate_for_background(); @@ -2155,14 +2789,14 @@ dict_stats_fetch_from_ps( trx_start_if_not_started(trx); + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + pinfo = pars_info_create(); - pars_info_add_literal(pinfo, "database_name", table->name, - dict_get_db_name_len(table->name), - DATA_VARCHAR, 0); + pars_info_add_str_literal(pinfo, "database_name", db_utf8); - pars_info_add_str_literal(pinfo, "table_name", - dict_remove_db_name(table->name)); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); pars_info_bind_function(pinfo, "fetch_table_stats_step", @@ -2170,7 +2804,7 @@ dict_stats_fetch_from_ps( table); index_fetch_arg.table = table; - index_fetch_arg.stats_were_modified = FALSE; + index_fetch_arg.stats_were_modified = false; pars_info_bind_function(pinfo, "fetch_index_stats_step", dict_stats_fetch_index_stats_step, @@ -2230,19 +2864,9 @@ dict_stats_fetch_from_ps( "CLOSE index_stats_cur;\n" "END;", - !caller_has_dict_sys_mutex, trx); - + TRUE, trx); /* pinfo is freed by que_eval_sql() */ - /* XXX If mysql.innodb_index_stats contained less rows than the number - of indexes in the table, then some of the indexes of the table - were left uninitialized. Currently this is ignored and those - indexes are left with uninitialized stats until ANALYZE TABLE is - run. This condition happens when the user creates a new index - on a table. We could return DB_STATS_DO_NOT_EXIST from here, - forcing the usage of transient stats until mysql.innodb_index_stats - is complete. */ - trx_commit_for_mysql(trx); trx_free_for_background(trx); @@ -2256,32 +2880,67 @@ dict_stats_fetch_from_ps( /* @} */ /*********************************************************************//** +Fetches or calculates new estimates for index statistics. +dict_stats_update_for_index() @{ */ +UNIV_INTERN +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ +{ + ut_ad(!mutex_own(&dict_sys->mutex)); + + if (dict_stats_is_persistent_enabled(index->table)) { + + if (dict_stats_persistent_storage_check(false)) { + dict_table_stats_lock(index->table, RW_X_LATCH); + dict_stats_analyze_index(index); + dict_table_stats_unlock(index->table, RW_X_LATCH); + dict_stats_save(index->table); + return; + } + /* else */ + + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + char buf_table[MAX_FULL_NAME_LEN]; + char buf_index[MAX_FULL_NAME_LEN]; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Recalculation of persistent statistics " + "requested for table %s index %s but the required " + "persistent statistics storage is not present or is " + "corrupted. Using transient stats instead.\n", + ut_format_name(index->table->name, TRUE, + buf_table, sizeof(buf_table)), + ut_format_name(index->name, FALSE, + buf_index, sizeof(buf_index))); + } + + dict_table_stats_lock(index->table, RW_X_LATCH); + dict_stats_update_transient_for_index(index); + dict_table_stats_unlock(index->table, RW_X_LATCH); +} +/* @} */ + +/*********************************************************************//** Calculates new estimates for table and index statistics. The statistics are used in query optimization. -dict_stats_update() @{ -@return DB_* error code or DB_SUCCESS */ +@return DB_SUCCESS or error code */ UNIV_INTERN -enum db_err +dberr_t dict_stats_update( /*==============*/ dict_table_t* table, /*!< in/out: table */ - dict_stats_upd_option_t stats_upd_option, + dict_stats_upd_option_t stats_upd_option) /*!< in: whether to (re) calc the stats or to fetch them from the persistent statistics storage */ - ibool caller_has_dict_sys_mutex) - /*!< in: TRUE if the caller - owns dict_sys->mutex */ { - enum db_err ret = DB_ERROR; + char buf[MAX_FULL_NAME_LEN]; - /* check whether caller_has_dict_sys_mutex is set correctly; - note that mutex_own() is not implemented in non-debug code so - we cannot avoid having this extra param to the current function */ - ut_ad(caller_has_dict_sys_mutex - ? mutex_own(&dict_sys->mutex) - : !mutex_own(&dict_sys->mutex)); + ut_ad(!mutex_own(&dict_sys->mutex)); if (table->ibd_file_missing) { ut_print_timestamp(stderr); @@ -2289,83 +2948,61 @@ dict_stats_update( " InnoDB: cannot calculate statistics for table %s " "because the .ibd file is missing. For help, please " "refer to " REFMAN "innodb-troubleshooting.html\n", - table->name); - + ut_format_name(table->name, TRUE, buf, sizeof(buf))); + dict_stats_empty_table(table); return(DB_TABLESPACE_DELETED); - } - - /* If we have set a high innodb_force_recovery level, do not calculate - statistics, as a badly corrupted index can cause a crash in it. */ - - if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { - + } else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + /* If we have set a high innodb_force_recovery level, do + not calculate statistics, as a badly corrupted index can + cause a crash in it. */ + dict_stats_empty_table(table); return(DB_SUCCESS); } switch (stats_upd_option) { case DICT_STATS_RECALC_PERSISTENT: - case DICT_STATS_RECALC_PERSISTENT_SILENT: + + ut_ad(!srv_read_only_mode); + /* Persistent recalculation requested, called from - ANALYZE TABLE or from TRUNCATE TABLE */ - - /* FTS auxiliary tables do not need persistent stats */ - if ((ut_strcount(table->name, "FTS") > 0 - && (ut_strcount(table->name, "CONFIG") > 0 - || ut_strcount(table->name, "INDEX") > 0 - || ut_strcount(table->name, "DELETED") > 0 - || ut_strcount(table->name, "DOC_ID") > 0 - || ut_strcount(table->name, "ADDED") > 0))) { - goto transient; - } + 1) ANALYZE TABLE, or + 2) the auto recalculation background thread, or + 3) open table if stats do not exist on disk and auto recalc + is enabled */ + + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name, '/') != NULL); /* check if the persistent statistics storage exists before calling the potentially slow function dict_stats_update_persistent(); that is a prerequisite for dict_stats_save() succeeding */ - if (dict_stats_persistent_storage_check( - caller_has_dict_sys_mutex)) { - - dict_table_stats_lock(table, RW_X_LATCH); + if (dict_stats_persistent_storage_check(false)) { - ret = dict_stats_update_persistent(table); + dberr_t err; - /* XXX Currently dict_stats_save() would read the - stats from the table without dict_table_stats_lock() - which means it could save inconsistent data on the - disk. This is because we must call - dict_table_stats_lock() after locking dict_sys->mutex. - A solution is to copy here the stats to a temporary - buffer while holding the _stats_lock(), release it, - and pass that buffer to dict_stats_save(). */ + err = dict_stats_update_persistent(table); - dict_table_stats_unlock(table, RW_X_LATCH); - - if (ret == DB_SUCCESS) { - ret = dict_stats_save( - table, - caller_has_dict_sys_mutex); + if (err != DB_SUCCESS) { + return(err); } - return(ret); + err = dict_stats_save(table); + + return(err); } - /* else */ /* Fall back to transient stats since the persistent storage is not present or is corrupted */ - if (stats_upd_option == DICT_STATS_RECALC_PERSISTENT) { - - ut_print_timestamp(stderr); - /* XXX add link to the doc about storage - creation */ - fprintf(stderr, - " InnoDB: Recalculation of persistent " - "statistics requested but the required " - "persistent statistics storage is not " - "present or is corrupted. " - "Using quick transient stats " - "instead.\n"); - } + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Recalculation of persistent statistics " + "requested for table %s but the required persistent " + "statistics storage is not present or is corrupted. " + "Using transient stats instead.\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf))); goto transient; @@ -2373,265 +3010,317 @@ dict_stats_update( goto transient; - case DICT_STATS_FETCH: - case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY: - /* fetch requested, either fetch from persistent statistics - storage or use the old method */ + case DICT_STATS_EMPTY_TABLE: - dict_table_stats_lock(table, RW_X_LATCH); + dict_stats_empty_table(table); - if (stats_upd_option == DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY - && table->stat_initialized) { + /* If table is using persistent stats, + then save the stats on disk */ - dict_table_stats_unlock(table, RW_X_LATCH); - return(DB_SUCCESS); + if (dict_stats_is_persistent_enabled(table)) { + + if (dict_stats_persistent_storage_check(false)) { + + return(dict_stats_save(table)); + } + + return(DB_STATS_DO_NOT_EXIST); } - /* else */ - /* Must unlock because otherwise there is a lock order - violation with dict_sys->mutex below. Declare stats to be - initialized before unlocking. */ - table->stat_initialized = TRUE; - dict_table_stats_unlock(table, RW_X_LATCH); + return(DB_SUCCESS); + + case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY: - if (strchr(table->name, '/') == NULL - || strcmp(table->name, INDEX_STATS_NAME) == 0 - || strcmp(table->name, TABLE_STATS_NAME) == 0 - || (ut_strcount(table->name, "FTS") > 0 - && (ut_strcount(table->name, "CONFIG") > 0 - || ut_strcount(table->name, "INDEX") > 0 - || ut_strcount(table->name, "DELETED") > 0 - || ut_strcount(table->name, "DOC_ID") > 0 - || ut_strcount(table->name, "ADDED") > 0))) { - /* Use the quick transient stats method for - InnoDB internal tables, because we know the - persistent stats storage does not contain data - for them */ + /* fetch requested, either fetch from persistent statistics + storage or use the old method */ - goto transient; + if (table->stat_initialized) { + return(DB_SUCCESS); } - /* else */ - if (dict_stats_persistent_storage_check( - caller_has_dict_sys_mutex)) { + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name, '/') != NULL); - ret = dict_stats_fetch_from_ps(table, - caller_has_dict_sys_mutex); + if (!dict_stats_persistent_storage_check(false)) { + /* persistent statistics storage does not exist + or is corrupted, calculate the transient stats */ - if (ret == DB_STATS_DO_NOT_EXIST - || (ret != DB_SUCCESS && stats_upd_option - == DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY)) { - /* Stats for this particular table do not - exist or we have been called from open table - which needs to initialize the stats, - calculate the quick transient statistics */ - goto transient; - } - /* else */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Fetch of persistent " + "statistics requested for table %s but the " + "required system tables %s and %s are not " + "present or have unexpected structure. " + "Using transient stats instead.\n", + ut_format_name(table->name, TRUE, + buf, sizeof(buf)), + TABLE_STATS_NAME_PRINT, + INDEX_STATS_NAME_PRINT); - return(ret); - } else { - /* persistent statistics storage does not exist, - calculate the transient stats */ goto transient; } - break; + dict_table_t* t; - /* no "default:" in order to produce a compilation warning - about unhandled enumeration value */ - } + ut_ad(!srv_read_only_mode); -transient: + /* Create a dummy table object with the same name and + indexes, suitable for fetching the stats into it. */ + t = dict_stats_table_clone_create(table); - dict_table_stats_lock(table, RW_X_LATCH); + dberr_t err = dict_stats_fetch_from_ps(t); - dict_stats_update_transient(table); + t->stats_last_recalc = table->stats_last_recalc; + t->stat_modified_counter = 0; - dict_table_stats_unlock(table, RW_X_LATCH); + switch (err) { + case DB_SUCCESS: - return(DB_SUCCESS); -} -/* @} */ + dict_table_stats_lock(table, RW_X_LATCH); -/*********************************************************************//** -Close the stats tables. Should always be called after successful -dict_stats_open(). It will free the dict_stats handle. -dict_stats_close() @{ */ -UNIV_INLINE -void -dict_stats_close( -/*=============*/ - dict_stats_t* dict_stats) /*!< in/own: Handle to open - statistics tables */ -{ - if (dict_stats->table_stats != NULL) { - dict_table_close(dict_stats->table_stats, FALSE); - dict_stats->table_stats = NULL; - } + /* Initialize all stats to dummy values before + copying because dict_stats_table_clone_create() does + skip corrupted indexes so our dummy object 't' may + have less indexes than the real object 'table'. */ + dict_stats_empty_table(table); - if (dict_stats->index_stats != NULL) { - dict_table_close(dict_stats->index_stats, FALSE); - dict_stats->index_stats = NULL; - } + dict_stats_copy(table, t); - mem_free(dict_stats); -} -/* @} */ + dict_stats_assert_initialized(table); -/*********************************************************************//** -Open stats tables to prevent these tables from being DROPped. -Also check whether they have the correct structure. The caller -must call dict_stats_close() when he has finished DMLing the tables. -dict_stats_open() @{ -@return pointer to open tables or NULL on failure */ -UNIV_INLINE -dict_stats_t* -dict_stats_open(void) -/*=================*/ -{ - dict_stats_t* dict_stats; + dict_table_stats_unlock(table, RW_X_LATCH); + + dict_stats_table_clone_free(t); + + return(DB_SUCCESS); + case DB_STATS_DO_NOT_EXIST: + + dict_stats_table_clone_free(t); - dict_stats = static_cast<dict_stats_t*>( - mem_zalloc(sizeof(*dict_stats))); + if (dict_stats_auto_recalc_is_enabled(table)) { + return(dict_stats_update( + table, + DICT_STATS_RECALC_PERSISTENT)); + } - dict_stats->table_stats = dict_table_open_on_name_no_stats( - TABLE_STATS_NAME, FALSE, DICT_ERR_IGNORE_NONE); + ut_format_name(table->name, TRUE, buf, sizeof(buf)); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Trying to use table %s which has " + "persistent statistics enabled, but auto " + "recalculation turned off and the statistics " + "do not exist in %s and %s. Please either run " + "\"ANALYZE TABLE %s;\" manually or enable the " + "auto recalculation with " + "\"ALTER TABLE %s STATS_AUTO_RECALC=1;\". " + "InnoDB will now use transient statistics for " + "%s.\n", + buf, TABLE_STATS_NAME, INDEX_STATS_NAME, buf, + buf, buf); - dict_stats->index_stats = dict_table_open_on_name_no_stats( - INDEX_STATS_NAME, FALSE, DICT_ERR_IGNORE_NONE); + goto transient; + default: - /* Check if the tables have the correct structure, if yes then - after this function we can safely DELETE from them without worrying - that they may get DROPped or DDLed because the open will have - increased the reference count. */ + dict_stats_table_clone_free(t); - if (dict_stats->table_stats == NULL - || dict_stats->index_stats == NULL - || !dict_stats_persistent_storage_check(FALSE)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error fetching persistent statistics " + "for table %s from %s and %s: %s. " + "Using transient stats method instead.\n", + ut_format_name(table->name, TRUE, buf, + sizeof(buf)), + TABLE_STATS_NAME, + INDEX_STATS_NAME, + ut_strerr(err)); - /* There was an error, close the tables and free the handle. */ - dict_stats_close(dict_stats); - dict_stats = NULL; + goto transient; + } + /* no "default:" in order to produce a compilation warning + about unhandled enumeration value */ } - return(dict_stats); +transient: + + dict_table_stats_lock(table, RW_X_LATCH); + + dict_stats_update_transient(table); + + dict_table_stats_unlock(table, RW_X_LATCH); + + return(DB_SUCCESS); } -/* @} */ /*********************************************************************//** Removes the information for a particular index's stats from the persistent storage if it exists and if there is data stored for this index. -The transaction is not committed, it must not be committed in this -function because this is the user trx that is running DROP INDEX. -The transaction will be committed at the very end when dropping an -index. +This function creates its own trx and commits it. A note from Marko why we cannot edit user and sys_* tables in one trx: marko: The problem is that ibuf merges should be disabled while we are rolling back dict transactions. marko: If ibuf merges are not disabled, we need to scan the *.ibd files. But we shouldn't open *.ibd files before we have rolled back dict transactions and opened the SYS_* records for the *.ibd files. -dict_stats_delete_index_stats() @{ +dict_stats_drop_index() @{ @return DB_SUCCESS or error code */ UNIV_INTERN -enum db_err -dict_stats_delete_index_stats( -/*==========================*/ - dict_index_t* index, /*!< in: index */ - trx_t* trx, /*!< in: transaction to use */ +dberr_t +dict_stats_drop_index( +/*==================*/ + const char* db_and_table,/*!< in: db and table, e.g. 'db/table' */ + const char* iname, /*!< in: index name */ char* errstr, /*!< out: error message if != DB_SUCCESS is returned */ ulint errstr_sz)/*!< in: size of the errstr buffer */ { - char database_name[MAX_DATABASE_NAME_LEN + 1]; - const char* table_name; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; pars_info_t* pinfo; - enum db_err ret; - dict_stats_t* dict_stats; - void* mysql_thd = trx->mysql_thd; + dberr_t ret; + + ut_ad(!mutex_own(&dict_sys->mutex)); /* skip indexes whose table names do not contain a database name e.g. if we are dropping an index from SYS_TABLES */ - if (strchr(index->table_name, '/') == NULL) { - - return(DB_SUCCESS); - } + if (strchr(db_and_table, '/') == NULL) { - /* Increment table reference count to prevent the tables from - being DROPped just before que_eval_sql(). */ - dict_stats = dict_stats_open(); - - if (dict_stats == NULL) { - /* stats tables do not exist or have unexpected structure */ return(DB_SUCCESS); } - /* the stats tables cannot be DROPped now */ - - ut_snprintf(database_name, sizeof(database_name), "%.*s", - (int) dict_get_db_name_len(index->table_name), - index->table_name); - - table_name = dict_remove_db_name(index->table_name); + dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); pinfo = pars_info_create(); - pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "database_name", db_utf8); - pars_info_add_str_literal(pinfo, "table_name", table_name); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); - pars_info_add_str_literal(pinfo, "index_name", index->name); + pars_info_add_str_literal(pinfo, "index_name", iname); - /* Force lock wait timeout to be instantaneous because the incoming - transaction was created via MySQL. */ + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); - mysql_thd = trx->mysql_thd; - trx->mysql_thd = NULL; + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE DROP_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name;\n" + "END;\n"); - ret = que_eval_sql(pinfo, - "PROCEDURE DROP_INDEX_STATS () IS\n" - "BEGIN\n" - "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" - "database_name = :database_name AND\n" - "table_name = :table_name AND\n" - "index_name = :index_name;\n" - "END;\n", - TRUE, - trx); - - trx->mysql_thd = mysql_thd; - - /* pinfo is freed by que_eval_sql() */ + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); - /* do not to commit here, see the function's comment */ + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } if (ret != DB_SUCCESS) { - ut_snprintf(errstr, errstr_sz, "Unable to delete statistics for index %s " - "from %s%s. They can be deleted later using " + "from %s%s: %s. They can be deleted later using " "DELETE FROM %s WHERE " "database_name = '%s' AND " "table_name = '%s' AND " "index_name = '%s';", - index->name, + iname, INDEX_STATS_NAME_PRINT, (ret == DB_LOCK_WAIT_TIMEOUT ? " because the rows are locked" : ""), + ut_strerr(ret), INDEX_STATS_NAME_PRINT, - database_name, - table_name, - index->name); + db_utf8, + table_utf8, + iname); ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: %s\n", errstr); - - trx->error_state = DB_SUCCESS; } - dict_stats_close(dict_stats); + return(ret); +} +/* @} */ + +/*********************************************************************//** +Executes +DELETE FROM mysql.innodb_table_stats +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +dict_stats_delete_from_table_stats() @{ +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_delete_from_table_stats( +/*===============================*/ + const char* database_name, /*!< in: database name, e.g. 'db' */ + const char* table_name) /*!< in: table name, e.g. 'table' */ +{ + pars_info_t* pinfo; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_STAT */ + ut_ad(mutex_own(&dict_sys->mutex)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_TABLE_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n"); + + return(ret); +} +/* @} */ + +/*********************************************************************//** +Executes +DELETE FROM mysql.innodb_index_stats +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +dict_stats_delete_from_index_stats() @{ +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_delete_from_index_stats( +/*===============================*/ + const char* database_name, /*!< in: database name, e.g. 'db' */ + const char* table_name) /*!< in: table name, e.g. 'table' */ +{ + pars_info_t* pinfo; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_STAT */ + ut_ad(mutex_own(&dict_sys->mutex)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n"); return(ret); } @@ -2640,130 +3329,332 @@ dict_stats_delete_index_stats( /*********************************************************************//** Removes the statistics for a table and all of its indexes from the persistent statistics storage if it exists and if there is data stored for -the table. This function creates its own transaction and commits it. -dict_stats_delete_table_stats() @{ +the table. This function creates its own transaction and commits it. +dict_stats_drop_table() @{ @return DB_SUCCESS or error code */ UNIV_INTERN -enum db_err -dict_stats_delete_table_stats( -/*==========================*/ - const char* table_name, /*!< in: table name */ +dberr_t +dict_stats_drop_table( +/*==================*/ + const char* db_and_table, /*!< in: db and table, e.g. 'db/table' */ char* errstr, /*!< out: error message if != DB_SUCCESS is returned */ ulint errstr_sz) /*!< in: size of errstr buffer */ { - char database_name[MAX_DATABASE_NAME_LEN + 1]; - const char* table_name_strip; /* without leading db name */ - trx_t* trx; - pars_info_t* pinfo; - enum db_err ret = DB_ERROR; - dict_stats_t* dict_stats; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_STAT */ + ut_ad(mutex_own(&dict_sys->mutex)); /* skip tables that do not contain a database name e.g. if we are dropping SYS_TABLES */ - if (strchr(table_name, '/') == NULL) { + if (strchr(db_and_table, '/') == NULL) { return(DB_SUCCESS); } /* skip innodb_table_stats and innodb_index_stats themselves */ - if (strcmp(table_name, TABLE_STATS_NAME) == 0 - || strcmp(table_name, INDEX_STATS_NAME) == 0) { + if (strcmp(db_and_table, TABLE_STATS_NAME) == 0 + || strcmp(db_and_table, INDEX_STATS_NAME) == 0) { return(DB_SUCCESS); } - /* Create a new private trx */ + dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); - trx = trx_allocate_for_background(); + ret = dict_stats_delete_from_table_stats(db_utf8, table_utf8); - /* Use 'read-uncommitted' so that the SELECTs we execute - do not get blocked in case some user has locked the rows we - are SELECTing */ + if (ret == DB_SUCCESS) { + ret = dict_stats_delete_from_index_stats(db_utf8, table_utf8); + } - trx->isolation_level = TRX_ISO_READ_UNCOMMITTED; + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } - trx_start_if_not_started(trx); + if (ret != DB_SUCCESS) { - /* Increment table reference count to prevent the tables from - being DROPped just before que_eval_sql(). */ - dict_stats = dict_stats_open(); + ut_snprintf(errstr, errstr_sz, + "Unable to delete statistics for table %s.%s: %s. " + "They can be deleted later using " - if (dict_stats == NULL) { - /* stats tables do not exist or have unexpected structure */ - ret = DB_SUCCESS; - goto commit_and_return; + "DELETE FROM %s WHERE " + "database_name = '%s' AND " + "table_name = '%s'; " + + "DELETE FROM %s WHERE " + "database_name = '%s' AND " + "table_name = '%s';", + + db_utf8, table_utf8, + ut_strerr(ret), + + INDEX_STATS_NAME_PRINT, + db_utf8, table_utf8, + + TABLE_STATS_NAME_PRINT, + db_utf8, table_utf8); } - ut_snprintf(database_name, sizeof(database_name), "%.*s", - (int) dict_get_db_name_len(table_name), - table_name); + return(ret); +} +/* @} */ + +/*********************************************************************//** +Executes +UPDATE mysql.innodb_table_stats SET +database_name = '...', table_name = '...' +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +dict_stats_rename_in_table_stats() @{ +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_rename_in_table_stats( +/*=============================*/ + const char* old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */ + const char* old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */ + const char* new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */ + const char* new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */ +{ + pars_info_t* pinfo; + dberr_t ret; - table_name_strip = dict_remove_db_name(table_name); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_STAT */ + ut_ad(mutex_own(&dict_sys->mutex)); pinfo = pars_info_create(); - pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8); + pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8); + pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8); + pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE RENAME_IN_TABLE_STATS () IS\n" + "BEGIN\n" + "UPDATE \"" TABLE_STATS_NAME "\" SET\n" + "database_name = :new_dbname_utf8,\n" + "table_name = :new_tablename_utf8\n" + "WHERE\n" + "database_name = :old_dbname_utf8 AND\n" + "table_name = :old_tablename_utf8;\n" + "END;\n"); - pars_info_add_str_literal(pinfo, "table_name", table_name_strip); + return(ret); +} +/* @} */ - ret = que_eval_sql(pinfo, - "PROCEDURE DROP_TABLE_STATS () IS\n" - "BEGIN\n" +/*********************************************************************//** +Executes +UPDATE mysql.innodb_index_stats SET +database_name = '...', table_name = '...' +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +dict_stats_rename_in_index_stats() @{ +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_rename_in_index_stats( +/*=============================*/ + const char* old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */ + const char* old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */ + const char* new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */ + const char* new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */ +{ + pars_info_t* pinfo; + dberr_t ret; - "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" - "database_name = :database_name AND\n" - "table_name = :table_name;\n" +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_STAT */ + ut_ad(mutex_own(&dict_sys->mutex)); - "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n" - "database_name = :database_name AND\n" - "table_name = :table_name;\n" + pinfo = pars_info_create(); - "END;\n", - TRUE, - trx); + pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8); + pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8); + pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8); + pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE RENAME_IN_INDEX_STATS () IS\n" + "BEGIN\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET\n" + "database_name = :new_dbname_utf8,\n" + "table_name = :new_tablename_utf8\n" + "WHERE\n" + "database_name = :old_dbname_utf8 AND\n" + "table_name = :old_tablename_utf8;\n" + "END;\n"); - /* pinfo is freed by que_eval_sql() */ + return(ret); +} +/* @} */ - if (ret != DB_SUCCESS) { +/*********************************************************************//** +Renames a table in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +dict_stats_rename_table() @{ +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_rename_table( +/*====================*/ + const char* old_name, /*!< in: old name, e.g. 'db/table' */ + const char* new_name, /*!< in: new name, e.g. 'db/table' */ + char* errstr, /*!< out: error string if != DB_SUCCESS + is returned */ + size_t errstr_sz) /*!< in: errstr size */ +{ + char old_db_utf8[MAX_DB_UTF8_LEN]; + char new_db_utf8[MAX_DB_UTF8_LEN]; + char old_table_utf8[MAX_TABLE_UTF8_LEN]; + char new_table_utf8[MAX_TABLE_UTF8_LEN]; + dberr_t ret; - ut_snprintf(errstr, errstr_sz, - "Unable to delete statistics for table %s.%s " - "from %s or %s%s. " - "They can be deleted later using " +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_STAT */ + ut_ad(!mutex_own(&dict_sys->mutex)); - "DELETE FROM %s WHERE " - "database_name = '%s' AND " - "table_name = '%s'; " + /* skip innodb_table_stats and innodb_index_stats themselves */ + if (strcmp(old_name, TABLE_STATS_NAME) == 0 + || strcmp(old_name, INDEX_STATS_NAME) == 0 + || strcmp(new_name, TABLE_STATS_NAME) == 0 + || strcmp(new_name, INDEX_STATS_NAME) == 0) { - "DELETE FROM %s WHERE " - "database_name = '%s' AND " - "table_name = '%s';", + return(DB_SUCCESS); + } - database_name, table_name_strip, - TABLE_STATS_NAME_PRINT, INDEX_STATS_NAME_PRINT, + dict_fs2utf8(old_name, old_db_utf8, sizeof(old_db_utf8), + old_table_utf8, sizeof(old_table_utf8)); - (ret == DB_LOCK_WAIT_TIMEOUT - ? " because the rows are locked" - : ""), + dict_fs2utf8(new_name, new_db_utf8, sizeof(new_db_utf8), + new_table_utf8, sizeof(new_table_utf8)); - INDEX_STATS_NAME_PRINT, - database_name, table_name_strip, + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + + ulint n_attempts = 0; + do { + n_attempts++; + + ret = dict_stats_rename_in_table_stats( + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8); + + if (ret == DB_DUPLICATE_KEY) { + dict_stats_delete_from_table_stats( + new_db_utf8, new_table_utf8); + } + + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } + + if (ret != DB_SUCCESS) { + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + os_thread_sleep(200000 /* 0.2 sec */); + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + } + } while ((ret == DB_DEADLOCK + || ret == DB_DUPLICATE_KEY + || ret == DB_LOCK_WAIT_TIMEOUT) + && n_attempts < 5); + + if (ret != DB_SUCCESS) { + ut_snprintf(errstr, errstr_sz, + "Unable to rename statistics from " + "%s.%s to %s.%s in %s: %s. " + "They can be renamed later using " + + "UPDATE %s SET " + "database_name = '%s', " + "table_name = '%s' " + "WHERE " + "database_name = '%s' AND " + "table_name = '%s';", + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8, TABLE_STATS_NAME_PRINT, - database_name, table_name_strip); + ut_strerr(ret), - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: %s\n", errstr); + TABLE_STATS_NAME_PRINT, + new_db_utf8, new_table_utf8, + old_db_utf8, old_table_utf8); + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + return(ret); } + /* else */ - dict_stats_close(dict_stats); + n_attempts = 0; + do { + n_attempts++; -commit_and_return: + ret = dict_stats_rename_in_index_stats( + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8); - trx_commit_for_mysql(trx); + if (ret == DB_DUPLICATE_KEY) { + dict_stats_delete_from_index_stats( + new_db_utf8, new_table_utf8); + } - trx_free_for_background(trx); + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } + + if (ret != DB_SUCCESS) { + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + os_thread_sleep(200000 /* 0.2 sec */); + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + } + } while ((ret == DB_DEADLOCK + || ret == DB_DUPLICATE_KEY + || ret == DB_LOCK_WAIT_TIMEOUT) + && n_attempts < 5); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + if (ret != DB_SUCCESS) { + ut_snprintf(errstr, errstr_sz, + "Unable to rename statistics from " + "%s.%s to %s.%s in %s: %s. " + "They can be renamed later using " + + "UPDATE %s SET " + "database_name = '%s', " + "table_name = '%s' " + "WHERE " + "database_name = '%s' AND " + "table_name = '%s';", + + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8, + INDEX_STATS_NAME_PRINT, + ut_strerr(ret), + + INDEX_STATS_NAME_PRINT, + new_db_utf8, new_table_utf8, + old_db_utf8, old_table_utf8); + } return(ret); } @@ -2933,13 +3824,13 @@ test_dict_stats_save() dict_table_t table; dict_index_t index1; dict_field_t index1_fields[1]; - ib_uint64_t index1_stat_n_diff_key_vals[2]; - ib_uint64_t index1_stat_n_sample_sizes[2]; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; dict_index_t index2; dict_field_t index2_fields[4]; - ib_uint64_t index2_stat_n_diff_key_vals[5]; - ib_uint64_t index2_stat_n_sample_sizes[5]; - enum db_err ret; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; /* craft a dummy dict_table_t */ table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); @@ -2949,16 +3840,11 @@ test_dict_stats_save() UT_LIST_INIT(table.indexes); UT_LIST_ADD_LAST(indexes, table.indexes, &index1); UT_LIST_ADD_LAST(indexes, table.indexes, &index2); -#ifdef UNIV_DEBUG - table.magic_n = DICT_TABLE_MAGIC_N; -#endif /* UNIV_DEBUG */ + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); index1.name = TEST_IDX1_NAME; index1.table = &table; -#ifdef UNIV_DEBUG - index1.magic_n = DICT_INDEX_MAGIC_N; -#endif /* UNIV_DEBUG */ - index1.to_be_dropped = 0; index1.cached = 1; index1.n_uniq = 1; index1.fields = index1_fields; @@ -2967,17 +3853,12 @@ test_dict_stats_save() index1.stat_index_size = TEST_IDX1_INDEX_SIZE; index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES; index1_fields[0].name = TEST_IDX1_COL1_NAME; - index1_stat_n_diff_key_vals[0] = 1; /* dummy */ - index1_stat_n_diff_key_vals[1] = TEST_IDX1_N_DIFF1; - index1_stat_n_sample_sizes[0] = 0; /* dummy */ - index1_stat_n_sample_sizes[1] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE; + index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1; + index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE; + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); index2.name = TEST_IDX2_NAME; index2.table = &table; -#ifdef UNIV_DEBUG - index2.magic_n = DICT_INDEX_MAGIC_N; -#endif /* UNIV_DEBUG */ - index2.to_be_dropped = 0; index2.cached = 1; index2.n_uniq = 4; index2.fields = index2_fields; @@ -2989,18 +3870,16 @@ test_dict_stats_save() index2_fields[1].name = TEST_IDX2_COL2_NAME; index2_fields[2].name = TEST_IDX2_COL3_NAME; index2_fields[3].name = TEST_IDX2_COL4_NAME; - index2_stat_n_diff_key_vals[0] = 1; /* dummy */ - index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF1; - index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF2; - index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF3; - index2_stat_n_diff_key_vals[4] = TEST_IDX2_N_DIFF4; - index2_stat_n_sample_sizes[0] = 0; /* dummy */ - index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE; - index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE; - index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE; - index2_stat_n_sample_sizes[4] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE; - - ret = dict_stats_save(&table, FALSE); + index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1; + index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2; + index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3; + index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4; + index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE; + index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE; + index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE; + index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE; + + ret = dict_stats_save(&table); ut_a(ret == DB_SUCCESS); @@ -3098,41 +3977,35 @@ test_dict_stats_fetch_from_ps() { dict_table_t table; dict_index_t index1; - ib_uint64_t index1_stat_n_diff_key_vals[2]; - ib_uint64_t index1_stat_n_sample_sizes[2]; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; dict_index_t index2; - ib_uint64_t index2_stat_n_diff_key_vals[5]; - ib_uint64_t index2_stat_n_sample_sizes[5]; - enum db_err ret; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; /* craft a dummy dict_table_t */ table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); UT_LIST_INIT(table.indexes); UT_LIST_ADD_LAST(indexes, table.indexes, &index1); UT_LIST_ADD_LAST(indexes, table.indexes, &index2); -#ifdef UNIV_DEBUG - table.magic_n = DICT_TABLE_MAGIC_N; -#endif /* UNIV_DEBUG */ + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); index1.name = TEST_IDX1_NAME; -#ifdef UNIV_DEBUG - index1.magic_n = DICT_INDEX_MAGIC_N; -#endif /* UNIV_DEBUG */ + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); index1.cached = 1; index1.n_uniq = 1; index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals; index1.stat_n_sample_sizes = index1_stat_n_sample_sizes; index2.name = TEST_IDX2_NAME; -#ifdef UNIV_DEBUG - index2.magic_n = DICT_INDEX_MAGIC_N; -#endif /* UNIV_DEBUG */ + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); index2.cached = 1; index2.n_uniq = 4; index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals; index2.stat_n_sample_sizes = index2_stat_n_sample_sizes; - ret = dict_stats_fetch_from_ps(&table, FALSE); + ret = dict_stats_fetch_from_ps(&table); ut_a(ret == DB_SUCCESS); @@ -3143,19 +4016,19 @@ test_dict_stats_fetch_from_ps() ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE); ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES); - ut_a(index1_stat_n_diff_key_vals[1] == TEST_IDX1_N_DIFF1); - ut_a(index1_stat_n_sample_sizes[1] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE); + ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1); + ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE); ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE); ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES); - ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF1); - ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE); - ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF2); - ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE); - ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF3); - ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE); - ut_a(index2_stat_n_diff_key_vals[4] == TEST_IDX2_N_DIFF4); - ut_a(index2_stat_n_sample_sizes[4] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1); + ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2); + ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3); + ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4); + ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE); printf("OK: fetch successful\n"); } diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc new file mode 100644 index 00000000000..7a30b748e7f --- /dev/null +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -0,0 +1,392 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0stats_bg.cc +Code used for background table and index stats gathering. + +Created Apr 25, 2012 Vasil Dimov +*******************************************************/ + +#include "row0mysql.h" +#include "srv0start.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" + +#include <vector> + +/** Minimum time interval between stats recalc for a given table */ +#define MIN_RECALC_INTERVAL 10 /* seconds */ + +#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE) + +/** Event to wake up the stats thread */ +UNIV_INTERN os_event_t dict_stats_event = NULL; + +/** This mutex protects the "recalc_pool" variable. */ +static ib_mutex_t recalc_pool_mutex; +#ifdef HAVE_PSI_INTERFACE +static mysql_pfs_key_t recalc_pool_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ + +/** The number of tables that can be added to "recalc_pool" before +it is enlarged */ +static const ulint RECALC_POOL_INITIAL_SLOTS = 128; + +/** The multitude of tables whose stats are to be automatically +recalculated - an STL vector */ +typedef std::vector<table_id_t> recalc_pool_t; +static recalc_pool_t recalc_pool; + +typedef recalc_pool_t::iterator recalc_pool_iterator_t; + +/*****************************************************************//** +Initialize the recalc pool, called once during thread initialization. */ +static +void +dict_stats_recalc_pool_init() +/*=========================*/ +{ + ut_ad(!srv_read_only_mode); + + recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS); +} + +/*****************************************************************//** +Free the resources occupied by the recalc pool, called once during +thread de-initialization. */ +static +void +dict_stats_recalc_pool_deinit() +/*===========================*/ +{ + ut_ad(!srv_read_only_mode); + + recalc_pool.clear(); +} + +/*****************************************************************//** +Add a table to the recalc pool, which is processed by the +background stats gathering thread. Only the table id is added to the +list, so the table can be closed after being enqueued and it will be +opened when needed. If the table does not exist later (has been DROPped), +then it will be removed from the pool and skipped. +dict_stats_recalc_pool_add() @{ */ +UNIV_INTERN +void +dict_stats_recalc_pool_add( +/*=======================*/ + const dict_table_t* table) /*!< in: table to add */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&recalc_pool_mutex); + + /* quit if already in the list */ + for (recalc_pool_iterator_t iter = recalc_pool.begin(); + iter != recalc_pool.end(); + ++iter) { + + if (*iter == table->id) { + mutex_exit(&recalc_pool_mutex); + return; + } + } + + recalc_pool.push_back(table->id); + + mutex_exit(&recalc_pool_mutex); + + os_event_set(dict_stats_event); +} +/* @} */ + +/*****************************************************************//** +Get a table from the auto recalc pool. The returned table id is removed +from the pool. +dict_stats_recalc_pool_get() @{ +@return true if the pool was non-empty and "id" was set, false otherwise */ +static +bool +dict_stats_recalc_pool_get( +/*=======================*/ + table_id_t* id) /*!< out: table id, or unmodified if list is + empty */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&recalc_pool_mutex); + + if (recalc_pool.empty()) { + mutex_exit(&recalc_pool_mutex); + return(false); + } + + *id = recalc_pool[0]; + + recalc_pool.erase(recalc_pool.begin()); + + mutex_exit(&recalc_pool_mutex); + + return(true); +} +/* @} */ + +/*****************************************************************//** +Delete a given table from the auto recalc pool. +dict_stats_recalc_pool_del() */ +UNIV_INTERN +void +dict_stats_recalc_pool_del( +/*=======================*/ + const dict_table_t* table) /*!< in: table to remove */ +{ + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + + mutex_enter(&recalc_pool_mutex); + + ut_ad(table->id > 0); + + for (recalc_pool_iterator_t iter = recalc_pool.begin(); + iter != recalc_pool.end(); + ++iter) { + + if (*iter == table->id) { + /* erase() invalidates the iterator */ + recalc_pool.erase(iter); + break; + } + } + + mutex_exit(&recalc_pool_mutex); +} + +/*****************************************************************//** +Wait until background stats thread has stopped using the specified table(s). +The caller must have locked the data dictionary using +row_mysql_lock_data_dictionary() and this function may unlock it temporarily +and restore the lock before it exits. +The background stats thead is guaranteed not to start using the specified +tables after this function returns and before the caller unlocks the data +dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag +under dict_sys->mutex. +dict_stats_wait_bg_to_stop_using_table() @{ */ +UNIV_INTERN +void +dict_stats_wait_bg_to_stop_using_tables( +/*====================================*/ + dict_table_t* table1, /*!< in/out: table1 */ + dict_table_t* table2, /*!< in/out: table2, could be NULL */ + trx_t* trx) /*!< in/out: transaction to use for + unlocking/locking the data dict */ +{ + ut_ad(!srv_read_only_mode); + + while ((table1->stats_bg_flag & BG_STAT_IN_PROGRESS) + || (table2 != NULL + && (table2->stats_bg_flag & BG_STAT_IN_PROGRESS))) { + + table1->stats_bg_flag |= BG_STAT_SHOULD_QUIT; + if (table2 != NULL) { + table2->stats_bg_flag |= BG_STAT_SHOULD_QUIT; + } + + row_mysql_unlock_data_dictionary(trx); + os_thread_sleep(250000); + row_mysql_lock_data_dictionary(trx); + } +} +/* @} */ + +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread() +Must be called before dict_stats_thread() is started. +dict_stats_thread_init() @{ */ +UNIV_INTERN +void +dict_stats_thread_init() +/*====================*/ +{ + ut_a(!srv_read_only_mode); + + dict_stats_event = os_event_create(); + + /* The recalc_pool_mutex is acquired from: + 1) the background stats gathering thread before any other latch + and released without latching anything else in between (thus + any level would do here) + 2) from row_update_statistics_if_needed() + and released without latching anything else in between. We know + that dict_sys->mutex (SYNC_DICT) is not acquired when + row_update_statistics_if_needed() is called and it may be acquired + inside that function (thus a level <=SYNC_DICT would do). + 3) from row_drop_table_for_mysql() after dict_sys->mutex (SYNC_DICT) + and dict_operation_lock (SYNC_DICT_OPERATION) have been locked + (thus a level <SYNC_DICT && <SYNC_DICT_OPERATION would do) + So we choose SYNC_STATS_AUTO_RECALC to be about below SYNC_DICT. */ + mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex, + SYNC_STATS_AUTO_RECALC); + + dict_stats_recalc_pool_init(); +} +/* @} */ + +/*****************************************************************//** +Free resources allocated by dict_stats_thread_init(), must be called +after dict_stats_thread() has exited. +dict_stats_thread_deinit() @{ */ +UNIV_INTERN +void +dict_stats_thread_deinit() +/*======================*/ +{ + ut_a(!srv_read_only_mode); + ut_ad(!srv_dict_stats_thread_active); + + dict_stats_recalc_pool_deinit(); + + mutex_free(&recalc_pool_mutex); + memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex)); + + os_event_free(dict_stats_event); + dict_stats_event = NULL; +} +/* @} */ + +/*****************************************************************//** +Get the first table that has been added for auto recalc and eventually +update its stats. +dict_stats_process_entry_from_recalc_pool() @{ */ +static +void +dict_stats_process_entry_from_recalc_pool() +/*=======================================*/ +{ + table_id_t table_id; + + ut_ad(!srv_read_only_mode); + + /* pop the first table from the auto recalc pool */ + if (!dict_stats_recalc_pool_get(&table_id)) { + /* no tables for auto recalc */ + return; + } + + dict_table_t* table; + + mutex_enter(&dict_sys->mutex); + + table = dict_table_open_on_id(table_id, TRUE, FALSE); + + if (table == NULL) { + /* table does not exist, must have been DROPped + after its id was enqueued */ + mutex_exit(&dict_sys->mutex); + return; + } + + /* Check whether table is corrupted */ + if (table->corrupted) { + dict_table_close(table, TRUE, FALSE); + mutex_exit(&dict_sys->mutex); + return; + } + + table->stats_bg_flag = BG_STAT_IN_PROGRESS; + + mutex_exit(&dict_sys->mutex); + + /* ut_time() could be expensive, the current function + is called once every time a table has been changed more than 10% and + on a system with lots of small tables, this could become hot. If we + find out that this is a problem, then the check below could eventually + be replaced with something else, though a time interval is the natural + approach. */ + + if (ut_difftime(ut_time(), table->stats_last_recalc) + < MIN_RECALC_INTERVAL) { + + /* Stats were (re)calculated not long ago. To avoid + too frequent stats updates we put back the table on + the auto recalc list and do nothing. */ + + dict_stats_recalc_pool_add(table); + + } else { + + dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); + } + + mutex_enter(&dict_sys->mutex); + + table->stats_bg_flag = BG_STAT_NONE; + + dict_table_close(table, TRUE, FALSE); + + mutex_exit(&dict_sys->mutex); +} +/* @} */ + +/*****************************************************************//** +This is the thread for background stats gathering. It pops tables, from +the auto recalc list and proceeds them, eventually recalculating their +statistics. +dict_stats_thread() @{ +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(dict_stats_thread)( +/*==============================*/ + void* arg __attribute__((unused))) /*!< in: a dummy parameter + required by os_thread_create */ +{ + ut_a(!srv_read_only_mode); + + srv_dict_stats_thread_active = TRUE; + + while (!SHUTTING_DOWN()) { + + /* Wake up periodically even if not signaled. This is + because we may lose an event - if the below call to + dict_stats_process_entry_from_recalc_pool() puts the entry back + in the list, the os_event_set() will be lost by the subsequent + os_event_reset(). */ + os_event_wait_time( + dict_stats_event, MIN_RECALC_INTERVAL * 1000000); + + if (SHUTTING_DOWN()) { + break; + } + + dict_stats_process_entry_from_recalc_pool(); + + os_event_reset(dict_stats_event); + } + + srv_dict_stats_thread_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit instead of return(). */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} +/* @} */ + +/* vim: set foldmethod=marker foldmarker=@{,@}: */ diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 2e6835fe0c0..a89875352c6 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -25,6 +25,9 @@ Created 10/25/1995 Heikki Tuuri #include "fil0fil.h" +#include <debug_sync.h> +#include <my_dbug.h> + #include "mem0mem.h" #include "hash0hash.h" #include "os0file.h" @@ -41,7 +44,7 @@ Created 10/25/1995 Heikki Tuuri #include "page0page.h" #include "page0zip.h" #include "trx0sys.h" -#include "buf0rea.h" +#include "row0mysql.h" #ifndef UNIV_HOTBACKUP # include "buf0lru.h" # include "ibuf0ibuf.h" @@ -138,7 +141,7 @@ UNIV_INTERN mysql_pfs_key_t fil_space_latch_key; #endif /* UNIV_PFS_RWLOCK */ /** File node of a tablespace or the log data space */ -struct fil_node_struct { +struct fil_node_t { fil_space_t* space; /*!< backpointer to the space where this node belongs */ char* name; /*!< path to the file */ @@ -172,11 +175,11 @@ struct fil_node_struct { ulint magic_n;/*!< FIL_NODE_MAGIC_N */ }; -/** Value of fil_node_struct::magic_n */ +/** Value of fil_node_t::magic_n */ #define FIL_NODE_MAGIC_N 89389 /** Tablespace or log data space: let us call them by a common name space */ -struct fil_space_struct { +struct fil_space_t { char* name; /*!< space name = the path to the first file in it */ ulint id; /*!< space id */ @@ -215,7 +218,8 @@ struct fil_space_struct { last incomplete megabytes in data files may be ignored if space == 0 */ ulint flags; /*!< tablespace flags; see - fsp_flags_validate(), fsp_flags_get_zip_size() */ + fsp_flags_is_valid(), + fsp_flags_get_zip_size() */ ulint n_reserved_extents; /*!< number of reserved free extents for ongoing operations like B-tree page split */ @@ -238,26 +242,23 @@ struct fil_space_struct { UT_LIST_NODE_T(fil_space_t) unflushed_spaces; /*!< list of spaces with at least one unflushed file we have written to */ - ibool is_in_unflushed_spaces; /*!< TRUE if this space is - currently in unflushed_spaces */ + bool is_in_unflushed_spaces; + /*!< true if this space is currently in + unflushed_spaces */ UT_LIST_NODE_T(fil_space_t) space_list; /*!< list of all spaces */ ulint magic_n;/*!< FIL_SPACE_MAGIC_N */ }; -/** Value of fil_space_struct::magic_n */ +/** Value of fil_space_t::magic_n */ #define FIL_SPACE_MAGIC_N 89472 -/** The tablespace memory cache */ -typedef struct fil_system_struct fil_system_t; - /** The tablespace memory cache; also the totality of logs (the log data space) is stored here; below we talk about tablespaces, but also the ib_logfiles form a 'space' and it is handled here */ - -struct fil_system_struct { +struct fil_system_t { #ifndef UNIV_HOTBACKUP - mutex_t mutex; /*!< The mutex protecting the cache */ + ib_mutex_t mutex; /*!< The mutex protecting the cache */ #endif /* !UNIV_HOTBACKUP */ hash_table_t* spaces; /*!< The hash table of spaces in the system; they are hashed on the space @@ -313,7 +314,17 @@ initialized. */ static fil_system_t* fil_system = NULL; /** Determine if (i) is a user tablespace id or not. */ -# define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces) +# define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces_open) + +/** Determine if user has explicitly disabled fsync(). */ +#ifndef __WIN__ +# define fil_buffering_disabled(s) \ + ((s)->purpose == FIL_TABLESPACE \ + && srv_unix_file_flush_method \ + == SRV_UNIX_O_DIRECT_NO_FSYNC) +#else /* __WIN__ */ +# define fil_buffering_disabled(s) (0) +#endif /* __WIN__ */ #ifdef UNIV_DEBUG /** Try fil_validate() every this many times */ @@ -384,16 +395,6 @@ fil_node_complete_io( the node as modified if type == OS_FILE_WRITE */ /*******************************************************************//** -Checks if a single-table tablespace for a given table name exists in the -tablespace memory cache. -@return space id, ULINT_UNDEFINED if not found */ -static -ulint -fil_get_space_id_for_table( -/*=======================*/ - const char* name); /*!< in: table name in the standard - 'databasename/tablename' format */ -/*******************************************************************//** Frees a space object from the tablespace memory cache. Closes the files in the chain but does not delete them. There must not be any pending i/o's or flushes on the files. @@ -412,7 +413,7 @@ calculating the byte offset within a space. @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ UNIV_INLINE -ulint +dberr_t fil_read( /*=====*/ ibool sync, /*!< in: TRUE if synchronous aio is desired */ @@ -441,7 +442,7 @@ calculating the byte offset within a space. @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ UNIV_INLINE -ulint +dberr_t fil_write( /*======*/ ibool sync, /*!< in: TRUE if synchronous aio is desired */ @@ -459,6 +460,8 @@ fil_write( void* message) /*!< in: message for aio handler if non-sync aio used, else ignored */ { + ut_ad(!srv_read_only_mode); + return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message)); } @@ -592,9 +595,9 @@ fil_space_get_type( /**********************************************************************//** Checks if all the file nodes in a space are flushed. The caller must hold the fil_system mutex. -@return TRUE if all are flushed */ +@return true if all are flushed */ static -ibool +bool fil_space_is_flushed( /*=================*/ fil_space_t* space) /*!< in: space */ @@ -608,19 +611,21 @@ fil_space_is_flushed( while (node) { if (node->modification_counter > node->flush_counter) { - return(FALSE); + ut_ad(!fil_buffering_disabled(space)); + return(false); } node = UT_LIST_GET_NEXT(chain, node); } - return(TRUE); + return(true); } /*******************************************************************//** -Appends a new file to the chain of files of a space. File must be closed. */ +Appends a new file to the chain of files of a space. File must be closed. +@return pointer to the file name, or NULL on error */ UNIV_INTERN -void +char* fil_node_create( /*============*/ const char* name, /*!< in: file name (file must be closed) */ @@ -663,7 +668,7 @@ fil_node_create( mutex_exit(&fil_system->mutex); - return; + return(NULL); } space->size += size; @@ -678,6 +683,8 @@ fil_node_create( } mutex_exit(&fil_system->mutex); + + return(node->name); } /********************************************************************//** @@ -718,7 +725,7 @@ fil_node_open_file( OS_FILE_READ_ONLY, &success); if (!success) { /* The following call prints an error message */ - os_file_get_last_error(TRUE); + os_file_get_last_error(true); ut_print_timestamp(stderr); @@ -798,9 +805,9 @@ fil_node_open_file( != page_size)) { fprintf(stderr, "InnoDB: Error: tablespace file %s" - " has page size %lx\n" + " has page size 0x%lx\n" "InnoDB: but the data dictionary" - " expects page size %lx!\n", + " expects page size 0x%lx!\n", node->name, flags, fsp_flags_get_page_size(space->flags)); @@ -809,9 +816,9 @@ fil_node_open_file( if (UNIV_UNLIKELY(space->flags != flags)) { fprintf(stderr, - "InnoDB: Error: table flags are %lx" + "InnoDB: Error: table flags are 0x%lx" " in the data dictionary\n" - "InnoDB: but the flags in file %s are %lx!\n", + "InnoDB: but the flags in file %s are 0x%lx!\n", space->flags, node->name, flags); ut_error; @@ -971,6 +978,7 @@ fil_try_to_close_file_in_LRU( ", because mod_count %ld != fl_count %ld\n", (long) node->modification_counter, (long) node->flush_counter); + } if (node->being_extended) { @@ -1143,10 +1151,15 @@ fil_node_free( node->modification_counter = node->flush_counter; - if (space->is_in_unflushed_spaces - && fil_space_is_flushed(space)) { + if (fil_buffering_disabled(space)) { + + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(fil_space_is_flushed(space)); - space->is_in_unflushed_spaces = FALSE; + } else if (space->is_in_unflushed_spaces + && fil_space_is_flushed(space)) { + + space->is_in_unflushed_spaces = false; UT_LIST_REMOVE(unflushed_spaces, system->unflushed_spaces, @@ -1215,82 +1228,50 @@ fil_space_create( { fil_space_t* space; - fsp_flags_validate(flags); - -try_again: - /*printf( - "InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name, - purpose);*/ + DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); ut_a(fil_system); - ut_a(name); + ut_a(fsp_flags_is_valid(flags)); - mutex_enter(&fil_system->mutex); + /* Look for a matching tablespace and if found free it. */ + do { + mutex_enter(&fil_system->mutex); - space = fil_space_get_by_name(name); + space = fil_space_get_by_name(name); - if (UNIV_LIKELY_NULL(space)) { - ibool success; - ulint namesake_id; + if (space != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Tablespace '%s' exists in the cache " + "with id %lu", name, (ulong) id); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Warning: trying to init to the" - " tablespace memory cache\n" - "InnoDB: a tablespace %lu of name ", (ulong) id); - ut_print_filename(stderr, name); - fprintf(stderr, ",\n" - "InnoDB: but a tablespace %lu of the same name\n" - "InnoDB: already exists in the" - " tablespace memory cache!\n", - (ulong) space->id); + if (id == 0 || purpose != FIL_TABLESPACE) { - if (id == 0 || purpose != FIL_TABLESPACE) { + mutex_exit(&fil_system->mutex); - mutex_exit(&fil_system->mutex); + return(FALSE); + } - return(FALSE); - } + ib_logf(IB_LOG_LEVEL_WARN, + "Freeing existing tablespace '%s' entry " + "from the cache with id %lu", + name, (ulong) id); - fprintf(stderr, - "InnoDB: We assume that InnoDB did a crash recovery," - " and you had\n" - "InnoDB: an .ibd file for which the table" - " did not exist in the\n" - "InnoDB: InnoDB internal data dictionary in the" - " ibdata files.\n" - "InnoDB: We assume that you later removed the" - " .ibd and .frm files,\n" - "InnoDB: and are now trying to recreate the table." - " We now remove the\n" - "InnoDB: conflicting tablespace object" - " from the memory cache and try\n" - "InnoDB: the init again.\n"); - - namesake_id = space->id; - - success = fil_space_free(namesake_id, FALSE); - ut_a(success); + ibool success = fil_space_free(space->id, FALSE); + ut_a(success); - mutex_exit(&fil_system->mutex); + mutex_exit(&fil_system->mutex); + } - goto try_again; - } + } while (space != 0); space = fil_space_get_by_id(id); - if (UNIV_LIKELY_NULL(space)) { - fprintf(stderr, - "InnoDB: Error: trying to add tablespace %lu" - " of name ", (ulong) id); - ut_print_filename(stderr, name); - fprintf(stderr, "\n" - "InnoDB: to the tablespace memory cache," - " but tablespace\n" - "InnoDB: %lu of name ", (ulong) space->id); - ut_print_filename(stderr, space->name); - fputs(" already exists in the tablespace\n" - "InnoDB: memory cache!\n", stderr); + if (space != 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to add tablespace '%s' with id %lu " + "to the tablespace memory cache, but tablespace '%s' " + "with id %lu already exists in the cache!", + name, (ulong) id, space->name, (ulong) space->id); mutex_exit(&fil_system->mutex); @@ -1306,15 +1287,15 @@ try_again: space->tablespace_version = fil_system->tablespace_version; space->mark = FALSE; - if (UNIV_LIKELY(purpose == FIL_TABLESPACE && !recv_recovery_on) - && UNIV_UNLIKELY(id > fil_system->max_assigned_id)) { + if (purpose == FIL_TABLESPACE && !recv_recovery_on + && id > fil_system->max_assigned_id) { + if (!fil_system->space_id_reuse_warned) { fil_system->space_id_reuse_warned = TRUE; - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Warning: allocated tablespace %lu," - " old maximum was %lu\n", + ib_logf(IB_LOG_LEVEL_WARN, + "Allocated tablespace %lu, old maximum " + "was %lu", (ulong) id, (ulong) fil_system->max_assigned_id); } @@ -1333,7 +1314,7 @@ try_again: HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, ut_fold_string(name), space); - space->is_in_unflushed_spaces = FALSE; + space->is_in_unflushed_spaces = false; UT_LIST_ADD_LAST(space_list, fil_system->space_list, space); @@ -1418,7 +1399,6 @@ fil_space_free( { fil_space_t* space; fil_space_t* fnamespace; - fil_node_t* fil_node; ut_ad(mutex_own(&fil_system->mutex)); @@ -1444,7 +1424,9 @@ fil_space_free( ut_fold_string(space->name), space); if (space->is_in_unflushed_spaces) { - space->is_in_unflushed_spaces = FALSE; + + ut_ad(!fil_buffering_disabled(space)); + space->is_in_unflushed_spaces = false; UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces, space); @@ -1455,12 +1437,11 @@ fil_space_free( ut_a(space->magic_n == FIL_SPACE_MAGIC_N); ut_a(0 == space->n_pending_flushes); - fil_node = UT_LIST_GET_FIRST(space->chain); + for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain); + fil_node != NULL; + fil_node = UT_LIST_GET_FIRST(space->chain)) { - while (fil_node != NULL) { fil_node_free(fil_node, fil_system, space); - - fil_node = UT_LIST_GET_FIRST(space->chain); } ut_a(0 == UT_LIST_GET_LEN(space->chain)); @@ -1478,34 +1459,30 @@ fil_space_free( } /*******************************************************************//** -Returns the size of the space in pages. The tablespace must be cached in the -memory cache. -@return space size, 0 if space not found */ -UNIV_INTERN -ulint -fil_space_get_size( -/*===============*/ +Returns a pointer to the file_space_t that is in the memory cache +associated with a space id. The caller must lock fil_system->mutex. +@return file_space_t pointer, NULL if space not found */ +UNIV_INLINE +fil_space_t* +fil_space_get_space( +/*================*/ ulint id) /*!< in: space id */ { - fil_node_t* node; fil_space_t* space; - ulint size; + fil_node_t* node; ut_ad(fil_system); - fil_mutex_enter_and_prepare_for_io(id); - space = fil_space_get_by_id(id); - if (space == NULL) { - mutex_exit(&fil_system->mutex); - - return(0); + return(NULL); } if (space->size == 0 && space->purpose == FIL_TABLESPACE) { ut_a(id != 0); + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ ut_a(1 == UT_LIST_GET_LEN(space->chain)); node = UT_LIST_GET_FIRST(space->chain); @@ -1518,7 +1495,69 @@ fil_space_get_size( fil_node_complete_io(node, fil_system, OS_FILE_READ); } - size = space->size; + return(space); +} + +/*******************************************************************//** +Returns the path from the first fil_node_t found for the space ID sent. +The caller is responsible for freeing the memory allocated here for the +value returned. +@return own: A copy of fil_node_t::path, NULL if space ID is zero +or not found. */ +UNIV_INTERN +char* +fil_space_get_first_path( +/*=====================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + fil_node_t* node; + char* path; + + ut_ad(fil_system); + ut_a(id); + + fil_mutex_enter_and_prepare_for_io(id); + + space = fil_space_get_space(id); + + if (space == NULL) { + mutex_exit(&fil_system->mutex); + + return(NULL); + } + + ut_ad(mutex_own(&fil_system->mutex)); + + node = UT_LIST_GET_FIRST(space->chain); + + path = mem_strdup(node->name); + + mutex_exit(&fil_system->mutex); + + return(path); +} + +/*******************************************************************//** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. +@return space size, 0 if space not found */ +UNIV_INTERN +ulint +fil_space_get_size( +/*===============*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ulint size; + + ut_ad(fil_system); + + fil_mutex_enter_and_prepare_for_io(id); + + space = fil_space_get_space(id); + + size = space ? space->size : 0; mutex_exit(&fil_system->mutex); @@ -1535,19 +1574,18 @@ fil_space_get_flags( /*================*/ ulint id) /*!< in: space id */ { - fil_node_t* node; fil_space_t* space; ulint flags; ut_ad(fil_system); - if (UNIV_UNLIKELY(!id)) { + if (!id) { return(0); } fil_mutex_enter_and_prepare_for_io(id); - space = fil_space_get_by_id(id); + space = fil_space_get_space(id); if (space == NULL) { mutex_exit(&fil_system->mutex); @@ -1555,21 +1593,6 @@ fil_space_get_flags( return(ULINT_UNDEFINED); } - if (space->size == 0 && space->purpose == FIL_TABLESPACE) { - ut_a(id != 0); - - ut_a(1 == UT_LIST_GET_LEN(space->chain)); - - node = UT_LIST_GET_FIRST(space->chain); - - /* It must be a single-table tablespace and we have not opened - the file yet; the following calls will open it and update the - size fields */ - - fil_node_prepare_for_io(node, fil_system, space); - fil_node_complete_io(node, fil_system, OS_FILE_READ); - } - flags = space->flags; mutex_exit(&fil_system->mutex); @@ -1744,6 +1767,49 @@ fil_close_all_files(void) } /*******************************************************************//** +Closes the redo log files. There must not be any pending i/o's or not +flushed modifications in the files. */ +UNIV_INTERN +void +fil_close_log_files( +/*================*/ + bool free) /*!< in: whether to free the memory object */ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space != NULL) { + fil_node_t* node; + fil_space_t* prev_space = space; + + if (space->purpose != FIL_LOG) { + space = UT_LIST_GET_NEXT(space_list, space); + continue; + } + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (node->open) { + fil_node_close_file(node, fil_system); + } + } + + space = UT_LIST_GET_NEXT(space_list, space); + + if (free) { + fil_space_free(prev_space->id, FALSE); + } + } + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** Sets the max tablespace id counter if the given number is bigger than the previous value. */ UNIV_INTERN @@ -1773,8 +1839,8 @@ fil_set_max_space_id_if_bigger( Writes the flushed lsn and the latest archived log number to the page header of the first page of a data file of the system tablespace (space 0), which is uncompressed. */ -static -ulint +static __attribute__((warn_unused_result)) +dberr_t fil_write_lsn_and_arch_no_to_file( /*==============================*/ ulint space, /*!< in: space to write to */ @@ -1786,19 +1852,23 @@ fil_write_lsn_and_arch_no_to_file( { byte* buf1; byte* buf; + dberr_t err; buf1 = static_cast<byte*>(mem_alloc(2 * UNIV_PAGE_SIZE)); buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE)); - fil_read(TRUE, space, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); - - mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); + err = fil_read(TRUE, space, 0, sum_of_sizes, 0, + UNIV_PAGE_SIZE, buf, NULL); + if (err == DB_SUCCESS) { + mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); - fil_write(TRUE, space, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); + err = fil_write(TRUE, space, 0, sum_of_sizes, 0, + UNIV_PAGE_SIZE, buf, NULL); + } mem_free(buf1); - return(DB_SUCCESS); + return(err); } /****************************************************************//** @@ -1806,7 +1876,7 @@ Writes the flushed lsn and the latest archived log number to the page header of the first page of each data file in the system tablespace. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t fil_write_flushed_lsn_to_data_files( /*================================*/ lsn_t lsn, /*!< in: lsn to write */ @@ -1814,7 +1884,7 @@ fil_write_flushed_lsn_to_data_files( { fil_space_t* space; fil_node_t* node; - ulint err; + dberr_t err; mutex_enter(&fil_system->mutex); @@ -1830,7 +1900,6 @@ fil_write_flushed_lsn_to_data_files( if (space->purpose == FIL_TABLESPACE && !fil_is_user_tablespace_id(space->id)) { - ulint sum_of_sizes = 0; for (node = UT_LIST_GET_FIRST(space->chain); @@ -1872,6 +1941,7 @@ fil_read_first_page( parameters below already contain sensible data */ ulint* flags, /*!< out: tablespace flags */ + ulint* space_id, /*!< out: tablespace ID */ #ifdef UNIV_LOG_ARCHIVE ulint* min_arch_log_no, /*!< out: min of archived log numbers in data files */ @@ -1897,7 +1967,9 @@ fil_read_first_page( *flags = fsp_header_get_flags(page); - flushed_lsn = mach_read_from_8(page+ FIL_PAGE_FILE_FLUSH_LSN); + *space_id = fsp_header_get_space_id(page); + + flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN); ut_free(buf); @@ -2102,6 +2174,12 @@ created does not exist, then we create the directory, too. Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the datadir that we should use in replaying the file operations. + +InnoDB recovery does not replay these fully since it always sets the space id +to zero. But ibbackup does replay them. TODO: If remote tablespaces are used, +ibbackup will only create tables in the default directory since MLOG_FILE_CREATE +and MLOG_FILE_CREATE2 only know the tablename, not the path. + @return end of log record, or NULL if the record was not completely contained between ptr and end_ptr */ UNIV_INTERN @@ -2197,7 +2275,9 @@ fil_op_log_parse_or_replay( switch (type) { case MLOG_FILE_DELETE: if (fil_tablespace_exists_in_mem(space_id)) { - ut_a(fil_delete_tablespace(space_id)); + dberr_t err = fil_delete_tablespace( + space_id, BUF_REMOVE_FLUSH_NO_WRITE); + ut_a(err == DB_SUCCESS); } break; @@ -2218,10 +2298,10 @@ fil_op_log_parse_or_replay( if (fil_get_space_id_for_table(new_name) == ULINT_UNDEFINED) { - /* We do not care of the old name, that is - why we pass NULL as the first argument */ + /* We do not care about the old name, that + is why we pass NULL as the first argument. */ if (!fil_rename_tablespace(NULL, space_id, - new_name)) { + new_name, NULL)) { ut_error; } } @@ -2239,12 +2319,14 @@ fil_op_log_parse_or_replay( } else if (log_flags & MLOG_FILE_FLAG_TEMP) { /* Temporary table, do nothing */ } else { + const char* path = NULL; + /* Create the database directory for name, if it does not exist yet */ fil_create_directory_for_tablename(name); if (fil_create_new_single_table_tablespace( - space_id, name, FALSE, flags, + space_id, name, path, flags, DICT_TF2_USE_TABLESPACE, FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { ut_error; @@ -2261,118 +2343,271 @@ fil_op_log_parse_or_replay( } /*******************************************************************//** -Deletes a single-table tablespace. The tablespace must be cached in the -memory cache. -@return TRUE if success */ -UNIV_INTERN -ibool -fil_delete_tablespace( -/*==================*/ - ulint id) /*!< in: space id */ +Allocates a file name for the EXPORT/IMPORT config file name. The +string must be freed by caller with mem_free(). +@return own: file name */ +static +char* +fil_make_cfg_name( +/*==============*/ + const char* filepath) /*!< in: .ibd file name */ { - ibool success; - fil_space_t* space; - fil_node_t* node; - ulint count = 0; - char* path; + char* cfg_name; - ut_a(id != 0); -stop_new_ops: - mutex_enter(&fil_system->mutex); + /* Create a temporary file path by replacing the .ibd suffix + with .cfg. */ - space = fil_space_get_by_id(id); + ut_ad(strlen(filepath) > 4); - if (space != NULL) { - space->stop_new_ops = TRUE; + cfg_name = mem_strdup(filepath); + ut_snprintf(cfg_name + strlen(cfg_name) - 3, 4, "cfg"); + return(cfg_name); +} - if (space->n_pending_ops == 0) { - mutex_exit(&fil_system->mutex); +/*******************************************************************//** +Check for change buffer merges. +@return 0 if no merges else count + 1. */ +static +ulint +fil_ibuf_check_pending_ops( +/*=======================*/ + fil_space_t* space, /*!< in/out: Tablespace to check */ + ulint count) /*!< in: number of attempts so far */ +{ + ut_ad(mutex_own(&fil_system->mutex)); - count = 0; + if (space != 0 && space->n_pending_ops != 0) { - goto try_again; - } else { - if (count > 5000) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Warning: trying to" - " delete tablespace ", stderr); - ut_print_filename(stderr, space->name); - fprintf(stderr, ",\n" - "InnoDB: but there are %lu pending" - " operations (most likely ibuf merges)" - " on it.\n" - "InnoDB: Loop %lu.\n", - (ulong) space->n_pending_ops, - (ulong) count); - } + if (count > 5000) { + ib_logf(IB_LOG_LEVEL_WARN, + "Trying to close/delete tablespace " + "'%s' but there are %lu pending change " + "buffer merges on it.", + space->name, + (ulong) space->n_pending_ops); + } - mutex_exit(&fil_system->mutex); + return(count + 1); + } - os_thread_sleep(20000); - count++; + return(0); +} + +/*******************************************************************//** +Check for pending IO. +@return 0 if no pending else count + 1. */ +static +ulint +fil_check_pending_io( +/*=================*/ + fil_space_t* space, /*!< in/out: Tablespace to check */ + fil_node_t** node, /*!< out: Node in space list */ + ulint count) /*!< in: number of attempts so far */ +{ + ut_ad(mutex_own(&fil_system->mutex)); + ut_a(space->n_pending_ops == 0); + + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + + *node = UT_LIST_GET_FIRST(space->chain); + + if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) { + + ut_a(!(*node)->being_extended); - goto stop_new_ops; + if (count > 1000) { + ib_logf(IB_LOG_LEVEL_WARN, + "Trying to close/delete tablespace '%s' " + "but there are %lu flushes " + " and %lu pending i/o's on it.", + space->name, + (ulong) space->n_pending_flushes, + (ulong) (*node)->n_pending); } + + return(count + 1); } - mutex_exit(&fil_system->mutex); - count = 0; + return(0); +} + +/*******************************************************************//** +Check pending operations on a tablespace. +@return DB_SUCCESS or error failure. */ +static +dberr_t +fil_check_pending_operations( +/*=========================*/ + ulint id, /*!< in: space id */ + fil_space_t** space, /*!< out: tablespace instance in memory */ + char** path) /*!< out/own: tablespace path */ +{ + ulint count = 0; + + ut_a(id != TRX_SYS_SPACE); + ut_ad(space); + + *space = 0; -try_again: mutex_enter(&fil_system->mutex); + fil_space_t* sp = fil_space_get_by_id(id); + if (sp) { + sp->stop_new_ops = TRUE; + } + mutex_exit(&fil_system->mutex); - space = fil_space_get_by_id(id); + /* Check for pending change buffer merges. */ - if (space == NULL) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: cannot delete tablespace %lu\n" - "InnoDB: because it is not found in the" - " tablespace memory cache.\n", - (ulong) id); + do { + mutex_enter(&fil_system->mutex); + + sp = fil_space_get_by_id(id); + + count = fil_ibuf_check_pending_ops(sp, count); mutex_exit(&fil_system->mutex); - return(FALSE); - } + if (count > 0) { + os_thread_sleep(20000); + } - ut_a(space->stop_new_ops); - ut_a(space->n_pending_ops == 0); + } while (count > 0); - /* TODO: The following code must change when InnoDB supports - multiple datafiles per tablespace. */ - ut_a(UT_LIST_GET_LEN(space->chain) == 1); + /* Check for pending IO. */ - node = UT_LIST_GET_FIRST(space->chain); + *path = 0; - if (space->n_pending_flushes > 0 || node->n_pending > 0 - || node->being_extended) { - if (count > 1000) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Warning: trying to" - " delete tablespace ", stderr); - ut_print_filename(stderr, space->name); - fprintf(stderr, ",\n" - "InnoDB: but there are %lu flushes" - " and %lu pending i/o's on it\n" - "InnoDB: Or it is being extended\n" - "InnoDB: Loop %lu.\n", - (ulong) space->n_pending_flushes, - (ulong) node->n_pending, - (ulong) count); + do { + mutex_enter(&fil_system->mutex); + + sp = fil_space_get_by_id(id); + + if (sp == NULL) { + mutex_exit(&fil_system->mutex); + return(DB_TABLESPACE_NOT_FOUND); + } + + fil_node_t* node; + + count = fil_check_pending_io(sp, &node, count); + + if (count == 0) { + *path = mem_strdup(node->name); } + mutex_exit(&fil_system->mutex); - os_thread_sleep(20000); - count++; + if (count > 0) { + os_thread_sleep(20000); + } + + } while (count > 0); + + ut_ad(sp); + + *space = sp; + return(DB_SUCCESS); +} + +/*******************************************************************//** +Closes a single-table tablespace. The tablespace must be cached in the +memory cache. Free all pages used by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_close_tablespace( +/*=================*/ + trx_t* trx, /*!< in/out: Transaction covering the close */ + ulint id) /*!< in: space id */ +{ + char* path = 0; + fil_space_t* space = 0; + + ut_a(id != TRX_SYS_SPACE); + + dberr_t err = fil_check_pending_operations(id, &space, &path); - goto try_again; + if (err != DB_SUCCESS) { + return(err); } - path = mem_strdup(node->name); + ut_a(space); + ut_a(path != 0); + + rw_lock_x_lock(&space->latch); + +#ifndef UNIV_HOTBACKUP + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since we have set space->stop_new_ops = TRUE, readahead + or ibuf merge can no longer read more pages of this tablespace to the + buffer pool. Thus we can clean the tablespace out of the buffer pool + completely and permanently. The flag stop_new_ops also prevents + fil_flush() from being applied to this tablespace. */ + + buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_FLUSH_WRITE, trx); +#endif + mutex_enter(&fil_system->mutex); + + /* If the free is successful, the X lock will be released before + the space memory data structure is freed. */ + + if (!fil_space_free(id, TRUE)) { + rw_lock_x_unlock(&space->latch); + err = DB_TABLESPACE_NOT_FOUND; + } else { + err = DB_SUCCESS; + } mutex_exit(&fil_system->mutex); + /* If it is a delete then also delete any generated files, otherwise + when we drop the database the remove directory will fail. */ + + char* cfg_name = fil_make_cfg_name(path); + + os_file_delete_if_exists(cfg_name); + + mem_free(path); + mem_free(cfg_name); + + return(err); +} + +/*******************************************************************//** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_delete_tablespace( +/*==================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove) /*!< in: specify the action to take + on the tables pages in the buffer + pool */ +{ + char* path = 0; + fil_space_t* space = 0; + + ut_a(id != TRX_SYS_SPACE); + + dberr_t err = fil_check_pending_operations(id, &space, &path); + + if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot delete tablespace %lu because it is not " + "found in the tablespace memory cache.", + (ulong) id); + + return(err); + } + + ut_a(space); + ut_a(path != 0); + /* Important: We rely on the data dictionary mutex to ensure that a race is not possible here. It should serialize the tablespace drop/free. We acquire an X latch only to avoid a race condition @@ -2407,9 +2642,22 @@ try_again: To deal with potential read requests by checking the ::stop_new_ops flag in fil_io() */ - buf_LRU_invalidate_tablespace(id); -#endif - /* printf("Deleting tablespace %s id %lu\n", space->name, id); */ + buf_LRU_flush_or_remove_pages(id, buf_remove, 0); + +#endif /* !UNIV_HOTBACKUP */ + + /* If it is a delete then also delete any generated files, otherwise + when we drop the database the remove directory will fail. */ + { + char* cfg_name = fil_make_cfg_name(path); + os_file_delete_if_exists(cfg_name); + mem_free(cfg_name); + } + + /* Delete the link file pointing to the ibd file we are deleting. */ + if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) { + fil_delete_link_file(space->name); + } mutex_enter(&fil_system->mutex); @@ -2418,25 +2666,27 @@ try_again: if (fil_space_get_by_id(id)) { ut_a(space->n_pending_ops == 0); ut_a(UT_LIST_GET_LEN(space->chain) == 1); - node = UT_LIST_GET_FIRST(space->chain); + fil_node_t* node = UT_LIST_GET_FIRST(space->chain); ut_a(node->n_pending == 0); } - success = fil_space_free(id, TRUE); + if (!fil_space_free(id, TRUE)) { + err = DB_TABLESPACE_NOT_FOUND; + } mutex_exit(&fil_system->mutex); - if (success) { - success = os_file_delete(path); - - if (!success) { - success = os_file_delete_if_exists(path); - } - } else { + if (err != DB_SUCCESS) { rw_lock_x_unlock(&space->latch); + } else if (!os_file_delete(path) && !os_file_delete_if_exists(path)) { + + /* Note: This is because we have removed the + tablespace instance from the cache. */ + + err = DB_IO_ERROR; } - if (success) { + if (err == DB_SUCCESS) { #ifndef UNIV_HOTBACKUP /* Write a log record about the deletion of the .ibd file, so that ibbackup can replay it in the @@ -2451,14 +2701,12 @@ try_again: fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr); mtr_commit(&mtr); #endif - mem_free(path); - - return(TRUE); + err = DB_SUCCESS; } mem_free(path); - return(FALSE); + return(err); } /*******************************************************************//** @@ -2490,36 +2738,49 @@ fil_tablespace_is_being_deleted( /*******************************************************************//** Discards a single-table tablespace. The tablespace must be cached in the memory cache. Discarding is like deleting a tablespace, but -1) we do not drop the table from the data dictionary; -2) we remove all insert buffer entries for the tablespace immediately; in DROP -TABLE they are only removed gradually in the background; -3) when the user does IMPORT TABLESPACE, the tablespace will have the same id -as it originally had. -@return TRUE if success */ + + 1. We do not drop the table from the data dictionary; + + 2. We remove all insert buffer entries for the tablespace immediately; + in DROP TABLE they are only removed gradually in the background; + + 3. Free all the pages in use by the tablespace. +@return DB_SUCCESS or error */ UNIV_INTERN -ibool +dberr_t fil_discard_tablespace( /*===================*/ ulint id) /*!< in: space id */ { - ibool success; + dberr_t err; - success = fil_delete_tablespace(id); + switch (err = fil_delete_tablespace(id, BUF_REMOVE_ALL_NO_WRITE)) { + case DB_SUCCESS: + break; - if (!success) { - fprintf(stderr, - "InnoDB: Warning: cannot delete tablespace %lu" - " in DISCARD TABLESPACE.\n" - "InnoDB: But let us remove the" - " insert buffer entries for this tablespace.\n", - (ulong) id); + case DB_IO_ERROR: + ib_logf(IB_LOG_LEVEL_WARN, + "While deleting tablespace %lu in DISCARD TABLESPACE." + " File rename/delete failed: %s", + (ulong) id, ut_strerr(err)); + break; + + case DB_TABLESPACE_NOT_FOUND: + ib_logf(IB_LOG_LEVEL_WARN, + "Cannot delete tablespace %lu in DISCARD " + "TABLESPACE. %s", + (ulong) id, ut_strerr(err)); + break; + + default: + ut_error; } /* Remove all insert buffer entries for the tablespace */ ibuf_delete_for_discarded_space(id); - return(success); + return(err); } #endif /* !UNIV_HOTBACKUP */ @@ -2575,30 +2836,27 @@ fil_rename_tablespace_in_mem( Allocates a file name for a single-table tablespace. The string must be freed by caller with mem_free(). @return own: file name */ -static +UNIV_INTERN char* fil_make_ibd_name( /*==============*/ - const char* name, /*!< in: table name or a dir path of a - TEMPORARY table */ - ibool is_temp) /*!< in: TRUE if it is a dir path */ + const char* name, /*!< in: table name or a dir path */ + bool is_full_path) /*!< in: TRUE if it is a dir path */ { char* filename; ulint namelen = strlen(name); ulint dirlen = strlen(fil_path_to_mysql_datadir); + ulint pathlen = dirlen + namelen + sizeof "/.ibd"; - filename = static_cast<char*>( - mem_alloc(namelen + dirlen + sizeof "/.ibd")); + filename = static_cast<char*>(mem_alloc(pathlen)); - if (is_temp) { + if (is_full_path) { memcpy(filename, name, namelen); memcpy(filename + namelen, ".ibd", sizeof ".ibd"); } else { - memcpy(filename, fil_path_to_mysql_datadir, dirlen); - filename[dirlen] = '/'; + ut_snprintf(filename, pathlen, "%s/%s.ibd", + fil_path_to_mysql_datadir, name); - memcpy(filename + dirlen + 1, name, namelen); - memcpy(filename + dirlen + namelen + 1, ".ibd", sizeof ".ibd"); } srv_normalize_path_for_win(filename); @@ -2607,6 +2865,31 @@ fil_make_ibd_name( } /*******************************************************************//** +Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link). +The string must be freed by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_isl_name( +/*==============*/ + const char* name) /*!< in: table name */ +{ + char* filename; + ulint namelen = strlen(name); + ulint dirlen = strlen(fil_path_to_mysql_datadir); + ulint pathlen = dirlen + namelen + sizeof "/.isl"; + + filename = static_cast<char*>(mem_alloc(pathlen)); + + ut_snprintf(filename, pathlen, "%s/%s.isl", + fil_path_to_mysql_datadir, name); + + srv_normalize_path_for_win(filename); + + return(filename); +} + +/*******************************************************************//** Renames a single-table tablespace. The tablespace must be cached in the tablespace memory cache. @return TRUE if success */ @@ -2614,14 +2897,19 @@ UNIV_INTERN ibool fil_rename_tablespace( /*==================*/ - const char* old_name_in, /*!< in: old table name in the standard - databasename/tablename format of - InnoDB, or NULL if we do the rename - based on the space id only */ + const char* old_name_in, /*!< in: old table name in the + standard databasename/tablename + format of InnoDB, or NULL if we + do the rename based on the space + id only */ ulint id, /*!< in: space id */ - const char* new_name) /*!< in: new table name in the standard - databasename/tablename format - of InnoDB */ + const char* new_name, /*!< in: new table name in the + standard databasename/tablename + format of InnoDB */ + const char* new_path_in) /*!< in: new full datafile path + if the tablespace is remotely + located, or NULL if it is located + in the normal data directory. */ { ibool success; fil_space_t* space; @@ -2651,14 +2939,14 @@ retry: space = fil_space_get_by_id(id); + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_1", space = NULL; ); + if (space == NULL) { - fprintf(stderr, - "InnoDB: Error: cannot find space id %lu" - " in the tablespace memory cache\n" - "InnoDB: though the table ", (ulong) id); - ut_print_filename(stderr, - old_name_in ? old_name_in : not_given); - fputs(" in a rename operation should have that id\n", stderr); + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot find space id %lu in the tablespace " + "memory cache, though the table '%s' in a " + "rename operation should have that id.", + (ulong) id, old_name_in ? old_name_in : not_given); mutex_exit(&fil_system->mutex); return(FALSE); @@ -2677,10 +2965,13 @@ retry: space->stop_ios = TRUE; + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ ut_a(UT_LIST_GET_LEN(space->chain) == 1); node = UT_LIST_GET_FIRST(space->chain); - if (node->n_pending > 0 || node->n_pending_flushes > 0 + if (node->n_pending > 0 + || node->n_pending_flushes > 0 || node->being_extended) { /* There are pending i/o's or flushes or the file is currently being extended, sleep for a while and @@ -2713,24 +3004,31 @@ retry: if (old_name_in) { old_name = mem_strdup(old_name_in); - old_path = fil_make_ibd_name(old_name, FALSE); - ut_a(strcmp(space->name, old_name) == 0); - ut_a(strcmp(node->name, old_path) == 0); } else { old_name = mem_strdup(space->name); - old_path = mem_strdup(node->name); } + old_path = mem_strdup(node->name); /* Rename the tablespace and the node in the memory cache */ - new_path = fil_make_ibd_name(new_name, FALSE); + new_path = new_path_in ? mem_strdup(new_path_in) + : fil_make_ibd_name(new_name, false); + success = fil_rename_tablespace_in_mem( space, node, new_name, new_path); if (success) { + + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2", + goto skip_second_rename; ); + success = os_file_rename( innodb_file_data_key, old_path, new_path); + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2", +skip_second_rename: + success = FALSE; ); + if (!success) { /* We have to revert the changes we made to the tablespace memory cache */ @@ -2745,7 +3043,7 @@ retry: mutex_exit(&fil_system->mutex); #ifndef UNIV_HOTBACKUP - if (success) { + if (success && !recv_recovery_on) { mtr_t mtr; mtr_start(&mtr); @@ -2754,7 +3052,7 @@ retry: &mtr); mtr_commit(&mtr); } -#endif +#endif /* !UNIV_HOTBACKUP */ mem_free(new_path); mem_free(old_path); @@ -2764,23 +3062,202 @@ retry: } /*******************************************************************//** +Creates a new InnoDB Symbolic Link (ISL) file. It is always created +under the 'datadir' of MySQL. The datadir is the directory of a +running mysqld program. We can refer to it by simply using the path '.'. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_link_file( +/*=================*/ + const char* tablename, /*!< in: tablename */ + const char* filepath) /*!< in: pathname of tablespace */ +{ + os_file_t file; + ibool success; + dberr_t err = DB_SUCCESS; + char* link_filepath; + char* prev_filepath = fil_read_link_file(tablename); + + ut_ad(!srv_read_only_mode); + + if (prev_filepath) { + /* Truncate will call this with an existing + link file which contains the same filepath. */ + if (0 == strcmp(prev_filepath, filepath)) { + mem_free(prev_filepath); + return(DB_SUCCESS); + } + mem_free(prev_filepath); + } + + link_filepath = fil_make_isl_name(tablename); + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, link_filepath, + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + + if (!success) { + /* The following call will print an error message */ + ulint error = os_file_get_last_error(true); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot create file ", stderr); + ut_print_filename(stderr, link_filepath); + fputs(".\n", stderr); + + if (error == OS_FILE_ALREADY_EXISTS) { + fputs("InnoDB: The link file: ", stderr); + ut_print_filename(stderr, filepath); + fputs(" already exists.\n", stderr); + err = DB_TABLESPACE_EXISTS; + + } else if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + + } else { + err = DB_ERROR; + } + + /* file is not open, no need to close it. */ + mem_free(link_filepath); + return(err); + } + + if (!os_file_write(link_filepath, file, filepath, 0, + strlen(filepath))) { + err = DB_ERROR; + } + + /* Close the file, we only need it at startup */ + os_file_close(file); + + mem_free(link_filepath); + + return(err); +} + +/*******************************************************************//** +Deletes an InnoDB Symbolic Link (ISL) file. */ +UNIV_INTERN +void +fil_delete_link_file( +/*=================*/ + const char* tablename) /*!< in: name of table */ +{ + char* link_filepath = fil_make_isl_name(tablename); + + os_file_delete_if_exists(link_filepath); + + mem_free(link_filepath); +} + +/*******************************************************************//** +Reads an InnoDB Symbolic Link (ISL) file. +It is always created under the 'datadir' of MySQL. The name is of the +form {databasename}/{tablename}. and the isl file is expected to be in a +'{databasename}' directory called '{tablename}.isl'. The caller must free +the memory of the null-terminated path returned if it is not null. +@return own: filepath found in link file, NULL if not found. */ +UNIV_INTERN +char* +fil_read_link_file( +/*===============*/ + const char* name) /*!< in: tablespace name */ +{ + char* filepath = NULL; + char* link_filepath; + FILE* file = NULL; + + /* The .isl file is in the 'normal' tablespace location. */ + link_filepath = fil_make_isl_name(name); + + file = fopen(link_filepath, "r+b"); + + mem_free(link_filepath); + + if (file) { + filepath = static_cast<char*>(mem_alloc(OS_FILE_MAX_PATH)); + + os_file_read_string(file, filepath, OS_FILE_MAX_PATH); + fclose(file); + + if (strlen(filepath)) { + /* Trim whitespace from end of filepath */ + ulint lastch = strlen(filepath) - 1; + while (lastch > 4 && filepath[lastch] <= 0x20) { + filepath[lastch--] = 0x00; + } + srv_normalize_path_for_win(filepath); + } + } + + return(filepath); +} + +/*******************************************************************//** +Opens a handle to the file linked to in an InnoDB Symbolic Link file. +@return TRUE if remote linked tablespace file is found and opened. */ +UNIV_INTERN +ibool +fil_open_linked_file( +/*===============*/ + const char* tablename, /*!< in: database/tablename */ + char** remote_filepath,/*!< out: remote filepath */ + os_file_t* remote_file) /*!< out: remote file handle */ + +{ + ibool success; + + *remote_filepath = fil_read_link_file(tablename); + if (*remote_filepath == NULL) { + return(FALSE); + } + + /* The filepath provided is different from what was + found in the link file. */ + *remote_file = os_file_create_simple_no_error_handling( + innodb_file_data_key, *remote_filepath, + OS_FILE_OPEN, OS_FILE_READ_ONLY, + &success); + + if (!success) { + char* link_filepath = fil_make_isl_name(tablename); + + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "A link file was found named '%s' " + "but the linked tablespace '%s' " + "could not be opened.", + link_filepath, *remote_filepath); + + mem_free(link_filepath); + mem_free(*remote_filepath); + *remote_filepath = NULL; + } + + return(success); +} + +/*******************************************************************//** Creates a new single-table tablespace to a database directory of MySQL. Database directories are under the 'datadir' of MySQL. The datadir is the directory of a running mysqld program. We can refer to it by simply the path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp dir of the mysqld server. + @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fil_create_new_single_table_tablespace( /*===================================*/ ulint space_id, /*!< in: space id */ const char* tablename, /*!< in: the table name in the usual databasename/tablename format - of InnoDB, or a dir path to a temp - table */ - ibool is_temp, /*!< in: TRUE if a table created with - CREATE TEMPORARY TABLE */ + of InnoDB */ + const char* dir_path, /*!< in: NULL or a dir path */ ulint flags, /*!< in: tablespace flags */ ulint flags2, /*!< in: table flags2 */ ulint size) /*!< in: the initial size of the @@ -2789,18 +3266,40 @@ fil_create_new_single_table_tablespace( { os_file_t file; ibool ret; - ulint err; + dberr_t err; byte* buf2; byte* page; char* path; ibool success; + /* TRUE if a table is created with CREATE TEMPORARY TABLE */ + bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); + bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); ut_a(space_id > 0); + ut_ad(!srv_read_only_mode); ut_a(space_id < SRV_LOG_SPACE_FIRST_ID); ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); - fsp_flags_validate(flags); + ut_a(fsp_flags_is_valid(flags)); - path = fil_make_ibd_name(tablename, is_temp); + if (is_temp) { + /* Temporary table filepath */ + ut_ad(dir_path); + path = fil_make_ibd_name(dir_path, true); + } else if (has_data_dir) { + ut_ad(dir_path); + path = os_file_make_remote_pathname(dir_path, tablename, "ibd"); + + /* Since this tablespace file will be created in a + remote directory, let's create the subdirectories + in the path, if they are not there already. */ + success = os_file_create_subdirs_if_needed(path); + if (!success) { + err = DB_ERROR; + goto error_exit_3; + } + } else { + path = fil_make_ibd_name(tablename, false); + } file = os_file_create( innodb_file_data_key, path, @@ -2810,58 +3309,44 @@ fil_create_new_single_table_tablespace( &ret); if (ret == FALSE) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error creating file ", stderr); - ut_print_filename(stderr, path); - fputs(".\n", stderr); - /* The following call will print an error message */ - - err = os_file_get_last_error(TRUE); - - if (err == OS_FILE_ALREADY_EXISTS) { - fputs("InnoDB: The file already exists though" - " the corresponding table did not\n" - "InnoDB: exist in the InnoDB data dictionary." - " Have you moved InnoDB\n" - "InnoDB: .ibd files around without using the" - " SQL commands\n" - "InnoDB: DISCARD TABLESPACE and" - " IMPORT TABLESPACE, or did\n" - "InnoDB: mysqld crash in the middle of" - " CREATE TABLE? You can\n" - "InnoDB: resolve the problem by" - " removing the file ", stderr); - ut_print_filename(stderr, path); - fputs("\n" - "InnoDB: under the 'datadir' of MySQL.\n", - stderr); - - mem_free(path); - return(DB_TABLESPACE_ALREADY_EXISTS); + ulint error = os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create file '%s'\n", path); + + if (error == OS_FILE_ALREADY_EXISTS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "The file '%s' already exists though the " + "corresponding table did not exist " + "in the InnoDB data dictionary. " + "Have you moved InnoDB .ibd files " + "around without using the SQL commands " + "DISCARD TABLESPACE and IMPORT TABLESPACE, " + "or did mysqld crash in the middle of " + "CREATE TABLE? " + "You can resolve the problem by removing " + "the file '%s' under the 'datadir' of MySQL.", + path, path); + + err = DB_TABLESPACE_EXISTS; + goto error_exit_3; } - if (err == OS_FILE_DISK_FULL) { - - mem_free(path); - return(DB_OUT_OF_FILE_SPACE); + if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + goto error_exit_3; } - mem_free(path); - return(DB_ERROR); + err = DB_ERROR; + goto error_exit_3; } ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE); if (!ret) { err = DB_OUT_OF_FILE_SPACE; -error_exit: - os_file_close(file); -error_exit2: - os_file_delete(path); - - mem_free(path); - return(err); + goto error_exit_2; } /* printf("Creating tablespace %s id %lu\n", path, space_id); */ @@ -2910,356 +3395,486 @@ error_exit2: ut_free(buf2); if (!ret) { - fputs("InnoDB: Error: could not write the first page" - " to tablespace ", stderr); - ut_print_filename(stderr, path); - putc('\n', stderr); + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not write the first page to tablespace " + "'%s'", path); + err = DB_ERROR; - goto error_exit; + goto error_exit_2; } ret = os_file_flush(file); if (!ret) { - fputs("InnoDB: Error: file flush of tablespace ", stderr); - ut_print_filename(stderr, path); - fputs(" failed\n", stderr); + ib_logf(IB_LOG_LEVEL_ERROR, + "File flush of tablespace '%s' failed", path); err = DB_ERROR; - goto error_exit; + goto error_exit_2; } - os_file_close(file); + if (has_data_dir) { + /* Now that the IBD file is created, make the ISL file. */ + err = fil_create_link_file(tablename, path); + if (err != DB_SUCCESS) { + goto error_exit_2; + } + } success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE); - - if (!success) { + if (!success || !fil_node_create(path, size, space_id, FALSE)) { err = DB_ERROR; - goto error_exit2; + goto error_exit_1; } - fil_node_create(path, size, space_id, FALSE); - #ifndef UNIV_HOTBACKUP { mtr_t mtr; + ulint mlog_file_flag = 0; + + if (is_temp) { + mlog_file_flag |= MLOG_FILE_FLAG_TEMP; + } mtr_start(&mtr); fil_op_write_log(flags ? MLOG_FILE_CREATE2 : MLOG_FILE_CREATE, - space_id, - is_temp ? MLOG_FILE_FLAG_TEMP : 0, - flags, + space_id, mlog_file_flag, flags, tablename, NULL, &mtr); mtr_commit(&mtr); } #endif + err = DB_SUCCESS; + + /* Error code is set. Cleanup the various variables used. + These labels reflect the order in which variables are assigned or + actions are done. */ +error_exit_1: + if (has_data_dir && err != DB_SUCCESS) { + fil_delete_link_file(tablename); + } +error_exit_2: + os_file_close(file); + if (err != DB_SUCCESS) { + os_file_delete(path); + } +error_exit_3: mem_free(path); - return(DB_SUCCESS); + + return(err); } #ifndef UNIV_HOTBACKUP /********************************************************************//** -It is possible, though very improbable, that the lsn's in the tablespace to be -imported have risen above the current system lsn, if a lengthy purge, ibuf -merge, or rollback was performed on a backup taken with ibbackup. If that is -the case, reset page lsn's in the file. We assume that mysqld was shut down -after it performed these cleanup operations on the .ibd file, so that it at -the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the -first page of the .ibd file, and we can determine whether we need to reset the -lsn's just by looking at that flush lsn. -@return TRUE if success */ -UNIV_INTERN -ibool -fil_reset_too_high_lsns( -/*====================*/ - const char* name, /*!< in: table name in the - databasename/tablename format */ - lsn_t current_lsn) /*!< in: reset lsn's if the lsn stamped - to FIL_PAGE_FILE_FLUSH_LSN in the - first page is too high */ +Report information about a bad tablespace. */ +static +void +fil_report_bad_tablespace( +/*======================*/ + char* filepath, /*!< in: filepath */ + ulint found_id, /*!< in: found space ID */ + ulint found_flags, /*!< in: found flags */ + ulint expected_id, /*!< in: expected space id */ + ulint expected_flags) /*!< in: expected flags */ { - os_file_t file; - char* filepath; - byte* page; - byte* buf2; - lsn_t flush_lsn; - ulint space_id; - os_offset_t file_size; - os_offset_t offset; - ulint zip_size; - ibool success; - page_zip_des_t page_zip; - - filepath = fil_make_ibd_name(name, FALSE); - - file = os_file_create_simple_no_error_handling( - innodb_file_data_key, filepath, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &success); - if (!success) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); - - ut_print_timestamp(stderr); + ib_logf(IB_LOG_LEVEL_ERROR, + "In file '%s', tablespace id and flags are %lu and %lu, " + "but in the InnoDB data dictionary they are %lu and %lu. " + "Have you moved InnoDB .ibd files around without using the " + "commands DISCARD TABLESPACE and IMPORT TABLESPACE? " + "Please refer to " + REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + filepath, (ulong) found_id, (ulong) found_flags, + (ulong) expected_id, (ulong) expected_flags); +} - fputs(" InnoDB: Error: trying to open a table," - " but could not\n" - "InnoDB: open the tablespace file ", stderr); - ut_print_filename(stderr, filepath); - fputs("!\n", stderr); - mem_free(filepath); +struct fsp_open_info { + ibool success; /*!< Has the tablespace been opened? */ + ibool valid; /*!< Is the tablespace valid? */ + os_file_t file; /*!< File handle */ + char* filepath; /*!< File path to open */ + lsn_t lsn; /*!< Flushed LSN from header page */ + ulint id; /*!< Space ID */ + ulint flags; /*!< Tablespace flags */ +#ifdef UNIV_LOG_ARCHIVE + ulint arch_log_no; /*!< latest archived log file number */ +#endif /* UNIV_LOG_ARCHIVE */ +}; - return(FALSE); - } +/********************************************************************//** +Tries to open a single-table tablespace and optionally checks that the +space id in it is correct. If this does not succeed, print an error message +to the .err log. This function is used to open a tablespace when we start +mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE. - /* Read the first page of the tablespace */ +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. - buf2 = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE)); - /* Align the memory for file i/o if we might have O_DIRECT set */ - page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially FALSE, but if +a remote tablespace is found it will be changed to true. - success = os_file_read(file, page, 0, UNIV_PAGE_SIZE); - if (!success) { +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. - goto func_exit; +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_open_single_table_tablespace( +/*=============================*/ + bool validate, /*!< in: Do we validate tablespace? */ + bool fix_dict, /*!< in: Can we fix the dictionary? */ + ulint id, /*!< in: space id */ + ulint flags, /*!< in: tablespace flags */ + const char* tablename, /*!< in: table name in the + databasename/tablename format */ + const char* path_in) /*!< in: tablespace filepath */ +{ + dberr_t err = DB_SUCCESS; + bool dict_filepath_same_as_default = false; + bool link_file_found = false; + bool link_file_is_bad = false; + fsp_open_info def; + fsp_open_info dict; + fsp_open_info remote; + ulint tablespaces_found = 0; + ulint valid_tablespaces_found = 0; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex))); + + if (!fsp_flags_is_valid(flags)) { + return(DB_CORRUPTION); + } + + /* If the tablespace was relocated, we do not + compare the DATA_DIR flag */ + ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; + + memset(&def, 0, sizeof(def)); + memset(&dict, 0, sizeof(dict)); + memset(&remote, 0, sizeof(remote)); + + /* Discover the correct filepath. We will always look for an ibd + in the default location. If it is remote, it should not be here. */ + def.filepath = fil_make_ibd_name(tablename, false); + + /* The path_in was read from SYS_DATAFILES. */ + if (path_in) { + if (strcmp(def.filepath, path_in)) { + dict.filepath = mem_strdup(path_in); + /* possibility of multiple files. */ + validate = true; + } else { + dict_filepath_same_as_default = true; + } } - /* We have to read the file flush lsn from the header of the file */ - - flush_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN); + link_file_found = fil_open_linked_file( + tablename, &remote.filepath, &remote.file); + remote.success = link_file_found; + if (remote.success) { + /* possibility of multiple files. */ + validate = true; + tablespaces_found++; + + /* A link file was found. MySQL does not allow a DATA + DIRECTORY to be be the same as the default filepath. */ + ut_a(strcmp(def.filepath, remote.filepath)); + + /* If there was a filepath found in SYS_DATAFILES, + we hope it was the same as this remote.filepath found + in the ISL file. */ + if (dict.filepath + && (0 == strcmp(dict.filepath, remote.filepath))) { + remote.success = FALSE; + os_file_close(remote.file); + mem_free(remote.filepath); + remote.filepath = NULL; + tablespaces_found--; + } + } - if (current_lsn >= flush_lsn) { - /* Ok */ - success = TRUE; + /* Attempt to open the tablespace at other possible filepaths. */ + if (dict.filepath) { + dict.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, dict.filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &dict.success); + if (dict.success) { + /* possibility of multiple files. */ + validate = true; + tablespaces_found++; + } + } - goto func_exit; + /* Always look for a file at the default location. */ + ut_a(def.filepath); + def.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, def.filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &def.success); + if (def.success) { + tablespaces_found++; } - space_id = fsp_header_get_space_id(page); - zip_size = fsp_header_get_zip_size(page); + /* We have now checked all possible tablespace locations and + have a count of how many we found. If things are normal, we + only found 1. */ + if (!validate && tablespaces_found == 1) { + goto skip_validate; + } - page_zip_des_init(&page_zip); - page_zip_set_size(&page_zip, zip_size); - if (zip_size) { - page_zip.data = page + UNIV_PAGE_SIZE; + /* Read the first page of the datadir tablespace, if found. */ + if (def.success) { + fil_read_first_page( + def.file, FALSE, &def.flags, &def.id, +#ifdef UNIV_LOG_ARCHIVE + &space_arch_log_no, &space_arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + &def.lsn, &def.lsn); + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + ulint mod_def_flags = def.flags & ~FSP_FLAGS_MASK_DATA_DIR; + if (def.id == id && mod_def_flags == mod_flags) { + valid_tablespaces_found++; + def.valid = TRUE; + } else { + /* Do not use this tablespace. */ + fil_report_bad_tablespace( + def.filepath, def.id, + def.flags, id, flags); + } } - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Flush lsn in the tablespace file %lu" - " to be imported\n" - "InnoDB: is " LSN_PF ", which exceeds current" - " system lsn " LSN_PF ".\n" - "InnoDB: We reset the lsn's in the file ", - (ulong) space_id, - flush_lsn, current_lsn); - ut_print_filename(stderr, filepath); - fputs(".\n", stderr); - - ut_a(ut_is_2pow(zip_size)); - ut_a(zip_size <= UNIV_ZIP_SIZE_MAX); - - /* Loop through all the pages in the tablespace and reset the lsn and - the page checksum if necessary */ - - file_size = os_file_get_size(file); - ut_a(file_size != (os_offset_t) -1); + /* Read the first page of the remote tablespace */ + if (remote.success) { + fil_read_first_page( + remote.file, FALSE, &remote.flags, &remote.id, +#ifdef UNIV_LOG_ARCHIVE + &remote.arch_log_no, &remote.arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + &remote.lsn, &remote.lsn); + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + ulint mod_remote_flags = remote.flags & ~FSP_FLAGS_MASK_DATA_DIR; + if (remote.id == id && mod_remote_flags == mod_flags) { + valid_tablespaces_found++; + remote.valid = TRUE; + } else { + /* Do not use this linked tablespace. */ + fil_report_bad_tablespace( + remote.filepath, remote.id, + remote.flags, id, flags); + link_file_is_bad = true; + } + } - for (offset = 0; offset < file_size; - offset += zip_size ? zip_size : UNIV_PAGE_SIZE) { - success = os_file_read(file, page, offset, - zip_size ? zip_size : UNIV_PAGE_SIZE); - if (!success) { + /* Read the first page of the datadir tablespace, if found. */ + if (dict.success) { + fil_read_first_page( + dict.file, FALSE, &dict.flags, &dict.id, +#ifdef UNIV_LOG_ARCHIVE + &dict.arch_log_no, &dict.arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + &dict.lsn, &dict.lsn); + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + ulint mod_dict_flags = dict.flags & ~FSP_FLAGS_MASK_DATA_DIR; + if (dict.id == id && mod_dict_flags == mod_flags) { + valid_tablespaces_found++; + dict.valid = TRUE; + } else { + /* Do not use this tablespace. */ + fil_report_bad_tablespace( + dict.filepath, dict.id, + dict.flags, id, flags); + } + } - goto func_exit; + /* Make sense of these three possible locations. + First, bail out if no tablespace files were found. */ + if (valid_tablespaces_found == 0) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not find a valid tablespace file for '%s'. " + "See " REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + tablename); + + err = DB_CORRUPTION; + + goto cleanup_and_exit; + } + + /* Do not open any tablespaces if more than one tablespace with + the correct space ID and flags were found. */ + if (tablespaces_found > 1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "A tablespace for %s has been found in " + "multiple places;", tablename); + if (def.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Default location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + def.filepath, def.lsn, + (ulong) def.id, (ulong) def.flags); + } + if (remote.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Remote location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + remote.filepath, remote.lsn, + (ulong) remote.id, (ulong) remote.flags); } - if (mach_read_from_8(page + FIL_PAGE_LSN) > current_lsn) { - /* We have to reset the lsn */ - - if (zip_size) { - memcpy(page_zip.data, page, zip_size); - buf_flush_init_for_writing( - page, &page_zip, current_lsn); - success = os_file_write( - filepath, file, page_zip.data, - offset, zip_size); + if (dict.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Dictionary location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + dict.filepath, dict.lsn, + (ulong) dict.id, (ulong) dict.flags); + } + + /* Force-recovery will allow some tablespaces to be + skipped by REDO if there was more than one file found. + Unlike during the REDO phase of recovery, we now know + if the tablespace is valid according to the dictionary, + which was not available then. So if we did not force + recovery and there is only one good tablespace, ignore + any bad tablespaces. */ + if (valid_tablespaces_found > 1 || srv_force_recovery > 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Will not open the tablespace for '%s'", + tablename); + + if (def.success != def.valid + || dict.success != dict.valid + || remote.success != remote.valid) { + err = DB_CORRUPTION; } else { - buf_flush_init_for_writing( - page, NULL, current_lsn); - success = os_file_write( - filepath, file, page, - offset, UNIV_PAGE_SIZE); + err = DB_ERROR; } + goto cleanup_and_exit; + } - if (!success) { + /* There is only one valid tablespace found and we did + not use srv_force_recovery during REDO. Use this one + tablespace and clean up invalid tablespace pointers */ + if (def.success && !def.valid) { + def.success = false; + os_file_close(def.file); + tablespaces_found--; + } + if (dict.success && !dict.valid) { + dict.success = false; + os_file_close(dict.file); + /* Leave dict.filepath so that SYS_DATAFILES + can be corrected below. */ + tablespaces_found--; + } + if (remote.success && !remote.valid) { + remote.success = false; + os_file_close(remote.file); + mem_free(remote.filepath); + remote.filepath = NULL; + tablespaces_found--; + } + } - goto func_exit; + /* At this point, there should be only one filepath. */ + ut_a(tablespaces_found == 1); + ut_a(valid_tablespaces_found == 1); + + /* Only fix the dictionary at startup when there is only one thread. + Calls to dict_load_table() can be done while holding other latches. */ + if (!fix_dict) { + goto skip_validate; + } + + /* We may need to change what is stored in SYS_DATAFILES or + SYS_TABLESPACES or adjust the link file. + Since a failure to update SYS_TABLESPACES or SYS_DATAFILES does + not prevent opening and using the single_table_tablespace either + this time or the next, we do not check the return code or fail + to open the tablespace. But dict_update_filepath() will issue a + warning to the log. */ + if (dict.filepath) { + if (remote.success) { + dict_update_filepath(id, remote.filepath); + } else if (def.success) { + dict_update_filepath(id, def.filepath); + if (link_file_is_bad) { + fil_delete_link_file(tablename); } + } else if (!link_file_found || link_file_is_bad) { + ut_ad(dict.success); + /* Fix the link file if we got our filepath + from the dictionary but a link file did not + exist or it did not point to a valid file. */ + fil_delete_link_file(tablename); + fil_create_link_file(tablename, dict.filepath); } - } - success = os_file_flush(file); - if (!success) { + } else if (remote.success && dict_filepath_same_as_default) { + dict_update_filepath(id, remote.filepath); - goto func_exit; + } else if (remote.success && path_in == NULL) { + /* SYS_DATAFILES record for this space ID was not found. */ + dict_insert_tablespace_and_filepath( + id, tablename, remote.filepath, flags); } - /* We now update the flush_lsn stamp at the start of the file */ - success = os_file_read(file, page, 0, - zip_size ? zip_size : UNIV_PAGE_SIZE); - if (!success) { +skip_validate: + if (err != DB_SUCCESS) { + ; // Don't load the tablespace into the cache + } else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE)) { + err = DB_ERROR; + } else { + /* We do not measure the size of the file, that is why + we pass the 0 below */ - goto func_exit; + if (!fil_node_create(remote.success ? remote.filepath : + dict.success ? dict.filepath : + def.filepath, 0, id, FALSE)) { + err = DB_ERROR; + } } - mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); - - success = os_file_write(filepath, file, page, 0, - zip_size ? zip_size : UNIV_PAGE_SIZE); - if (!success) { - - goto func_exit; +cleanup_and_exit: + if (remote.success) { + os_file_close(remote.file); } - success = os_file_flush(file); -func_exit: - os_file_close(file); - ut_free(buf2); - mem_free(filepath); - - return(success); -} - -/********************************************************************//** -Tries to open a single-table tablespace and optionally checks the space id is -right in it. If does not succeed, prints an error message to the .err log. This -function is used to open a tablespace when we start up mysqld, and also in -IMPORT TABLESPACE. -NOTE that we assume this operation is used either at the database startup -or under the protection of the dictionary mutex, so that two users cannot -race here. This operation does not leave the file associated with the -tablespace open, but closes it after we have looked at the space id in it. -@return TRUE if success */ -UNIV_INTERN -ibool -fil_open_single_table_tablespace( -/*=============================*/ - ibool check_space_id, /*!< in: should we check that the space - id in the file is right; we assume - that this function runs much faster - if no check is made, since accessing - the file inode probably is much - faster (the OS caches them) than - accessing the first page of the file */ - ulint id, /*!< in: space id */ - ulint flags, /*!< in: tablespace flags */ - const char* tablename) /*!< in: table name in the - databasename/tablename format */ -{ - os_file_t file; - char* filepath; - ibool success; - byte* buf2; - byte* page; - ulint space_id; - ulint space_flags; - - filepath = fil_make_ibd_name(tablename, FALSE); - - fsp_flags_validate(flags); - - file = os_file_create_simple_no_error_handling( - innodb_file_data_key, filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); - if (!success) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); - - ut_print_timestamp(stderr); - - fputs(" InnoDB: Error: trying to open a table," - " but could not\n" - "InnoDB: open the tablespace file ", stderr); - ut_print_filename(stderr, filepath); - fputs("!\n" - "InnoDB: Have you moved InnoDB .ibd files around" - " without using the\n" - "InnoDB: commands DISCARD TABLESPACE and" - " IMPORT TABLESPACE?\n" - "InnoDB: It is also possible that this is" - " a temporary table #sql...,\n" - "InnoDB: and MySQL removed the .ibd file for this.\n" - "InnoDB: Please refer to\n" - "InnoDB: " REFMAN - "innodb-troubleshooting-datadict.html\n" - "InnoDB: for how to resolve the issue.\n", stderr); - - mem_free(filepath); - - return(FALSE); + if (remote.filepath) { + mem_free(remote.filepath); } - - if (!check_space_id) { - space_id = id; - - goto skip_check; + if (dict.success) { + os_file_close(dict.file); } - - /* Read the first page of the tablespace */ - - buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); - /* Align the memory for file i/o if we might have O_DIRECT set */ - page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); - - success = os_file_read(file, page, 0, UNIV_PAGE_SIZE); - - /* We have to read the tablespace id and flags from the file. */ - - space_id = fsp_header_get_space_id(page); - space_flags = fsp_header_get_flags(page); - - ut_free(buf2); - - if (UNIV_UNLIKELY(space_id != id || space_flags != flags)) { - ut_print_timestamp(stderr); - - fputs(" InnoDB: Error: tablespace id and flags in file ", - stderr); - ut_print_filename(stderr, filepath); - fprintf(stderr, " are %lu and %lu, but in the InnoDB\n" - "InnoDB: data dictionary they are %lu and %lu.\n" - "InnoDB: Have you moved InnoDB .ibd files" - " around without using the\n" - "InnoDB: commands DISCARD TABLESPACE and" - " IMPORT TABLESPACE?\n" - "InnoDB: Please refer to\n" - "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n" - "InnoDB: for how to resolve the issue.\n", - (ulong) space_id, (ulong) space_flags, - (ulong) id, (ulong) flags); - - success = FALSE; - - goto func_exit; + if (dict.filepath) { + mem_free(dict.filepath); } - -skip_check: - success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE); - - if (!success) { - goto func_exit; + if (def.success) { + os_file_close(def.file); } + mem_free(def.filepath); - /* We do not measure the size of the file, that is why we pass the 0 - below */ - - fil_node_create(filepath, 0, space_id, FALSE); -func_exit: - os_file_close(file); - mem_free(filepath); - - return(success); + return(err); } #endif /* !UNIV_HOTBACKUP */ @@ -3282,13 +3897,64 @@ fil_make_ibbackup_old_name( memcpy(path, name, len); memcpy(path + len, suffix, (sizeof suffix) - 1); - ut_sprintf_timestamp_without_extra_chars(path + len + sizeof suffix); + ut_sprintf_timestamp_without_extra_chars( + path + len + ((sizeof suffix) - 1)); return(path); } #endif /* UNIV_HOTBACKUP */ /********************************************************************//** Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.cc data structures. +Set fsp->success to TRUE if tablespace is valid, FALSE if not. */ +static +void +fil_validate_single_table_tablespace( +/*=================================*/ + const char* tablename, /*!< in: database/tablename */ + fsp_open_info* fsp) /*!< in/out: tablespace info */ +{ + fil_read_first_page( + fsp->file, FALSE, &fsp->flags, &fsp->id, +#ifdef UNIV_LOG_ARCHIVE + &fsp->arch_log_no, &fsp->arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + &fsp->lsn, &fsp->lsn); + + if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) { + fprintf(stderr, + " InnoDB: Error: Tablespace is not sensible;" + " Table: %s Space ID: %lu Filepath: %s\n", + tablename, (ulong) fsp->id, fsp->filepath); + fsp->success = FALSE; + return; + } + + mutex_enter(&fil_system->mutex); + fil_space_t* space = fil_space_get_by_id(fsp->id); + mutex_exit(&fil_system->mutex); + if (space != NULL) { + char* prev_filepath = fil_space_get_first_path(fsp->id); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Attempted to open a previously opened tablespace. " + "Previous tablespace %s uses space ID: %lu at " + "filepath: %s. Cannot open tablespace %s which uses " + "space ID: %lu at filepath: %s", + space->name, (ulong) space->id, prev_filepath, + tablename, (ulong) fsp->id, fsp->filepath); + + mem_free(prev_filepath); + fsp->success = FALSE; + return; + } + + fsp->success = TRUE; +} + + +/********************************************************************//** +Opens an .ibd file and adds the associated single-table tablespace to the InnoDB fil0fil.cc data structures. */ static void @@ -3296,34 +3962,49 @@ fil_load_single_table_tablespace( /*=============================*/ const char* dbname, /*!< in: database name */ const char* filename) /*!< in: file name (not a path), - including the .ibd extension */ + including the .ibd or .isl extension */ { - os_file_t file; - char* filepath; char* tablename; - ibool success; - byte* buf2; - byte* page; - ulint space_id; - ulint flags; + ulint tablename_len; + ulint dbname_len = strlen(dbname); + ulint filename_len = strlen(filename); + fsp_open_info def; + fsp_open_info remote; os_offset_t size; #ifdef UNIV_HOTBACKUP fil_space_t* space; #endif - filepath = static_cast<char*>( - mem_alloc( - strlen(dbname) - + strlen(filename) - + strlen(fil_path_to_mysql_datadir) + 3)); - sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname, - filename); - srv_normalize_path_for_win(filepath); + memset(&def, 0, sizeof(def)); + memset(&remote, 0, sizeof(remote)); + /* The caller assured that the extension is ".ibd" or ".isl". */ + ut_ad(0 == memcmp(filename + filename_len - 4, ".ibd", 4) + || 0 == memcmp(filename + filename_len - 4, ".isl", 4)); + + /* Build up the tablename in the standard form database/table. */ tablename = static_cast<char*>( - mem_alloc(strlen(dbname) + strlen(filename) + 2)); + mem_alloc(dbname_len + filename_len + 2)); sprintf(tablename, "%s/%s", dbname, filename); - tablename[strlen(tablename) - strlen(".ibd")] = 0; + tablename_len = strlen(tablename) - strlen(".ibd"); + tablename[tablename_len] = '\0'; + + /* There may be both .ibd and .isl file in the directory. + And it is possible that the .isl file refers to a different + .ibd file. If so, we open and compare them the first time + one of them is sent to this function. So if this table has + already been loaded, there is nothing to do.*/ + mutex_enter(&fil_system->mutex); + if (fil_space_get_by_name(tablename)) { + mem_free(tablename); + mutex_exit(&fil_system->mutex); + return; + } + mutex_exit(&fil_system->mutex); + + /* Build up the filepath of the .ibd tablespace in the datadir. + This must be freed independent of def.success. */ + def.filepath = fil_make_ibd_name(tablename, false); #ifdef __WIN__ # ifndef UNIV_HOTBACKUP @@ -3333,31 +4014,56 @@ fil_load_single_table_tablespace( file path to lower case, so that we are consistent with InnoDB's internal data dictionary. */ - dict_casedn_str(filepath); + dict_casedn_str(def.filepath); # endif /* !UNIV_HOTBACKUP */ #endif - file = os_file_create_simple_no_error_handling( - innodb_file_data_key, filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); - if (!success) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); + /* Check for a link file which locates a remote tablespace. */ + remote.success = fil_open_linked_file( + tablename, &remote.filepath, &remote.file); + + /* Read the first page of the remote tablespace */ + if (remote.success) { + fil_validate_single_table_tablespace(tablename, &remote); + if (!remote.success) { + os_file_close(remote.file); + mem_free(remote.filepath); + } + } + + + /* Try to open the tablespace in the datadir. */ + def.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, def.filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &def.success); + + /* Read the first page of the remote tablespace */ + if (def.success) { + fil_validate_single_table_tablespace(tablename, &def); + if (!def.success) { + os_file_close(def.file); + } + } + + if (!def.success && !remote.success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + fprintf(stderr, + "InnoDB: Error: could not open single-table" + " tablespace file %s\n", def.filepath); +no_good_file: fprintf(stderr, - "InnoDB: Error: could not open single-table tablespace" - " file\n" - "InnoDB: %s!\n" "InnoDB: We do not continue the crash recovery," " because the table may become\n" - "InnoDB: corrupt if we cannot apply the log records" - " in the InnoDB log to it.\n" + "InnoDB: corrupt if we cannot apply the log" + " records in the InnoDB log to it.\n" "InnoDB: To fix the problem and start mysqld:\n" "InnoDB: 1) If there is a permission problem" " in the file and mysqld cannot\n" "InnoDB: open the file, you should" " modify the permissions.\n" - "InnoDB: 2) If the table is not needed, or you can" - " restore it from a backup,\n" + "InnoDB: 2) If the table is not needed, or you" + " can restore it from a backup,\n" "InnoDB: then you can remove the .ibd file," " and InnoDB will do a normal\n" "InnoDB: crash recovery and ignore that table.\n" @@ -3366,123 +4072,84 @@ fil_load_single_table_tablespace( "InnoDB: the .ibd file, you can set" " innodb_force_recovery > 0 in my.cnf\n" "InnoDB: and force InnoDB to continue crash" - " recovery here.\n", filepath); - + " recovery here.\n"); +will_not_choose: mem_free(tablename); - mem_free(filepath); - - if (srv_force_recovery > 0) { - fprintf(stderr, - "InnoDB: innodb_force_recovery" - " was set to %lu. Continuing crash recovery\n" - "InnoDB: even though we cannot access" - " the .ibd file of this table.\n", - srv_force_recovery); - return; + if (remote.success) { + mem_free(remote.filepath); } - - exit(1); - } - - size = os_file_get_size(file); - - if (UNIV_UNLIKELY(size == (os_offset_t) -1)) { - /* The following call prints an error message */ - os_file_get_last_error(TRUE); - - fprintf(stderr, - "InnoDB: Error: could not measure the size" - " of single-table tablespace file\n" - "InnoDB: %s!\n" - "InnoDB: We do not continue crash recovery," - " because the table will become\n" - "InnoDB: corrupt if we cannot apply the log records" - " in the InnoDB log to it.\n" - "InnoDB: To fix the problem and start mysqld:\n" - "InnoDB: 1) If there is a permission problem" - " in the file and mysqld cannot\n" - "InnoDB: access the file, you should" - " modify the permissions.\n" - "InnoDB: 2) If the table is not needed," - " or you can restore it from a backup,\n" - "InnoDB: then you can remove the .ibd file," - " and InnoDB will do a normal\n" - "InnoDB: crash recovery and ignore that table.\n" - "InnoDB: 3) If the file system or the disk is broken," - " and you cannot remove\n" - "InnoDB: the .ibd file, you can set" - " innodb_force_recovery > 0 in my.cnf\n" - "InnoDB: and force InnoDB to continue" - " crash recovery here.\n", filepath); - - os_file_close(file); - mem_free(tablename); - mem_free(filepath); + mem_free(def.filepath); if (srv_force_recovery > 0) { - fprintf(stderr, - "InnoDB: innodb_force_recovery" - " was set to %lu. Continuing crash recovery\n" - "InnoDB: even though we cannot access" - " the .ibd file of this table.\n", + ib_logf(IB_LOG_LEVEL_INFO, + "innodb_force_recovery was set to %lu. " + "Continuing crash recovery even though we " + "cannot access the .ibd file of this table.", srv_force_recovery); return; } + /* If debug code, cause a core dump and call stack. For + release builds just exit and rely on the messages above. */ + ut_ad(0); exit(1); } - /* TODO: What to do in other cases where we cannot access an .ibd - file during a crash recovery? */ + if (def.success && remote.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespaces for %s have been found in two places;\n" + "Location 1: SpaceID: %lu LSN: %lu File: %s\n" + "Location 2: SpaceID: %lu LSN: %lu File: %s\n" + "You must delete one of them.", + tablename, (ulong) def.id, (ulong) def.lsn, + def.filepath, (ulong) remote.id, (ulong) remote.lsn, + remote.filepath); - /* Every .ibd file is created >= 4 pages in size. Smaller files - cannot be ok. */ + def.success = FALSE; + os_file_close(def.file); + os_file_close(remote.file); + goto will_not_choose; + } -#ifndef UNIV_HOTBACKUP - if (size < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { - fprintf(stderr, - "InnoDB: Error: the size of single-table" - " tablespace file %s\n" - "InnoDB: is only " UINT64PF - ", should be at least %lu!\n", - filepath, - size, (ulong) (4 * UNIV_PAGE_SIZE)); - os_file_close(file); - mem_free(tablename); - mem_free(filepath); + /* At this point, only one tablespace is open */ + ut_a(def.success == !remote.success); - return; - } -#endif - /* Read the first page of the tablespace if the size is big enough */ + fsp_open_info* fsp = def.success ? &def : &remote; - buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); - /* Align the memory for file i/o if we might have O_DIRECT set */ - page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + /* Get and test the file size. */ + size = os_file_get_size(fsp->file); - if (size >= FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { - success = os_file_read(file, page, 0, UNIV_PAGE_SIZE); + if (size == (os_offset_t) -1) { + /* The following call prints an error message */ + os_file_get_last_error(true); - /* We have to read the tablespace id from the file */ + ib_logf(IB_LOG_LEVEL_ERROR, + "could not measure the size of single-table " + "tablespace file %s", fsp->filepath); - space_id = fsp_header_get_space_id(page); - flags = fsp_header_get_flags(page); - } else { - space_id = ULINT_UNDEFINED; - flags = 0; + os_file_close(fsp->file); + goto no_good_file; } + /* Every .ibd file is created >= 4 pages in size. Smaller files + cannot be ok. */ + ulong minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE; + if (size < minimum_size) { #ifndef UNIV_HOTBACKUP - if (space_id == ULINT_UNDEFINED || space_id == 0) { - fprintf(stderr, - "InnoDB: Error: tablespace id %lu in file %s" - " is not sensible\n", - (ulong) space_id, - filepath); - goto func_exit; - } + ib_logf(IB_LOG_LEVEL_ERROR, + "The size of single-table tablespace file %s " + "is only " UINT64PF ", should be at least %lu!", + fsp->filepath, size, minimum_size); + os_file_close(fsp->file); + goto no_good_file; #else - if (space_id == ULINT_UNDEFINED || space_id == 0) { + fsp->id = ULINT_UNDEFINED; + fsp->flags = 0; +#endif /* !UNIV_HOTBACKUP */ + } + +#ifdef UNIV_HOTBACKUP + if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) { char* new_path; fprintf(stderr, @@ -3494,18 +4161,19 @@ fil_load_single_table_tablespace( " is not sensible.\n" "InnoDB: This can happen in an ibbackup run," " and is not dangerous.\n", - filepath, space_id, filepath, size); - os_file_close(file); + fsp->filepath, fsp->id, fsp->filepath, size); + os_file_close(fsp->file); - new_path = fil_make_ibbackup_old_name(filepath); - ut_a(os_file_rename(innodb_file_data_key, filepath, new_path)); + new_path = fil_make_ibbackup_old_name(fsp->filepath); + + bool success = os_file_rename( + innodb_file_data_key, fsp->filepath, new_path)); + + ut_a(success); - ut_free(buf2); - mem_free(tablename); - mem_free(filepath); mem_free(new_path); - return; + goto func_exit_after_close; } /* A backup may contain the same space several times, if the space got @@ -3517,7 +4185,7 @@ fil_load_single_table_tablespace( mutex_enter(&fil_system->mutex); - space = fil_space_get_by_id(space_id); + space = fil_space_get_by_id(fsp->id); if (space) { char* new_path; @@ -3529,52 +4197,64 @@ fil_load_single_table_tablespace( "InnoDB: was scanned earlier. This can happen" " if you have renamed tables\n" "InnoDB: during an ibbackup run.\n", - filepath, space_id, filepath, + fsp->filepath, fsp->id, fsp->filepath, space->name); - os_file_close(file); + os_file_close(fsp->file); - new_path = fil_make_ibbackup_old_name(filepath); + new_path = fil_make_ibbackup_old_name(fsp->filepath); mutex_exit(&fil_system->mutex); - ut_a(os_file_rename(innodb_file_data_key, filepath, new_path)); + bool success = os_file_rename( + innodb_file_data_key, fsp->filepath, new_path); + + ut_a(success); - ut_free(buf2); - mem_free(tablename); - mem_free(filepath); mem_free(new_path); - return; + goto func_exit_after_close; } mutex_exit(&fil_system->mutex); -#endif - success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE); - - if (!success) { +#endif /* UNIV_HOTBACKUP */ + ibool file_space_create_success = fil_space_create( + tablename, fsp->id, fsp->flags, FIL_TABLESPACE); + if (!file_space_create_success) { if (srv_force_recovery > 0) { fprintf(stderr, - "InnoDB: innodb_force_recovery" - " was set to %lu. Continuing crash recovery\n" - "InnoDB: even though the tablespace creation" - " of this table failed.\n", + "InnoDB: innodb_force_recovery was set" + " to %lu. Continuing crash recovery\n" + "InnoDB: even though the tablespace" + " creation of this table failed.\n", srv_force_recovery); goto func_exit; } - exit(1); + /* Exit here with a core dump, stack, etc. */ + ut_a(file_space_create_success); } /* We do not use the size information we have about the file, because the rounding formula for extents and pages is somewhat complex; we let fil_node_open() do that task. */ - fil_node_create(filepath, 0, space_id, FALSE); + if (!fil_node_create(fsp->filepath, 0, fsp->id, FALSE)) { + ut_error; + } + func_exit: - os_file_close(file); - ut_free(buf2); + os_file_close(fsp->file); + +#ifdef UNIV_HOTBACKUP +func_exit_after_close: +#else + ut_ad(!mutex_own(&fil_system->mutex)); +#endif mem_free(tablename); - mem_free(filepath); + if (remote.success) { + mem_free(remote.filepath); + } + mem_free(def.filepath); } /***********************************************************************//** @@ -3587,29 +4267,25 @@ static int fil_file_readdir_next_file( /*=======================*/ - ulint* err, /*!< out: this is set to DB_ERROR if an error + dberr_t* err, /*!< out: this is set to DB_ERROR if an error was encountered, otherwise not changed */ const char* dirname,/*!< in: directory name or path */ os_file_dir_t dir, /*!< in: directory stream */ - os_file_stat_t* info) /*!< in/out: buffer where the info is returned */ + os_file_stat_t* info) /*!< in/out: buffer where the + info is returned */ { - ulint i; - int ret; - - for (i = 0; i < 100; i++) { - ret = os_file_readdir_next_file(dirname, dir, info); + for (ulint i = 0; i < 100; i++) { + int ret = os_file_readdir_next_file(dirname, dir, info); if (ret != -1) { return(ret); } - fprintf(stderr, - "InnoDB: Error: os_file_readdir_next_file()" - " returned -1 in\n" - "InnoDB: directory %s\n" - "InnoDB: Crash recovery may have failed" - " for some .ibd files!\n", dirname); + ib_logf(IB_LOG_LEVEL_ERROR, + "os_file_readdir_next_file() returned -1 in " + "directory %s, crash recovery may have failed " + "for some .ibd files!", dirname); *err = DB_ERROR; } @@ -3626,7 +4302,7 @@ in the doublewrite buffer, also to know where to apply log records where the space id is != 0. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t fil_load_single_table_tablespaces(void) /*===================================*/ { @@ -3637,7 +4313,7 @@ fil_load_single_table_tablespaces(void) os_file_dir_t dbdir; os_file_stat_t dbinfo; os_file_stat_t fileinfo; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; /* The datadir of MySQL is always the default directory of mysqld */ @@ -3686,7 +4362,6 @@ fil_load_single_table_tablespaces(void) dbdir = os_file_opendir(dbpath, FALSE); if (dbdir != NULL) { - /* printf("Opened dir %s\n", dbinfo.name); */ /* We found a database directory; loop through it, looking for possible .ibd files in it */ @@ -3694,8 +4369,6 @@ fil_load_single_table_tablespaces(void) ret = fil_file_readdir_next_file(&err, dbpath, dbdir, &fileinfo); while (ret == 0) { - /* printf( - " Looking at file %s\n", fileinfo.name); */ if (fileinfo.type == OS_FILE_TYPE_DIR) { @@ -3704,11 +4377,14 @@ fil_load_single_table_tablespaces(void) /* We found a symlink or a file */ if (strlen(fileinfo.name) > 4 - && 0 == strcmp(fileinfo.name + && (0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".ibd") + || 0 == strcmp(fileinfo.name + strlen(fileinfo.name) - 4, - ".ibd")) { - /* The name ends in .ibd; try opening - the file */ + ".isl"))) { + /* The name ends in .ibd or .isl; + try opening the file */ fil_load_single_table_tablespace( dbinfo.name, fileinfo.name); } @@ -3808,6 +4484,29 @@ fil_tablespace_exists_in_mem( } /*******************************************************************//** +Report that a tablespace for a table was not found. */ +static +void +fil_report_missing_tablespace( +/*===========================*/ + const char* name, /*!< in: table name */ + ulint space_id) /*!< in: table's space id */ +{ + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name(index_name, sizeof(index_name), name, TRUE); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Table %s in the InnoDB data dictionary has tablespace id %lu, " + "but tablespace with that id or name does not exist. Have " + "you deleted or moved .ibd files? This may also be a table " + "created with CREATE TEMPORARY TABLE whose .ibd and .frm " + "files MySQL automatically removed, but the table still " + "exists in the InnoDB internal data dictionary.", + name, space_id); +} + +/*******************************************************************//** Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory cache. Note that if we have not done a crash recovery at the database startup, there may be many tablespaces which are not yet in the memory cache. @@ -3817,19 +4516,25 @@ ibool fil_space_for_table_exists_in_mem( /*==============================*/ ulint id, /*!< in: space id */ - const char* name, /*!< in: table name in the standard - 'databasename/tablename' format */ + const char* name, /*!< in: table name used in + fil_space_create(). Either the + standard 'dbname/tablename' format + or table->dir_path_of_temp_table */ ibool mark_space, /*!< in: in crash recovery, at database startup we mark all spaces which have an associated table in the InnoDB data dictionary, so that we can print a warning about orphaned tablespaces */ - ibool print_error_if_does_not_exist) + ibool print_error_if_does_not_exist, /*!< in: print detailed error information to the .err log if a matching tablespace is not found from memory */ + bool adjust_space, /*!< in: whether to adjust space id + when find table space mismatch */ + mem_heap_t* heap, /*!< in: heap memory */ + table_id_t table_id) /*!< in: table id */ { fil_space_t* fnamespace; fil_space_t* space; @@ -3858,6 +4563,47 @@ fil_space_for_table_exists_in_mem( return(TRUE); } + /* Info from "fnamespace" comes from the ibd file itself, it can + be different from data obtained from System tables since it is + not transactional. If adjust_space is set, and the mismatching + space are between a user table and its temp table, we shall + adjust the ibd file name according to system table info */ + if (adjust_space + && space != NULL + && row_is_mysql_tmp_table_name(space->name) + && !row_is_mysql_tmp_table_name(name)) { + + mutex_exit(&fil_system->mutex); + + DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space", + DBUG_SUICIDE();); + + if (fnamespace) { + char* tmp_name; + + tmp_name = dict_mem_create_temporary_tablename( + heap, name, table_id); + + fil_rename_tablespace(fnamespace->name, fnamespace->id, + tmp_name, NULL); + } + + DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space", + DBUG_SUICIDE();); + + fil_rename_tablespace(space->name, id, name, NULL); + + DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space", + DBUG_SUICIDE();); + + mutex_enter(&fil_system->mutex); + fnamespace = fil_space_get_by_name(name); + ut_ad(space == fnamespace); + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + if (!print_error_if_does_not_exist) { mutex_exit(&fil_system->mutex); @@ -3867,22 +4613,9 @@ fil_space_for_table_exists_in_mem( if (space == NULL) { if (fnamespace == NULL) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_filename(stderr, name); - fprintf(stderr, "\n" - "InnoDB: in InnoDB data dictionary" - " has tablespace id %lu,\n" - "InnoDB: but tablespace with that id" - " or name does not exist. Have\n" - "InnoDB: you deleted or moved .ibd files?\n" - "InnoDB: This may also be a table created with" - " CREATE TEMPORARY TABLE\n" - "InnoDB: whose .ibd and .frm files" - " MySQL automatically removed, but the\n" - "InnoDB: table still exists in the" - " InnoDB internal data dictionary.\n", - (ulong) id); + if (print_error_if_does_not_exist) { + fil_report_missing_tablespace(name, id); + } } else { ut_print_timestamp(stderr); fputs(" InnoDB: Error: table ", stderr); @@ -3941,7 +4674,7 @@ error_exit: Checks if a single-table tablespace for a given table name exists in the tablespace memory cache. @return space id, ULINT_UNDEFINED if not found */ -static +UNIV_INTERN ulint fil_get_space_id_for_table( /*=======================*/ @@ -3996,6 +4729,8 @@ fil_extend_space_to_desired_size( ulint pages_added; ibool success; + ut_ad(!srv_read_only_mode); + retry: pages_added = 0; success = TRUE; @@ -4070,7 +4805,7 @@ retry: node->name, node->handle, buf, offset, page_size * n_pages, NULL, NULL); -#endif +#endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; } else { @@ -4143,7 +4878,7 @@ fil_extend_tablespaces_to_stored_len(void) byte* buf; ulint actual_size; ulint size_in_header; - ulint error; + dberr_t error; ibool success; buf = mem_alloc(UNIV_PAGE_SIZE); @@ -4177,7 +4912,7 @@ fil_extend_tablespaces_to_stored_len(void) "InnoDB: Check that you have free disk space" " and retry!\n", space->name, size_in_header, actual_size); - exit(1); + ut_a(success); } mutex_enter(&fil_system->mutex); @@ -4347,12 +5082,21 @@ fil_node_complete_io( node->n_pending--; if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); system->modification_counter++; node->modification_counter = system->modification_counter; - if (!node->space->is_in_unflushed_spaces) { + if (fil_buffering_disabled(node->space)) { + + /* We don't need to keep track of unflushed + changes as user has explicitly disabled + buffering. */ + ut_ad(!node->space->is_in_unflushed_spaces); + node->flush_counter = node->modification_counter; - node->space->is_in_unflushed_spaces = TRUE; + } else if (!node->space->is_in_unflushed_spaces) { + + node->space->is_in_unflushed_spaces = true; UT_LIST_ADD_FIRST(unflushed_spaces, system->unflushed_spaces, node->space); @@ -4399,7 +5143,7 @@ Reads or writes data. This operation is asynchronous (aio). @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ UNIV_INTERN -ulint +dberr_t fil_io( /*===*/ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE, @@ -4462,9 +5206,11 @@ fil_io( #ifndef UNIV_HOTBACKUP # ifndef UNIV_LOG_DEBUG /* ibuf bitmap pages must be read in the sync aio mode: */ - ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE) + ut_ad(recv_no_ibuf_operations + || type == OS_FILE_WRITE || !ibuf_bitmap_page(zip_size, block_offset) - || sync || is_log); + || sync + || is_log); # endif /* UNIV_LOG_DEBUG */ if (sync) { mode = OS_AIO_SYNC; @@ -4483,9 +5229,10 @@ fil_io( #endif /* !UNIV_HOTBACKUP */ if (type == OS_FILE_READ) { - srv_data_read+= len; + srv_stats.data_read.add(len); } else if (type == OS_FILE_WRITE) { - srv_data_written+= len; + ut_ad(!srv_read_only_mode); + srv_stats.data_written.add(len); } /* Reserve the fil_system mutex and make sure that we can open at @@ -4497,48 +5244,43 @@ fil_io( /* If we are deleting a tablespace we don't allow any read operations on that. However, we do allow write operations. */ - if (!space || (type == OS_FILE_READ && space->stop_new_ops)) { + if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) { mutex_exit(&fil_system->mutex); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: trying to do i/o" - " to a tablespace which does not exist.\n" - "InnoDB: i/o type %lu, space id %lu," - " page no. %lu, i/o length %lu bytes\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to do i/o to a tablespace which does " + "not exist. i/o type %lu, space id %lu, " + "page no. %lu, i/o length %lu bytes", (ulong) type, (ulong) space_id, (ulong) block_offset, (ulong) len); return(DB_TABLESPACE_DELETED); } - ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE)); + ut_ad(mode != OS_AIO_IBUF || space->purpose == FIL_TABLESPACE); node = UT_LIST_GET_FIRST(space->chain); for (;;) { - if (UNIV_UNLIKELY(node == NULL)) { + if (node == NULL) { if (ignore_nonexistent_pages) { mutex_exit(&fil_system->mutex); return(DB_ERROR); } - /* else */ fil_report_invalid_page_access( block_offset, space_id, space->name, byte_offset, len, type); ut_error; - } - if (fil_is_user_tablespace_id(space->id) && node->size == 0) { + } else if (fil_is_user_tablespace_id(space->id) + && node->size == 0) { + /* We do not know the size of a single-table tablespace before we open the file */ - break; - } - - if (node->size > block_offset) { + } else if (node->size > block_offset) { /* Found! */ break; } else { @@ -4600,6 +5342,7 @@ fil_io( if (type == OS_FILE_READ) { ret = os_file_read(node->handle, buf, offset, len); } else { + ut_ad(!srv_read_only_mode); ret = os_file_write(node->name, node->handle, buf, offset, len); } @@ -4607,7 +5350,7 @@ fil_io( /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, offset, len, node, message); -#endif +#endif /* UNIV_HOTBACKUP */ ut_a(ret); if (mode == OS_AIO_SYNC) { @@ -4649,24 +5392,24 @@ fil_aio_wait( if (srv_use_native_aio) { srv_set_io_thread_op_info(segment, "native aio handle"); #ifdef WIN_ASYNC_IO - ret = os_aio_windows_handle(segment, 0, &fil_node, - &message, &type); + ret = os_aio_windows_handle( + segment, 0, &fil_node, &message, &type); #elif defined(LINUX_NATIVE_AIO) - ret = os_aio_linux_handle(segment, &fil_node, - &message, &type); + ret = os_aio_linux_handle( + segment, &fil_node, &message, &type); #else ut_error; ret = 0; /* Eliminate compiler warning */ -#endif +#endif /* WIN_ASYNC_IO */ } else { srv_set_io_thread_op_info(segment, "simulated aio handle"); - ret = os_aio_simulated_handle(segment, &fil_node, - &message, &type); + ret = os_aio_simulated_handle( + segment, &fil_node, &message, &type); } ut_a(ret); - if (UNIV_UNLIKELY(fil_node == NULL)) { + if (fil_node == NULL) { ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); return; } @@ -4722,6 +5465,28 @@ fil_flush( return; } + if (fil_buffering_disabled(space)) { + + /* No need to flush. User has explicitly disabled + buffering. */ + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(fil_space_is_flushed(space)); + ut_ad(space->n_pending_flushes == 0); + +#ifdef UNIV_DEBUG + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + ut_ad(node->modification_counter + == node->flush_counter); + ut_ad(node->n_pending_flushes == 0); + } +#endif /* UNIV_DEBUG */ + + mutex_exit(&fil_system->mutex); + return; + } + space->n_pending_flushes++; /*!< prevent dropping of the space while we are flushing */ node = UT_LIST_GET_FIRST(space->chain); @@ -4745,7 +5510,7 @@ fil_flush( goto skip_flush; } -#endif +#endif /* __WIN__ */ retry: if (node->n_pending_flushes > 0) { /* We want to avoid calling os_file_flush() on @@ -4788,7 +5553,7 @@ skip_flush: if (space->is_in_unflushed_spaces && fil_space_is_flushed(space)) { - space->is_in_unflushed_spaces = FALSE; + space->is_in_unflushed_spaces = false; UT_LIST_REMOVE( unflushed_spaces, @@ -5025,3 +5790,401 @@ fil_close(void) fil_system = NULL; } + +/********************************************************************//** +Initializes a buffer control block when the buf_pool is created. */ +static +void +fil_buf_block_init( +/*===============*/ + buf_block_t* block, /*!< in: pointer to control block */ + byte* frame) /*!< in: pointer to buffer frame */ +{ + UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE); + + block->frame = frame; + + block->page.io_fix = BUF_IO_NONE; + /* There are assertions that check for this. */ + block->page.buf_fix_count = 1; + block->page.state = BUF_BLOCK_READY_FOR_USE; + + page_zip_des_init(&block->page.zip); +} + +struct fil_iterator_t { + os_file_t file; /*!< File handle */ + const char* filepath; /*!< File path name */ + os_offset_t start; /*!< From where to start */ + os_offset_t end; /*!< Where to stop */ + os_offset_t file_size; /*!< File size in bytes */ + ulint page_size; /*!< Page size */ + ulint n_io_buffers; /*!< Number of pages to use + for IO */ + byte* io_buffer; /*!< Buffer to use for IO */ +}; + +/********************************************************************//** +TODO: This can be made parallel trivially by chunking up the file and creating +a callback per thread. . Main benefit will be to use multiple CPUs for +checksums and compressed tables. We have to do compressed tables block by +block right now. Secondly we need to decompress/compress and copy too much +of data. These are CPU intensive. + +Iterate over all the pages in the tablespace. +@param iter - Tablespace iterator +@param block - block to use for IO +@param callback - Callback to inspect and update page contents +@retval DB_SUCCESS or error code */ +static +dberr_t +fil_iterate( +/*========*/ + const fil_iterator_t& iter, + buf_block_t* block, + PageCallback& callback) +{ + os_offset_t offset; + ulint page_no = 0; + ulint space_id = callback.get_space_id(); + ulint n_bytes = iter.n_io_buffers * iter.page_size; + + ut_ad(!srv_read_only_mode); + + /* TODO: For compressed tables we do a lot of useless + copying for non-index pages. Unfortunately, it is + required by buf_zip_decompress() */ + + for (offset = iter.start; offset < iter.end; offset += n_bytes) { + + byte* io_buffer = iter.io_buffer; + + block->frame = io_buffer; + + if (callback.get_zip_size() > 0) { + page_zip_des_init(&block->page.zip); + page_zip_set_size(&block->page.zip, iter.page_size); + block->page.zip.data = block->frame + UNIV_PAGE_SIZE; + ut_d(block->page.zip.m_external = true); + ut_ad(iter.page_size == callback.get_zip_size()); + + /* Zip IO is done in the compressed page buffer. */ + io_buffer = block->page.zip.data; + } else { + io_buffer = iter.io_buffer; + } + + /* We have to read the exact number of bytes. Otherwise the + InnoDB IO functions croak on failed reads. */ + + n_bytes = static_cast<ulint>( + ut_min(static_cast<os_offset_t>(n_bytes), + iter.end - offset)); + + ut_ad(n_bytes > 0); + ut_ad(!(n_bytes % iter.page_size)); + + if (!os_file_read(iter.file, io_buffer, offset, + (ulint) n_bytes)) { + + ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed"); + + return(DB_IO_ERROR); + } + + bool updated = false; + os_offset_t page_off = offset; + ulint n_pages_read = (ulint) n_bytes / iter.page_size; + + for (ulint i = 0; i < n_pages_read; ++i) { + + buf_block_set_file_page(block, space_id, page_no++); + + dberr_t err; + + if ((err = callback(page_off, block)) != DB_SUCCESS) { + + return(err); + + } else if (!updated) { + updated = buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE; + } + + buf_block_set_state(block, BUF_BLOCK_NOT_USED); + buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); + + page_off += iter.page_size; + block->frame += iter.page_size; + } + + /* A page was updated in the set, write back to disk. */ + if (updated + && !os_file_write( + iter.filepath, iter.file, io_buffer, + offset, (ulint) n_bytes)) { + + ib_logf(IB_LOG_LEVEL_ERROR, "os_file_write() failed"); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/********************************************************************//** +Iterate over all the pages in the tablespace. +@param table - the table definiton in the server +@param n_io_buffers - number of blocks to read and write together +@param callback - functor that will do the page updates +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_tablespace_iterate( +/*===================*/ + dict_table_t* table, + ulint n_io_buffers, + PageCallback& callback) +{ + dberr_t err; + os_file_t file; + char* filepath; + + ut_a(n_io_buffers > 0); + ut_ad(!srv_read_only_mode); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_1", + return(DB_CORRUPTION);); + + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, false); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + + { + ibool success; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, filepath, + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + + DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", + { + static bool once; + + if (!once || ut_rnd_interval(0, 10) == 5) { + once = true; + success = FALSE; + os_file_close(file); + } + }); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to import a tablespace, but could not " + "open the tablespace file %s", filepath); + + mem_free(filepath); + + return(DB_TABLESPACE_NOT_FOUND); + + } else { + err = DB_SUCCESS; + } + } + + callback.set_file(filepath, file); + + os_offset_t file_size = os_file_get_size(file); + ut_a(file_size != (os_offset_t) -1); + + /* The block we will use for every physical page */ + buf_block_t block; + + memset(&block, 0x0, sizeof(block)); + + /* Allocate a page to read in the tablespace header, so that we + can determine the page size and zip_size (if it is compressed). + We allocate an extra page in case it is a compressed table. One + page is to ensure alignement. */ + + void* page_ptr = mem_alloc(3 * UNIV_PAGE_SIZE); + byte* page = static_cast<byte*>(ut_align(page_ptr, UNIV_PAGE_SIZE)); + + fil_buf_block_init(&block, page); + + /* Read the first page and determine the page and zip size. */ + + if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE)) { + + err = DB_IO_ERROR; + + } else if ((err = callback.init(file_size, &block)) == DB_SUCCESS) { + fil_iterator_t iter; + + iter.file = file; + iter.start = 0; + iter.end = file_size; + iter.filepath = filepath; + iter.file_size = file_size; + iter.n_io_buffers = n_io_buffers; + iter.page_size = callback.get_page_size(); + + /* Compressed pages can't be optimised for block IO for now. + We do the IMPORT page by page. */ + + if (callback.get_zip_size() > 0) { + iter.n_io_buffers = 1; + ut_a(iter.page_size == callback.get_zip_size()); + } + + /** Add an extra page for compressed page scratch area. */ + + void* io_buffer = mem_alloc( + (2 + iter.n_io_buffers) * UNIV_PAGE_SIZE); + + iter.io_buffer = static_cast<byte*>( + ut_align(io_buffer, UNIV_PAGE_SIZE)); + + err = fil_iterate(iter, &block, callback); + + mem_free(io_buffer); + } + + if (err == DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk"); + + if (!os_file_flush(file)) { + ib_logf(IB_LOG_LEVEL_INFO, "os_file_flush() failed!"); + err = DB_IO_ERROR; + } else { + ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk - done!"); + } + } + + os_file_close(file); + + mem_free(page_ptr); + mem_free(filepath); + + return(err); +} + +/** +Set the tablespace compressed table size. +@return DB_SUCCESS if it is valie or DB_CORRUPTION if not */ +dberr_t +PageCallback::set_zip_size(const buf_frame_t* page) UNIV_NOTHROW +{ + m_zip_size = fsp_header_get_zip_size(page); + + if (!ut_is_2pow(m_zip_size) || m_zip_size > UNIV_ZIP_SIZE_MAX) { + return(DB_CORRUPTION); + } + + return(DB_SUCCESS); +} + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +UNIV_INTERN +void +fil_delete_file( +/*============*/ + const char* ibd_name) /*!< in: filepath of the ibd + tablespace */ +{ + /* Force a delete of any stale .ibd files that are lying around. */ + + ib_logf(IB_LOG_LEVEL_INFO, "Deleting %s", ibd_name); + + os_file_delete_if_exists(ibd_name); + + char* cfg_name = fil_make_cfg_name(ibd_name); + + os_file_delete_if_exists(cfg_name); + + mem_free(cfg_name); +} + +/** +Iterate over all the spaces in the space list and fetch the +tablespace names. It will return a copy of the name that must be +freed by the caller using: delete[]. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fil_get_space_names( +/*================*/ + space_name_list_t& space_name_list) + /*!< in/out: List to append to */ +{ + fil_space_t* space; + dberr_t err = DB_SUCCESS; + + mutex_enter(&fil_system->mutex); + + for (space = UT_LIST_GET_FIRST(fil_system->space_list); + space != NULL; + space = UT_LIST_GET_NEXT(space_list, space)) { + + if (space->purpose == FIL_TABLESPACE) { + ulint len; + char* name; + + len = strlen(space->name); + name = new(std::nothrow) char[len + 1]; + + if (name == 0) { + /* Caller to free elements allocated so far. */ + err = DB_OUT_OF_MEMORY; + break; + } + + memcpy(name, space->name, len); + name[len] = 0; + + space_name_list.push_back(name); + } + } + + mutex_exit(&fil_system->mutex); + + return(err); +} + +/****************************************************************//** +Generate redo logs for swapping two .ibd files */ +UNIV_INTERN +void +fil_mtr_rename_log( +/*===============*/ + ulint old_space_id, /*!< in: tablespace id of the old + table. */ + const char* old_name, /*!< in: old table name */ + ulint new_space_id, /*!< in: tablespace id of the new + table */ + const char* new_name, /*!< in: new table name */ + const char* tmp_name) /*!< in: temp table name used while + swapping */ +{ + mtr_t mtr; + mtr_start(&mtr); + fil_op_write_log(MLOG_FILE_RENAME, old_space_id, + 0, 0, old_name, tmp_name, &mtr); + fil_op_write_log(MLOG_FILE_RENAME, new_space_id, + 0, 0, new_name, old_name, &mtr); + mtr_commit(&mtr); +} + diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 398dd24afed..dc843a89fb9 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -93,15 +93,13 @@ fseg_n_reserved_pages_low( /********************************************************************//** Marks a page used. The page must reside within the extents of the given segment. */ -static +static __attribute__((nonnull)) void fseg_mark_page_used( /*================*/ fseg_inode_t* seg_inode,/*!< in: segment inode */ - ulint space, /*!< in: space id */ - ulint zip_size,/*!< in: compressed page size in bytes - or 0 for uncompressed pages */ ulint page, /*!< in: page offset */ + xdes_t* descr, /*!< in: extent descriptor */ mtr_t* mtr); /*!< in/out: mini-transaction */ /**********************************************************************//** Returns the first extent descriptor for a segment. We think of the extent @@ -214,30 +212,18 @@ Gets a descriptor bit of a page. @return TRUE if free */ UNIV_INLINE ibool -xdes_get_bit( -/*=========*/ +xdes_mtr_get_bit( +/*=============*/ const xdes_t* descr, /*!< in: descriptor */ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ ulint offset, /*!< in: page offset within extent: 0 ... FSP_EXTENT_SIZE - 1 */ - mtr_t* mtr) /*!< in/out: mini-transaction */ + mtr_t* mtr) /*!< in: mini-transaction */ { - ulint index; - ulint byte_index; - ulint bit_index; - + ut_ad(mtr->state == MTR_ACTIVE); ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); - ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT)); - ut_ad(offset < FSP_EXTENT_SIZE); - index = bit + XDES_BITS_PER_PAGE * offset; - - byte_index = index / 8; - bit_index = index % 8; - - return(ut_bit_get_nth(mtr_read_ulint(descr + XDES_BITMAP + byte_index, - MLOG_1BYTE, mtr), - bit_index)); + return(xdes_get_bit(descr, bit, offset)); } /**********************************************************************//** @@ -287,7 +273,8 @@ xdes_find_bit( xdes_t* descr, /*!< in: descriptor */ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ ibool val, /*!< in: desired bit value */ - ulint hint, /*!< in: hint of which bit position would be desirable */ + ulint hint, /*!< in: hint of which bit position would + be desirable */ mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint i; @@ -297,14 +284,14 @@ xdes_find_bit( ut_ad(hint < FSP_EXTENT_SIZE); ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); for (i = hint; i < FSP_EXTENT_SIZE; i++) { - if (val == xdes_get_bit(descr, bit, i, mtr)) { + if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) { return(i); } } for (i = 0; i < hint; i++) { - if (val == xdes_get_bit(descr, bit, i, mtr)) { + if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) { return(i); } @@ -324,7 +311,8 @@ xdes_find_bit_downward( xdes_t* descr, /*!< in: descriptor */ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ ibool val, /*!< in: desired bit value */ - ulint hint, /*!< in: hint of which bit position would be desirable */ + ulint hint, /*!< in: hint of which bit position would + be desirable */ mtr_t* mtr) /*!< in/out: mini-transaction */ { ulint i; @@ -334,14 +322,14 @@ xdes_find_bit_downward( ut_ad(hint < FSP_EXTENT_SIZE); ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); for (i = hint + 1; i > 0; i--) { - if (val == xdes_get_bit(descr, bit, i - 1, mtr)) { + if (val == xdes_mtr_get_bit(descr, bit, i - 1, mtr)) { return(i - 1); } } for (i = FSP_EXTENT_SIZE - 1; i > hint; i--) { - if (val == xdes_get_bit(descr, bit, i, mtr)) { + if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) { return(i); } @@ -360,13 +348,12 @@ xdes_get_n_used( const xdes_t* descr, /*!< in: descriptor */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - ulint i; ulint count = 0; ut_ad(descr && mtr); ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); - for (i = 0; i < FSP_EXTENT_SIZE; i++) { - if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) { + for (ulint i = 0; i < FSP_EXTENT_SIZE; ++i) { + if (FALSE == xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) { count++; } } @@ -471,76 +458,11 @@ xdes_init( } /********************************************************************//** -Calculates the page where the descriptor of a page resides. -@return descriptor page offset */ -UNIV_INLINE -ulint -xdes_calc_descriptor_page( -/*======================*/ - ulint zip_size, /*!< in: compressed page size in bytes; - 0 for uncompressed pages */ - ulint offset) /*!< in: page offset */ -{ -#ifndef DOXYGEN /* Doxygen gets confused of these */ -# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET \ - + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) \ - * XDES_SIZE_MAX -# error -# endif -# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET \ - + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN) \ - * XDES_SIZE_MIN -# error -# endif -#endif /* !DOXYGEN */ - - ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET - + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) - * XDES_SIZE); - ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET - + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) - * XDES_SIZE); - - ut_ad(ut_is_2pow(zip_size)); - - if (!zip_size) { - return(ut_2pow_round(offset, UNIV_PAGE_SIZE)); - } else { - ut_ad(zip_size > XDES_ARR_OFFSET - + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); - return(ut_2pow_round(offset, zip_size)); - } -} - -/********************************************************************//** -Calculates the descriptor index within a descriptor page. -@return descriptor index */ -UNIV_INLINE -ulint -xdes_calc_descriptor_index( -/*=======================*/ - ulint zip_size, /*!< in: compressed page size in bytes; - 0 for uncompressed pages */ - ulint offset) /*!< in: page offset */ -{ - ut_ad(ut_is_2pow(zip_size)); - - if (!zip_size) { - return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE) - / FSP_EXTENT_SIZE); - } else { - return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE); - } -} - -/********************************************************************//** Gets pointer to a the extent descriptor of a page. The page where the extent -descriptor resides is x-locked. If the page offset is equal to the free limit -of the space, adds new extents from above the free limit to the space free -list, if not free limit == space size. This adding is necessary to make the -descriptor defined, as they are uninitialized above the free limit. +descriptor resides is x-locked. This function no longer extends the data +file. @return pointer to the extent descriptor, NULL if the page does not -exist in the space or if the offset exceeds the free limit */ +exist in the space or if the offset is >= the free limit */ UNIV_INLINE __attribute__((nonnull, warn_unused_result)) xdes_t* xdes_get_descriptor_with_space_hdr( @@ -570,19 +492,10 @@ xdes_get_descriptor_with_space_hdr( zip_size = fsp_flags_get_zip_size( mach_read_from_4(sp_header + FSP_SPACE_FLAGS)); - /* If offset is >= size or > limit, return NULL */ - - if ((offset >= size) || (offset > limit)) { - + if ((offset >= size) || (offset >= limit)) { return(NULL); } - /* If offset is == limit, fill free list of the space. */ - - if (offset == limit) { - fsp_fill_free_list(FALSE, space, sp_header, mtr); - } - descr_page_no = xdes_calc_descriptor_page(zip_size, offset); if (descr_page_no == 0) { @@ -668,7 +581,7 @@ UNIV_INLINE ulint xdes_get_offset( /*============*/ - xdes_t* descr) /*!< in: extent descriptor */ + const xdes_t* descr) /*!< in: extent descriptor */ { ut_ad(descr); @@ -784,7 +697,7 @@ fsp_header_init_fields( ulint space_id, /*!< in: space id */ ulint flags) /*!< in: tablespace flags (FSP_SPACE_FLAGS) */ { - fsp_flags_validate(flags); + ut_a(fsp_flags_is_valid(flags)); mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page, space_id); @@ -872,11 +785,13 @@ fsp_header_get_space_id( id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + DBUG_EXECUTE_IF("fsp_header_get_space_id_failure", + id = ULINT_UNDEFINED;); + if (id != fsp_id) { - fprintf(stderr, - "InnoDB: Error: space id in fsp header %lu," - " but in the page header %lu\n", - (ulong) fsp_id, (ulong) id); + ib_logf(IB_LOG_LEVEL_ERROR, + "Space id in fsp header %lu,but in the page header " + "%lu", fsp_id, id); return(ULINT_UNDEFINED); } @@ -1348,7 +1263,7 @@ fsp_alloc_from_free_frag( ulint frag_n_used; ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); - ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr)); + ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, bit, mtr)); xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); /* Update the FRAG_N_USED field */ @@ -1583,7 +1498,9 @@ fsp_free_page( ut_error; } - if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) { + if (xdes_mtr_get_bit(descr, XDES_FREE_BIT, + page % FSP_EXTENT_SIZE, mtr)) { + fprintf(stderr, "InnoDB: Error: File space extent descriptor" " of page %lu says it is free\n" @@ -1728,16 +1645,15 @@ fsp_seg_inode_page_find_free( ulint zip_size,/*!< in: compressed page size, or 0 */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - fseg_inode_t* inode; - for (; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + fseg_inode_t* inode; + inode = fsp_seg_inode_page_get_nth_inode( page, i, zip_size, mtr); if (!mach_read_from_8(inode + FSEG_ID)) { /* This is unused */ - return(i); } @@ -1763,11 +1679,11 @@ fsp_alloc_seg_inode_page( page_t* page; ulint space; ulint zip_size; - ulint i; ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET); space = page_get_space_id(page_align(space_header)); + zip_size = fsp_flags_get_zip_size( mach_read_from_4(FSP_SPACE_FLAGS + space_header)); @@ -1788,16 +1704,18 @@ fsp_alloc_seg_inode_page( mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_INODE, MLOG_2BYTES, mtr); - for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { - inode = fsp_seg_inode_page_get_nth_inode(page, i, - zip_size, mtr); + inode = fsp_seg_inode_page_get_nth_inode( + page, i, zip_size, mtr); mlog_write_ull(inode + FSEG_ID, 0, mtr); } - flst_add_last(space_header + FSP_SEG_INODES_FREE, - page + FSEG_INODE_PAGE_NODE, mtr); + flst_add_last( + space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + return(TRUE); } @@ -2486,8 +2404,8 @@ fseg_alloc_free_page_low( /*-------------------------------------------------------------*/ if ((xdes_get_state(descr, mtr) == XDES_FSEG) && mach_read_from_8(descr + XDES_ID) == seg_id - && (xdes_get_bit(descr, XDES_FREE_BIT, - hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { + && (xdes_mtr_get_bit(descr, XDES_FREE_BIT, + hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { take_hinted_page: /* 1. We can take the hinted page =================================*/ @@ -2652,10 +2570,12 @@ got_hinted_page: ut_ad(xdes_get_descriptor(space, zip_size, ret_page, mtr) == ret_descr); - ut_ad(xdes_get_bit(ret_descr, XDES_FREE_BIT, - ret_page % FSP_EXTENT_SIZE, mtr) == TRUE); - fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr); + ut_ad(xdes_mtr_get_bit( + ret_descr, XDES_FREE_BIT, + ret_page % FSP_EXTENT_SIZE, mtr)); + + fseg_mark_page_used(seg_inode, ret_page, ret_descr, mtr); } return(fsp_page_create( @@ -3053,27 +2973,21 @@ fsp_get_available_space_in_free_extents( /********************************************************************//** Marks a page used. The page must reside within the extents of the given segment. */ -static +static __attribute__((nonnull)) void fseg_mark_page_used( /*================*/ fseg_inode_t* seg_inode,/*!< in: segment inode */ - ulint space, /*!< in: space id */ - ulint zip_size,/*!< in: compressed page size in bytes - or 0 for uncompressed pages */ ulint page, /*!< in: page offset */ + xdes_t* descr, /*!< in: extent descriptor */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - xdes_t* descr; ulint not_full_n_used; - ut_ad(seg_inode && mtr); ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); - descr = xdes_get_descriptor(space, zip_size, page, mtr); - ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr) == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr)); @@ -3086,8 +3000,9 @@ fseg_mark_page_used( descr + XDES_FLST_NODE, mtr); } - ut_ad(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr) - == TRUE); + ut_ad(xdes_mtr_get_bit( + descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)); + /* We mark the page as used */ xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr); @@ -3142,8 +3057,8 @@ fseg_free_page_low( descr = xdes_get_descriptor(space, zip_size, page, mtr); - ut_a(descr); - if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) { + if (xdes_mtr_get_bit(descr, XDES_FREE_BIT, + page % FSP_EXTENT_SIZE, mtr)) { fputs("InnoDB: Dump of the tablespace extent descriptor: ", stderr); ut_print_buf(stderr, descr, 40); @@ -3278,6 +3193,49 @@ fseg_free_page( } /**********************************************************************//** +Checks if a single page of a segment is free. +@return true if free */ +UNIV_INTERN +bool +fseg_page_is_free( +/*==============*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page) /*!< in: page offset */ +{ + mtr_t mtr; + ibool is_free; + ulint flags; + rw_lock_t* latch; + xdes_t* descr; + ulint zip_size; + fseg_inode_t* seg_inode; + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_tf_get_zip_size(flags); + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode = fseg_inode_get(seg_header, space, zip_size, &mtr); + + ut_a(seg_inode); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + descr = xdes_get_descriptor(space, zip_size, page, &mtr); + ut_a(descr); + + is_free = xdes_mtr_get_bit( + descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, &mtr); + + mtr_commit(&mtr); + + return(is_free); +} + +/**********************************************************************//** Frees an extent of a segment to the space free list. */ static void @@ -3308,7 +3266,7 @@ fseg_free_extent( first_page_in_extent = page - (page % FSP_EXTENT_SIZE); for (i = 0; i < FSP_EXTENT_SIZE; i++) { - if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) { + if (!xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) { /* Drop search system page hash index if the page is found in the pool and is hashed */ @@ -3388,9 +3346,9 @@ fseg_free_step( /* Check that the header resides on a page which has not been freed yet */ - ut_a(descr); - ut_a(xdes_get_bit(descr, XDES_FREE_BIT, - header_page % FSP_EXTENT_SIZE, mtr) == FALSE); + ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, + header_page % FSP_EXTENT_SIZE, mtr) == FALSE); + inode = fseg_inode_try_get(header, space, zip_size, mtr); if (UNIV_UNLIKELY(inode == NULL)) { diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc index c01c43a021f..972f5acf461 100644 --- a/storage/innobase/fts/fts0ast.cc +++ b/storage/innobase/fts/fts0ast.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -98,9 +98,21 @@ fts_ast_create_node_text( void* arg, /*!< in: ast state instance */ const char* ptr) /*!< in: ast text string */ { + ulint len = strlen(ptr); + fts_ast_node_t* node = NULL; + + ut_ad(len >= 2); + + if (len == 2) { + ut_ad(ptr[0] == '\"'); + ut_ad(ptr[1] == '\"'); + return(NULL); + } + + node = fts_ast_node_create(); + /*!< We ignore the actual quotes "" */ - ulint len = strlen(ptr) - 2; - fts_ast_node_t* node = fts_ast_node_create(); + len -= 2; node->type = FTS_AST_TEXT; node->text.ptr = static_cast<byte*>(ut_malloc(len + 1)); @@ -381,34 +393,100 @@ fts_ast_node_print( } /******************************************************************//** -Traverse the AST - in-order traversal. +Traverse the AST - in-order traversal, except for the FTS_IGNORE +nodes, which will be ignored in the first pass of each level, and +visited in a second pass after all other nodes in the same level are visited. @return DB_SUCCESS if all went well */ UNIV_INTERN -ulint +dberr_t fts_ast_visit( /*==========*/ fts_ast_oper_t oper, /*!< in: current operator */ fts_ast_node_t* node, /*!< in: current root node */ fts_ast_callback visitor, /*!< in: callback function */ - void* arg) /*!< in: arg for callback */ + void* arg, /*!< in: arg for callback */ + bool* has_ignore) /*!< out: true, if the operator + was ignored during processing, + currently we only ignore + FTS_IGNORE operator */ { - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; + fts_ast_node_t* oper_node = NULL; + fts_ast_node_t* start_node; + bool revisit = false; + bool will_be_ignored = false; + + start_node = node->list.head; ut_a(node->type == FTS_AST_LIST || node->type == FTS_AST_SUBEXP_LIST); + /* In the first pass of the tree, at the leaf level of the + tree, FTS_IGNORE operation will be ignored. It will be + repeated at the level above the leaf level */ for (node = node->list.head; - node && error == DB_SUCCESS; + node && (error == DB_SUCCESS); node = node->next) { if (node->type == FTS_AST_LIST) { - error = fts_ast_visit(oper, node, visitor, arg); + error = fts_ast_visit(oper, node, visitor, + arg, &will_be_ignored); + + /* If will_be_ignored is set to true, then + we encountered and ignored a FTS_IGNORE operator, + and a second pass is needed to process FTS_IGNORE + operator */ + if (will_be_ignored) { + revisit = true; + } } else if (node->type == FTS_AST_SUBEXP_LIST) { error = fts_ast_visit_sub_exp(node, visitor, arg); } else if (node->type == FTS_AST_OPER) { oper = node->oper; + oper_node = node; } else { - visitor(oper, node, arg); + if (node->visited) { + continue; + } + + ut_a(oper == FTS_NONE || !oper_node + || oper_node->oper == oper); + + if (oper == FTS_IGNORE) { + *has_ignore = true; + /* Change the operator to FTS_IGNORE_SKIP, + so that it is processed in the second pass */ + oper_node->oper = FTS_IGNORE_SKIP; + continue; + } + + if (oper == FTS_IGNORE_SKIP) { + /* This must be the second pass, now we process + the FTS_IGNORE operator */ + visitor(FTS_IGNORE, node, arg); + } else { + visitor(oper, node, arg); + } + + node->visited = true; + } + } + + /* Second pass to process the skipped FTS_IGNORE operation. + It is only performed at the level above leaf level */ + if (revisit) { + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST) { + /* In this pass, it will process all those + operators ignored in the first pass, and those + whose operators are set to FTS_IGNORE_SKIP */ + error = fts_ast_visit( + oper, node, visitor, arg, + &will_be_ignored); + } } } diff --git a/storage/innobase/fts/fts0blex.cc b/storage/innobase/fts/fts0blex.cc index b3350010db0..1abd737ec06 100644 --- a/storage/innobase/fts/fts0blex.cc +++ b/storage/innobase/fts/fts0blex.cc @@ -35,7 +35,7 @@ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, - * if you want the limit (max/min) macros for int types. + * if you want the limit (max/min) macros for int types. */ #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS 1 @@ -247,7 +247,7 @@ struct yy_buffer_state int yy_bs_lineno; /**< The line count. */ int yy_bs_column; /**< The column count. */ - + /* Whether to try to fill the input buffer when we reach the * end of it. */ @@ -305,9 +305,9 @@ YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); -void *fts0balloc (yy_size_t , yyscan_t yyscanner __attribute__((unused)) ); -void *fts0brealloc (void *,yy_size_t , yyscan_t yyscanner __attribute__((unused)) ); -void fts0bfree (void * , yyscan_t yyscanner __attribute__((unused)) ); +void *fts0balloc (yy_size_t , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); +void *fts0brealloc (void *,yy_size_t , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); +void fts0bfree (void * , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); #define yy_new_buffer fts0b_create_buffer @@ -347,7 +347,7 @@ typedef int yy_state_type; static yy_state_type yy_get_previous_state (yyscan_t yyscanner ); static yy_state_type yy_try_NUL_trans (yy_state_type current_state ,yyscan_t yyscanner); static int yy_get_next_buffer (yyscan_t yyscanner ); -static void yy_fatal_error (yyconst char msg[] , yyscan_t yyscanner __attribute__((unused)) ); +static void yy_fatal_error (yyconst char msg[] , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); /* Done after the current pattern has been matched and before the * corresponding action - sets up yytext. @@ -368,10 +368,10 @@ struct yy_trans_info flex_int32_t yy_verify; flex_int32_t yy_nxt; }; -static yyconst flex_int16_t yy_accept[18] = +static yyconst flex_int16_t yy_accept[19] = { 0, - 4, 4, 8, 4, 1, 6, 1, 7, 2, 3, - 4, 1, 1, 0, 5, 3, 0 + 4, 4, 8, 4, 1, 6, 1, 7, 7, 2, + 3, 4, 1, 1, 0, 5, 3, 0 } ; static yyconst flex_int32_t yy_ec[256] = @@ -379,17 +379,17 @@ static yyconst flex_int32_t yy_ec[256] = 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 4, 1, 5, 1, 1, 1, 1, 1, 6, - 6, 6, 6, 1, 6, 1, 1, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 1, 1, 6, - 1, 6, 1, 6, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 1, 1, 6, 1, 1, 7, + 7, 7, 7, 1, 7, 1, 1, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 1, 1, 7, + 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -406,35 +406,39 @@ static yyconst flex_int32_t yy_ec[256] = 1, 1, 1, 1, 1 } ; -static yyconst flex_int32_t yy_meta[8] = +static yyconst flex_int32_t yy_meta[9] = { 0, - 1, 2, 3, 4, 5, 5, 1 + 1, 2, 3, 4, 5, 5, 5, 1 } ; -static yyconst flex_int16_t yy_base[21] = +static yyconst flex_int16_t yy_base[22] = { 0, - 0, 0, 21, 0, 6, 22, 0, 13, 22, 7, - 0, 0, 0, 4, 22, 0, 22, 10, 11, 15 + 0, 0, 22, 0, 7, 23, 0, 14, 23, 23, + 7, 0, 0, 0, 5, 23, 0, 23, 11, 12, + 16 } ; -static yyconst flex_int16_t yy_def[21] = +static yyconst flex_int16_t yy_def[22] = { 0, - 17, 1, 17, 18, 18, 17, 19, 20, 17, 18, - 18, 5, 19, 20, 17, 10, 0, 17, 17, 17 + 18, 1, 18, 19, 19, 18, 20, 21, 18, 18, + 19, 19, 5, 20, 21, 18, 11, 0, 18, 18, + 18 } ; -static yyconst flex_int16_t yy_nxt[30] = +static yyconst flex_int16_t yy_nxt[32] = { 0, - 4, 5, 6, 7, 8, 9, 10, 12, 15, 13, - 11, 11, 13, 16, 13, 14, 14, 15, 14, 14, - 17, 3, 17, 17, 17, 17, 17, 17, 17 + 4, 5, 6, 7, 8, 9, 10, 11, 13, 16, + 14, 12, 12, 14, 17, 14, 15, 15, 16, 15, + 15, 18, 3, 18, 18, 18, 18, 18, 18, 18, + 18 } ; -static yyconst flex_int16_t yy_chk[30] = +static yyconst flex_int16_t yy_chk[32] = { 0, - 1, 1, 1, 1, 1, 1, 1, 5, 14, 5, - 18, 18, 19, 10, 19, 20, 20, 8, 20, 20, - 3, 17, 17, 17, 17, 17, 17, 17, 17 + 1, 1, 1, 1, 1, 1, 1, 1, 5, 15, + 5, 19, 19, 20, 11, 20, 21, 21, 8, 21, + 21, 3, 18, 18, 18, 18, 18, 18, 18, 18, + 18 } ; /* The intent behind this definition is that it'll catch @@ -477,7 +481,7 @@ this program; if not, write to the Free Software Foundation, Inc., #define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner) #define YY_NO_INPUT 1 -#line 480 "fts0blex.cc" +#line 484 "fts0blex.cc" #define INITIAL 0 @@ -575,11 +579,11 @@ extern int fts0bwrap (yyscan_t yyscanner ); #endif #ifndef yytext_ptr -static void yy_flex_strncpy (char *,yyconst char *,int , yyscan_t yyscanner __attribute__((unused))); +static void yy_flex_strncpy (char *,yyconst char *,int , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))); #endif #ifdef YY_NEED_STRLEN -static int yy_flex_strlen (yyconst char * , yyscan_t yyscanner __attribute__((unused))); +static int yy_flex_strlen (yyconst char * , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))); #endif #ifndef YY_NO_INPUT @@ -699,12 +703,12 @@ YY_DECL register yy_state_type yy_current_state; register char *yy_cp, *yy_bp; register int yy_act; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; #line 43 "fts0blex.l" -#line 707 "fts0blex.cc" +#line 711 "fts0blex.cc" if ( !yyg->yy_init ) { @@ -757,13 +761,13 @@ yy_match: while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 18 ) + if ( yy_current_state >= 19 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; ++yy_cp; } - while ( yy_current_state != 17 ); + while ( yy_current_state != 18 ); yy_cp = yyg->yy_last_accepting_cpos; yy_current_state = yyg->yy_last_accepting_state; @@ -835,7 +839,7 @@ YY_RULE_SETUP #line 73 "fts0blex.l" ECHO; YY_BREAK -#line 838 "fts0blex.cc" +#line 842 "fts0blex.cc" case YY_STATE_EOF(INITIAL): yyterminate(); @@ -978,7 +982,7 @@ case YY_STATE_EOF(INITIAL): */ static int yy_get_next_buffer (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; register char *source = yyg->yytext_ptr; register int number_to_move, i; @@ -1044,9 +1048,9 @@ static int yy_get_next_buffer (yyscan_t yyscanner) else b->yy_buf_size *= 2; - b->yy_ch_buf = (char*) + b->yy_ch_buf = (char *) /* Include room in for 2 EOB chars. */ - fts0brealloc((void*) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); + fts0brealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); } else /* Can't grow it, we don't own it. */ @@ -1095,7 +1099,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { /* Extend the array by 50%, plus the number we really need. */ yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); - YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char*) fts0brealloc((void*) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0brealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); } @@ -1115,7 +1119,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) { register yy_state_type yy_current_state; register char *yy_cp; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yy_current_state = yyg->yy_start; @@ -1130,7 +1134,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 18 ) + if ( yy_current_state >= 19 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; @@ -1147,7 +1151,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) { register int yy_is_jam; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; /* This var may be unused depending upon options. */ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ register char *yy_cp = yyg->yy_c_buf_p; register YY_CHAR yy_c = 1; @@ -1159,11 +1163,11 @@ static int yy_get_next_buffer (yyscan_t yyscanner) while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 18 ) + if ( yy_current_state >= 19 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; - yy_is_jam = (yy_current_state == 17); + yy_is_jam = (yy_current_state == 18); return yy_is_jam ? 0 : yy_current_state; } @@ -1177,7 +1181,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) { int c; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; *yyg->yy_c_buf_p = yyg->yy_hold_char; @@ -1235,7 +1239,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) } } - c = *(unsigned char*) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ yyg->yy_hold_char = *++yyg->yy_c_buf_p; @@ -1250,7 +1254,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) */ void fts0brestart (FILE * input_file , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if ( ! YY_CURRENT_BUFFER ){ fts0bensure_buffer_stack (yyscanner); @@ -1268,7 +1272,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) */ void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* TODO. We should be able to replace this entire function body * with @@ -1300,7 +1304,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) static void fts0b_load_buffer_state (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; @@ -1316,7 +1320,7 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) YY_BUFFER_STATE fts0b_create_buffer (FILE * file, int size , yyscan_t yyscanner) { YY_BUFFER_STATE b; - + b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner ); if ( ! b ) YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" ); @@ -1326,7 +1330,7 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) /* yy_ch_buf has to be 2 characters longer than the size given because * we need to put in 2 end-of-buffer characters. */ - b->yy_ch_buf = (char*) fts0balloc(b->yy_buf_size + 2 ,yyscanner ); + b->yy_ch_buf = (char *) fts0balloc(b->yy_buf_size + 2 ,yyscanner ); if ( ! b->yy_ch_buf ) YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" ); @@ -1343,7 +1347,7 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) */ void fts0b_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if ( ! b ) return; @@ -1352,9 +1356,9 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; if ( b->yy_is_our_buffer ) - fts0bfree((void*) b->yy_ch_buf ,yyscanner ); + fts0bfree((void *) b->yy_ch_buf ,yyscanner ); - fts0bfree((void*) b ,yyscanner ); + fts0bfree((void *) b ,yyscanner ); } /* Initializes or reinitializes a buffer. @@ -1365,7 +1369,7 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) { int oerrno = errno; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; fts0b_flush_buffer(b ,yyscanner); @@ -1382,7 +1386,7 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) } b->yy_is_interactive = 0; - + errno = oerrno; } @@ -1392,7 +1396,7 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) */ void fts0b_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if ( ! b ) return; @@ -1422,7 +1426,7 @@ static void fts0b_load_buffer_state (yyscan_t yyscanner) */ void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (new_buffer == NULL) return; @@ -1453,7 +1457,7 @@ void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) */ void fts0bpop_buffer_state (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (!YY_CURRENT_BUFFER) return; @@ -1474,7 +1478,7 @@ void fts0bpop_buffer_state (yyscan_t yyscanner) static void fts0bensure_buffer_stack (yyscan_t yyscanner) { int num_to_alloc; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (!yyg->yy_buffer_stack) { @@ -1483,14 +1487,14 @@ static void fts0bensure_buffer_stack (yyscan_t yyscanner) * immediate realloc on the next call. */ num_to_alloc = 1; - yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0balloc + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0balloc (num_to_alloc * sizeof(struct yy_buffer_state*) , yyscanner); if ( ! yyg->yy_buffer_stack ) YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" ); - + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); - + yyg->yy_buffer_stack_max = num_to_alloc; yyg->yy_buffer_stack_top = 0; return; @@ -1502,7 +1506,7 @@ static void fts0bensure_buffer_stack (yyscan_t yyscanner) int grow_size = 8 /* arbitrary grow size */; num_to_alloc = yyg->yy_buffer_stack_max + grow_size; - yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0brealloc + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0brealloc (yyg->yy_buffer_stack, num_to_alloc * sizeof(struct yy_buffer_state*) , yyscanner); @@ -1519,12 +1523,12 @@ static void fts0bensure_buffer_stack (yyscan_t yyscanner) * @param base the character buffer * @param size the size in bytes of the character buffer * @param yyscanner The scanner object. - * @return the newly allocated buffer state object. + * @return the newly allocated buffer state object. */ YY_BUFFER_STATE fts0b_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) { YY_BUFFER_STATE b; - + if ( size < 2 || base[size-2] != YY_END_OF_BUFFER_CHAR || base[size-1] != YY_END_OF_BUFFER_CHAR ) @@ -1560,7 +1564,7 @@ YY_BUFFER_STATE fts0b_scan_buffer (char * base, yy_size_t size , yyscan_t yysc */ YY_BUFFER_STATE fts0b_scan_string (yyconst char * yystr , yyscan_t yyscanner) { - + return fts0b_scan_bytes(yystr,strlen(yystr) ,yyscanner); } @@ -1577,10 +1581,10 @@ YY_BUFFER_STATE fts0b_scan_bytes (yyconst char * yybytes, int _yybytes_len , y char *buf; yy_size_t n; int i; - + /* Get memory for full buffer, including space for trailing EOB's. */ n = _yybytes_len + 2; - buf = (char*) fts0balloc(n ,yyscanner ); + buf = (char *) fts0balloc(n ,yyscanner ); if ( ! buf ) YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_bytes()" ); @@ -1605,7 +1609,7 @@ YY_BUFFER_STATE fts0b_scan_bytes (yyconst char * yybytes, int _yybytes_len , y #define YY_EXIT_FAILURE 2 #endif -static void yy_fatal_error (yyconst char* msg , yyscan_t yyscanner __attribute__((unused))) +static void yy_fatal_error (yyconst char* msg , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) { (void) fprintf( stderr, "%s\n", msg ); exit( YY_EXIT_FAILURE ); @@ -1635,7 +1639,7 @@ static void yy_fatal_error (yyconst char* msg , yyscan_t yyscanner __attribute_ */ YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyextra; } @@ -1644,11 +1648,11 @@ YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner) */ int fts0bget_lineno (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; - + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (! YY_CURRENT_BUFFER) return 0; - + return yylineno; } @@ -1657,11 +1661,11 @@ int fts0bget_lineno (yyscan_t yyscanner) */ int fts0bget_column (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; - + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (! YY_CURRENT_BUFFER) return 0; - + return yycolumn; } @@ -1670,7 +1674,7 @@ int fts0bget_column (yyscan_t yyscanner) */ FILE *fts0bget_in (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyin; } @@ -1679,7 +1683,7 @@ FILE *fts0bget_in (yyscan_t yyscanner) */ FILE *fts0bget_out (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyout; } @@ -1688,7 +1692,7 @@ FILE *fts0bget_out (yyscan_t yyscanner) */ int fts0bget_leng (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyleng; } @@ -1698,7 +1702,7 @@ int fts0bget_leng (yyscan_t yyscanner) char *fts0bget_text (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yytext; } @@ -1708,7 +1712,7 @@ char *fts0bget_text (yyscan_t yyscanner) */ void fts0bset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyextra = user_defined ; } @@ -1718,12 +1722,12 @@ void fts0bset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) */ void fts0bset_lineno (int line_number , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* lineno is only valid if an input buffer exists. */ if (! YY_CURRENT_BUFFER ) - yy_fatal_error( "fts0bset_lineno called with no buffer" , yyscanner); - + yy_fatal_error( "fts0bset_lineno called with no buffer" , yyscanner); + yylineno = line_number; } @@ -1733,12 +1737,12 @@ void fts0bset_lineno (int line_number , yyscan_t yyscanner) */ void fts0bset_column (int column_no , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* column is only valid if an input buffer exists. */ if (! YY_CURRENT_BUFFER ) - yy_fatal_error( "fts0bset_column called with no buffer" , yyscanner); - + yy_fatal_error( "fts0bset_column called with no buffer" , yyscanner); + yycolumn = column_no; } @@ -1750,25 +1754,25 @@ void fts0bset_column (int column_no , yyscan_t yyscanner) */ void fts0bset_in (FILE * in_str , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyin = in_str ; } void fts0bset_out (FILE * out_str , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyout = out_str ; } int fts0bget_debug (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yy_flex_debug; } void fts0bset_debug (int bdebug , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yy_flex_debug = bdebug ; } @@ -1821,26 +1825,26 @@ int fts0blex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals ) errno = EINVAL; return 1; } - + *ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); - + if (*ptr_yy_globals == NULL){ errno = ENOMEM; return 1; } - + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); - + fts0bset_extra (yy_user_defined, *ptr_yy_globals); - + return yy_init_globals ( *ptr_yy_globals ); } static int yy_init_globals (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* Initialization is the same as for the non-reentrant scanner. * This function is called from fts0blex_destroy(), so don't allocate here. */ @@ -1848,7 +1852,7 @@ static int yy_init_globals (yyscan_t yyscanner) yyg->yy_buffer_stack = 0; yyg->yy_buffer_stack_top = 0; yyg->yy_buffer_stack_max = 0; - yyg->yy_c_buf_p = (char*) 0; + yyg->yy_c_buf_p = (char *) 0; yyg->yy_init = 0; yyg->yy_start = 0; @@ -1861,8 +1865,8 @@ static int yy_init_globals (yyscan_t yyscanner) yyin = stdin; yyout = stdout; #else - yyin = (FILE*) 0; - yyout = (FILE*) 0; + yyin = (FILE *) 0; + yyout = (FILE *) 0; #endif /* For future reference: Set errno on error, since we are called by @@ -1874,7 +1878,7 @@ static int yy_init_globals (yyscan_t yyscanner) /* fts0blex_destroy is for both reentrant and non-reentrant scanners. */ int fts0blex_destroy (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* Pop the buffer stack, destroying each element. */ while(YY_CURRENT_BUFFER){ @@ -1906,7 +1910,7 @@ int fts0blex_destroy (yyscan_t yyscanner) */ #ifndef yytext_ptr -static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yyscanner __attribute__((unused))) +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) { register int i; for ( i = 0; i < n; ++i ) @@ -1915,7 +1919,7 @@ static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yysc #endif #ifdef YY_NEED_STRLEN -static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner __attribute__((unused))) +static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) { register int n; for ( n = 0; s[n]; ++n ) @@ -1925,26 +1929,26 @@ static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner __attribute__( } #endif -void *fts0balloc (yy_size_t size , yyscan_t yyscanner __attribute__((unused))) +void *fts0balloc (yy_size_t size , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) { - return (void*) malloc( size ); + return (void *) malloc( size ); } -void *fts0brealloc (void * ptr, yy_size_t size , yyscan_t yyscanner __attribute__((unused))) +void *fts0brealloc (void * ptr, yy_size_t size , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) { - /* The cast to (char*) in the following accommodates both + /* The cast to (char *) in the following accommodates both * implementations that use char* generic pointers, and those * that use void* generic pointers. It works with the latter * because both ANSI C and C++ allow castless assignment from * any pointer type to void*, and deal with argument conversions * as though doing an assignment. */ - return (void*) realloc( (char*) ptr, size ); + return (void *) realloc( (char *) ptr, size ); } -void fts0bfree (void * ptr , yyscan_t yyscanner __attribute__((unused))) +void fts0bfree (void * ptr , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) { - free( (char*) ptr ); /* see fts0brealloc() for (char*) cast */ + free( (char *) ptr ); /* see fts0brealloc() for (char *) cast */ } #define YYTABLES_NAME "yytables" diff --git a/storage/innobase/fts/fts0blex.l b/storage/innobase/fts/fts0blex.l index b84b0cea294..6193f0df187 100644 --- a/storage/innobase/fts/fts0blex.l +++ b/storage/innobase/fts/fts0blex.l @@ -56,7 +56,7 @@ this program; if not, write to the Free Software Foundation, Inc., return(FTS_NUMB); } -[^" \n*()+\-<>~@]* { +[^" \n*()+\-<>~@%]* { val->token = strdup(fts0bget_text(yyscanner)); return(FTS_TERM); diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc index 3f849ef183c..9cac680101c 100644 --- a/storage/innobase/fts/fts0config.cc +++ b/storage/innobase/fts/fts0config.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -70,7 +70,7 @@ Get value from the config table. The caller must ensure that enough space is allocated for value to hold the column contents. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_value( /*=================*/ trx_t* trx, /*!< transaction */ @@ -83,7 +83,7 @@ fts_config_get_value( { pars_info_t* info; que_t* graph; - ulint error; + dberr_t error; ulint name_len = strlen(name); info = pars_info_create(); @@ -162,7 +162,7 @@ must ensure that enough space is allocated for value to hold the column contents. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_index_value( /*=======================*/ trx_t* trx, /*!< transaction */ @@ -173,7 +173,7 @@ fts_config_get_index_value( config table */ { char* name; - ulint error; + dberr_t error; fts_table_t fts_table; FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, @@ -193,7 +193,7 @@ fts_config_get_index_value( Set the value in the config table for name. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_value( /*=================*/ trx_t* trx, /*!< transaction */ @@ -206,7 +206,7 @@ fts_config_set_value( { pars_info_t* info; que_t* graph; - ulint error; + dberr_t error; undo_no_t undo_no; undo_no_t n_rows_updated; ulint name_len = strlen(name); @@ -262,7 +262,7 @@ fts_config_set_value( Set the value specific to an FTS index in the config table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_index_value( /*=======================*/ trx_t* trx, /*!< transaction */ @@ -273,7 +273,7 @@ fts_config_set_index_value( config table */ { char* name; - ulint error; + dberr_t error; fts_table_t fts_table; FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, @@ -293,7 +293,7 @@ fts_config_set_index_value( Get an ulint value from the config table. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_index_ulint( /*=======================*/ trx_t* trx, /*!< in: transaction */ @@ -301,7 +301,7 @@ fts_config_get_index_ulint( const char* name, /*!< in: param name */ ulint* int_value) /*!< out: value */ { - ulint error; + dberr_t error; fts_string_t value; /* We set the length of value to the max bytes it can hold. This @@ -314,8 +314,8 @@ fts_config_get_index_ulint( if (UNIV_UNLIKELY(error != DB_SUCCESS)) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) reading `%s'\n", - error, name); + fprintf(stderr, " InnoDB: Error: (%s) reading `%s'\n", + ut_strerr(error), name); } else { *int_value = strtoul((char*) value.f_str, NULL, 10); } @@ -329,7 +329,7 @@ fts_config_get_index_ulint( Set an ulint value in the config table. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_index_ulint( /*=======================*/ trx_t* trx, /*!< in: transaction */ @@ -337,7 +337,7 @@ fts_config_set_index_ulint( const char* name, /*!< in: param name */ ulint int_value) /*!< in: value */ { - ulint error; + dberr_t error; fts_string_t value; /* We set the length of value to the max bytes it can hold. This @@ -356,8 +356,8 @@ fts_config_set_index_ulint( if (UNIV_UNLIKELY(error != DB_SUCCESS)) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) writing `%s'\n", - error, name); + fprintf(stderr, " InnoDB: Error: (%s) writing `%s'\n", + ut_strerr(error), name); } ut_free(value.f_str); @@ -369,7 +369,7 @@ fts_config_set_index_ulint( Get an ulint value from the config table. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_ulint( /*=================*/ trx_t* trx, /*!< in: transaction */ @@ -378,7 +378,7 @@ fts_config_get_ulint( const char* name, /*!< in: param name */ ulint* int_value) /*!< out: value */ { - ulint error; + dberr_t error; fts_string_t value; /* We set the length of value to the max bytes it can hold. This @@ -391,8 +391,8 @@ fts_config_get_ulint( if (UNIV_UNLIKELY(error != DB_SUCCESS)) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) reading `%s'\n", - error, name); + fprintf(stderr, " InnoDB: Error: (%s) reading `%s'\n", + ut_strerr(error), name); } else { *int_value = strtoul((char*) value.f_str, NULL, 10); } @@ -406,7 +406,7 @@ fts_config_get_ulint( Set an ulint value in the config table. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_ulint( /*=================*/ trx_t* trx, /*!< in: transaction */ @@ -415,7 +415,7 @@ fts_config_set_ulint( const char* name, /*!< in: param name */ ulint int_value) /*!< in: value */ { - ulint error; + dberr_t error; fts_string_t value; /* We set the length of value to the max bytes it can hold. This @@ -434,8 +434,8 @@ fts_config_set_ulint( if (UNIV_UNLIKELY(error != DB_SUCCESS)) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) writing `%s'\n", - error, name); + fprintf(stderr, " InnoDB: Error: (%s) writing `%s'\n", + ut_strerr(error), name); } ut_free(value.f_str); @@ -447,7 +447,7 @@ fts_config_set_ulint( Increment the value in the config table for column name. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_increment_value( /*=======================*/ trx_t* trx, /*!< transaction */ @@ -458,7 +458,7 @@ fts_config_increment_value( ulint delta) /*!< in: increment by this much */ { - ulint error; + dberr_t error; fts_string_t value; que_t* graph = NULL; ulint name_len = strlen(name); @@ -520,8 +520,8 @@ fts_config_increment_value( ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) " - "while incrementing %s.\n", error, name); + fprintf(stderr, " InnoDB: Error: (%s) " + "while incrementing %s.\n", ut_strerr(error), name); } ut_free(value.f_str); @@ -533,7 +533,7 @@ fts_config_increment_value( Increment the per index value in the config table for column name. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_increment_index_value( /*=============================*/ trx_t* trx, /*!< transaction */ @@ -544,7 +544,7 @@ fts_config_increment_index_value( much */ { char* name; - ulint error; + dberr_t error; fts_table_t fts_table; FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index f716b980501..a81d3043e9c 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,12 +36,8 @@ Full Text Search interface #include "dict0priv.h" #include "dict0stats.h" #include "btr0pcur.h" -#include "row0row.h" -#include "ha_prototypes.h" -#ifdef UNIV_NONINL -#include "fts0priv.ic" -#endif +#include "ha_prototypes.h" #define FTS_MAX_ID_LEN 32 @@ -63,9 +59,6 @@ UNIV_INTERN ulong fts_min_token_size; ib_time_t elapsed_time = 0; ulint n_nodes = 0; -typedef struct fts_schema_struct fts_schema_t; -typedef struct fts_sys_table_struct fts_sys_table_t; - /** Error condition reported by fts_utf8_decode() */ const ulint UTF8_ERROR = 0xFFFFFFFF; @@ -142,7 +135,7 @@ const char *fts_default_stopword[] = }; /** For storing table info when checking for orphaned tables. */ -struct fts_sys_table_struct { +struct fts_aux_table_t { table_id_t id; /*!< Table id */ table_id_t parent_id; /*!< Parent table id */ table_id_t index_id; /*!< Table FT index id */ @@ -246,7 +239,7 @@ static const char* fts_config_table_insert_values_sql = FTS_OPTIMIZE_LIMIT_IN_SECS "', '180');\n" "" "INSERT INTO %s VALUES ('" - FTS_SYNCED_DOC_ID "', '1');\n" + FTS_SYNCED_DOC_ID "', '0');\n" "" "INSERT INTO %s VALUES ('" FTS_TOTAL_DELETED_COUNT "', '0');\n" @@ -257,12 +250,13 @@ static const char* fts_config_table_insert_values_sql = /****************************************************************//** Run SYNC on the table, i.e., write out data from the cache to the FTS auxiliary INDEX table and clear the cache at the end. -@return DB_SUCCESS if all OK */ +@return DB_SUCCESS if all OK */ static -ulint +dberr_t fts_sync( /*=====*/ - fts_sync_t* sync); /*!< in: sync state */ + fts_sync_t* sync) /*!< in: sync state */ + __attribute__((nonnull)); /****************************************************************//** Release all resources help by the words rb tree e.g., the node ilist. */ @@ -270,7 +264,8 @@ static void fts_words_free( /*===========*/ - ib_rbt_t* words); /*!< in: rb tree of words */ + ib_rbt_t* words) /*!< in: rb tree of words */ + __attribute__((nonnull)); #ifdef FTS_CACHE_SIZE_DEBUG /****************************************************************//** Read the max cache size parameter from the config table. */ @@ -294,19 +289,35 @@ fts_add_doc_by_id( doc_id_t doc_id, /*!< in: doc id */ ib_vector_t* fts_indexes __attribute__((unused))); /*!< in: affected fts indexes */ +#ifdef FTS_DOC_STATS_DEBUG /****************************************************************//** Check whether a particular word (term) exists in the FTS index. @return DB_SUCCESS if all went fine */ static -ulint +dberr_t fts_is_word_in_index( /*=================*/ trx_t* trx, /*!< in: FTS query state */ que_t** graph, /*!< out: Query graph */ fts_table_t* fts_table, /*!< in: table instance */ const fts_string_t* word, /*!< in: the word to check */ - ibool* found); /*!< out: TRUE if exists */ + ibool* found) /*!< out: TRUE if exists */ + __attribute__((nonnull, warn_unused_result)); +#endif /* FTS_DOC_STATS_DEBUG */ +/******************************************************************//** +Update the last document id. This function could create a new +transaction to update the last document id. +@return DB_SUCCESS if OK */ +static +dberr_t +fts_update_sync_doc_id( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + const char* table_name, /*!< in: table name, or NULL */ + doc_id_t doc_id, /*!< in: last document id */ + trx_t* trx) /*!< in: update trx, or NULL */ + __attribute__((nonnull(1))); /******************************************************************** Check if we should stop. */ UNIV_INLINE @@ -443,7 +454,7 @@ fts_load_user_stopword( { pars_info_t* info; que_t* graph; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ibool ret = TRUE; trx_t* trx; ibool has_lock = fts->fts_status & TABLE_DICT_LOCKED; @@ -507,9 +518,9 @@ fts_load_user_stopword( trx->error_state = DB_SUCCESS; } else { - fprintf(stderr, " InnoDB: Error: %lu " + fprintf(stderr, " InnoDB: Error '%s' " "while reading user stopword table.\n", - error); + ut_strerr(error)); ret = FALSE; break; } @@ -542,7 +553,7 @@ fts_index_cache_init( index_cache->words = rbt_create_arg_cmp( sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp, - index_cache->charset); + (void*) index_cache->charset); ut_a(index_cache->doc_stats == NULL); @@ -670,7 +681,7 @@ fts_add_index( ib_vector_push(fts->indexes, &index); - index_cache = (fts_index_cache_t*) fts_find_index_cache(cache, index); + index_cache = fts_find_index_cache(cache, index); if (!index_cache) { /* Add new index cache structure */ @@ -805,7 +816,7 @@ fts_check_cached_index( Drop auxiliary tables related to an FTS index @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t fts_drop_index( /*===========*/ dict_table_t* table, /*!< in: Table where indexes are dropped */ @@ -813,7 +824,7 @@ fts_drop_index( trx_t* trx) /*!< in: Transaction for the drop */ { ib_vector_t* indexes = table->fts->indexes; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_a(indexes); @@ -821,6 +832,8 @@ fts_drop_index( && (index == static_cast<dict_index_t*>( ib_vector_getp(table->fts->indexes, 0)))) || ib_vector_is_empty(indexes)) { + doc_id_t current_doc_id; + doc_id_t first_doc_id; /* If we are dropping the only FTS index of the table, remove it from optimize thread */ @@ -844,17 +857,20 @@ fts_drop_index( return(err); } + current_doc_id = table->fts->cache->next_doc_id; + first_doc_id = table->fts->cache->first_doc_id; fts_cache_clear(table->fts->cache, TRUE); fts_cache_destroy(table->fts->cache); table->fts->cache = fts_cache_create(table); + table->fts->cache->next_doc_id = current_doc_id; + table->fts->cache->first_doc_id = first_doc_id; } else { fts_cache_t* cache = table->fts->cache; fts_index_cache_t* index_cache; rw_lock_x_lock(&cache->init_lock); - index_cache = (fts_index_cache_t*) fts_find_index_cache( - cache, index); + index_cache = fts_find_index_cache(cache, index); if (index_cache->words) { fts_words_free(index_cache->words); @@ -1215,7 +1231,7 @@ fts_tokenizer_word_get( if (rbt_search(cache->stopword_info.cached_stopword, &parent, text) == 0) { - return NULL; + return(NULL); } /* Check if we found a match, if not then add word to tree. */ @@ -1445,38 +1461,40 @@ fts_cache_add_doc( /****************************************************************//** Drops a table. If the table can't be found we return a SUCCESS code. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_drop_table( /*===========*/ trx_t* trx, /*!< in: transaction */ const char* table_name) /*!< in: table to drop */ { - ulint error = DB_SUCCESS; + dict_table_t* table; + dberr_t error = DB_SUCCESS; - /* Check that the table exists in our data dictionary. */ - if (dict_table_get_low(table_name)) { + /* Check that the table exists in our data dictionary. + Similar to regular drop table case, we will open table with + DICT_ERR_IGNORE_INDEX_ROOT and DICT_ERR_IGNORE_CORRUPT option */ + table = dict_table_open_on_name( + table_name, TRUE, FALSE, + static_cast<dict_err_ignore_t>( + DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT)); -#ifdef FTS_INTERNAL_DIAG_PRINT - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Dropping %s\n", table_name); -#endif + if (table != 0) { - error = row_drop_table_for_mysql(table_name, trx, TRUE); + dict_table_close(table, TRUE, FALSE); + + /* Pass nonatomic=false (dont allow data dict unlock), + because the transaction may hold locks on SYS_* tables from + previous calls to fts_drop_table(). */ + error = row_drop_table_for_mysql(table_name, trx, true, false); - /* We only return the status of the last error. */ if (error != DB_SUCCESS) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) dropping " - "FTS index table %s\n", error, table_name); + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to drop FTS index aux table %s: %s", + table_name, ut_strerr(error)); } } else { - ut_print_timestamp(stderr); - - /* FIXME: Should provide appropriate error return code - rather than printing message indiscriminately. */ - fprintf(stderr, " InnoDB: %s not found.\n", - table_name); + error = DB_FAIL; } return(error); @@ -1487,8 +1505,8 @@ Drops the common ancillary tables needed for supporting an FTS index on the given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_drop_common_tables( /*===================*/ trx_t* trx, /*!< in: transaction */ @@ -1496,10 +1514,10 @@ fts_drop_common_tables( index */ { ulint i; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; for (i = 0; fts_common_tables[i] != NULL; ++i) { - ulint err; + dberr_t err; char* table_name; fts_table->suffix = fts_common_tables[i]; @@ -1509,7 +1527,7 @@ fts_drop_common_tables( err = fts_drop_table(trx, table_name); /* We only return the status of the last error. */ - if (err != DB_SUCCESS) { + if (err != DB_SUCCESS && err != DB_FAIL) { error = err; } @@ -1520,11 +1538,11 @@ fts_drop_common_tables( } /****************************************************************//** -Since we do a horizontal split on the index table, we need to drop the +Since we do a horizontal split on the index table, we need to drop all the split tables. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_drop_index_split_tables( /*========================*/ trx_t* trx, /*!< in: transaction */ @@ -1533,12 +1551,12 @@ fts_drop_index_split_tables( { ulint i; fts_table_t fts_table; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index); for (i = 0; fts_index_selector[i].value; ++i) { - ulint err; + dberr_t err; char* table_name; fts_table.suffix = fts_get_suffix(i); @@ -1548,7 +1566,7 @@ fts_drop_index_split_tables( err = fts_drop_table(trx, table_name); /* We only return the status of the last error. */ - if (err != DB_SUCCESS) { + if (err != DB_SUCCESS && err != DB_FAIL) { error = err; } @@ -1562,23 +1580,21 @@ fts_drop_index_split_tables( Drops FTS auxiliary tables for an FTS index @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_drop_index_tables( /*==================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index) /*!< in: Index to drop */ { - ulint err; - ulint error = DB_SUCCESS; fts_table_t fts_table; - ulint j; + dberr_t error = DB_SUCCESS; static const char* index_tables[] = { "DOC_ID", NULL }; - err = fts_drop_index_split_tables(trx, index); + dberr_t err = fts_drop_index_split_tables(trx, index); /* We only return the status of the last error. */ if (err != DB_SUCCESS) { @@ -1587,18 +1603,17 @@ fts_drop_index_tables( FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index); - for (j = 0; index_tables[j] != NULL; ++j) { - ulint err; + for (ulint i = 0; index_tables[i] != NULL; ++i) { char* table_name; - fts_table.suffix = index_tables[j]; + fts_table.suffix = index_tables[i]; table_name = fts_get_table_name(&fts_table); err = fts_drop_table(trx, table_name); /* We only return the status of the last error. */ - if (err != DB_SUCCESS) { + if (err != DB_SUCCESS && err != DB_FAIL) { error = err; } @@ -1613,18 +1628,20 @@ Drops FTS ancillary tables needed for supporting an FTS index on the given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_drop_all_index_tables( /*======================*/ trx_t* trx, /*!< in: transaction */ fts_t* fts) /*!< in: fts instance */ { - ulint i; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; - for (i = 0; i < ib_vector_size(fts->indexes); ++i) { - ulint err; + for (ulint i = 0; + fts->indexes != 0 && i < ib_vector_size(fts->indexes); + ++i) { + + dberr_t err; dict_index_t* index; index = static_cast<dict_index_t*>( @@ -1646,17 +1663,19 @@ given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_drop_tables( /*============*/ trx_t* trx, /*!< in: transaction */ dict_table_t* table) /*!< in: table has the FTS index */ { - ulint error; + dberr_t error; fts_table_t fts_table; FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); + /* TODO: This is not atomic and can cause problems during recovery. */ + error = fts_drop_common_tables(trx, &fts_table); if (error == DB_SUCCESS) { @@ -1692,20 +1711,20 @@ on the given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS if succeed */ UNIV_INTERN -ulint +dberr_t fts_create_common_tables( /*=====================*/ - trx_t* trx, /*!< in: transaction */ - const dict_table_t* table, /*!< in: table with FTS index */ - const char* name, /*!< in: table name normalized.*/ - ibool skip_doc_id_index) /*!< in: Skip index on doc id */ - + trx_t* trx, /*!< in: transaction */ + const dict_table_t* table, /*!< in: table with FTS index */ + const char* name, /*!< in: table name normalized.*/ + bool skip_doc_id_index)/*!< in: Skip index on doc id */ { char* sql; - ulint error; + dberr_t error; que_t* graph; fts_table_t fts_table; mem_heap_t* heap = mem_heap_create(1024); + pars_info_t* info; FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); @@ -1744,17 +1763,23 @@ fts_create_common_tables( goto func_exit; } + info = pars_info_create(); + + pars_info_bind_id(info, TRUE, "table_name", name); + pars_info_bind_id(info, TRUE, "index_name", FTS_DOC_ID_INDEX_NAME); + pars_info_bind_id(info, TRUE, "doc_id_col_name", FTS_DOC_ID_COL_NAME); + /* Create the FTS DOC_ID index on the hidden column. Currently this is common for any FT index created on the table. */ graph = fts_parse_sql_no_dict_lock( NULL, - NULL, + info, mem_heap_printf( heap, "BEGIN\n" "" - "CREATE UNIQUE INDEX %s ON %s(%s);\n", - FTS_DOC_ID_INDEX_NAME, name, FTS_DOC_ID_COL_NAME)); + "CREATE UNIQUE INDEX $index_name ON $table_name(" + "$doc_id_col_name);\n")); error = fts_eval_sql(trx, graph); que_graph_free(graph); @@ -1794,7 +1819,7 @@ fts_create_one_index_table( dict_field_t* field; dict_table_t* new_table = NULL; char* table_name = fts_get_table_name(fts_table); - ulint error; + dberr_t error; CHARSET_INFO* charset; ut_ad(index->type & DICT_FTS); @@ -1828,14 +1853,14 @@ fts_create_one_index_table( dict_mem_table_add_col(new_table, heap, "ilist", DATA_BLOB, 4130048, 0); - error = row_create_table_for_mysql(new_table, trx); + error = row_create_table_for_mysql(new_table, trx, true); if (error != DB_SUCCESS) { - trx->error_state = static_cast<db_err>(error); + trx->error_state = error; dict_mem_table_free(new_table); new_table = NULL; - fprintf(stderr, " InnoDB: Warning: Fail to create FTS " - " index table %s \n", table_name); + ib_logf(IB_LOG_LEVEL_WARN, + "Fail to create FTS index table %s", table_name); } mem_free(table_name); @@ -1848,7 +1873,7 @@ Wrapper function of fts_create_index_tables_low(), create auxiliary tables for an FTS index @return: DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_create_index_tables_low( /*========================*/ trx_t* trx, /*!< in: transaction */ @@ -1862,7 +1887,7 @@ fts_create_index_tables_low( char* sql; que_t* graph; fts_table_t fts_table; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; mem_heap_t* heap = mem_heap_create(1024); fts_table.type = FTS_INDEX_TABLE; @@ -1874,6 +1899,7 @@ fts_create_index_tables_low( /* Create the FTS auxiliary tables that are specific to an FTS index. */ sql = fts_prepare_sql(&fts_table, fts_create_index_tables_sql); + graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql); mem_free(sql); @@ -1903,9 +1929,7 @@ fts_create_index_tables_low( que_graph_free(graph); } - if (error == DB_SUCCESS) { - error = fts_sql_commit(trx); - } else { + if (error != DB_SUCCESS) { /* We have special error handling here */ trx->error_state = DB_SUCCESS; @@ -1928,18 +1952,25 @@ FTS index on the given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_create_index_tables( /*====================*/ trx_t* trx, /*!< in: transaction */ const dict_index_t* index) /*!< in: the index instance */ { + dberr_t err; dict_table_t* table; table = dict_table_get_low(index->table_name); ut_a(table != NULL); - return(fts_create_index_tables_low(trx, index, table->name, table->id)); + err = fts_create_index_tables_low(trx, index, table->name, table->id); + + if (err == DB_SUCCESS) { + trx_commit(trx); + } + + return(err); } #if 0 /******************************************************************//** @@ -1953,22 +1984,22 @@ fts_get_state_str( { switch (state) { case FTS_INSERT: - return "INSERT"; + return("INSERT"); case FTS_MODIFY: - return "MODIFY"; + return("MODIFY"); case FTS_DELETE: - return "DELETE"; + return("DELETE"); case FTS_NOTHING: - return "NOTHING"; + return("NOTHING"); case FTS_INVALID: - return "INVALID"; + return("INVALID"); default: - return "UNKNOWN"; + return("UNKNOWN"); } } #endif @@ -2321,7 +2352,7 @@ fts_get_max_cache_size( trx_t* trx, /*!< in: transaction */ fts_table_t* fts_table) /*!< in: table instance */ { - ulint error; + dberr_t error; fts_string_t value; ulint cache_size_in_mb; @@ -2381,32 +2412,19 @@ fts_get_max_cache_size( } #endif -/*********************************************************************//** -Get the total number of documents in the FTS. -@return estimated number of rows in the table */ -UNIV_INTERN -ulint -fts_get_total_document_count( -/*=========================*/ - dict_table_t* table) /*!< in: table instance */ -{ - ut_ad(table->stat_initialized); - - return((ulint) table->stat_n_rows); -} - +#ifdef FTS_DOC_STATS_DEBUG /*********************************************************************//** Get the total number of words in the FTS for a particular FTS index. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t fts_get_total_word_count( /*=====================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: for this index */ ulint* total) /* out: total words */ { - ulint error; + dberr_t error; fts_string_t value; *total = 0; @@ -2426,14 +2444,15 @@ fts_get_total_word_count( *total = strtoul((char*) value.f_str, NULL, 10); } else { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) reading total words " - "value from config table\n", error); + fprintf(stderr, " InnoDB: Error: (%s) reading total words " + "value from config table\n", ut_strerr(error)); } ut_free(value.f_str); return(error); } +#endif /* FTS_DOC_STATS_DEBUG */ /*********************************************************************//** Update the next and last Doc ID in the CONFIG table to be the input @@ -2443,8 +2462,9 @@ UNIV_INTERN void fts_update_next_doc_id( /*===================*/ + trx_t* trx, /*!< in/out: transaction */ const dict_table_t* table, /*!< in: table */ - const char* table_name, /*!< in: table name */ + const char* table_name, /*!< in: table name, or NULL */ doc_id_t doc_id) /*!< in: DOC ID to set */ { table->fts->cache->synced_doc_id = doc_id; @@ -2453,7 +2473,7 @@ fts_update_next_doc_id( table->fts->cache->first_doc_id = table->fts->cache->next_doc_id; fts_update_sync_doc_id( - table, table_name, table->fts->cache->synced_doc_id, NULL); + table, table_name, table->fts->cache->synced_doc_id, trx); } @@ -2461,7 +2481,7 @@ fts_update_next_doc_id( Get the next available document id. @return DB_SUCCESS if OK */ UNIV_INTERN -ulint +dberr_t fts_get_next_doc_id( /*================*/ const dict_table_t* table, /*!< in: table */ @@ -2494,8 +2514,8 @@ fts_get_next_doc_id( This function fetch the Doc ID from CONFIG table, and compare with the Doc ID supplied. And store the larger one to the CONFIG table. @return DB_SUCCESS if OK */ -UNIV_INTERN -ulint +static __attribute__((nonnull)) +dberr_t fts_cmp_set_sync_doc_id( /*====================*/ const dict_table_t* table, /*!< in: table */ @@ -2509,7 +2529,7 @@ fts_cmp_set_sync_doc_id( { trx_t* trx; pars_info_t* info; - ulint error; + dberr_t error; fts_table_t fts_table; que_t* graph = NULL; fts_cache_t* cache = table->fts->cache; @@ -2559,8 +2579,6 @@ retry: goto func_exit; } - ut_a(*doc_id > 0); - if (read_only) { goto func_exit; } @@ -2594,8 +2612,8 @@ func_exit: *doc_id = 0; ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) " - "while getting next doc id.\n", error); + fprintf(stderr, " InnoDB: Error: (%s) " + "while getting next doc id.\n", ut_strerr(error)); fts_sql_rollback(trx); @@ -2614,23 +2632,23 @@ func_exit: Update the last document id. This function could create a new transaction to update the last document id. @return DB_SUCCESS if OK */ -UNIV_INTERN -ulint +static +dberr_t fts_update_sync_doc_id( /*===================*/ const dict_table_t* table, /*!< in: table */ - const char* table_name, /*!< in: table name */ + const char* table_name, /*!< in: table name, or NULL */ doc_id_t doc_id, /*!< in: last document id */ - trx_t* trx) /*!< in: update trx */ + trx_t* trx) /*!< in: update trx, or NULL */ { byte id[FTS_MAX_ID_LEN]; pars_info_t* info; fts_table_t fts_table; ulint id_len; que_t* graph = NULL; - ulint error; + dberr_t error; ibool local_trx = FALSE; - fts_cache_t* cache = table->fts->cache;; + fts_cache_t* cache = table->fts->cache; fts_table.suffix = "CONFIG"; fts_table.table_id = table->id; @@ -2651,8 +2669,7 @@ fts_update_sync_doc_id( info = pars_info_create(); - // FIXME: Get rid of snprintf - id_len = snprintf( + id_len = ut_snprintf( (char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1); pars_info_bind_varchar_literal(info, "doc_id", id, id_len); @@ -2672,9 +2689,10 @@ fts_update_sync_doc_id( fts_sql_commit(trx); cache->synced_doc_id = doc_id; } else { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) " - "while updating last doc id.\n", error); + + ib_logf(IB_LOG_LEVEL_ERROR, + "(%s) while updating last doc id.", + ut_strerr(error)); fts_sql_rollback(trx); } @@ -2725,15 +2743,15 @@ fts_doc_ids_free( /*********************************************************************//** Do commit-phase steps necessary for the insertion of a new row. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_add( /*====*/ fts_trx_table_t*ftt, /*!< in: FTS trx table */ fts_trx_row_t* row) /*!< in: row */ { dict_table_t* table = ftt->table; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; doc_id_t doc_id = row->doc_id; ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY); @@ -2757,8 +2775,8 @@ fts_add( /*********************************************************************//** Do commit-phase steps necessary for the deletion of a row. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_delete( /*=======*/ fts_trx_table_t*ftt, /*!< in: FTS trx table */ @@ -2766,7 +2784,7 @@ fts_delete( { que_t* graph; fts_table_t fts_table; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; doc_id_t write_doc_id; dict_table_t* table = ftt->table; doc_id_t doc_id = row->doc_id; @@ -2848,14 +2866,14 @@ fts_delete( /*********************************************************************//** Do commit-phase steps necessary for the modification of a row. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_modify( /*=======*/ fts_trx_table_t* ftt, /*!< in: FTS trx table */ fts_trx_row_t* row) /*!< in: row */ { - ulint error; + dberr_t error; ut_a(row->state == FTS_MODIFY); @@ -2872,7 +2890,7 @@ fts_modify( Create a new document id. @return DB_SUCCESS if all went well else error */ UNIV_INTERN -ulint +dberr_t fts_create_doc_id( /*==============*/ dict_table_t* table, /*!< in: row is of this table. */ @@ -2882,7 +2900,7 @@ fts_create_doc_id( mem_heap_t* heap) /*!< in: heap */ { doc_id_t doc_id; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ut_a(table->fts->doc_col != ULINT_UNDEFINED); @@ -2919,15 +2937,15 @@ fts_create_doc_id( The given transaction is about to be committed; do whatever is necessary from the FTS system's POV. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_commit_table( /*=============*/ fts_trx_table_t* ftt) /*!< in: FTS table to commit*/ { const ib_rbt_node_t* node; ib_rbt_t* rows; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; fts_cache_t* cache = ftt->table->fts->cache; trx_t* trx = trx_allocate_for_background(); @@ -2979,13 +2997,13 @@ The given transaction is about to be committed; do whatever is necessary from the FTS system's POV. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_commit( /*=======*/ trx_t* trx) /*!< in: transaction */ { const ib_rbt_node_t* node; - ulint error; + dberr_t error; ib_rbt_t* tables; fts_savepoint_t* savepoint; @@ -3008,10 +3026,9 @@ fts_commit( } /*********************************************************************//** -Create a new empty document. -@return new document */ +Initialize a document. */ UNIV_INTERN -fts_doc_t* +void fts_doc_init( /*=========*/ fts_doc_t* doc) /*!< in: doc to initialize */ @@ -3021,8 +3038,6 @@ fts_doc_init( memset(doc, 0, sizeof(*doc)); doc->self_heap = ib_heap_allocator_create(heap); - - return(doc); } /*********************************************************************//** @@ -3075,7 +3090,7 @@ fts_fetch_row_id( /*********************************************************************//** Callback function for fetch that stores the text of an FTS document, converting each column to UTF-16. -@return: always returns FALSE */ +@return always FALSE */ UNIV_INTERN ibool fts_query_expansion_fetch_doc( @@ -3467,13 +3482,15 @@ fts_get_max_doc_id( dfield = dict_index_get_nth_field(index, 0); +#if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */ ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0); +#endif mtr_start(&mtr); /* fetch the largest indexes value */ btr_pcur_open_at_index_side( - FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) { const rec_t* rec = NULL; @@ -3516,13 +3533,14 @@ func_exit: Fetch document with the given document id. @return DB_SUCCESS if OK else error */ UNIV_INTERN -ulint +dberr_t fts_doc_fetch_by_doc_id( /*====================*/ fts_get_doc_t* get_doc, /*!< in: state */ doc_id_t doc_id, /*!< in: id of document to fetch */ - dict_index_t* index_to_use, /*!< in: caller supplied FTS index */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ ulint option, /*!< in: search option, if it is greater than doc_id or equal */ fts_sql_callback @@ -3530,7 +3548,7 @@ fts_doc_fetch_by_doc_id( void* arg) /*!< in: callback arg */ { pars_info_t* info; - ulint error; + dberr_t error; const char* select_str; doc_id_t write_doc_id; dict_index_t* index; @@ -3555,6 +3573,7 @@ fts_doc_fetch_by_doc_id( pars_info_bind_function(info, "my_func", callback, arg); select_str = fts_get_select_columns_str(index, info, info->heap); + pars_info_bind_id(info, TRUE, "table_name", index->table_name); if (!get_doc || !get_doc->get_document_graph) { if (option == FTS_FETCH_DOC_BY_ID_EQUAL) { @@ -3564,7 +3583,7 @@ fts_doc_fetch_by_doc_id( mem_heap_printf(info->heap, "DECLARE FUNCTION my_func;\n" "DECLARE CURSOR c IS" - " SELECT %s FROM %s" + " SELECT %s FROM $table_name" " WHERE %s = :doc_id;\n" "BEGIN\n" "" @@ -3576,20 +3595,32 @@ fts_doc_fetch_by_doc_id( " END IF;\n" "END LOOP;\n" "CLOSE c;", - select_str, index->table_name, - FTS_DOC_ID_COL_NAME)); + select_str, FTS_DOC_ID_COL_NAME)); } else { ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE); + /* This is used for crash recovery of table with + hidden DOC ID or FTS indexes. We will scan the table + to re-processing user table rows whose DOC ID or + FTS indexed documents have not been sync-ed to disc + during recent crash. + In the case that all fulltext indexes are dropped + for a table, we will keep the "hidden" FTS_DOC_ID + column, and this scan is to retreive the largest + DOC ID being used in the table to determine the + appropriate next DOC ID. + In the case of there exists fulltext index(es), this + operation will re-tokenize any docs that have not + been sync-ed to the disk, and re-prime the FTS + cached */ graph = fts_parse_sql( NULL, info, mem_heap_printf(info->heap, "DECLARE FUNCTION my_func;\n" "DECLARE CURSOR c IS" - " SELECT %s, %s FROM %s" - " WHERE %s > :doc_id" - " ORDER BY %s;\n" + " SELECT %s, %s FROM $table_name" + " WHERE %s > :doc_id;\n" "BEGIN\n" "" "OPEN c;\n" @@ -3601,9 +3632,7 @@ fts_doc_fetch_by_doc_id( "END LOOP;\n" "CLOSE c;", FTS_DOC_ID_COL_NAME, - select_str, index->table_name, - FTS_DOC_ID_COL_NAME, - FTS_DOC_ID_COL_NAME)); + select_str, FTS_DOC_ID_COL_NAME)); } if (get_doc) { get_doc->get_document_graph = graph; @@ -3633,7 +3662,7 @@ fts_doc_fetch_by_doc_id( Write out a single word's data as new entry/entries in the INDEX table. @return DB_SUCCESS if all OK. */ UNIV_INTERN -ulint +dberr_t fts_write_node( /*===========*/ trx_t* trx, /*!< in: transaction */ @@ -3643,7 +3672,7 @@ fts_write_node( fts_node_t* node) /*!< in: node columns */ { pars_info_t* info; - ulint error; + dberr_t error; ib_uint32_t doc_count; ib_time_t start_time; doc_id_t last_doc_id; @@ -3698,8 +3727,8 @@ fts_write_node( /*********************************************************************//** Add rows to the DELETED_CACHE table. @return DB_SUCCESS if all went well else error code*/ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_sync_add_deleted_cache( /*=======================*/ fts_sync_t* sync, /*!< in: sync state */ @@ -3710,7 +3739,7 @@ fts_sync_add_deleted_cache( que_t* graph; fts_table_t fts_table; doc_id_t dummy = 0; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ulint n_elems = ib_vector_size(doc_ids); ut_a(ib_vector_size(doc_ids) > 0); @@ -3748,9 +3777,10 @@ fts_sync_add_deleted_cache( } /*********************************************************************//** -Write the words and ilist to disk.*/ -static -ulint +Write the words and ilist to disk. +@return DB_SUCCESS if all went well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_sync_write_words( /*=================*/ trx_t* trx, /*!< in: transaction */ @@ -3761,10 +3791,12 @@ fts_sync_write_words( ulint n_nodes = 0; ulint n_words = 0; const ib_rbt_node_t* rbt_node; - ulint n_new_words = 0; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ibool print_error = FALSE; +#ifdef FTS_DOC_STATS_DEBUG dict_table_t* table = index_cache->index->table; + ulint n_new_words = 0; +#endif /* FTS_DOC_STATS_DEBUG */ FTS_INIT_INDEX_TABLE( &fts_table, NULL, FTS_INDEX_TABLE, index_cache->index); @@ -3789,9 +3821,10 @@ fts_sync_write_words( fts_table.suffix = fts_get_suffix(selected); +#ifdef FTS_DOC_STATS_DEBUG /* Check if the word exists in the FTS index and if not then we need to increment the total word count stats. */ - if (error == DB_SUCCESS) { + if (error == DB_SUCCESS && fts_enable_diag_print) { ibool found = FALSE; error = fts_is_word_in_index( @@ -3805,6 +3838,7 @@ fts_sync_write_words( ++n_new_words; } } +#endif /* FTS_DOC_STATS_DEBUG */ n_nodes += ib_vector_size(word->nodes); @@ -3829,9 +3863,9 @@ fts_sync_write_words( if (error != DB_SUCCESS && !print_error) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error (%lu) writing " + fprintf(stderr, " InnoDB: Error (%s) writing " "word node to FTS auxiliary index " - "table.\n", error); + "table.\n", ut_strerr(error)); print_error = TRUE; } @@ -3840,19 +3874,23 @@ fts_sync_write_words( ut_free(rbt_remove_node(index_cache->words, rbt_node)); } - if (error == DB_SUCCESS && n_new_words > 0) { +#ifdef FTS_DOC_STATS_DEBUG + if (error == DB_SUCCESS && n_new_words > 0 && fts_enable_diag_print) { fts_table_t fts_table; FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); /* Increment the total number of words in the FTS index */ - fts_config_increment_index_value( + error = fts_config_increment_index_value( trx, index_cache->index, FTS_TOTAL_WORD_COUNT, n_new_words); } +#endif /* FTS_DOC_STATS_DEBUG */ - printf("Avg number of nodes: %lf\n", - (double) n_nodes / (double) (n_words > 1 ? n_words : 1)); + if (fts_enable_diag_print) { + printf("Avg number of nodes: %lf\n", + (double) n_nodes / (double) (n_words > 1 ? n_words : 1)); + } return(error); } @@ -3861,8 +3899,8 @@ fts_sync_write_words( /*********************************************************************//** Write a single documents statistics to disk. @return DB_SUCCESS if all went well else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_sync_write_doc_stat( /*====================*/ trx_t* trx, /*!< in: transaction */ @@ -3872,7 +3910,7 @@ fts_sync_write_doc_stat( { pars_info_t* info; doc_id_t doc_id; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ib_uint32_t word_count; if (*graph) { @@ -3918,9 +3956,9 @@ fts_sync_write_doc_stat( trx->error_state = DB_SUCCESS; } else { - fprintf(stderr, " InnoDB: Error: %lu " + fprintf(stderr, " InnoDB: Error: (%s) " "while writing to FTS doc_id.\n", - error); + ut_strerr(error)); break; /* Exit the loop. */ } @@ -3940,7 +3978,7 @@ fts_sync_write_doc_stats( trx_t* trx, /*!< in: transaction */ const fts_index_cache_t*index_cache) /*!< in: index cache */ { - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; que_t* graph = NULL; fts_doc_stats_t* doc_stat; @@ -3973,7 +4011,6 @@ fts_sync_write_doc_stats( return(error); } -#endif /* FTS_DOC_STATS_DEBUG */ /*********************************************************************//** Callback to check the existince of a word. @@ -4007,13 +4044,12 @@ fts_lookup_word( } /*********************************************************************//** -Check whether a particular word (term) exists in the FTS index. */ +Check whether a particular word (term) exists in the FTS index. +@return DB_SUCCESS if all went well else error code */ static -ulint +dberr_t fts_is_word_in_index( /*=================*/ - /* out: DB_SUCCESS if all went - well else error code */ trx_t* trx, /*!< in: FTS query state */ que_t** graph, /* out: Query graph */ fts_table_t* fts_table, /*!< in: table instance */ @@ -4022,7 +4058,7 @@ fts_is_word_in_index( ibool* found) /* out: TRUE if exists */ { pars_info_t* info; - ulint error; + dberr_t error; trx->op_info = "looking up word in FTS index"; @@ -4073,8 +4109,9 @@ fts_is_word_in_index( trx->error_state = DB_SUCCESS; } else { - fprintf(stderr, " InnoDB: Error: %lu " - "while reading FTS index.\n", error); + fprintf(stderr, " InnoDB: Error: (%s) " + "while reading FTS index.\n", + ut_strerr(error)); break; /* Exit the loop. */ } @@ -4083,6 +4120,7 @@ fts_is_word_in_index( return(error); } +#endif /* FTS_DOC_STATS_DEBUG */ /*********************************************************************//** Begin Sync, create transaction, acquire locks, etc. */ @@ -4101,29 +4139,36 @@ fts_sync_begin( sync->trx = trx_allocate_for_background(); - ut_print_timestamp(stderr); - fprintf(stderr, " SYNC deleted count: %ld size: %lu bytes\n", - ib_vector_size(cache->deleted_doc_ids), cache->total_size); + if (fts_enable_diag_print) { + ib_logf(IB_LOG_LEVEL_INFO, + "FTS SYNC for table %s, deleted count: %ld size: " + "%lu bytes", + sync->table->name, + ib_vector_size(cache->deleted_doc_ids), + cache->total_size); + } } /*********************************************************************//** Run SYNC on the table, i.e., write out data from the index specific -cache to the FTS aux INDEX table and FTS aux doc id stats table. */ -static -ulint +cache to the FTS aux INDEX table and FTS aux doc id stats table. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_sync_index( /*===========*/ - /* out: DB_SUCCESS if all OK */ fts_sync_t* sync, /*!< in: sync state */ fts_index_cache_t* index_cache) /*!< in: index cache */ { trx_t* trx = sync->trx; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; trx->op_info = "doing SYNC index"; - ut_print_timestamp(stderr); - fprintf(stderr, " SYNC words: %ld\n", rbt_size(index_cache->words)); + if (fts_enable_diag_print) { + ib_logf(IB_LOG_LEVEL_INFO, + "SYNC words: %ld", rbt_size(index_cache->words)); + } ut_ad(rbt_validate(index_cache->words)); @@ -4146,13 +4191,13 @@ fts_sync_index( /*********************************************************************//** Commit the SYNC, change state of processed doc ids etc. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_sync_commit( /*============*/ fts_sync_t* sync) /*!< in: sync state */ { - ulint error; + dberr_t error; trx_t* trx = sync->trx; fts_cache_t* cache = sync->table->fts->cache; doc_id_t last_doc_id; @@ -4191,13 +4236,18 @@ fts_sync_commit( fts_sql_rollback(trx); ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) during SYNC.\n", error); + fprintf(stderr, " InnoDB: Error: (%s) during SYNC.\n", + ut_strerr(error)); } - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: SYNC time : %lusecs: elapsed %lf ins/sec\n", - (ulong) (ut_time() - sync->start_time), - (double) n_nodes/ (double) elapsed_time); + if (fts_enable_diag_print && elapsed_time) { + ib_logf(IB_LOG_LEVEL_INFO, + "SYNC for table %s: SYNC time : %lu secs: " + "elapsed %lf ins/sec", + sync->table->name, + (ulong) (ut_time() - sync->start_time), + (double) n_nodes/ (double) elapsed_time); + } trx_free_for_background(trx); @@ -4226,13 +4276,13 @@ Run SYNC on the table, i.e., write out data from the cache to the FTS auxiliary INDEX table and clear the cache at the end. @return DB_SUCCESS if all OK */ static -ulint +dberr_t fts_sync( /*=====*/ fts_sync_t* sync) /*!< in: sync state */ { ulint i; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; fts_cache_t* cache = sync->table->fts->cache; rw_lock_x_lock(&cache->lock); @@ -4275,34 +4325,28 @@ fts_sync( /****************************************************************//** Run SYNC on the table, i.e., write out data from the cache to the -FTS auxiliary INDEX table and clear the cache at the end. -@return DB_SUCCESS if all OK */ +FTS auxiliary INDEX table and clear the cache at the end. */ UNIV_INTERN -ulint +void fts_sync_table( /*===========*/ dict_table_t* table) /*!< in: table */ { - ulint error = DB_SUCCESS; - ut_ad(table->fts); if (table->fts->cache) { fts_sync(table->fts->cache->sync); } - - return(error); } /******************************************************************** Process next token from document starting at the given position, i.e., add -the token's start position to the token's list of positions. */ +the token's start position to the token's list of positions. +@return number of characters handled in this call */ static ulint fts_process_token( /*==============*/ - /* out: number of characters - handled in this call */ fts_doc_t* doc, /* in/out: document to tokenize */ fts_doc_t* result, /* out: if provided, save @@ -4406,7 +4450,7 @@ fts_tokenize_document( ut_a(doc->charset); doc->tokens = rbt_create_arg_cmp( - sizeof(fts_token_t), innobase_fts_text_cmp, doc->charset); + sizeof(fts_token_t), innobase_fts_text_cmp, (void*) doc->charset); for (ulint i = 0; i < doc->text.f_len; i += inc) { inc = fts_process_token(doc, result, i, 0); @@ -4473,6 +4517,7 @@ fts_get_docs_create( memset(get_doc, 0x0, sizeof(*get_doc)); get_doc->index_cache = fts_get_index_cache(cache, *index); + get_doc->cache = cache; /* Must find the index cache. */ ut_a(get_doc->index_cache != NULL); @@ -4520,11 +4565,14 @@ fts_init_doc_id( rw_lock_x_lock(&table->fts->cache->lock); + /* Return if the table is already initialized for DOC ID */ if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) { rw_lock_x_unlock(&table->fts->cache->lock); return(0); } + DEBUG_SYNC_C("fts_initialize_doc_id"); + /* Then compare this value with the ID value stored in the CONFIG table. The larger one will be our new initial Doc ID */ fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id); @@ -4591,7 +4639,7 @@ fts_get_rows_count( trx_t* trx; pars_info_t* info; que_t* graph; - ulint error; + dberr_t error; ulint count = 0; trx = trx_allocate_for_background(); @@ -4639,9 +4687,9 @@ fts_get_rows_count( trx->error_state = DB_SUCCESS; } else { - fprintf(stderr, " InnoDB: Error: %lu " + fprintf(stderr, " InnoDB: Error: (%s) " "while reading FTS table.\n", - error); + ut_strerr(error)); break; /* Exit the loop. */ } @@ -4678,7 +4726,7 @@ fts_update_max_cache_size( trx_free_for_background(trx); } -#endif +#endif /* FTS_CACHE_SIZE_DEBUG */ /*********************************************************************//** Free the modified rows of a table. */ @@ -4861,13 +4909,13 @@ fts_get_doc_id_from_rec( col_no = dict_col_get_clust_pos( &table->cols[table->fts->doc_col], clust_index); + ut_ad(col_no != ULINT_UNDEFINED); - /* We have no choice but to cast rec here :-( */ - data = rec_get_nth_field((rec_t*) rec, offsets, col_no, &len); + data = rec_get_nth_field(rec, offsets, col_no, &len); ut_a(len == 8); - ut_a(len == sizeof(doc_id)); - doc_id = (doc_id_t) mach_read_from_8(data); + ut_ad(8 == sizeof(doc_id)); + doc_id = static_cast<doc_id_t>(mach_read_from_8(data)); return(doc_id); } @@ -4876,7 +4924,7 @@ fts_get_doc_id_from_rec( Search the index specific cache for a particular FTS index. @return the index specific cache else NULL */ UNIV_INTERN -const fts_index_cache_t* +fts_index_cache_t* fts_find_index_cache( /*=================*/ const fts_cache_t* cache, /*!< in: cache to search */ @@ -4884,7 +4932,8 @@ fts_find_index_cache( { /* We cast away the const because our internal function, takes non-const cache arg and returns a non-const pointer. */ - return(fts_get_index_cache((fts_cache_t*) cache, index)); + return(static_cast<fts_index_cache_t*>( + fts_get_index_cache((fts_cache_t*) cache, index))); } /*********************************************************************//** @@ -4960,7 +5009,7 @@ fts_cache_append_deleted_doc_ids( { ulint i; - mutex_enter((mutex_t*) &cache->deleted_lock); + mutex_enter((ib_mutex_t*) &cache->deleted_lock); for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) { fts_update_t* update; @@ -4971,7 +5020,7 @@ fts_cache_append_deleted_doc_ids( ib_vector_push(vector, &update->doc_id); } - mutex_exit((mutex_t*) &cache->deleted_lock); + mutex_exit((ib_mutex_t*) &cache->deleted_lock); } /*********************************************************************//** @@ -5043,11 +5092,11 @@ UNIV_INTERN void fts_add_doc_id_column( /*==================*/ - dict_table_t* table) /*!< in/out: Table with FTS index */ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap) /*!< in: temporary memory heap, or NULL */ { dict_mem_table_add_col( - table, - table->heap, + table, heap, FTS_DOC_ID_COL_NAME, DATA_INT, dtype_form_prtype( @@ -5069,7 +5118,7 @@ fts_update_doc_id( doc_id_t* next_doc_id) /*!< in/out: buffer for writing */ { doc_id_t doc_id; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; if (*next_doc_id) { doc_id = *next_doc_id; @@ -5236,13 +5285,12 @@ fts_savepoint_copy( ftt_dst = fts_trx_table_clone(*ftt_src); - rbt_insert(dst->tables, &ftt_dst->table->id, &ftt_dst); + rbt_insert(dst->tables, &ftt_dst, &ftt_dst); } } /*********************************************************************//** -Take a FTS savepoint. -@return DB_SUCCESS or error code */ +Take a FTS savepoint. */ UNIV_INTERN void fts_savepoint_take( @@ -5312,7 +5360,6 @@ fts_savepoint_release( const char* name) /*!< in: savepoint name */ { ulint i; - fts_savepoint_t* prev; ib_vector_t* savepoints; ulint top_of_stack = 0; @@ -5322,9 +5369,6 @@ fts_savepoint_release( ut_a(ib_vector_size(savepoints) > 0); - prev = static_cast<fts_savepoint_t*>( - ib_vector_get(savepoints, top_of_stack)); - /* Skip the implied savepoint (first element). */ for (i = 1; i < ib_vector_size(savepoints); ++i) { fts_savepoint_t* savepoint; @@ -5338,17 +5382,6 @@ fts_savepoint_release( we have to skip deleted/released entries. */ if (savepoint->name != NULL && strcmp(name, savepoint->name) == 0) { - - fts_savepoint_t* last; - fts_savepoint_t temp; - - last = static_cast<fts_savepoint_t*>( - ib_vector_last(savepoints)); - - /* Swap the entries. */ - memcpy(&temp, last, sizeof(temp)); - memcpy(last, prev, sizeof(*last)); - memcpy(prev, &temp, sizeof(prev)); break; /* Track the previous savepoint instance that will @@ -5357,8 +5390,6 @@ fts_savepoint_release( /* We need to delete all entries greater than this element. */ top_of_stack = i; - - prev = savepoint; } } @@ -5395,8 +5426,7 @@ fts_savepoint_release( } /**********************************************************************//** -Refresh last statement savepoint. -@return DB_SUCCESS or error code */ +Refresh last statement savepoint. */ UNIV_INTERN void fts_savepoint_laststmt_refresh( @@ -5588,7 +5618,7 @@ static ibool fts_is_aux_table_name( /*==================*/ - fts_sys_table_t*table, /*!< out: table info */ + fts_aux_table_t*table, /*!< out: table info */ const char* name, /*!< in: table name */ ulint len) /*!< in: length of table name */ { @@ -5614,7 +5644,6 @@ fts_is_aux_table_name( if (ptr != NULL && len > 20 && strncmp(ptr, "FTS_", 4) == 0) { ulint i; - /* Skip the prefix. */ ptr += 4; len -= 4; @@ -5689,7 +5718,7 @@ fts_read_tables( void* user_arg) /*!< in: pointer to ib_vector_t */ { int i; - fts_sys_table_t*table; + fts_aux_table_t*table; mem_heap_t* heap; ibool done = FALSE; ib_vector_t* tables = static_cast<ib_vector_t*>(user_arg); @@ -5701,7 +5730,7 @@ fts_read_tables( /* We will use this heap for allocating strings. */ heap = static_cast<mem_heap_t*>(tables->allocator->arg); - table = static_cast<fts_sys_table_t*>(ib_vector_push(tables, NULL)); + table = static_cast<fts_aux_table_t*>(ib_vector_push(tables, NULL)); memset(table, 0x0, sizeof(*table)); @@ -5726,9 +5755,9 @@ fts_read_tables( } table->name = static_cast<char*>( - mem_heap_dup(heap, data, len + 1)); - table->name[len] = '\0'; - printf("Found [%.*s]\n", (int) len, table->name); + mem_heap_alloc(heap, len + 1)); + memcpy(table->name, data, len); + table->name[len] = 0; break; case 1: /* ID */ @@ -5749,41 +5778,41 @@ fts_read_tables( Check and drop all orphaned FTS auxiliary tables, those that don't have a parent table or FTS index defined on them. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull)) +void fts_check_and_drop_orphaned_tables( /*===============================*/ trx_t* trx, /*!< in: transaction */ ib_vector_t* tables) /*!< in: tables to check */ { - ulint i; - ulint error = DB_SUCCESS; - - for (i = 0; i < ib_vector_size(tables); ++i) { + for (ulint i = 0; i < ib_vector_size(tables); ++i) { dict_table_t* table; - fts_sys_table_t* sys_table; - ibool drop = FALSE; + fts_aux_table_t* aux_table; + bool drop = false; - sys_table = static_cast<fts_sys_table_t*>( + aux_table = static_cast<fts_aux_table_t*>( ib_vector_get(tables, i)); - table = dict_table_open_on_id(sys_table->parent_id, FALSE); + table = dict_table_open_on_id( + aux_table->parent_id, TRUE, FALSE); if (table == NULL || table->fts == NULL) { - drop = TRUE; + drop = true; - } else if (sys_table->index_id != 0) { - ulint j; + } else if (aux_table->index_id != 0) { index_id_t id; - fts_t* fts; + fts_t* fts; - drop = TRUE; + drop = true; fts = table->fts; - id = sys_table->index_id; + id = aux_table->index_id; /* Search for the FT index in the table's list. */ - for (j = 0; j < ib_vector_size(fts->indexes); ++j) { + for (ulint j = 0; + j < ib_vector_size(fts->indexes); + ++j) { + const dict_index_t* index; index = static_cast<const dict_index_t*>( @@ -5791,28 +5820,36 @@ fts_check_and_drop_orphaned_tables( if (index->id == id) { - drop = FALSE; + drop = false; break; } } } if (table) { - dict_table_close(table, FALSE); + dict_table_close(table, TRUE, FALSE); } if (drop) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Warning: Parent table of " - "FT auxiliary table %s not found.\n", - sys_table->name); - /* We ignore drop errors. */ - fts_drop_table(trx, sys_table->name); + ib_logf(IB_LOG_LEVEL_WARN, + "Parent table of FTS auxiliary table %s not " + "found.", aux_table->name); + + dberr_t err = fts_drop_table(trx, aux_table->name); + + if (err == DB_FAIL) { + char* path; + + path = fil_make_ibd_name( + aux_table->name, false); + + os_file_delete_if_exists(path); + + mem_free(path); + } } } - - return(error); } /**********************************************************************//** @@ -5823,19 +5860,62 @@ void fts_drop_orphaned_tables(void) /*==========================*/ { - trx_t* trx; - pars_info_t* info; - mem_heap_t* heap; - que_t* graph; - ib_vector_t* tables; - ib_alloc_t* heap_alloc; - ulint error = DB_SUCCESS; + trx_t* trx; + pars_info_t* info; + mem_heap_t* heap; + que_t* graph; + ib_vector_t* tables; + ib_alloc_t* heap_alloc; + space_name_list_t space_name_list; + dberr_t error = DB_SUCCESS; + + /* Note: We have to free the memory after we are done with the list. */ + error = fil_get_space_names(space_name_list); + + if (error == DB_OUT_OF_MEMORY) { + ib_logf(IB_LOG_LEVEL_ERROR, "Out of memory"); + ut_error; + } heap = mem_heap_create(1024); heap_alloc = ib_heap_allocator_create(heap); /* We store the table ids of all the FTS indexes that were found. */ - tables = ib_vector_create(heap_alloc, sizeof(fts_sys_table_t), 128); + tables = ib_vector_create(heap_alloc, sizeof(fts_aux_table_t), 128); + + /* Get the list of all known .ibd files and check for orphaned + FTS auxiliary files in that list. We need to remove them because + users can't map them back to table names and this will create + unnecessary clutter. */ + + for (space_name_list_t::iterator it = space_name_list.begin(); + it != space_name_list.end(); + ++it) { + + fts_aux_table_t* fts_aux_table; + + fts_aux_table = static_cast<fts_aux_table_t*>( + ib_vector_push(tables, NULL)); + + memset(fts_aux_table, 0x0, sizeof(*fts_aux_table)); + + if (!fts_is_aux_table_name(fts_aux_table, *it, strlen(*it))) { + ib_vector_pop(tables); + } else { + ulint len = strlen(*it); + + fts_aux_table->id = fil_get_space_id_for_table(*it); + + /* We got this list from fil0fil.cc. The tablespace + with this name must exist. */ + ut_a(fts_aux_table->id != ULINT_UNDEFINED); + + fts_aux_table->name = static_cast<char*>( + mem_heap_dup(heap, *it, len + 1)); + + fts_aux_table->name[len] = 0; + } + } trx = trx_allocate_for_background(); trx->op_info = "dropping orphaned FTS tables"; @@ -5867,10 +5947,7 @@ fts_drop_orphaned_tables(void) error = fts_eval_sql(trx, graph); if (error == DB_SUCCESS) { - error = fts_check_and_drop_orphaned_tables(trx, tables); - } - - if (error == DB_SUCCESS) { + fts_check_and_drop_orphaned_tables(trx, tables); fts_sql_commit(trx); break; /* Exit the loop. */ } else { @@ -5881,15 +5958,15 @@ fts_drop_orphaned_tables(void) ut_print_timestamp(stderr); if (error == DB_LOCK_WAIT_TIMEOUT) { - fprintf(stderr, " InnoDB: Warning: lock wait " - "timeout reading SYS_TABLES. " - "Retrying!\n"); + ib_logf(IB_LOG_LEVEL_WARN, + "lock wait timeout reading SYS_TABLES. " + "Retrying!"); trx->error_state = DB_SUCCESS; } else { - fprintf(stderr, " InnoDB: Error: %lu " - "while reading SYS_TABLES.\n", - error); + ib_logf(IB_LOG_LEVEL_ERROR, + "(%s) while reading SYS_TABLES.", + ut_strerr(error)); break; /* Exit the loop. */ } @@ -5905,6 +5982,14 @@ fts_drop_orphaned_tables(void) if (heap != NULL) { mem_heap_free(heap); } + + /** Free the memory allocated to store the .ibd names. */ + for (space_name_list_t::iterator it = space_name_list.begin(); + it != space_name_list.end(); + ++it) { + + delete[] *it; + } } /**********************************************************************//** @@ -5986,7 +6071,7 @@ fts_load_stopword( { fts_table_t fts_table; fts_string_t str; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ulint use_stopword; fts_cache_t* cache; const char* stopword_to_use = NULL; @@ -6086,6 +6171,43 @@ cleanup: /**********************************************************************//** Callback function when we initialize the FTS at the start up +time. It recovers the maximum Doc IDs presented in the current table. +@return: always returns TRUE */ +static +ibool +fts_init_get_doc_id( +/*================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts cache */ +{ + doc_id_t doc_id = FTS_NULL_DOC_ID; + sel_node_t* node = static_cast<sel_node_t*>(row); + que_node_t* exp = node->select_list; + fts_cache_t* cache = static_cast<fts_cache_t*>(user_arg); + + ut_ad(ib_vector_is_empty(cache->get_docs)); + + /* Copy each indexed column content into doc->text.f_str */ + if (exp) { + dfield_t* dfield = que_node_get_val(exp); + dtype_t* type = dfield_get_type(dfield); + void* data = dfield_get_data(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + + doc_id = static_cast<doc_id_t>(mach_read_from_8( + static_cast<const byte*>(data))); + + if (doc_id >= cache->next_doc_id) { + cache->next_doc_id = doc_id + 1; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Callback function when we initialize the FTS at the start up time. It recovers Doc IDs that have not sync-ed to the auxiliary table, and require to bring them back into FTS index. @return: always returns TRUE */ @@ -6100,22 +6222,16 @@ fts_init_recover_doc( fts_doc_t doc; ulint doc_len = 0; ulint field_no = 0; - ibool has_fts = TRUE; - fts_get_doc_t* get_doc = NULL; + fts_get_doc_t* get_doc = static_cast<fts_get_doc_t*>(user_arg); doc_id_t doc_id = FTS_NULL_DOC_ID; sel_node_t* node = static_cast<sel_node_t*>(row); que_node_t* exp = node->select_list; - fts_cache_t* cache = static_cast<fts_cache_t*>(user_arg); + fts_cache_t* cache = get_doc->cache; - if (ib_vector_is_empty(cache->get_docs)) { - has_fts = FALSE; - } else { - get_doc = static_cast<fts_get_doc_t*>( - ib_vector_get(cache->get_docs, 0)); + fts_doc_init(&doc); + doc.found = TRUE; - fts_doc_init(&doc); - doc.found = TRUE; - } + ut_ad(cache); /* Copy each indexed column content into doc->text.f_str */ while (exp) { @@ -6131,18 +6247,11 @@ fts_init_recover_doc( doc_id = static_cast<doc_id_t>(mach_read_from_8( static_cast<const byte*>(data))); - /* Just need to fetch the Doc ID */ - if (!has_fts) { - goto func_exit; - } - field_no++; exp = que_node_get_next(exp); continue; } - ut_a(has_fts); - if (len == UNIV_SQL_NULL) { exp = que_node_get_next(exp); continue; @@ -6196,7 +6305,6 @@ fts_init_recover_doc( cache->added++; -func_exit: if (doc_id >= cache->next_doc_id) { cache->next_doc_id = doc_id + 1; } @@ -6223,6 +6331,9 @@ fts_init_index( fts_get_doc_t* get_doc = NULL; ibool has_fts = TRUE; fts_cache_t* cache = table->fts->cache; + bool need_init = false; + + ut_ad(!mutex_own(&dict_sys->mutex)); /* First check cache->get_docs is initialized */ if (!has_cache_lock) { @@ -6239,6 +6350,8 @@ fts_init_index( goto func_exit; } + need_init = true; + start_doc = cache->synced_doc_id; if (!start_doc) { @@ -6250,28 +6363,32 @@ fts_init_index( dropped, and we re-initialize the Doc ID system for subsequent insertion */ if (ib_vector_is_empty(cache->get_docs)) { - index = dict_table_get_first_index(table); + index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME); + + ut_a(index); + has_fts = FALSE; + fts_doc_fetch_by_doc_id(NULL, start_doc, index, + FTS_FETCH_DOC_BY_ID_LARGE, + fts_init_get_doc_id, cache); } else { - /* We only have one FTS index per table */ - get_doc = static_cast<fts_get_doc_t*>( - ib_vector_get(cache->get_docs, 0)); + for (ulint i = 0; i < ib_vector_size(cache->get_docs); ++i) { + get_doc = static_cast<fts_get_doc_t*>( + ib_vector_get(cache->get_docs, i)); - index = get_doc->index_cache->index; - } + index = get_doc->index_cache->index; - fts_doc_fetch_by_doc_id(NULL, start_doc, index, - FTS_FETCH_DOC_BY_ID_LARGE, - fts_init_recover_doc, cache); + fts_doc_fetch_by_doc_id(NULL, start_doc, index, + FTS_FETCH_DOC_BY_ID_LARGE, + fts_init_recover_doc, get_doc); + } + } if (has_fts) { if (table->fts->cache->stopword_info.status & STOPWORD_NOT_INIT) { fts_load_stopword(table, NULL, NULL, NULL, TRUE, TRUE); } - - /* Register the table with the optimize thread. */ - fts_optimize_add_table(table); } table->fts->fts_status |= ADDED_TABLE_SYNCED; @@ -6283,5 +6400,12 @@ func_exit: rw_lock_x_unlock(&cache->lock); } + if (need_init) { + mutex_enter(&dict_sys->mutex); + /* Register the table with the optimize thread. */ + fts_optimize_add_table(table); + mutex_exit(&dict_sys->mutex); + } + return(TRUE); } diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc index 92e040d2715..9abeeccac91 100644 --- a/storage/innobase/fts/fts0opt.cc +++ b/storage/innobase/fts/fts0opt.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,26 +39,29 @@ Completed 2011/7/10 Sunny and Jimmy Yang #include "fts0vlc.ic" #endif -/* The FTS optimize thread's work queue. */ +/** The FTS optimize thread's work queue. */ static ib_wqueue_t* fts_optimize_wq; -/* The number of document ids to delete in one statement. */ +/** The number of document ids to delete in one statement. */ static const ulint FTS_MAX_DELETE_DOC_IDS = 1000; -/* Time to wait for a message. */ +/** Time to wait for a message. */ static const ulint FTS_QUEUE_WAIT_IN_USECS = 5000000; -/* Default optimize interval in secs. */ +/** Default optimize interval in secs. */ static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300; +/** Server is shutting down, so does we exiting the optimize thread */ +static bool fts_opt_start_shutdown = false; + #if 0 -/* Check each table in round robin to see whether they'd +/** Check each table in round robin to see whether they'd need to be "optimized" */ static ulint fts_optimize_sync_iterator = 0; #endif /** State of a table within the optimization sub system. */ -enum fts_state_enum { +enum fts_state_t { FTS_STATE_LOADED, FTS_STATE_RUNNING, FTS_STATE_SUSPENDED, @@ -67,7 +70,7 @@ enum fts_state_enum { }; /** FTS optimize thread message types. */ -enum fts_msg_type_enum { +enum fts_msg_type_t { FTS_MSG_START, /*!< Start optimizing thread */ FTS_MSG_PAUSE, /*!< Pause optimizing thread */ @@ -83,21 +86,9 @@ enum fts_msg_type_enum { threads work queue */ }; -typedef enum fts_state_enum fts_state_t; -typedef struct fts_zip_struct fts_zip_t; -typedef struct fts_msg_struct fts_msg_t; -typedef struct fts_slot_struct fts_slot_t; -typedef struct fts_encode_struct fts_encode_t; -typedef enum fts_msg_type_enum fts_msg_type_t; -typedef struct fts_msg_del_struct fts_msg_del_t; -typedef struct fts_msg_stop_struct fts_msg_stop_t; -typedef struct fts_optimize_struct fts_optimize_t; -typedef struct fts_msg_optimize_struct fts_msg_optimize_t; -typedef struct fts_optimize_graph_struct fts_optimize_graph_t; - /** Compressed list of words that have been read from FTS INDEX that needs to be optimized. */ -struct fts_zip_struct { +struct fts_zip_t { ulint status; /*!< Status of (un)/zip operation */ ulint n_words; /*!< Number of words compressed */ @@ -128,7 +119,7 @@ struct fts_zip_struct { }; /** Prepared statemets used during optimize */ -struct fts_optimize_graph_struct { +struct fts_optimize_graph_t { /*!< Delete a word from FTS INDEX */ que_t* delete_nodes_graph; /*!< Insert a word into FTS INDEX */ @@ -140,7 +131,7 @@ struct fts_optimize_graph_struct { }; /** Used by fts_optimize() to store state. */ -struct fts_optimize_struct { +struct fts_optimize_t { trx_t* trx; /*!< The transaction used for all SQL */ ib_alloc_t* self_heap; /*!< Heap to use for allocations */ @@ -183,14 +174,14 @@ struct fts_optimize_struct { }; /** Used by the optimize, to keep state during compacting nodes. */ -struct fts_encode_struct { +struct fts_encode_t { doc_id_t src_last_doc_id;/*!< Last doc id read from src node */ byte* src_ilist_ptr; /*!< Current ptr within src ilist */ }; /** We use this information to determine when to start the optimize cycle for a table. */ -struct fts_slot_struct { +struct fts_slot_t { dict_table_t* table; /*!< Table to optimize */ fts_state_t state; /*!< State of this slot */ @@ -210,7 +201,7 @@ struct fts_slot_struct { }; /** A table remove message for the FTS optimize thread. */ -struct fts_msg_del_struct { +struct fts_msg_del_t { dict_table_t* table; /*!< The table to remove */ os_event_t event; /*!< Event to synchronize acknowledgement @@ -219,12 +210,12 @@ struct fts_msg_del_struct { }; /** Stop the optimize thread. */ -struct fts_msg_optimize_struct { +struct fts_msg_optimize_t { dict_table_t* table; /*!< Table to optimize */ }; /** The FTS optimize message work queue message type. */ -struct fts_msg_struct { +struct fts_msg_t { fts_msg_type_t type; /*!< Message type */ void* ptr; /*!< The message contents */ @@ -466,9 +457,9 @@ fts_optimize_index_fetch_node( /**********************************************************************//** Read the rows from the FTS inde. -@return vector of rows fetched */ +@return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_index_fetch_nodes( /*==================*/ trx_t* trx, /*!< in: transaction */ @@ -479,7 +470,7 @@ fts_index_fetch_nodes( fts_fetch_t* fetch) /*!< in: fetch callback.*/ { pars_info_t* info; - ulint error; + dberr_t error; trx->op_info = "fetching FTS index nodes"; @@ -543,8 +534,9 @@ fts_index_fetch_nodes( trx->error_state = DB_SUCCESS; } else { - fprintf(stderr, " InnoDB: Error: %lu " - "while reading FTS index.\n", error); + fprintf(stderr, " InnoDB: Error: (%s) " + "while reading FTS index.\n", + ut_strerr(error)); break; /* Exit the loop. */ } @@ -781,8 +773,8 @@ fts_zip_deflate_end( Read the words from the FTS INDEX. @return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes to search else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_index_fetch_words( /*==================*/ fts_optimize_t* optim, /*!< in: optimize scratch pad */ @@ -794,7 +786,7 @@ fts_index_fetch_words( que_t* graph; ulint selected; fts_zip_t* zip = NULL; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; mem_heap_t* heap = static_cast<mem_heap_t*>(optim->self_heap->arg); ibool inited = FALSE; @@ -849,13 +841,14 @@ fts_index_fetch_words( zip = optim->zip; for(;;) { + int err; - if (!inited && ((error = deflateInit(zip->zp, 9)) + if (!inited && ((err = deflateInit(zip->zp, 9)) != Z_OK)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: ZLib deflateInit() " - "failed: %lu\n", error); + "failed: %d\n", err); error = DB_ERROR; break; @@ -885,9 +878,9 @@ fts_index_fetch_words( optim->trx->error_state = DB_SUCCESS; } else { - fprintf(stderr, " InnoDB: Error: %lu " + fprintf(stderr, " InnoDB: Error: (%s) " "while reading document.\n", - error); + ut_strerr(error)); break; /* Exit the loop. */ } @@ -962,14 +955,14 @@ fts_fetch_doc_ids( Read the rows from a FTS common auxiliary table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_table_fetch_doc_ids( /*====================*/ trx_t* trx, /*!< in: transaction */ fts_table_t* fts_table, /*!< in: table */ fts_doc_ids_t* doc_ids) /*!< in: For collecting doc ids */ { - ulint error; + dberr_t error; que_t* graph; pars_info_t* info = pars_info_create(); ibool alloc_bk_trx = FALSE; @@ -1114,8 +1107,8 @@ fts_optimize_lookup( /**********************************************************************//** Encode the word pos list into the node @return DB_SUCCESS or error code*/ -static -ulint +static __attribute__((nonnull)) +dberr_t fts_optimize_encode_node( /*=====================*/ fts_node_t* node, /*!< in: node to fill*/ @@ -1126,7 +1119,7 @@ fts_optimize_encode_node( ulint enc_len; ulint pos_enc_len; doc_id_t doc_id_delta; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; byte* src = enc->src_ilist_ptr; if (node->first_doc_id == 0) { @@ -1202,8 +1195,8 @@ fts_optimize_encode_node( /**********************************************************************//** Optimize the data contained in a node. @return DB_SUCCESS or error code*/ -static -ulint +static __attribute__((nonnull)) +dberr_t fts_optimize_node( /*==============*/ ib_vector_t* del_vec, /*!< in: vector of doc ids to delete*/ @@ -1213,7 +1206,7 @@ fts_optimize_node( fts_encode_t* enc) /*!< in: encoding state */ { ulint copied; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; doc_id_t doc_id = enc->src_last_doc_id; if (!enc->src_ilist_ptr) { @@ -1299,8 +1292,8 @@ test_again: /**********************************************************************//** Determine the starting pos within the deleted doc id vector for a word. -@return DB_SUCCESS or error code */ -static +@return delete position */ +static __attribute__((nonnull, warn_unused_result)) int fts_optimize_deleted_pos( /*=====================*/ @@ -1428,8 +1421,8 @@ fts_optimize_word( /**********************************************************************//** Update the FTS index table. This is a delete followed by an insert. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_write_word( /*====================*/ trx_t* trx, /*!< in: transaction */ @@ -1441,7 +1434,7 @@ fts_optimize_write_word( pars_info_t* info; que_t* graph; ulint selected; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; char* table_name = fts_get_table_name(fts_table); info = pars_info_create(); @@ -1470,8 +1463,9 @@ fts_optimize_write_word( if (error != DB_SUCCESS) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) during optimize, " - "when deleting a word from the FTS index.\n", error); + fprintf(stderr, " InnoDB: Error: (%s) during optimize, " + "when deleting a word from the FTS index.\n", + ut_strerr(error)); } fts_que_graph_free(graph); @@ -1491,9 +1485,10 @@ fts_optimize_write_word( if (error != DB_SUCCESS) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: (%lu) " + fprintf(stderr, " InnoDB: Error: (%s) " "during optimize, while adding a " - "word to the FTS index.\n", error); + "word to the FTS index.\n", + ut_strerr(error)); } } @@ -1529,8 +1524,8 @@ fts_word_free( /**********************************************************************//** Optimize the word ilist and rewrite data to the FTS index. @return status one of RESTART, EXIT, ERROR */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_compact( /*=================*/ fts_optimize_t* optim, /*!< in: optimize state data */ @@ -1538,7 +1533,7 @@ fts_optimize_compact( ib_time_t start_time) /*!< in: optimize start time */ { ulint i; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ulint size = ib_vector_size(optim->words); for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) { @@ -1622,77 +1617,63 @@ fts_optimize_create( /**********************************************************************//** Get optimize start time of an FTS index. @return DB_SUCCESS if all OK else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_get_index_start_time( /*==============================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: FTS index */ ib_time_t* start_time) /*!< out: time in secs */ { - ulint error; - - error = fts_config_get_index_ulint( - trx, index, FTS_OPTIMIZE_START_TIME, (ulint*) start_time); - - return(error); + return(fts_config_get_index_ulint( + trx, index, FTS_OPTIMIZE_START_TIME, + (ulint*) start_time)); } /**********************************************************************//** Set the optimize start time of an FTS index. @return DB_SUCCESS if all OK else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_set_index_start_time( /*==============================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: FTS index */ ib_time_t start_time) /*!< in: start time */ { - ulint error; - - error = fts_config_set_index_ulint( - trx, index, FTS_OPTIMIZE_START_TIME, (ulint) start_time); - - return(error); + return(fts_config_set_index_ulint( + trx, index, FTS_OPTIMIZE_START_TIME, + (ulint) start_time)); } /**********************************************************************//** Get optimize end time of an FTS index. @return DB_SUCCESS if all OK else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_get_index_end_time( /*============================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: FTS index */ ib_time_t* end_time) /*!< out: time in secs */ { - ulint error; - - error = fts_config_get_index_ulint( - trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time); - - return(error); + return(fts_config_get_index_ulint( + trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time)); } /**********************************************************************//** Set the optimize end time of an FTS index. @return DB_SUCCESS if all OK else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_set_index_end_time( /*============================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: FTS index */ ib_time_t end_time) /*!< in: end time */ { - ulint error; - - error = fts_config_set_index_ulint( - trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time); - - return(error); + return(fts_config_set_index_ulint( + trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time)); } #endif @@ -1798,7 +1779,7 @@ fts_optimize_words( fprintf(stderr, "%.*s\n", (int) word->f_len, word->f_str); while(!optim->done) { - ulint error; + dberr_t error; trx_t* trx = optim->trx; ulint selected; @@ -1901,15 +1882,15 @@ fts_optimize_set_next_word( Optimize is complete. Set the completion time, and reset the optimize start string for this FTS index to "". @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_index_completed( /*=========================*/ fts_optimize_t* optim, /*!< in: optimize instance */ dict_index_t* index) /*!< in: table with one FTS index */ { fts_string_t word; - ulint error; + dberr_t error; byte buf[sizeof(ulint)]; #ifdef FTS_OPTIMIZE_DEBUG ib_time_t end_time = ut_time(); @@ -1929,8 +1910,8 @@ fts_optimize_index_completed( if (error != DB_SUCCESS) { - fprintf(stderr, "InnoDB: Error: (%lu) while " - "updating last optimized word!\n", error); + fprintf(stderr, "InnoDB: Error: (%s) while " + "updating last optimized word!\n", ut_strerr(error)); } return(error); @@ -1941,15 +1922,15 @@ fts_optimize_index_completed( Read the list of words from the FTS auxiliary index that will be optimized in this pass. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_index_read_words( /*==========================*/ fts_optimize_t* optim, /*!< in: optimize instance */ dict_index_t* index, /*!< in: table with one FTS index */ fts_string_t* word) /*!< in: buffer to use */ { - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; if (optim->del_list_regenerated) { word->f_len = 0; @@ -1998,15 +1979,15 @@ fts_optimize_index_read_words( Run OPTIMIZE on the given FTS index. Note: this can take a very long time (hours). @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_index( /*===============*/ fts_optimize_t* optim, /*!< in: optimize instance */ dict_index_t* index) /*!< in: table with one FTS index */ { fts_string_t word; - ulint error; + dberr_t error; byte str[FTS_MAX_WORD_LEN + 1]; /* Set the current index that we have to optimize. */ @@ -2069,8 +2050,8 @@ fts_optimize_index( /**********************************************************************//** Delete the document ids in the delete, and delete cache tables. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_purge_deleted_doc_ids( /*===============================*/ fts_optimize_t* optim) /*!< in: optimize instance */ @@ -2081,7 +2062,7 @@ fts_optimize_purge_deleted_doc_ids( fts_update_t* update; char* sql_str; doc_id_t write_doc_id; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; info = pars_info_create(); @@ -2138,13 +2119,13 @@ fts_optimize_purge_deleted_doc_ids( /**********************************************************************//** Delete the document ids in the pending delete, and delete tables. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_purge_deleted_doc_id_snapshot( /*=======================================*/ fts_optimize_t* optim) /*!< in: optimize instance */ { - ulint error; + dberr_t error; que_t* graph; char* sql_str; @@ -2188,13 +2169,13 @@ Copy the deleted doc ids that will be purged during this optimize run to the being deleted FTS auxiliary tables. The transaction is committed upon successfull copy and rolled back on DB_DUPLICATE_KEY error. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_create_deleted_doc_id_snapshot( /*========================================*/ fts_optimize_t* optim) /*!< in: optimize instance */ { - ulint error; + dberr_t error; que_t* graph; char* sql_str; @@ -2226,13 +2207,13 @@ fts_optimize_create_deleted_doc_id_snapshot( Read in the document ids that are to be purged during optimize. The transaction is committed upon successfully read. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_read_deleted_doc_id_snapshot( /*======================================*/ fts_optimize_t* optim) /*!< in: optimize instance */ { - ulint error; + dberr_t error; optim->fts_common_table.suffix = "BEING_DELETED"; @@ -2263,14 +2244,14 @@ Optimze all the FTS indexes, skipping those that have already been optimized, since the FTS auxiliary indexes are not guaranteed to be of the same cardinality. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_indexes( /*=================*/ fts_optimize_t* optim) /*!< in: optimize instance */ { ulint i; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; fts_t* fts = optim->table->fts; /* Optimize the FTS indexes. */ @@ -2333,13 +2314,13 @@ fts_optimize_indexes( /*********************************************************************//** Cleanup the snapshot tables and the master deleted table. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_purge_snapshot( /*========================*/ fts_optimize_t* optim) /*!< in: optimize instance */ { - ulint error; + dberr_t error; /* Delete the doc ids from the master deleted tables, that were in the snapshot that was taken at the start of optimize. */ @@ -2362,13 +2343,13 @@ fts_optimize_purge_snapshot( /*********************************************************************//** Reset the start time to 0 so that a new optimize can be started. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_optimize_reset_start_time( /*==========================*/ fts_optimize_t* optim) /*!< in: optimize instance */ { - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; #ifdef FTS_OPTIMIZE_DEBUG fts_t* fts = optim->table->fts; @@ -2401,13 +2382,13 @@ fts_optimize_reset_start_time( /*********************************************************************//** Run OPTIMIZE on the given table by a background thread. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull)) +dberr_t fts_optimize_table_bk( /*==================*/ fts_slot_t* slot) /*!< in: table to optimiza */ { - ulint error; + dberr_t error; dict_table_t* table = slot->table; fts_t* fts = table->fts; @@ -2440,12 +2421,12 @@ fts_optimize_table_bk( Run OPTIMIZE on the given table. @return DB_SUCCESS if all OK */ UNIV_INTERN -ulint +dberr_t fts_optimize_table( /*===============*/ dict_table_t* table) /*!< in: table to optimiza */ { - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; fts_optimize_t* optim = NULL; fts_t* fts = table->fts; @@ -2567,6 +2548,11 @@ fts_optimize_add_table( return; } + /* Make sure table with FTS index cannot be evicted */ + if (table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(table); + } + msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table); ib_wqueue_add(fts_optimize_wq, msg, msg->heap); @@ -2602,18 +2588,26 @@ fts_optimize_remove_table( dict_table_t* table) /*!< in: table to remove */ { fts_msg_t* msg; - os_event_t event; - fts_msg_del_t* remove; + os_event_t event; + fts_msg_del_t* remove; /* if the optimize system not yet initialized, return */ if (!fts_optimize_wq) { return; } + /* FTS optimizer thread is already exited */ + if (fts_opt_start_shutdown) { + ib_logf(IB_LOG_LEVEL_INFO, + "Try to remove table %s after FTS optimize" + " thread exiting.", table->name); + return; + } + msg = fts_optimize_create_msg(FTS_MSG_DEL_TABLE, NULL); /* We will wait on this event until signalled by the consumer. */ - event = os_event_create(table->name); + event = os_event_create(); remove = static_cast<fts_msg_del_t*>( mem_heap_alloc(msg->heap, sizeof(*remove))); @@ -2889,6 +2883,8 @@ fts_optimize_thread( ulint n_optimize = 0; ib_wqueue_t* wq = (ib_wqueue_t*) arg; + ut_ad(!srv_read_only_mode); + heap = mem_heap_create(sizeof(dict_table_t*) * 64); heap_alloc = ib_heap_allocator_create(heap); @@ -3010,10 +3006,10 @@ fts_optimize_thread( ib_vector_get(tables, i)); if (slot->state != FTS_STATE_EMPTY) { - dict_table_t* table; + dict_table_t* table = NULL; - table = dict_table_open_on_name_no_stats( - slot->table->name, FALSE, + table = dict_table_open_on_name( + slot->table->name, FALSE, FALSE, DICT_ERR_IGNORE_INDEX_ROOT); if (table) { @@ -3022,8 +3018,11 @@ fts_optimize_thread( fts_sync_table(table); } - fts_free(table); - dict_table_close(table, FALSE); + if (table->fts) { + fts_free(table); + } + + dict_table_close(table, FALSE, FALSE); } } } @@ -3031,10 +3030,7 @@ fts_optimize_thread( ib_vector_free(tables); - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: FTS optimize thread exiting.\n"); - - ib_wqueue_free(wq); + ib_logf(IB_LOG_LEVEL_INFO, "FTS optimize thread exiting."); os_event_set(exit_event); @@ -3052,6 +3048,8 @@ void fts_optimize_init(void) /*===================*/ { + ut_ad(!srv_read_only_mode); + /* For now we only support one optimize thread. */ ut_a(fts_optimize_wq == NULL); @@ -3074,18 +3072,30 @@ fts_optimize_is_init(void) /**********************************************************************//** Signal the optimize thread to prepare for shutdown. */ - +UNIV_INTERN void fts_optimize_start_shutdown(void) /*=============================*/ { + ut_ad(!srv_read_only_mode); + fts_msg_t* msg; os_event_t event; + /* If there is an ongoing activity on dictionary, such as + srv_master_evict_from_table_cache(), wait for it */ + dict_mutex_enter_for_mysql(); + + /* Tells FTS optimizer system that we are exiting from + optimizer thread, message send their after will not be + processed */ + fts_opt_start_shutdown = true; + dict_mutex_exit_for_mysql(); + /* We tell the OPTIMIZE thread to switch to state done, we can't delete the work queue here because the add thread needs deregister the FTS tables. */ - event = os_event_create(NULL); + event = os_event_create(); msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL); msg->ptr = event; @@ -3094,15 +3104,20 @@ fts_optimize_start_shutdown(void) os_event_wait(event); os_event_free(event); + + ib_wqueue_free(fts_optimize_wq); + } /**********************************************************************//** Reset the work queue. */ - +UNIV_INTERN void fts_optimize_end(void) /*==================*/ { + ut_ad(!srv_read_only_mode); + // FIXME: Potential race condition here: We should wait for // the optimize thread to confirm shutdown. fts_optimize_wq = NULL; diff --git a/storage/innobase/fts/fts0pars.cc b/storage/innobase/fts/fts0pars.cc index 4fdfff5ca42..dd2984b1beb 100644 --- a/storage/innobase/fts/fts0pars.cc +++ b/storage/innobase/fts/fts0pars.cc @@ -105,7 +105,7 @@ extern int ftserror(const char* p); typedef int (*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner); typedef int (*fts_scanner)(); -struct fts_lexer_struct { +struct fts_lexer_t { fts_scanner scanner; void* yyscanner; }; diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index 58b429a8406..5c757b4f176 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,6 +24,7 @@ Created 2007/03/27 Sunny Bains Completed 2011/7/10 Sunny and Jimmy Yang *******************************************************/ +#include "dict0dict.h" /* dict_table_get_n_rows() */ #include "ut0rbt.h" #include "row0sel.h" #include "fts0fts.h" @@ -57,15 +58,10 @@ static const double FTS_NORMALIZE_COEFF = 0.0115F; /* For parsing the search phrase */ static const char* FTS_PHRASE_DELIMITER = "\t "; -typedef struct fts_match_struct fts_match_t; -typedef struct fts_query_struct fts_query_t; -typedef struct fts_phrase_struct fts_phrase_t; -typedef struct fts_select_struct fts_select_t; -typedef struct fts_doc_freq_struct fts_doc_freq_t; -typedef struct fts_word_freq_struct fts_word_freq_t; +struct fts_word_freq_t; /** State of an FTS query. */ -struct fts_query_struct { +struct fts_query_t { mem_heap_t* heap; /*!< Heap to use for allocations */ trx_t* trx; /*!< The query transaction */ @@ -126,11 +122,11 @@ struct fts_query_struct { position info for each matched word in the word list */ - ulint total_docs; /*!< The total number of documents */ + ib_uint64_t total_docs; /*!< The total number of documents */ ulint total_words; /*!< The total number of words */ - ulint error; /*!< Error code if any, that is + dberr_t error; /*!< Error code if any, that is encountered during query processing */ ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per @@ -144,7 +140,7 @@ struct fts_query_struct { /** For phrase matching, first we collect the documents and the positions then we match. */ -struct fts_match_struct { +struct fts_match_t { doc_id_t doc_id; /*!< Document id */ ulint start; /*!< Start the phrase match from @@ -158,7 +154,7 @@ struct fts_match_struct { /** For matching tokens in a phrase search. We use this data structure in the callback that determines whether a document should be accepted or rejected for a phrase search. */ -struct fts_select_struct { +struct fts_select_t { doc_id_t doc_id; /*!< The document id to match */ ulint min_pos; /*!< For found to be TRUE at least @@ -173,8 +169,23 @@ struct fts_select_struct { the FTS index */ }; +/** structure defines a set of ranges for original documents, each of which +has a minimum position and maximum position. Text in such range should +contain all words in the proximity search. We will need to count the +words in such range to make sure it is less than the specified distance +of the proximity search */ +struct fts_proximity_t { + ulint n_pos; /*!< number of position set, defines + a range (min to max) containing all + matching words */ + ulint* min_pos; /*!< the minimum position (in bytes) + of the range */ + ulint* max_pos; /*!< the maximum position (in bytes) + of the range */ +}; + /** The match positions and tokesn to match */ -struct fts_phrase_struct { +struct fts_phrase_t { ibool found; /*!< Match result */ const fts_match_t* @@ -188,23 +199,26 @@ struct fts_phrase_struct { CHARSET_INFO* charset; /*!< Phrase match charset */ mem_heap_t* heap; /*!< Heap for word processing */ ulint zip_size; /*!< row zip size */ + fts_proximity_t*proximity_pos; /*!< position info for proximity + search verification. Records the min + and max position of words matched */ }; /** For storing the frequncy of a word/term in a document */ -struct fts_doc_freq_struct { +struct fts_doc_freq_t { doc_id_t doc_id; /*!< Document id */ ulint freq; /*!< Frequency of a word in a document */ }; /** To determine the word frequency per document. */ -struct fts_word_freq_struct { +struct fts_word_freq_t { byte* word; /*!< Word for which we need the freq, it's allocated on the query heap */ ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document word frequencies. The elements are of type fts_doc_freq_t */ - ulint doc_count; /*!< Total number of documents that + ib_uint64_t doc_count; /*!< Total number of documents that contain this word */ double idf; /*!< Inverse document frequency */ }; @@ -257,37 +271,46 @@ search arguments to search the document again, thus "expand" the search result set. @return DB_SUCCESS if success, otherwise the error code */ static -ulint +dberr_t fts_expand_query( /*=============*/ dict_index_t* index, /*!< in: FTS index to search */ - fts_query_t* query); /*!< in: query result, to be freed + fts_query_t* query) /*!< in: query result, to be freed by the client */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** This function finds documents that contain all words in a phrase or proximity search. And if proximity search, verify -the words are close to each other enough, as in specified distance. +the words are close enough to each other, as in specified distance. This function is called for phrase and proximity search. @return TRUE if documents are found, FALSE if otherwise */ static ibool -fts_check_phrase_proximity( -/*=======================*/ - fts_query_t* query, /*!< in: query instance */ +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance + query->doc_ids might be instantiated + with qualified doc IDs */ ib_vector_t* tokens); /*!< in: Tokens contain words */ /*************************************************************//** -This function check the words in result document are close to each -other enough (within proximity rnage). This is used for proximity search. -@return TRUE if words are close to each other, FALSE if otherwise */ +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ static -ulint -fts_proximity_check_position( -/*=========================*/ - fts_match_t** match, /*!< in: query instance */ - ulint num_match, /*!< in: number of matching - items */ - ulint distance); /*!< in: distance value - for proximity search */ +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos); /*!< out: the position info + records ranges containing + all matching words. */ #if 0 /******************************************************************** Get the total number of words in a documents. */ @@ -954,8 +977,8 @@ cont_search: /*****************************************************************//** Set difference. @return DB_SUCCESS if all went well */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_difference( /*=================*/ fts_query_t* query, /*!< in: query instance */ @@ -993,15 +1016,21 @@ fts_query_difference( ut_a(index_cache != NULL); /* Search the cache for a matching word first. */ - nodes = fts_cache_find_word(index_cache, token); + if (query->cur_node->term.wildcard + && query->flags != FTS_PROXIMITY + && query->flags != FTS_PHRASE) { + fts_cache_find_wildcard(query, index_cache, token); + } else { + nodes = fts_cache_find_word(index_cache, token); - for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { - const fts_node_t* node; + for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { + const fts_node_t* node; - node = static_cast<const fts_node_t*>( - ib_vector_get_const(nodes, i)); + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); - fts_query_check_node(query, token, node); + fts_query_check_node(query, token, node); + } } rw_lock_x_unlock(&cache->lock); @@ -1026,8 +1055,8 @@ fts_query_difference( /*****************************************************************//** Intersect the token doc ids with the current set. @return DB_SUCCESS if all went well */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_intersect( /*================*/ fts_query_t* query, /*!< in: query instance */ @@ -1216,8 +1245,8 @@ fts_query_cache( /*****************************************************************//** Set union. @return DB_SUCCESS if all went well */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_union( /*============*/ fts_query_t* query, /*!< in: query instance */ @@ -1248,13 +1277,7 @@ fts_query_union( /* Single '%' would confuse parser in pars_like_rebind(). In addition, our wildcard search only supports prefix search */ - if (*token->f_str == '%') { - if (token->f_len == 1) { - return(query->error); - } - token->f_str++; - token->f_len--; - } + ut_ad(*token->f_str != '%'); fts_query_cache(query, token); @@ -1485,6 +1508,67 @@ fts_query_match_phrase_terms( } /*****************************************************************//** +Callback function to count the number of words in position ranges, +and see whether the word count is in specified "phrase->distance" +@return true if the number of characters is less than the "distance" */ +static +bool +fts_proximity_is_word_in_range( +/*===========================*/ + const fts_phrase_t* + phrase, /*!< in: phrase with the search info */ + byte* start, /*!< in: text to search */ + ulint total_len) /*!< in: length of text */ +{ + fts_proximity_t* proximity_pos = phrase->proximity_pos; + + /* Search each matched position pair (with min and max positions) + and count the number of words in the range */ + for (ulint i = 0; i < proximity_pos->n_pos; i++) { + ulint cur_pos = proximity_pos->min_pos[i]; + ulint n_word = 0; + + ut_ad(proximity_pos->max_pos[i] <= total_len); + + /* Walk through words in the range and count them */ + while (cur_pos <= proximity_pos->max_pos[i]) { + ulint len; + fts_string_t str; + ulint offset = 0; + + len = innobase_mysql_fts_get_token( + phrase->charset, + start + cur_pos, + start + total_len, &str, &offset); + + if (len == 0) { + break; + } + + /* Advances position with "len" bytes */ + cur_pos += len; + + /* Record the number of words */ + if (str.f_n_char > 0) { + n_word++; + } + + if (n_word > phrase->distance) { + break; + } + } + + /* Check if the number of words is less than specified + "distance" */ + if (n_word && n_word <= phrase->distance) { + return(true); + } + } + + return(false); +} + +/*****************************************************************//** Callback function to fetch and search the document. @return TRUE if matched else FALSE */ static @@ -1594,31 +1678,77 @@ fts_query_fetch_document( sel_node_t* node = static_cast<sel_node_t*>(row); fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg); ulint prev_len = 0; + ulint total_len = 0; + byte* document_text = NULL; exp = node->select_list; phrase->found = FALSE; + /* For proximity search, we will need to get the whole document + from all fields, so first count the total length of the document + from all the fields */ + if (phrase->proximity_pos) { + while (exp) { + ulint field_len; + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + + if (dfield_is_ext(dfield)) { + ulint local_len = dfield_get_len(dfield); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + field_len = mach_read_from_4( + data + local_len + BTR_EXTERN_LEN + 4); + } else { + field_len = dfield_get_len(dfield); + } + + if (field_len != UNIV_SQL_NULL) { + total_len += field_len + 1; + } + + exp = que_node_get_next(exp); + } + + document_text = static_cast<byte*>(mem_heap_zalloc( + phrase->heap, total_len)); + + if (!document_text) { + return(FALSE); + } + } + + exp = node->select_list; + while (exp) { dfield_t* dfield = que_node_get_val(exp); - void* data = NULL; + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); ulint cur_len; if (dfield_is_ext(dfield)) { data = btr_copy_externally_stored_field( - &cur_len, static_cast<const byte*>(data), - phrase->zip_size, + &cur_len, data, phrase->zip_size, dfield_get_len(dfield), phrase->heap); } else { - data = dfield_get_data(dfield); cur_len = dfield_get_len(dfield); } if (cur_len != UNIV_SQL_NULL && cur_len != 0) { - phrase->found = - fts_query_match_phrase( - phrase, static_cast<byte*>(data), - cur_len, prev_len, phrase->heap); + if (phrase->proximity_pos) { + memcpy(document_text + prev_len, data, cur_len); + } else { + /* For phrase search */ + phrase->found = + fts_query_match_phrase( + phrase, + static_cast<byte*>(data), + cur_len, prev_len, + phrase->heap); + } } if (phrase->found) { @@ -1633,6 +1763,13 @@ fts_query_fetch_document( exp = que_node_get_next(exp); } + if (phrase->proximity_pos) { + ut_ad(prev_len <= total_len); + + phrase->found = fts_proximity_is_word_in_range( + phrase, document_text, total_len); + } + return(phrase->found); } @@ -1689,13 +1826,12 @@ fts_query_select( /******************************************************************** Read the rows from the FTS index, that match word and where the -doc id is between first and last doc id. */ -static -ulint +doc id is between first and last doc id. +@return DB_SUCCESS if all went well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_find_term( /*================*/ - /*!< out: DB_SUCCESS if all went well - else error code */ fts_query_t* query, /*!< in: FTS query state */ que_t** graph, /*!< in: prepared statement */ const fts_string_t* word, /*!< in: the word to fetch */ @@ -1705,7 +1841,7 @@ fts_query_find_term( ibool* found) /*!< out: TRUE if found else FALSE */ { pars_info_t* info; - ulint error; + dberr_t error; fts_select_t select; doc_id_t match_doc_id; trx_t* trx = query->trx; @@ -1830,19 +1966,18 @@ fts_query_sum( } /******************************************************************** -Calculate the total documents that contain a particular word (term). */ -static -ulint +Calculate the total documents that contain a particular word (term). +@return DB_SUCCESS if all went well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_total_docs_containing_term( /*=================================*/ - /*!< out: DB_SUCCESS if all went well - else error code */ fts_query_t* query, /*!< in: FTS query state */ const fts_string_t* word, /*!< in: the word to check */ ulint* total) /*!< out: documents containing word */ { pars_info_t* info; - ulint error; + dberr_t error; que_t* graph; ulint selected; trx_t* trx = query->trx; @@ -1910,19 +2045,18 @@ fts_query_total_docs_containing_term( } /******************************************************************** -Get the total number of words in a documents. */ -static -ulint +Get the total number of words in a documents. +@return DB_SUCCESS if all went well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_terms_in_document( /*========================*/ - /*!< out: DB_SUCCESS if all went well - else error code */ fts_query_t* query, /*!< in: FTS query state */ doc_id_t doc_id, /*!< in: the word to check */ ulint* total) /*!< out: total words in document */ { pars_info_t* info; - ulint error; + dberr_t error; que_t* graph; doc_id_t read_doc_id; trx_t* trx = query->trx; @@ -1993,9 +2127,9 @@ fts_query_terms_in_document( /*****************************************************************//** Retrieve the document and match the phrase tokens. -@return TRUE if matches else FALSE */ -static -ulint +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_match_document( /*=====================*/ ib_vector_t* tokens, /*!< in: phrase tokens */ @@ -2004,7 +2138,7 @@ fts_query_match_document( ulint distance, /*!< in: proximity distance */ ibool* found) /*!< out: TRUE if phrase found */ { - ulint error; + dberr_t error; fts_phrase_t phrase; memset(&phrase, 0x0, sizeof(phrase)); @@ -2025,8 +2159,8 @@ fts_query_match_document( if (error != DB_SUCCESS) { ut_print_timestamp(stderr); - fprintf(stderr, "InnoDB: Error: (%lu) matching document.\n", - error); + fprintf(stderr, "InnoDB: Error: (%s) matching document.\n", + ut_strerr(error)); } else { *found = phrase.found; } @@ -2037,11 +2171,66 @@ fts_query_match_document( } /*****************************************************************//** +This function fetches the original documents and count the +words in between matching words to see that is in specified distance +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +bool +fts_query_is_in_proximity_range( +/*============================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_match_t** match, /*!< in: query instance */ + fts_proximity_t* qualified_pos) /*!< in: position info for + qualified ranges */ +{ + fts_get_doc_t get_doc; + fts_cache_t* cache = query->index->table->fts->cache; + dberr_t err; + fts_phrase_t phrase; + + memset(&get_doc, 0x0, sizeof(get_doc)); + memset(&phrase, 0x0, sizeof(phrase)); + + rw_lock_x_lock(&cache->lock); + get_doc.index_cache = fts_find_index_cache(cache, query->index); + rw_lock_x_unlock(&cache->lock); + ut_a(get_doc.index_cache != NULL); + + phrase.distance = query->distance; + phrase.charset = get_doc.index_cache->charset; + phrase.zip_size = dict_table_zip_size( + get_doc.index_cache->index->table); + phrase.heap = mem_heap_create(512); + phrase.proximity_pos = qualified_pos; + phrase.found = FALSE; + + err = fts_doc_fetch_by_doc_id( + &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_fetch_document, &phrase); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error: (%s) in verification phase of proximity " + "search", ut_strerr(err)); + } + + /* Free the prepared statement. */ + if (get_doc.get_document_graph) { + fts_que_graph_free(get_doc.get_document_graph); + get_doc.get_document_graph = NULL; + } + + mem_heap_free(phrase.heap); + + return(err == DB_SUCCESS && phrase.found); +} + +/*****************************************************************//** Iterate over the matched document ids and search the for the actual phrase in the text. @return DB_SUCCESS if all OK */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_search_phrase( /*====================*/ fts_query_t* query, /*!< in: query instance */ @@ -2050,8 +2239,6 @@ fts_query_search_phrase( ulint i; fts_get_doc_t get_doc; ulint n_matched; - // FIXME: Debug code - ulint searched = 0; fts_cache_t* cache = query->index->table->fts->cache; n_matched = ib_vector_size(query->matched); @@ -2061,9 +2248,7 @@ fts_query_search_phrase( rw_lock_x_lock(&cache->lock); - // FIXME: We shouldn't have to cast here. - get_doc.index_cache = (fts_index_cache_t*) - fts_find_index_cache(cache, query->index); + get_doc.index_cache = fts_find_index_cache(cache, query->index); /* Must find the index cache */ ut_a(get_doc.index_cache != NULL); @@ -2089,9 +2274,6 @@ fts_query_search_phrase( an earlier pass. */ if (match->doc_id != 0) { - // FIXME: Debug code - ++searched; - query->error = fts_query_match_document( tokens, &get_doc, match, query->distance, &found); @@ -2119,18 +2301,14 @@ fts_query_search_phrase( get_doc.get_document_graph = NULL; } - // FIXME: Debug code - ut_print_timestamp(stderr); - printf(" End: %lu, %lu\n", searched, ib_vector_size(query->matched)); - return(query->error); } /*****************************************************************//** Text/Phrase search. -@return count of doc ids added */ -static -ulint +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_phrase_search( /*====================*/ fts_query_t* query, /*!< in: query instance */ @@ -2290,7 +2468,7 @@ fts_query_phrase_search( /* If we are doing proximity search, verify the distance between all words, and check they are in specified distance. */ if (query->flags & FTS_PROXIMITY) { - fts_check_phrase_proximity(query, tokens); + fts_phrase_or_proximity_search(query, tokens); } else { ibool matched; @@ -2301,7 +2479,7 @@ fts_query_phrase_search( and then doing a search through the text. Isolated testing shows this also helps in mitigating disruption of the buffer cache. */ - matched = fts_check_phrase_proximity(query, tokens); + matched = fts_phrase_or_proximity_search(query, tokens); query->matched = query->match_array[0]; /* Read the actual text in and search for the phrase. */ @@ -2329,8 +2507,8 @@ func_exit: /*****************************************************************//** Find the word and evaluate. @return DB_SUCCESS if all went well */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_query_execute( /*==============*/ fts_query_t* query, /*!< in: query instance */ @@ -2477,13 +2655,12 @@ fts_query_visitor( /*****************************************************************//** Process (nested) sub-expression, create a new result set to store the sub-expression result by processing nodes under current sub-expression -list. Merge the sub-expression result with that of parent expression list. */ - -ulint +list. Merge the sub-expression result with that of parent expression list. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t fts_ast_visit_sub_exp( /*==================*/ - /*!< out: DB_SUCCESS if all - went well */ fts_ast_node_t* node, /*!< in,out: current root node */ fts_ast_callback visitor, /*!< in: callback function */ void* arg) /*!< in,out: arg for callback */ @@ -2492,8 +2669,9 @@ fts_ast_visit_sub_exp( fts_query_t* query = static_cast<fts_query_t*>(arg); ib_rbt_t* parent_doc_ids; ib_rbt_t* subexpr_doc_ids; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ibool inited = query->inited; + bool will_be_ignored = false; ut_a(node->type == FTS_AST_SUBEXP_LIST); @@ -2521,7 +2699,8 @@ fts_ast_visit_sub_exp( /* Process nodes in current sub-expression and store its result set in query->doc_ids we created above. */ - error = fts_ast_visit(FTS_NONE, node->next, visitor, arg); + error = fts_ast_visit(FTS_NONE, node->next, visitor, + arg, &will_be_ignored); /* Reinstate parent node state and prepare for merge. */ query->inited = inited; @@ -2757,6 +2936,8 @@ fts_query_read_node( ut_a(query->cur_node->type == FTS_AST_TERM || query->cur_node->type == FTS_AST_TEXT); + memset(&node, 0, sizeof(node)); + /* Need to consider the wildcard search case, the word frequency is created on the search string not the actual word. So we need to assign the frequency on search string behalf. */ @@ -2879,8 +3060,8 @@ fts_query_calculate_idf( /*====================*/ fts_query_t* query) /*!< in: Query state */ { - const ib_rbt_node_t* node; - double total_docs = query->total_docs; + const ib_rbt_node_t* node; + ib_uint64_t total_docs = query->total_docs; /* We need to free any instances of fts_doc_freq_t that we may have allocated. */ @@ -2893,7 +3074,7 @@ fts_query_calculate_idf( word_freq = rbt_value(fts_word_freq_t, node); if (word_freq->doc_count > 0) { - if (total_docs == (double) word_freq->doc_count) { + if (total_docs == word_freq->doc_count) { /* QP assume ranking > 0 if we find a match. Since Log10(1) = 0, we cannot make IDF a zero value if do find a @@ -2907,10 +3088,13 @@ fts_query_calculate_idf( } } - fprintf(stderr,"'%s' -> %lu/%lu %6.5lf\n", - word_freq->word, - query->total_docs, word_freq->doc_count, - word_freq->idf); + if (fts_enable_diag_print) { + fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF + " %6.5lf\n", + word_freq->word, + query->total_docs, word_freq->doc_count, + word_freq->idf); + } } } @@ -3017,7 +3201,7 @@ fts_retrieve_ranking( ranking = rbt_value(fts_ranking_t, parent.last); - return (ranking->rank); + return(ranking->rank); } return(0); @@ -3184,7 +3368,7 @@ fts_query_parse( FTS Query entry point. @return DB_SUCCESS if successful otherwise error code */ UNIV_INTERN -ulint +dberr_t fts_query( /*======*/ trx_t* trx, /*!< in: transaction */ @@ -3196,7 +3380,7 @@ fts_query( fts_result_t** result) /*!< in/out: result doc ids */ { fts_query_t query; - ulint error; + dberr_t error = DB_SUCCESS; byte* lc_query_str; ulint lc_query_str_len; ulint result_len; @@ -3204,6 +3388,7 @@ fts_query( trx_t* query_trx; CHARSET_INFO* charset; ulint start_time_ms; + bool will_be_ignored = false; boolean_mode = flags & FTS_BOOL; @@ -3237,20 +3422,24 @@ fts_query( /* Setup the RB tree that will be used to collect per term statistics. */ query.word_freqs = rbt_create_arg_cmp( - sizeof(fts_word_freq_t), innobase_fts_string_cmp, charset); + sizeof(fts_word_freq_t), innobase_fts_string_cmp, + (void*) charset); - query.total_docs = fts_get_total_document_count(index->table); + query.total_docs = dict_table_get_n_rows(index->table); - error = fts_get_total_word_count(trx, query.index, &query.total_words); +#ifdef FTS_DOC_STATS_DEBUG + if (ft_enable_diag_print) { + error = fts_get_total_word_count( + trx, query.index, &query.total_words); - if (error != DB_SUCCESS) { - goto func_exit; - } + if (error != DB_SUCCESS) { + goto func_exit; + } -#ifdef FTS_INTERNAL_DIAG_PRINT - fprintf(stderr, "Total docs: %lu Total words: %lu\n", - query.total_docs, query.total_words); -#endif + fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n", + query.total_docs, query.total_words); + } +#endif /* FTS_DOC_STATS_DEBUG */ query.fts_common_table.suffix = "DELETED"; @@ -3299,13 +3488,14 @@ fts_query( sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); /* Parse the input query string. */ - if (fts_query_parse(&query, lc_query_str, query_len)) { + if (fts_query_parse(&query, lc_query_str, result_len)) { fts_ast_node_t* ast = query.root; /* Traverse the Abstract Syntax Tree (AST) and execute the query. */ query.error = fts_ast_visit( - FTS_NONE, ast, fts_query_visitor, &query); + FTS_NONE, ast, fts_query_visitor, + &query, &will_be_ignored); /* If query expansion is requested, extend the search with first search pass result */ @@ -3453,8 +3643,8 @@ words in documents found in the first search pass will be used as search arguments to search the document again, thus "expand" the search result set. @return DB_SUCCESS if success, otherwise the error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t fts_expand_query( /*=============*/ dict_index_t* index, /*!< in: FTS index to search */ @@ -3463,7 +3653,7 @@ fts_expand_query( const ib_rbt_node_t* node; const ib_rbt_node_t* token_node; fts_doc_t result_doc; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; const fts_index_cache_t*index_cache; /* If no doc is found in first search pass, return */ @@ -3482,7 +3672,7 @@ fts_expand_query( result_doc.tokens = rbt_create_arg_cmp( sizeof(fts_token_t), innobase_fts_text_cmp, - index_cache->charset); + (void*) index_cache->charset); result_doc.charset = index_cache->charset; @@ -3557,14 +3747,16 @@ fts_expand_query( /*************************************************************//** This function finds documents that contain all words in a phrase or proximity search. And if proximity search, verify -the words are close to each other enough, as in specified distance. +the words are close enough to each other, as in specified distance. This function is called for phrase and proximity search. @return TRUE if documents are found, FALSE if otherwise */ static ibool -fts_check_phrase_proximity( -/*=======================*/ - fts_query_t* query, /*!< in: query instance */ +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance. + query->doc_ids might be instantiated + with qualified doc IDs */ ib_vector_t* tokens) /*!< in: Tokens contain words */ { ulint n_matched; @@ -3581,8 +3773,13 @@ fts_check_phrase_proximity( walk through the list and find common documents that contain all the matching words. */ for (i = 0; i < n_matched; i++) { - ulint j; - ulint k = 0; + ulint j; + ulint k = 0; + fts_proximity_t qualified_pos; + ulint qualified_pos_buf[MAX_PROXIMITY_ITEM * 2]; + + qualified_pos.min_pos = &qualified_pos_buf[0]; + qualified_pos.max_pos = &qualified_pos_buf[MAX_PROXIMITY_ITEM]; match[0] = static_cast<fts_match_t*>( ib_vector_get(query->match_array[0], i)); @@ -3647,24 +3844,31 @@ fts_check_phrase_proximity( /* For this matching doc, we need to further verify whether the words in the doc are close - to each other, and with in distance specified + to each other, and within the distance specified in the proximity search */ if (query->flags & FTS_PHRASE) { matched = TRUE; - } else if (fts_proximity_check_position( - match, num_token, query->distance)) { - ulint z; - /* If so, mark we find a matching doc */ - fts_query_process_doc_id(query, match[0]->doc_id, 0); + } else if (fts_proximity_get_positions( + match, num_token, ULINT_MAX, &qualified_pos)) { + + /* Fetch the original documents and count the + words in between matching words to see that is in + specified distance */ + if (fts_query_is_in_proximity_range( + query, match, &qualified_pos)) { + /* If so, mark we find a matching doc */ + fts_query_process_doc_id( + query, match[0]->doc_id, 0); - matched = TRUE; - for (z = 0; z < num_token; z++) { - fts_string_t* token; - token = static_cast<fts_string_t*>( - ib_vector_get(tokens, z)); - fts_query_add_word_to_document( - query, match[0]->doc_id, - token->f_str); + matched = TRUE; + for (ulint z = 0; z < num_token; z++) { + fts_string_t* token; + token = static_cast<fts_string_t*>( + ib_vector_get(tokens, z)); + fts_query_add_word_to_document( + query, match[0]->doc_id, + token->f_str); + } } } @@ -3678,24 +3882,32 @@ func_exit: } /*************************************************************//** -This function check the words in result document are close to each -other (within proximity range). This is used for proximity search. -@return TRUE if words are close to each other, FALSE if otherwise */ +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ static -ulint -fts_proximity_check_position( -/*=========================*/ - fts_match_t** match, /*!< in: query instance */ - ulint num_match, /*!< in: number of matching - items */ - ulint distance) /*!< in: distance value - for proximity search */ +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos) /*!< out: the position info + records ranges containing + all matching words. */ { ulint i; ulint idx[MAX_PROXIMITY_ITEM]; ulint num_pos[MAX_PROXIMITY_ITEM]; ulint min_idx; + qualified_pos->n_pos = 0; + ut_a(num_match < MAX_PROXIMITY_ITEM); /* Each word could appear multiple times in a doc. So @@ -3747,14 +3959,21 @@ fts_proximity_check_position( find a good match */ if (max_pos - min_pos <= distance && (i >= num_match || position[i] != ULINT_UNDEFINED)) { - return(TRUE); - } else { - /* Otherwise, move to the next position is the - list for the word with the smallest position */ - idx[min_idx]++; + /* The charset has variable character + length encoding, record the min_pos and + max_pos, we will need to verify the actual + number of characters */ + qualified_pos->min_pos[qualified_pos->n_pos] = min_pos; + qualified_pos->max_pos[qualified_pos->n_pos] = max_pos; + qualified_pos->n_pos++; } + + /* Otherwise, move to the next position is the + list for the word with the smallest position */ + idx[min_idx]++; } - /* Failed to find all words within the range for the doc */ - return(FALSE); + ut_ad(qualified_pos->n_pos <= MAX_PROXIMITY_ITEM); + + return(qualified_pos->n_pos != 0); } diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc index 8e60a5f1132..03c19d93af6 100644 --- a/storage/innobase/fts/fts0sql.cc +++ b/storage/innobase/fts/fts0sql.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -262,7 +262,7 @@ fts_parse_sql_no_dict_lock( Evaluate an SQL query graph. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_eval_sql( /*=========*/ trx_t* trx, /*!< in: transaction */ @@ -327,16 +327,16 @@ fts_get_select_columns_str( Commit a transaction. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_sql_commit( /*===========*/ trx_t* trx) /*!< in: transaction */ { - ulint error; + dberr_t error; error = trx_commit_for_mysql(trx); - /* Commit above returns 0 on success, it should always succeed */ + /* Commit should always succeed */ ut_a(error == DB_SUCCESS); return(DB_SUCCESS); @@ -346,7 +346,7 @@ fts_sql_commit( Rollback a transaction. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_sql_rollback( /*=============*/ trx_t* trx) /*!< in: transaction */ diff --git a/storage/innobase/fts/fts0tlex.cc b/storage/innobase/fts/fts0tlex.cc index 69b859716d5..44434c4ea25 100644 --- a/storage/innobase/fts/fts0tlex.cc +++ b/storage/innobase/fts/fts0tlex.cc @@ -35,7 +35,7 @@ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, - * if you want the limit (max/min) macros for int types. + * if you want the limit (max/min) macros for int types. */ #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS 1 @@ -247,7 +247,7 @@ struct yy_buffer_state int yy_bs_lineno; /**< The line count. */ int yy_bs_column; /**< The column count. */ - + /* Whether to try to fill the input buffer when we reach the * end of it. */ @@ -368,10 +368,10 @@ struct yy_trans_info flex_int32_t yy_verify; flex_int32_t yy_nxt; }; -static yyconst flex_int16_t yy_accept[16] = +static yyconst flex_int16_t yy_accept[17] = { 0, - 4, 4, 7, 4, 1, 5, 1, 6, 2, 4, - 1, 1, 0, 3, 0 + 4, 4, 7, 4, 1, 5, 1, 6, 6, 2, + 4, 1, 1, 0, 3, 0 } ; static yyconst flex_int32_t yy_ec[256] = @@ -379,8 +379,8 @@ static yyconst flex_int32_t yy_ec[256] = 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 4, 1, 5, 1, 1, 1, 1, 1, 1, - 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 1, 1, 6, 1, 1, 1, + 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -406,35 +406,35 @@ static yyconst flex_int32_t yy_ec[256] = 1, 1, 1, 1, 1 } ; -static yyconst flex_int32_t yy_meta[7] = +static yyconst flex_int32_t yy_meta[8] = { 0, - 1, 2, 3, 4, 5, 1 + 1, 2, 3, 4, 5, 5, 1 } ; -static yyconst flex_int16_t yy_base[19] = +static yyconst flex_int16_t yy_base[20] = { 0, - 0, 0, 17, 0, 5, 20, 0, 8, 0, 0, - 0, 0, 3, 20, 20, 9, 10, 14 + 0, 0, 18, 0, 6, 21, 0, 9, 21, 0, + 0, 0, 0, 4, 21, 21, 10, 11, 15 } ; -static yyconst flex_int16_t yy_def[19] = +static yyconst flex_int16_t yy_def[20] = { 0, - 15, 1, 15, 16, 16, 15, 17, 18, 16, 16, - 5, 17, 18, 15, 0, 15, 15, 15 + 16, 1, 16, 17, 17, 16, 18, 19, 16, 17, + 17, 5, 18, 19, 16, 0, 16, 16, 16 } ; -static yyconst flex_int16_t yy_nxt[27] = +static yyconst flex_int16_t yy_nxt[29] = { 0, - 4, 5, 6, 7, 8, 9, 11, 14, 12, 10, - 10, 12, 14, 12, 13, 13, 15, 13, 13, 3, - 15, 15, 15, 15, 15, 15 + 4, 5, 6, 7, 8, 9, 10, 12, 15, 13, + 11, 11, 13, 15, 13, 14, 14, 16, 14, 14, + 3, 16, 16, 16, 16, 16, 16, 16 } ; -static yyconst flex_int16_t yy_chk[27] = +static yyconst flex_int16_t yy_chk[29] = { 0, - 1, 1, 1, 1, 1, 1, 5, 13, 5, 16, - 16, 17, 8, 17, 18, 18, 3, 18, 18, 15, - 15, 15, 15, 15, 15, 15 + 1, 1, 1, 1, 1, 1, 1, 5, 14, 5, + 17, 17, 18, 8, 18, 19, 19, 3, 19, 19, + 16, 16, 16, 16, 16, 16, 16, 16 } ; /* The intent behind this definition is that it'll catch @@ -699,7 +699,7 @@ YY_DECL register yy_state_type yy_current_state; register char *yy_cp, *yy_bp; register int yy_act; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; #line 44 "fts0tlex.l" @@ -757,13 +757,13 @@ yy_match: while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 16 ) + if ( yy_current_state >= 17 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; ++yy_cp; } - while ( yy_current_state != 15 ); + while ( yy_current_state != 16 ); yy_cp = yyg->yy_last_accepting_cpos; yy_current_state = yyg->yy_last_accepting_state; @@ -969,7 +969,7 @@ case YY_STATE_EOF(INITIAL): */ static int yy_get_next_buffer (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; register char *source = yyg->yytext_ptr; register int number_to_move, i; @@ -1035,9 +1035,9 @@ static int yy_get_next_buffer (yyscan_t yyscanner) else b->yy_buf_size *= 2; - b->yy_ch_buf = (char*) + b->yy_ch_buf = (char *) /* Include room in for 2 EOB chars. */ - fts0trealloc((void*) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); + fts0trealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); } else /* Can't grow it, we don't own it. */ @@ -1086,7 +1086,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { /* Extend the array by 50%, plus the number we really need. */ yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); - YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char*) fts0trealloc((void*) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0trealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); } @@ -1106,7 +1106,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) { register yy_state_type yy_current_state; register char *yy_cp; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yy_current_state = yyg->yy_start; @@ -1121,7 +1121,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 16 ) + if ( yy_current_state >= 17 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; @@ -1138,7 +1138,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) { register int yy_is_jam; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; /* This var may be unused depending upon options. */ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ register char *yy_cp = yyg->yy_c_buf_p; register YY_CHAR yy_c = 1; @@ -1150,11 +1150,11 @@ static int yy_get_next_buffer (yyscan_t yyscanner) while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 16 ) + if ( yy_current_state >= 17 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; - yy_is_jam = (yy_current_state == 15); + yy_is_jam = (yy_current_state == 16); return yy_is_jam ? 0 : yy_current_state; } @@ -1168,7 +1168,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) { int c; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; *yyg->yy_c_buf_p = yyg->yy_hold_char; @@ -1226,7 +1226,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) } } - c = *(unsigned char*) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ yyg->yy_hold_char = *++yyg->yy_c_buf_p; @@ -1241,7 +1241,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) */ void fts0trestart (FILE * input_file , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if ( ! YY_CURRENT_BUFFER ){ fts0tensure_buffer_stack (yyscanner); @@ -1259,7 +1259,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) */ void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* TODO. We should be able to replace this entire function body * with @@ -1291,7 +1291,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) static void fts0t_load_buffer_state (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; @@ -1317,7 +1317,7 @@ static void fts0t_load_buffer_state (yyscan_t yyscanner) /* yy_ch_buf has to be 2 characters longer than the size given because * we need to put in 2 end-of-buffer characters. */ - b->yy_ch_buf = (char*) fts0talloc(b->yy_buf_size + 2 ,yyscanner ); + b->yy_ch_buf = (char *) fts0talloc(b->yy_buf_size + 2 ,yyscanner ); if ( ! b->yy_ch_buf ) YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" ); @@ -1334,7 +1334,7 @@ static void fts0t_load_buffer_state (yyscan_t yyscanner) */ void fts0t_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if ( ! b ) return; @@ -1343,9 +1343,9 @@ static void fts0t_load_buffer_state (yyscan_t yyscanner) YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; if ( b->yy_is_our_buffer ) - fts0tfree((void*) b->yy_ch_buf ,yyscanner ); + fts0tfree((void *) b->yy_ch_buf ,yyscanner ); - fts0tfree((void*) b ,yyscanner ); + fts0tfree((void *) b ,yyscanner ); } /* Initializes or reinitializes a buffer. @@ -1356,7 +1356,7 @@ static void fts0t_load_buffer_state (yyscan_t yyscanner) { int oerrno = errno; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; fts0t_flush_buffer(b ,yyscanner); @@ -1383,7 +1383,7 @@ static void fts0t_load_buffer_state (yyscan_t yyscanner) */ void fts0t_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if ( ! b ) return; @@ -1413,7 +1413,7 @@ static void fts0t_load_buffer_state (yyscan_t yyscanner) */ void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (new_buffer == NULL) return; @@ -1444,7 +1444,7 @@ void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) */ void fts0tpop_buffer_state (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (!YY_CURRENT_BUFFER) return; @@ -1465,7 +1465,7 @@ void fts0tpop_buffer_state (yyscan_t yyscanner) static void fts0tensure_buffer_stack (yyscan_t yyscanner) { int num_to_alloc; - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (!yyg->yy_buffer_stack) { @@ -1474,7 +1474,7 @@ static void fts0tensure_buffer_stack (yyscan_t yyscanner) * immediate realloc on the next call. */ num_to_alloc = 1; - yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0talloc + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0talloc (num_to_alloc * sizeof(struct yy_buffer_state*) , yyscanner); if ( ! yyg->yy_buffer_stack ) @@ -1493,7 +1493,7 @@ static void fts0tensure_buffer_stack (yyscan_t yyscanner) int grow_size = 8 /* arbitrary grow size */; num_to_alloc = yyg->yy_buffer_stack_max + grow_size; - yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0trealloc + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0trealloc (yyg->yy_buffer_stack, num_to_alloc * sizeof(struct yy_buffer_state*) , yyscanner); @@ -1510,7 +1510,7 @@ static void fts0tensure_buffer_stack (yyscan_t yyscanner) * @param base the character buffer * @param size the size in bytes of the character buffer * @param yyscanner The scanner object. - * @return the newly allocated buffer state object. + * @return the newly allocated buffer state object. */ YY_BUFFER_STATE fts0t_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) { @@ -1571,7 +1571,7 @@ YY_BUFFER_STATE fts0t_scan_bytes (yyconst char * yybytes, int _yybytes_len , y /* Get memory for full buffer, including space for trailing EOB's. */ n = _yybytes_len + 2; - buf = (char*) fts0talloc(n ,yyscanner ); + buf = (char *) fts0talloc(n ,yyscanner ); if ( ! buf ) YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_bytes()" ); @@ -1626,7 +1626,7 @@ static void yy_fatal_error (yyconst char* msg , yyscan_t yyscanner __attribute_ */ YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyextra; } @@ -1635,7 +1635,7 @@ YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner) */ int fts0tget_lineno (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (! YY_CURRENT_BUFFER) return 0; @@ -1648,7 +1648,7 @@ int fts0tget_lineno (yyscan_t yyscanner) */ int fts0tget_column (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; if (! YY_CURRENT_BUFFER) return 0; @@ -1661,7 +1661,7 @@ int fts0tget_column (yyscan_t yyscanner) */ FILE *fts0tget_in (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyin; } @@ -1670,7 +1670,7 @@ FILE *fts0tget_in (yyscan_t yyscanner) */ FILE *fts0tget_out (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyout; } @@ -1679,7 +1679,7 @@ FILE *fts0tget_out (yyscan_t yyscanner) */ int fts0tget_leng (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yyleng; } @@ -1689,7 +1689,7 @@ int fts0tget_leng (yyscan_t yyscanner) char *fts0tget_text (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yytext; } @@ -1699,7 +1699,7 @@ char *fts0tget_text (yyscan_t yyscanner) */ void fts0tset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyextra = user_defined ; } @@ -1709,11 +1709,11 @@ void fts0tset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) */ void fts0tset_lineno (int line_number , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* lineno is only valid if an input buffer exists. */ if (! YY_CURRENT_BUFFER ) - yy_fatal_error( "fts0tset_lineno called with no buffer" , yyscanner); + yy_fatal_error( "fts0tset_lineno called with no buffer" , yyscanner); yylineno = line_number; } @@ -1724,11 +1724,11 @@ void fts0tset_lineno (int line_number , yyscan_t yyscanner) */ void fts0tset_column (int column_no , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* column is only valid if an input buffer exists. */ if (! YY_CURRENT_BUFFER ) - yy_fatal_error( "fts0tset_column called with no buffer" , yyscanner); + yy_fatal_error( "fts0tset_column called with no buffer" , yyscanner); yycolumn = column_no; } @@ -1741,25 +1741,25 @@ void fts0tset_column (int column_no , yyscan_t yyscanner) */ void fts0tset_in (FILE * in_str , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyin = in_str ; } void fts0tset_out (FILE * out_str , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yyout = out_str ; } int fts0tget_debug (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yy_flex_debug; } void fts0tset_debug (int bdebug , yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yy_flex_debug = bdebug ; } @@ -1819,19 +1819,19 @@ int fts0tlex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals ) errno = ENOMEM; return 1; } - + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); fts0tset_extra (yy_user_defined, *ptr_yy_globals); - + return yy_init_globals ( *ptr_yy_globals ); } static int yy_init_globals (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* Initialization is the same as for the non-reentrant scanner. * This function is called from fts0tlex_destroy(), so don't allocate here. */ @@ -1839,7 +1839,7 @@ static int yy_init_globals (yyscan_t yyscanner) yyg->yy_buffer_stack = 0; yyg->yy_buffer_stack_top = 0; yyg->yy_buffer_stack_max = 0; - yyg->yy_c_buf_p = (char*) 0; + yyg->yy_c_buf_p = (char *) 0; yyg->yy_init = 0; yyg->yy_start = 0; @@ -1852,8 +1852,8 @@ static int yy_init_globals (yyscan_t yyscanner) yyin = stdin; yyout = stdout; #else - yyin = (FILE*) 0; - yyout = (FILE*) 0; + yyin = (FILE *) 0; + yyout = (FILE *) 0; #endif /* For future reference: Set errno on error, since we are called by @@ -1865,7 +1865,7 @@ static int yy_init_globals (yyscan_t yyscanner) /* fts0tlex_destroy is for both reentrant and non-reentrant scanners. */ int fts0tlex_destroy (yyscan_t yyscanner) { - struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* Pop the buffer stack, destroying each element. */ while(YY_CURRENT_BUFFER){ @@ -1918,24 +1918,24 @@ static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner __attribute__( void *fts0talloc (yy_size_t size , yyscan_t yyscanner __attribute__((unused))) { - return (void*) malloc( size ); + return (void *) malloc( size ); } void *fts0trealloc (void * ptr, yy_size_t size , yyscan_t yyscanner __attribute__((unused))) { - /* The cast to (char*) in the following accommodates both + /* The cast to (char *) in the following accommodates both * implementations that use char* generic pointers, and those * that use void* generic pointers. It works with the latter * because both ANSI C and C++ allow castless assignment from * any pointer type to void*, and deal with argument conversions * as though doing an assignment. */ - return (void*) realloc( (char*) ptr, size ); + return (void *) realloc( (char *) ptr, size ); } void fts0tfree (void * ptr , yyscan_t yyscanner __attribute__((unused))) { - free( (char*) ptr ); /* see fts0trealloc() for (char*) cast */ + free( (char *) ptr ); /* see fts0trealloc() for (char *) cast */ } #define YYTABLES_NAME "yytables" diff --git a/storage/innobase/fts/fts0tlex.l b/storage/innobase/fts/fts0tlex.l index 8b04a9fecf1..8c42678ac7a 100644 --- a/storage/innobase/fts/fts0tlex.l +++ b/storage/innobase/fts/fts0tlex.l @@ -57,7 +57,7 @@ this program; if not, write to the Free Software Foundation, Inc., return(FTS_TEXT); } -[^" \n]* { +[^" \n\%]* { val->token = strdup(fts0tget_text(yyscanner)); return(FTS_TERM); diff --git a/storage/innobase/ha/ha0ha.cc b/storage/innobase/ha/ha0ha.cc index b58dc486cfa..3ec778f3bec 100644 --- a/storage/innobase/ha/ha0ha.cc +++ b/storage/innobase/ha/ha0ha.cc @@ -32,9 +32,7 @@ Created 8/22/1994 Heikki Tuuri #ifdef UNIV_DEBUG # include "buf0buf.h" #endif /* UNIV_DEBUG */ -#ifndef UNIV_HOTBACKUP # include "btr0sea.h" -#endif /* !UNIV_HOTBACKUP */ #include "page0page.h" /*************************************************************//** @@ -79,7 +77,6 @@ ha_create_func( return(table); } -#ifndef UNIV_HOTBACKUP if (type == MEM_HEAP_FOR_PAGE_HASH) { /* We create a hash table protected by rw_locks for buf_pool->page_hash. */ @@ -97,7 +94,6 @@ ha_create_func( table->heaps[i] = mem_heap_create_typed(4096, type); ut_a(table->heaps[i]); } -#endif /* !UNIV_HOTBACKUP */ return(table); } @@ -120,7 +116,6 @@ ha_clear( || rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE)); #endif /* UNIV_SYNC_DEBUG */ -#ifndef UNIV_HOTBACKUP /* Free the memory heaps. */ n = table->n_sync_obj; @@ -151,7 +146,6 @@ ha_clear( table->n_sync_obj = 0; table->type = HASH_TABLE_SYNC_NONE; -#endif /* !UNIV_HOTBACKUP */ /* Clear the hash table. */ n = hash_get_n_cells(table); @@ -179,7 +173,7 @@ ha_insert_for_fold_func( #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG buf_block_t* block, /*!< in: buffer block containing the data */ #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ - const rec_t* data) /*!< in: data, must not be NULL */ + rec_t* data) /*!< in: data, must not be NULL */ { hash_cell_t* cell; ha_node_t* node; @@ -215,7 +209,7 @@ ha_insert_for_fold_func( prev_node->block = block; #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ - prev_node->data = (rec_t*) data; + prev_node->data = data; return(TRUE); } @@ -237,7 +231,7 @@ ha_insert_for_fold_func( return(FALSE); } - ha_node_set_data(node, block, (rec_t*) data); + ha_node_set_data(node, block, data); #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { diff --git a/storage/innobase/ha/hash0hash.cc b/storage/innobase/ha/hash0hash.cc index 99128a676d5..174b6bcb57e 100644 --- a/storage/innobase/ha/hash0hash.cc +++ b/storage/innobase/ha/hash0hash.cc @@ -106,14 +106,14 @@ void hash_mutex_exit_all_but( /*====================*/ hash_table_t* table, /*!< in: hash table */ - mutex_t* keep_mutex) /*!< in: mutex to keep */ + ib_mutex_t* keep_mutex) /*!< in: mutex to keep */ { ulint i; ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); for (i = 0; i < table->n_sync_obj; i++) { - mutex_t* mutex = table->sync_obj.mutexes + i; + ib_mutex_t* mutex = table->sync_obj.mutexes + i; if (UNIV_LIKELY(keep_mutex != mutex)) { mutex_exit(mutex); } @@ -373,8 +373,8 @@ hash_create_sync_obj_func( switch (type) { case HASH_TABLE_SYNC_MUTEX: - table->sync_obj.mutexes = static_cast<mutex_t*>( - mem_alloc(n_sync_obj * sizeof(mutex_t))); + table->sync_obj.mutexes = static_cast<ib_mutex_t*>( + mem_alloc(n_sync_obj * sizeof(ib_mutex_t))); for (i = 0; i < n_sync_obj; i++) { mutex_create(hash_table_mutex_key, diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index e19fe47e81a..44bbe20c8d3 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -3,6 +3,7 @@ Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2012, Facebook Inc. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -36,8 +37,10 @@ this program; if not, write to the Free Software Foundation, Inc., #include <sql_acl.h> // PROCESS_ACL #include <debug_sync.h> // DEBUG_SYNC +#include <my_base.h> // HA_OPTION_* #include <mysys_err.h> #include <innodb_priv.h> + #ifdef _WIN32 #include <io.h> #endif @@ -57,8 +60,10 @@ this program; if not, write to the Free Software Foundation, Inc., #include "srv0srv.h" #include "trx0roll.h" #include "trx0trx.h" + #include "trx0sys.h" #include "mtr0mtr.h" +#include "rem0types.h" #include "row0ins.h" #include "row0mysql.h" #include "row0sel.h" @@ -75,14 +80,24 @@ this program; if not, write to the Free Software Foundation, Inc., #include "row0merge.h" #include "dict0boot.h" #include "dict0stats.h" +#include "dict0stats_bg.h" #include "ha_prototypes.h" #include "ut0mem.h" #include "ibuf0ibuf.h" #include "dict0dict.h" #include "srv0mon.h" +#include "api0api.h" +#include "api0misc.h" #include "pars0pars.h" #include "fts0fts.h" #include "fts0types.h" +#include "row0import.h" +#include "row0quiesce.h" +#ifdef UNIV_DEBUG +#include "trx0purge.h" +#endif /* UNIV_DEBUG */ +#include "fts0priv.h" +#include "page0zip.h" #include "ha_innodb.h" #include "i_s.h" @@ -112,11 +127,9 @@ static const long AUTOINC_NEW_STYLE_LOCKING = 1; static const long AUTOINC_NO_LOCKING = 2; static long innobase_mirrored_log_groups; -static long innobase_log_files_in_group; static long innobase_log_buffer_size; static long innobase_additional_mem_pool_size; static long innobase_file_io_threads; -static long innobase_force_recovery; static long innobase_open_files; static long innobase_autoinc_lock_mode; static ulong innobase_commit_concurrency = 0; @@ -134,12 +147,13 @@ static uint innobase_old_blocks_pct; of the buffer pool. */ static uint innobase_change_buffer_max_size = CHANGE_BUFFER_DEFAULT_SIZE; +static ulong innobase_compression_level = DEFAULT_COMPRESSION_LEVEL; + /* The default values for the following char* start-up parameters are determined in innobase_init below: */ static char* innobase_data_home_dir = NULL; static char* innobase_data_file_path = NULL; -static char* innobase_log_group_home_dir = NULL; static char* innobase_file_format_name = NULL; static char* innobase_change_buffering = NULL; static char* innobase_enable_monitor_counter = NULL; @@ -176,7 +190,6 @@ static my_bool innobase_stats_on_metadata = TRUE; static my_bool innobase_large_prefix = FALSE; static my_bool innodb_optimize_fulltext_only = FALSE; - static char* internal_innobase_data_file_path = NULL; static char* innodb_version_str = (char*) INNODB_VERSION_STR; @@ -250,6 +263,11 @@ const struct _ft_vft ft_vft_result = {NULL, innobase_fts_retrieve_ranking, NULL}; +const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version, + innobase_fts_flags, + innobase_fts_retrieve_docid, + innobase_fts_count_matches}; + #ifdef HAVE_PSI_INTERFACE /* Keys to register pthread mutexes/cond in the current file with performance schema */ @@ -262,8 +280,7 @@ static mysql_pfs_key_t pending_checkpoint_mutex_key; static PSI_mutex_info all_pthread_mutexes[] = { {&commit_threads_m_key, "commit_threads_m", 0}, {&commit_cond_mutex_key, "commit_cond_mutex", 0}, - {&innobase_share_mutex_key, "innobase_share_mutex", 0}, - {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0} + {&innobase_share_mutex_key, "innobase_share_mutex", 0} }; static PSI_cond_info all_innodb_conds[] = { @@ -306,8 +323,10 @@ static PSI_mutex_info all_innodb_mutexes[] = { # endif /* UNIV_MEM_DEBUG */ {&mem_pool_mutex_key, "mem_pool_mutex", 0}, {&mutex_list_mutex_key, "mutex_list_mutex", 0}, + {&page_zip_stat_per_index_mutex_key, "page_zip_stat_per_index_mutex", 0}, {&purge_sys_bh_mutex_key, "purge_sys_bh_mutex", 0}, {&recv_sys_mutex_key, "recv_sys_mutex", 0}, + {&recv_writer_mutex_key, "recv_writer_mutex", 0}, {&rseg_mutex_key, "rseg_mutex", 0}, # ifdef UNIV_SYNC_DEBUG {&rw_lock_debug_mutex_key, "rw_lock_debug_mutex", 0}, @@ -336,8 +355,12 @@ static PSI_mutex_info all_innodb_mutexes[] = { #ifndef HAVE_ATOMIC_BUILTINS {&srv_conc_mutex_key, "srv_conc_mutex", 0}, #endif /* !HAVE_ATOMIC_BUILTINS */ +#ifndef HAVE_ATOMIC_BUILTINS_64 + {&monitor_mutex_key, "monitor_mutex", 0}, +#endif /* !HAVE_ATOMIC_BUILTINS_64 */ {&ut_list_mutex_key, "ut_list_mutex", 0}, {&trx_sys_mutex_key, "trx_sys_mutex", 0}, + {&zip_pad_mutex_key, "zip_pad_mutex", 0}, }; # endif /* UNIV_PFS_MUTEX */ @@ -364,6 +387,7 @@ static PSI_rwlock_info all_innodb_rwlocks[] = { {&trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0}, {&trx_purge_latch_key, "trx_purge_latch", 0}, {&index_tree_rw_lock_key, "index_tree_rw_lock", 0}, + {&index_online_log_key, "index_online_log", 0}, {&dict_table_stats_latch_key, "dict_table_stats", 0}, {&hash_table_rw_lock_key, "hash table locks", 0} }; @@ -381,7 +405,8 @@ static PSI_thread_info all_innodb_threads[] = { {&srv_monitor_thread_key, "srv_monitor_thread", 0}, {&srv_master_thread_key, "srv_master_thread", 0}, {&srv_purge_thread_key, "srv_purge_thread", 0}, - {&buf_page_cleaner_thread_key, "page_cleaner_thread", 0} + {&buf_page_cleaner_thread_key, "page_cleaner_thread", 0}, + {&recv_writer_thread_key, "recovery writer thread", 0} }; # endif /* UNIV_PFS_THREAD */ @@ -396,6 +421,70 @@ static PSI_file_info all_innodb_files[] = { # endif /* UNIV_PFS_IO */ #endif /* HAVE_PSI_INTERFACE */ +/** Always normalize table name to lower case on Windows */ +#ifdef __WIN__ +#define normalize_table_name(norm_name, name) \ + normalize_table_name_low(norm_name, name, TRUE) +#else +#define normalize_table_name(norm_name, name) \ + normalize_table_name_low(norm_name, name, FALSE) +#endif /* __WIN__ */ + +/** Set up InnoDB API callback function array */ +ib_cb_t innodb_api_cb[] = { + (ib_cb_t) ib_cursor_open_table, + (ib_cb_t) ib_cursor_read_row, + (ib_cb_t) ib_cursor_insert_row, + (ib_cb_t) ib_cursor_delete_row, + (ib_cb_t) ib_cursor_update_row, + (ib_cb_t) ib_cursor_moveto, + (ib_cb_t) ib_cursor_first, + (ib_cb_t) ib_cursor_next, + (ib_cb_t) ib_cursor_last, + (ib_cb_t) ib_cursor_set_match_mode, + (ib_cb_t) ib_sec_search_tuple_create, + (ib_cb_t) ib_clust_read_tuple_create, + (ib_cb_t) ib_tuple_delete, + (ib_cb_t) ib_tuple_copy, + (ib_cb_t) ib_tuple_read_u32, + (ib_cb_t) ib_tuple_write_u32, + (ib_cb_t) ib_tuple_read_u64, + (ib_cb_t) ib_tuple_write_u64, + (ib_cb_t) ib_tuple_read_i32, + (ib_cb_t) ib_tuple_write_i32, + (ib_cb_t) ib_tuple_read_i64, + (ib_cb_t) ib_tuple_write_i64, + (ib_cb_t) ib_tuple_get_n_cols, + (ib_cb_t) ib_col_set_value, + (ib_cb_t) ib_col_get_value, + (ib_cb_t) ib_col_get_meta, + (ib_cb_t) ib_trx_begin, + (ib_cb_t) ib_trx_commit, + (ib_cb_t) ib_trx_rollback, + (ib_cb_t) ib_trx_start, + (ib_cb_t) ib_trx_release, + (ib_cb_t) ib_trx_state, + (ib_cb_t) ib_cursor_lock, + (ib_cb_t) ib_cursor_close, + (ib_cb_t) ib_cursor_new_trx, + (ib_cb_t) ib_cursor_reset, + (ib_cb_t) ib_open_table_by_name, + (ib_cb_t) ib_col_get_name, + (ib_cb_t) ib_table_truncate, + (ib_cb_t) ib_cursor_open_index_using_name, + (ib_cb_t) ib_close_thd, + (ib_cb_t) ib_cfg_get_cfg, + (ib_cb_t) ib_cursor_set_cluster_access, + (ib_cb_t) ib_cursor_commit_trx, + (ib_cb_t) ib_cfg_trx_level, + (ib_cb_t) ib_tuple_get_n_user_cols, + (ib_cb_t) ib_cursor_set_lock_mode, + (ib_cb_t) ib_cursor_clear_trx, + (ib_cb_t) ib_get_idx_field_name, + (ib_cb_t) ib_trx_get_start_time, + (ib_cb_t) ib_cfg_bk_commit_interval +}; + /*************************************************************//** Check whether valid argument given to innodb_ft_*_stopword_table. This function is registered as a callback with MySQL. @@ -410,24 +499,10 @@ innodb_stopword_table_validate( void* save, /*!< out: immediate result for update function */ struct st_mysql_value* value); /*!< in: incoming string */ -/****************************************************************//** -Update the session variable innodb_session_stopword_table -with the "saved" stopword table name value. This function -is registered as a callback with MySQL. */ -static -void -innodb_session_stopword_update( -/*===========================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save); /*!< in: immediate result - from check function */ -/** "GEN_CLUST_INDEX" is the name reserved for Innodb default -system primary index. */ -static const char innobase_index_reserve_name[]= "GEN_CLUST_INDEX"; + +/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default +system clustered index when there is no primary key. */ +const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX"; static const char innobase_hton_name[]= "InnoDB"; @@ -450,19 +525,14 @@ static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG, NULL, NULL, /* default */ TRUE); -static MYSQL_THDVAR_BOOL(analyze_is_persistent, PLUGIN_VAR_OPCMDARG, - "ANALYZE TABLE in InnoDB uses a more precise (and slow) sampling " - "algorithm and saves the results persistently.", - /* check_func */ NULL, /* update_func */ NULL, - /* default */ FALSE); - static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.", NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0); -static MYSQL_THDVAR_STR(ft_user_stopword_table, PLUGIN_VAR_OPCMDARG, +static MYSQL_THDVAR_STR(ft_user_stopword_table, + PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC, "User supplied stopword table name, effective in the session level.", - innodb_stopword_table_validate, innodb_session_stopword_update, NULL); + innodb_stopword_table_validate, NULL, NULL); static SHOW_VAR innodb_status_variables[]= { {"buffer_pool_dump_status", @@ -471,8 +541,12 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_buffer_pool_load_status, SHOW_CHAR}, {"buffer_pool_pages_data", (char*) &export_vars.innodb_buffer_pool_pages_data, SHOW_LONG}, + {"buffer_pool_bytes_data", + (char*) &export_vars.innodb_buffer_pool_bytes_data, SHOW_LONG}, {"buffer_pool_pages_dirty", (char*) &export_vars.innodb_buffer_pool_pages_dirty, SHOW_LONG}, + {"buffer_pool_bytes_dirty", + (char*) &export_vars.innodb_buffer_pool_bytes_dirty, SHOW_LONG}, {"buffer_pool_pages_flushed", (char*) &export_vars.innodb_buffer_pool_pages_flushed, SHOW_LONG}, {"buffer_pool_pages_free", @@ -567,6 +641,12 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_truncated_status_writes, SHOW_LONG}, {"available_undo_logs", (char*) &export_vars.innodb_available_undo_logs, SHOW_LONG}, +#ifdef UNIV_DEBUG + {"purge_trx_id_age", + (char*) &export_vars.innodb_purge_trx_id_age, SHOW_LONG}, + {"purge_view_trx_id_age", + (char*) &export_vars.innodb_purge_view_trx_id_age, SHOW_LONG}, +#endif /* UNIV_DEBUG */ {NullS, NullS, SHOW_LONG} }; @@ -598,18 +678,8 @@ innobase_close_connection( THD* thd); /*!< in: MySQL thread handle for which to close the connection */ -static -void -innobase_commit_ordered( -/*======================*/ - handlerton *hton, /*!< in/out: Innodb handlerton */ - THD* thd, /*!< in: MySQL thread handle */ - bool all); /*!< in: TRUE - commit transaction - FALSE - the current SQL statement - ended */ -static -void -innobase_kill_query(handlerton *hton, THD* thd, enum thd_kill_levels level); +static void innobase_kill_query(handlerton *hton, THD* thd, enum thd_kill_levels level); +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all); /*****************************************************************//** Commits a transaction in an InnoDB database or marks an SQL statement @@ -684,14 +754,7 @@ innobase_release_savepoint( savepoint should be released */ void* savepoint); /*!< in: savepoint data */ -/*****************************************************************//** -Handle a commit checkpoint request from server layer. -We simply flush the redo log immediately and do the notify call.*/ -static -void -innobase_checkpoint_request( - handlerton *hton, - void *cookie); +static void innobase_checkpoint_request(handlerton *hton, void *cookie); /************************************************************************//** Function for constructing an InnoDB table handler instance. */ @@ -745,13 +808,6 @@ int innobase_file_format_validate_and_set( /*==================================*/ const char* format_max); /*!< in: parameter value */ -/****************************************************************//** -Return alter table flags supported in an InnoDB database. */ -static -uint -innobase_alter_table_flags( -/*=======================*/ - uint flags); /*******************************************************************//** This function is used to prepare an X/Open XA distributed transaction. @@ -925,6 +981,21 @@ innodb_enable_monitor_at_startup( /*=============================*/ char* str); /*!< in: monitor counter enable list */ +/********************************************************************* +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case if "set_lower_case" is set to TRUE. */ +static +void +normalize_table_name_low( +/*=====================*/ + char* norm_name, /* out: normalized name as a + null-terminated string */ + const char* name, /* in: table name string */ + ibool set_lower_case); /* in: TRUE if we want to set + name to lower case */ + /*************************************************************//** Check for a valid value of innobase_commit_concurrency. @return 0 for valid innodb_commit_concurrency */ @@ -967,7 +1038,7 @@ innobase_create_handler( TABLE_SHARE* table, MEM_ROOT* mem_root) { - return new (mem_root) ha_innobase(hton, table); + return(new (mem_root) ha_innobase(hton, table)); } /* General functions */ @@ -1008,9 +1079,22 @@ UNIV_INTERN ibool thd_is_replication_slave_thread( /*============================*/ - void* thd) /*!< in: thread handle (THD*) */ + THD* thd) /*!< in: thread handle */ +{ + return((ibool) thd_slave_thread(thd)); +} + +/******************************************************************//** +Gets information on the durability property requested by thread. +Used when writing either a prepare or commit record to the log +buffer. @return the durability property. */ +UNIV_INTERN +enum durability_properties +thd_requested_durability( +/*=====================*/ + const THD* thd) /*!< in: thread handle */ { - return((ibool) thd_slave_thread((THD*) thd)); + return(thd_get_durability_property(thd)); } /******************************************************************//** @@ -1020,10 +1104,9 @@ UNIV_INTERN ibool thd_trx_is_read_only( /*=================*/ - void* thd) /*!< in: thread handle (THD*) */ + THD* thd) /*!< in: thread handle */ { - /* Waiting on WL#6046 to complete. */ - return(FALSE); + return(thd != 0 && thd_tx_is_read_only(thd)); } /******************************************************************//** @@ -1034,11 +1117,11 @@ UNIV_INTERN ibool thd_trx_is_auto_commit( /*===================*/ - void* thd) /*!< in: thread handle (THD*) can be NULL */ + THD* thd) /*!< in: thread handle, can be NULL */ { return(thd != NULL && !thd_test_options( - static_cast<THD*>(thd), + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) && thd_is_select(thd)); } @@ -1114,6 +1197,17 @@ innobase_srv_conc_force_exit_innodb( } /******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +UNIV_INTERN +const char* +server_get_hostname() +/*=================*/ +{ + return(glob_hostname); +} + +/******************************************************************//** Returns true if the transaction this thread is processing has edited non-transactional tables. Used by the deadlock detector when deciding which transaction to rollback in case of a deadlock - we try to avoid @@ -1123,9 +1217,9 @@ UNIV_INTERN ibool thd_has_edited_nontrans_tables( /*===========================*/ - void* thd) /*!< in: thread handle (THD*) */ + THD* thd) /*!< in: thread handle */ { - return((ibool) thd_non_transactional_update((THD*) thd)); + return((ibool) thd_non_transactional_update(thd)); } /******************************************************************//** @@ -1135,9 +1229,9 @@ UNIV_INTERN ibool thd_is_select( /*==========*/ - const void* thd) /*!< in: thread handle (THD*) */ + const THD* thd) /*!< in: thread handle */ { - return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT); + return(thd_sql_command(thd) == SQLCOM_SELECT); } /******************************************************************//** @@ -1148,10 +1242,10 @@ UNIV_INTERN ibool thd_supports_xa( /*============*/ - void* thd) /*!< in: thread handle (THD*), or NULL to query + THD* thd) /*!< in: thread handle, or NULL to query the global innodb_supports_xa */ { - return(THDVAR((THD*) thd, support_xa)); + return(THDVAR(thd, support_xa)); } /******************************************************************//** @@ -1161,12 +1255,12 @@ UNIV_INTERN ulong thd_lock_wait_timeout( /*==================*/ - void* thd) /*!< in: thread handle (THD*), or NULL to query + THD* thd) /*!< in: thread handle, or NULL to query the global innodb_lock_wait_timeout */ { /* According to <mysql/plugin.h>, passing thd == NULL returns the global value of the session variable. */ - return(THDVAR((THD*) thd, lock_wait_timeout)); + return(THDVAR(thd, lock_wait_timeout)); } /******************************************************************//** @@ -1175,17 +1269,18 @@ UNIV_INTERN void thd_set_lock_wait_time( /*===================*/ - void* thd, /*!< in: thread handle (THD*) */ + THD* thd, /*!< in/out: thread handle */ ulint value) /*!< in: time waited for the lock */ { if (thd) { - thd_storage_lock_wait((THD*) thd, value); + thd_storage_lock_wait(thd, value); } } /********************************************************************//** Obtain the InnoDB transaction of a MySQL thread. @return reference to transaction pointer */ +__attribute__((warn_unused_result, nonnull)) static inline trx_t*& thd_to_trx( @@ -1245,11 +1340,11 @@ Converts an InnoDB error code to a MySQL error code and also tells to MySQL about a possible transaction rollback inside InnoDB caused by a lock wait timeout or a deadlock. @return MySQL error code */ -UNIV_INTERN +static int convert_error_code_to_mysql( /*========================*/ - int error, /*!< in: InnoDB error code */ + dberr_t error, /*!< in: InnoDB error code */ ulint flags, /*!< in: InnoDB table flags, or 0 */ THD* thd) /*!< in: user thread handle or NULL */ { @@ -1287,7 +1382,7 @@ convert_error_code_to_mysql( return(HA_ERR_FOUND_DUPP_KEY); case DB_READ_ONLY: - return(HA_ERR_READ_ONLY_TRANSACTION); + return(HA_ERR_TABLE_READONLY); case DB_FOREIGN_DUPLICATE_KEY: return(HA_ERR_FOREIGN_DUPLICATE_KEY); @@ -1344,12 +1439,19 @@ convert_error_code_to_mysql( case DB_OUT_OF_FILE_SPACE: return(HA_ERR_RECORD_FILE_FULL); + case DB_TABLE_IN_FK_CHECK: + return(HA_ERR_TABLE_IN_FK_CHECK); + case DB_TABLE_IS_BEING_USED: return(HA_ERR_WRONG_COMMAND); + case DB_TABLESPACE_DELETED: case DB_TABLE_NOT_FOUND: return(HA_ERR_NO_SUCH_TABLE); + case DB_TABLESPACE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + case DB_TOO_BIG_RECORD: { /* If prefix is true then a 768-byte prefix is stored locally for BLOB fields. Refer to dict_table_get_format() */ @@ -1365,7 +1467,7 @@ convert_error_code_to_mysql( "or ROW_FORMAT=COMPRESSED ": "", prefix ? DICT_MAX_FIXED_COL_LEN : 0); return(HA_ERR_TO_BIG_ROW); - } + } case DB_TOO_BIG_INDEX_COL: my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), @@ -1386,27 +1488,21 @@ convert_error_code_to_mysql( return(HA_ERR_LOCK_TABLE_FULL); - case DB_PRIMARY_KEY_IS_NULL: - return(ER_PRIMARY_CANT_HAVE_NULL); - case DB_FTS_INVALID_DOCID: return(HA_FTS_INVALID_DOCID); case DB_TOO_MANY_CONCURRENT_TRXS: - /* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only - available in 5.1.38 and later, but the plugin should still - work with previous versions of MySQL. */ -#ifdef HA_ERR_TOO_MANY_CONCURRENT_TRXS return(HA_ERR_TOO_MANY_CONCURRENT_TRXS); -#else /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */ - return(HA_ERR_RECORD_FILE_FULL); -#endif /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */ case DB_UNSUPPORTED: return(HA_ERR_UNSUPPORTED); case DB_INDEX_CORRUPT: return(HA_ERR_INDEX_CORRUPT); case DB_UNDO_RECORD_TOO_BIG: return(HA_ERR_UNDO_REC_TOO_BIG); + case DB_OUT_OF_MEMORY: + return(HA_ERR_OUT_OF_MEM); + case DB_TABLESPACE_EXISTS: + return(HA_ERR_TABLESPACE_EXISTS); } } @@ -1417,18 +1513,30 @@ void innobase_mysql_print_thd( /*=====================*/ FILE* f, /*!< in: output stream */ - void* thd, /*!< in: pointer to a MySQL THD object */ + THD* thd, /*!< in: MySQL THD object */ uint max_query_len) /*!< in: max query length to print, or 0 to use the default max length */ { char buffer[1024]; - fputs(thd_security_context((THD*) thd, buffer, sizeof buffer, + fputs(thd_security_context(thd, buffer, sizeof buffer, max_query_len), f); putc('\n', f); } /******************************************************************//** +Get the error message format string. +@return the format string or 0 if not found. */ +UNIV_INTERN +const char* +innobase_get_err_msg( +/*=================*/ + int error_code) /*!< in: MySQL error code */ +{ + return(my_get_err_msg(error_code)); +} + +/******************************************************************//** Get the variable length bounds of the given character set. */ UNIV_INTERN void @@ -1456,7 +1564,7 @@ innobase_get_cset_width( /* Fix bug#46256: allow tables to be dropped if the collation is not found, but issue a warning. */ - if ((global_system_variables.log_warnings) + if ((log_warnings) && (cset != 0)){ sql_print_warning( @@ -1572,9 +1680,9 @@ UNIV_INTERN struct charset_info_st* innobase_get_charset( /*=================*/ - void* mysql_thd) /*!< in: MySQL thread handle */ + THD* mysql_thd) /*!< in: MySQL thread handle */ { - return(thd_charset((THD*) mysql_thd)); + return(thd_charset(mysql_thd)); } /**********************************************************************//** @@ -1584,12 +1692,12 @@ UNIV_INTERN const char* innobase_get_stmt( /*==============*/ - void* mysql_thd, /*!< in: MySQL thread handle */ + THD* thd, /*!< in: MySQL thread handle */ size_t* length) /*!< out: length of the SQL statement */ { LEX_STRING* stmt; - stmt = thd_query_string((THD*) mysql_thd); + stmt = thd_query_string(thd); *length = stmt->length; return(stmt->str); } @@ -1621,99 +1729,6 @@ innobase_get_lower_case_table_names(void) return(lower_case_table_names); } -#if defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) -extern MYSQL_PLUGIN_IMPORT MY_TMPDIR mysql_tmpdir_list; -/*******************************************************************//** -Map an OS error to an errno value. The OS error number is stored in -_doserrno and the mapped value is stored in errno) */ -void __cdecl -_dosmaperr( - unsigned long); /*!< in: OS error value */ - -/*********************************************************************//** -Creates a temporary file. -@return temporary file descriptor, or < 0 on error */ -UNIV_INTERN -int -innobase_mysql_tmpfile(void) -/*========================*/ -{ - int fd; /* handle of opened file */ - HANDLE osfh; /* OS handle of opened file */ - char* tmpdir; /* point to the directory - where to create file */ - TCHAR path_buf[MAX_PATH - 14]; /* buffer for tmp file path. - The length cannot be longer - than MAX_PATH - 14, or - GetTempFileName will fail. */ - char filename[MAX_PATH]; /* name of the tmpfile */ - DWORD fileaccess = GENERIC_READ /* OS file access */ - | GENERIC_WRITE - | DELETE; - DWORD fileshare = FILE_SHARE_READ /* OS file sharing mode */ - | FILE_SHARE_WRITE - | FILE_SHARE_DELETE; - DWORD filecreate = CREATE_ALWAYS; /* OS method of open/create */ - DWORD fileattrib = /* OS file attribute flags */ - FILE_ATTRIBUTE_NORMAL - | FILE_FLAG_DELETE_ON_CLOSE - | FILE_ATTRIBUTE_TEMPORARY - | FILE_FLAG_SEQUENTIAL_SCAN; - - DBUG_ENTER("innobase_mysql_tmpfile"); - - tmpdir = my_tmpdir(&mysql_tmpdir_list); - - /* The tmpdir parameter can not be NULL for GetTempFileName. */ - if (!tmpdir) { - uint ret; - - /* Use GetTempPath to determine path for temporary files. */ - ret = GetTempPath(sizeof(path_buf), path_buf); - if (ret > sizeof(path_buf) || (ret == 0)) { - - _dosmaperr(GetLastError()); /* map error */ - DBUG_RETURN(-1); - } - - tmpdir = path_buf; - } - - /* Use GetTempFileName to generate a unique filename. */ - if (!GetTempFileName(tmpdir, "ib", 0, filename)) { - - _dosmaperr(GetLastError()); /* map error */ - DBUG_RETURN(-1); - } - - DBUG_PRINT("info", ("filename: %s", filename)); - - /* Open/Create the file. */ - osfh = CreateFile(filename, fileaccess, fileshare, NULL, - filecreate, fileattrib, NULL); - if (osfh == INVALID_HANDLE_VALUE) { - - /* open/create file failed! */ - _dosmaperr(GetLastError()); /* map error */ - DBUG_RETURN(-1); - } - - do { - /* Associates a CRT file descriptor with the OS file handle. */ - fd = _open_osfhandle((intptr_t) osfh, 0); - } while (fd == -1 && errno == EINTR); - - if (fd == -1) { - /* Open failed, close the file handle. */ - - _dosmaperr(GetLastError()); /* map error */ - CloseHandle(osfh); /* no need to check if - CloseHandle fails */ - } - - DBUG_RETURN(fd); -} -#else /*********************************************************************//** Creates a temporary file. @return temporary file descriptor, or < 0 on error */ @@ -1724,6 +1739,9 @@ innobase_mysql_tmpfile(void) { int fd2 = -1; File fd = mysql_tmpfile("ib"); + + DBUG_EXECUTE_IF("innobase_tmpfile_creation_failure", return(-1);); + if (fd >= 0) { /* Copy the file descriptor, so that the additional resources allocated by create_temp_file() can be freed by invoking @@ -1767,7 +1785,6 @@ innobase_mysql_tmpfile(void) } return(fd2); } -#endif /* defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) */ /*********************************************************************//** Wrapper around MySQL's copy_and_convert function. @@ -1845,11 +1862,11 @@ values we want to reserve for multi-value inserts e.g., INSERT INTO T VALUES(), (), (); -innobase_next_autoinc() will be called with increment set to -n * 3 where autoinc_lock_mode != TRADITIONAL because we want -to reserve 3 values for the multi-value INSERT above. +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. @return the next value */ -static +UNIV_INTERN ulonglong innobase_next_autoinc( /*==================*/ @@ -1886,6 +1903,7 @@ innobase_next_autoinc( in reality a negative value.The visual studio compilers converts large double values automatically into unsigned long long datatype maximum value */ + if (block >= max_value || offset > max_value || current >= max_value @@ -2055,7 +2073,7 @@ trx_deregister_from_2pc( trx_t* trx) /* in: transaction */ { trx->is_registered = 0; - trx->active_commit_ordered = 0; + trx->active_commit_ordered = 0; } /*********************************************************************//** @@ -2082,6 +2100,78 @@ trx_is_started( } /*********************************************************************//** +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_create_info( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + HA_CREATE_INFO* create_info) /*!< in: create info */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table) || srv_read_only_mode) { + /* Temp tables do not use persistent stats. */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = create_info->table_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = create_info->table_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = create_info->stats_sample_pages; +} + +/*********************************************************************//** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_table_share( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + TABLE_SHARE* table_share) /*!< in: table share */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table) || srv_read_only_mode) { + /* Temp tables do not use persistent stats */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = table_share->db_create_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = table_share->db_create_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = table_share->stats_sample_pages; +} + +/*********************************************************************//** Construct ha_innobase handler. */ UNIV_INTERN ha_innobase::ha_innobase( @@ -2090,14 +2180,15 @@ ha_innobase::ha_innobase( TABLE_SHARE* table_arg) :handler(hton, table_arg), int_table_flags(HA_REC_NOT_IN_SEQ | - HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS | + HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS | HA_CAN_SQL_HANDLER | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_PRIMARY_KEY_IN_READ_INDEX | HA_BINLOG_ROW_CAPABLE | HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ | - HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT), + HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT | + HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT), start_of_scan(0), num_write_row(0) {} @@ -2122,6 +2213,9 @@ ha_innobase::update_thd( { trx_t* trx; + /* The table should have been opened in ha_innobase::open(). */ + DBUG_ASSERT(prebuilt->table->n_ref_count > 0); + trx = check_trx_exists(thd); if (prebuilt->trx != trx) { @@ -2209,7 +2303,9 @@ invalidation to the transaction commit. 2) To store or retrieve a value from the query cache of an InnoDB table TBL, any query must first ask InnoDB's permission. We must pass the thd as a parameter because InnoDB will look at the trx id, if any, associated with -that thd. +that thd. Also the full_name which is used as key to search for the table +object. The full_name is a string containing the normalized path to the +table in the canonical format. 3) Use of the query cache for InnoDB tables is now allowed also when AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer @@ -2244,11 +2340,9 @@ innobase_query_caching_of_table_permitted( THD* thd, /*!< in: thd of the user who is trying to store a result to the query cache or retrieve it */ - char* full_name, /*!< in: concatenation of database name, - the null character NUL, and the table - name */ - uint full_name_len, /*!< in: length of the full name, i.e. - len(dbname) + len(tablename) + 1 */ + char* full_name, /*!< in: normalized path to the table */ + uint full_name_len, /*!< in: length of the normalized path + to the table */ ulonglong *unused) /*!< unused for this engine */ { ibool is_autocommit; @@ -2308,16 +2402,7 @@ innobase_query_caching_of_table_permitted( } /* Normalize the table name to InnoDB format */ - - memcpy(norm_name, full_name, full_name_len); - - norm_name[strlen(norm_name)] = '/'; /* InnoDB uses '/' as the - separator between db and - table */ - norm_name[full_name_len] = '\0'; -#ifdef __WIN__ - innobase_casedn_str(norm_name); -#endif + normalize_table_name(norm_name, full_name); innobase_register_trx(innodb_hton_ptr, thd, trx); @@ -2355,7 +2440,7 @@ innobase_invalidate_query_cache( /* Argument TRUE below means we are using transactions */ #ifdef HAVE_QUERY_CACHE - mysql_query_cache_invalidate4((THD*) trx->mysql_thd, + mysql_query_cache_invalidate4(trx->mysql_thd, full_name, (uint32) full_name_len, TRUE); @@ -2374,7 +2459,7 @@ innobase_convert_identifier( ulint buflen, /*!< in: length of buf, in bytes */ const char* id, /*!< in: identifier to convert */ ulint idlen, /*!< in: length of id, in bytes */ - void* thd, /*!< in: MySQL connection thread, or NULL */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ ibool file_id)/*!< in: TRUE=id is a table or database name; FALSE=id is an UTF-8 string */ { @@ -2397,7 +2482,7 @@ innobase_convert_identifier( nz[idlen] = 0; s = nz2; - idlen = explain_filename((THD*) thd, nz, nz2, sizeof nz2, + idlen = explain_filename(thd, nz, nz2, sizeof nz2, EXPLAIN_PARTITIONS_AS_COMMENT); goto no_quote; } @@ -2406,7 +2491,7 @@ innobase_convert_identifier( if (UNIV_UNLIKELY(!thd)) { q = '"'; } else { - q = get_quote_char_for_identifier((THD*) thd, s, (int) idlen); + q = get_quote_char_for_identifier(thd, s, (int) idlen); } if (q == EOF) { @@ -2462,7 +2547,7 @@ innobase_convert_name( ulint buflen, /*!< in: length of buf, in bytes */ const char* id, /*!< in: identifier to convert */ ulint idlen, /*!< in: length of id, in bytes */ - void* thd, /*!< in: MySQL connection thread, or NULL */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ ibool table_id)/*!< in: TRUE=id is a table or database name; FALSE=id is an index name */ { @@ -2504,14 +2589,13 @@ no_db_name: } return(s); - } /*****************************************************************//** A wrapper function of innobase_convert_name(), convert a table or index name to the MySQL system_charset_info (UTF-8) and quote it if needed. @return pointer to the end of buf */ -static inline +UNIV_INTERN void innobase_format_name( /*==================*/ @@ -2537,9 +2621,9 @@ UNIV_INTERN ibool trx_is_interrupted( /*===============*/ - trx_t* trx) /*!< in: transaction */ + const trx_t* trx) /*!< in: transaction */ { - return(trx && trx->mysql_thd && thd_kill_level((THD*) trx->mysql_thd)); + return(trx && trx->mysql_thd && thd_kill_level(trx->mysql_thd)); } /**********************************************************************//** @@ -2551,8 +2635,20 @@ trx_is_strict( /*==========*/ trx_t* trx) /*!< in: transaction */ { - return(trx && trx->mysql_thd - && THDVAR((THD*) trx->mysql_thd, strict_mode)); + return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode)); +} + +/**********************************************************************//** +Determines if the current MySQL thread is running in strict mode. +If thd==NULL, THDVAR returns the global value of innodb-strict-mode. +@return TRUE if strict */ +UNIV_INLINE +ibool +thd_is_strict( +/*==========*/ + THD* thd) /*!< in: MySQL thread descriptor */ +{ + return(THDVAR(thd, strict_mode)); } /**************************************************************//** @@ -2568,6 +2664,7 @@ ha_innobase::reset_template(void) prebuilt->keep_other_fields_on_keyread = 0; prebuilt->read_just_key = 0; + prebuilt->in_fts_query = 0; /* Reset index condition pushdown state. */ if (prebuilt->idx_cond) { prebuilt->idx_cond = NULL; @@ -2663,14 +2760,14 @@ innobase_init( innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint; innobase_hton->savepoint_release = innobase_release_savepoint; innobase_hton->prepare_ordered= NULL; - innobase_hton->commit_ordered= innobase_commit_ordered; + innobase_hton->commit_ordered= innobase_commit_ordered; innobase_hton->commit = innobase_commit; innobase_hton->rollback = innobase_rollback; innobase_hton->prepare = innobase_xa_prepare; innobase_hton->recover = innobase_xa_recover; innobase_hton->commit_by_xid = innobase_commit_by_xid; innobase_hton->rollback_by_xid = innobase_rollback_by_xid; - innobase_hton->commit_checkpoint_request=innobase_checkpoint_request; + innobase_hton->commit_checkpoint_request=innobase_checkpoint_request; innobase_hton->create_cursor_read_view = innobase_create_cursor_view; innobase_hton->set_cursor_read_view = innobase_set_cursor_view; innobase_hton->close_cursor_read_view = innobase_close_cursor_view; @@ -2687,9 +2784,8 @@ innobase_init( innobase_hton->release_temporary_latches = innobase_release_temporary_latches; - - innobase_hton->alter_table_flags = innobase_alter_table_flags; - innobase_hton->kill_query = innobase_kill_query; + innobase_hton->kill_query = innobase_kill_query; + innobase_hton->data = &innodb_api_cb; ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); @@ -2756,12 +2852,12 @@ innobase_init( srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : default_path); - /* Set default InnoDB data file size to 10 MB and let it be + /* Set default InnoDB data file size to 12 MB and let it be auto-extending. Thus users can use InnoDB in >= 4.0 without having to specify any startup options. */ if (!innobase_data_file_path) { - innobase_data_file_path = (char*) "ibdata1:10M:autoextend"; + innobase_data_file_path = (char*) "ibdata1:12M:autoextend"; } /* Since InnoDB edits the argument in the next call, we make another @@ -2785,8 +2881,8 @@ mem_free_and_error: /* The default dir for log files is the datadir of MySQL */ - if (!innobase_log_group_home_dir) { - innobase_log_group_home_dir = default_path; + if (!srv_log_group_home_dir) { + srv_log_group_home_dir = default_path; } #ifdef UNIV_LOG_ARCHIVE @@ -2799,12 +2895,12 @@ mem_free_and_error: srv_arch_dir = innobase_log_arch_dir; #endif /* UNIG_LOG_ARCHIVE */ - ret = (bool) - srv_parse_log_group_home_dirs(innobase_log_group_home_dir); + srv_normalize_path_for_win(srv_log_group_home_dir); - if (ret == FALSE || innobase_mirrored_log_groups != 1) { - sql_print_error("syntax error in innodb_log_group_home_dir, or a " - "wrong number of mirrored log groups"); + if (strchr(srv_log_group_home_dir, ';') + || innobase_mirrored_log_groups != 1) { + sql_print_error("syntax error in innodb_log_group_home_dir, " + "or a wrong number of mirrored log groups"); goto mem_free_and_error; } @@ -2896,12 +2992,52 @@ innobase_change_buffering_inited_ok: innobase_change_buffering = (char*) innobase_change_buffering_values[ibuf_use]; + /* Check that interdependent parameters have sane values. */ + if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) { + sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct.\n" + "InnoDB: Setting" + " innodb_max_dirty_pages_pct_lwm to %lu\n", + srv_max_buf_pool_modified_pct); + + srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct; + } + + if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) { + + if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) { + /* Avoid overflow. */ + srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT; + } else { + /* The user has not set the value. We should + set it based on innodb_io_capacity. */ + srv_max_io_capacity = + ut_max(2 * srv_io_capacity, 2000); + } + + } else if (srv_max_io_capacity < srv_io_capacity) { + sql_print_warning("InnoDB: innodb_io_capacity" + " cannot be set higher than" + " innodb_io_capacity_max.\n" + "InnoDB: Setting" + " innodb_io_capacity to %lu\n", + srv_max_io_capacity); + + srv_io_capacity = srv_max_io_capacity; + } + + if (!is_filename_allowed(srv_buf_dump_filename, + strlen(srv_buf_dump_filename), FALSE)) { + sql_print_error("InnoDB: innodb_buffer_pool_filename" + " cannot have colon (:) in the file name."); + goto mem_free_and_error; + } + /* --------------------------------------------------*/ srv_file_flush_method_str = innobase_file_flush_method; - srv_n_log_groups = (ulint) innobase_mirrored_log_groups; - srv_n_log_files = (ulint) innobase_log_files_in_group; srv_log_file_size = (ib_uint64_t) innobase_log_file_size; #ifdef UNIV_LOG_ARCHIVE @@ -2927,6 +3063,18 @@ innobase_change_buffering_inited_ok: srv_log_buffer_size = (ulint) innobase_log_buffer_size; + if (innobase_buffer_pool_instances == 0) { + innobase_buffer_pool_instances = 8; + +#if defined(__WIN__) && !defined(_WIN64) + if (innobase_buffer_pool_size > 1331 * 1024 * 1024) { + innobase_buffer_pool_instances + = ut_min(MAX_BUFFER_POOLS, + (long) (innobase_buffer_pool_size + / (128 * 1024 * 1024))); + } +#endif /* defined(__WIN__) && !defined(_WIN64) */ + } srv_buf_pool_size = (ulint) innobase_buffer_pool_size; srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances; @@ -2959,9 +3107,10 @@ innobase_change_buffering_inited_ok: srv_n_read_io_threads = (ulint) innobase_read_io_threads; srv_n_write_io_threads = (ulint) innobase_write_io_threads; - srv_force_recovery = (ulint) innobase_force_recovery; - srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; + + page_compression_level = (ulint) innobase_compression_level; + if (!innobase_use_checksums) { ut_print_timestamp(stderr); fprintf(stderr, @@ -2992,6 +3141,12 @@ innobase_change_buffering_inited_ok: "level instead, see " REFMAN "set-transaction.html.\n"); } + if (innobase_open_files < 10) { + innobase_open_files = 300; + if (srv_file_per_table && table_cache_size > 300) { + innobase_open_files = table_cache_size; + } + } srv_max_n_open_files = (ulint) innobase_open_files; srv_innodb_status = (ibool) innobase_create_status_file; @@ -3059,7 +3214,7 @@ innobase_change_buffering_inited_ok: /* Since we in this module access directly the fields of a trx struct, and due to different headers and flags it might happen that - mutex_t has a different size in this module and in InnoDB + ib_mutex_t has a different size in this module and in InnoDB modules, we check at run time that the size is the same in these compilation modules. */ @@ -3174,28 +3329,13 @@ innobase_flush_logs( DBUG_ENTER("innobase_flush_logs"); DBUG_ASSERT(hton == innodb_hton_ptr); - log_buffer_flush_to_disk(); + if (!srv_read_only_mode) { + log_buffer_flush_to_disk(); + } DBUG_RETURN(result); } -/****************************************************************//** -Return alter table flags supported in an InnoDB database. */ -static -uint -innobase_alter_table_flags( -/*=======================*/ - uint flags) -{ - return(HA_INPLACE_ADD_INDEX_NO_READ_WRITE - | HA_INPLACE_ADD_INDEX_NO_WRITE - | HA_INPLACE_DROP_INDEX_NO_READ_WRITE - | HA_INPLACE_ADD_UNIQUE_INDEX_NO_READ_WRITE - | HA_INPLACE_ADD_UNIQUE_INDEX_NO_WRITE - | HA_INPLACE_DROP_UNIQUE_INDEX_NO_READ_WRITE - | HA_INPLACE_ADD_PK_INDEX_NO_READ_WRITE); -} - /*****************************************************************//** Commits a transaction in an InnoDB database. */ static @@ -3410,9 +3550,6 @@ innobase_commit( innobase_commit_ordered_2(trx, thd); } - /* We were instructed to commit the whole transaction, or - this is an SQL statement end and autocommit is on */ - /* We did the first part already in innobase_commit_ordered(), Now finish by doing a write + flush of logs. */ trx_commit_complete_for_mysql(trx); @@ -3462,7 +3599,7 @@ innobase_rollback( transaction FALSE - rollback the current statement only */ { - int error = 0; + dberr_t error; trx_t* trx; DBUG_ENTER("innobase_rollback"); @@ -3511,7 +3648,7 @@ innobase_rollback_trx( /*==================*/ trx_t* trx) /*!< in: transaction */ { - int error = 0; + dberr_t error = DB_SUCCESS; DBUG_ENTER("innobase_rollback_trx"); DBUG_PRINT("trans", ("aborting transaction")); @@ -3610,6 +3747,7 @@ innobase_checkpoint_request( Log code calls this whenever log has been written and/or flushed up to a new position. We use this to notify upper layer of a new commit checkpoint when necessary.*/ +extern "C" UNIV_INTERN void innobase_mysql_log_notify( /*===============*/ @@ -3692,7 +3830,7 @@ innobase_rollback_to_savepoint( void* savepoint) /*!< in: savepoint data */ { ib_int64_t mysql_binlog_cache_pos; - int error = 0; + dberr_t error; trx_t* trx; char name[64]; @@ -3713,7 +3851,7 @@ innobase_rollback_to_savepoint( longlong2str((ulint) savepoint, name, 36); - error = (int) trx_rollback_to_savepoint_for_mysql( + error = trx_rollback_to_savepoint_for_mysql( trx, name, &mysql_binlog_cache_pos); if (error == DB_SUCCESS && trx->fts_trx != NULL) { @@ -3737,7 +3875,7 @@ innobase_release_savepoint( savepoint should be released */ void* savepoint) /*!< in: savepoint data */ { - int error = 0; + dberr_t error; trx_t* trx; char name[64]; @@ -3750,7 +3888,7 @@ innobase_release_savepoint( longlong2str((ulint) savepoint, name, 36); - error = (int) trx_release_savepoint_for_mysql(trx, name); + error = trx_release_savepoint_for_mysql(trx, name); if (error == DB_SUCCESS && trx->fts_trx != NULL) { fts_savepoint_release(trx, name); @@ -3770,7 +3908,7 @@ innobase_savepoint( THD* thd, /*!< in: handle to the MySQL thread */ void* savepoint) /*!< in: savepoint data */ { - int error = 0; + dberr_t error; trx_t* trx; DBUG_ENTER("innobase_savepoint"); @@ -3797,7 +3935,7 @@ innobase_savepoint( char name[64]; longlong2str((ulint) savepoint,name,36); - error = (int) trx_savepoint_for_mysql(trx, name, (ib_int64_t)0); + error = trx_savepoint_for_mysql(trx, name, (ib_int64_t)0); if (error == DB_SUCCESS && trx->fts_trx != NULL) { fts_savepoint_take(trx, name); @@ -3831,7 +3969,7 @@ innobase_close_connection( "but transaction is active"); } - if (trx_is_started(trx) && global_system_variables.log_warnings) { + if (trx_is_started(trx) && log_warnings) { sql_print_warning( "MySQL is closing a connection that has an active " @@ -3848,6 +3986,27 @@ innobase_close_connection( } /*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +int +innobase_close_thd( +/*===============*/ + THD* thd) /*!< in: handle to the MySQL thread of the user + whose resources should be free'd */ +{ + trx_t* trx = thd_to_trx(thd); + + if (!trx) { + return(0); + } + + return(innobase_close_connection(innodb_hton_ptr, thd)); +} + +UNIV_INTERN void lock_cancel_waiting_and_release(lock_t* lock); + +/*****************************************************************//** Cancel any pending lock request associated with the current THD. */ static void @@ -3862,10 +4021,17 @@ innobase_kill_query( DBUG_ASSERT(hton == innodb_hton_ptr); trx = thd_to_trx(thd); - /* Cancel a pending lock request. */ - if (trx) { - lock_trx_handle_wait(trx); - } + + if (trx) + { + /* Cancel a pending lock request. */ + lock_mutex_enter(); + trx_mutex_enter(trx); + if (trx->lock.wait_lock) + lock_cancel_waiting_and_release(trx->lock.wait_lock); + trx_mutex_exit(trx); + lock_mutex_exit(); + } DBUG_VOID_RETURN; } @@ -3981,9 +4147,9 @@ ha_innobase::index_flags( uint, bool) const { - ulong extra_flag= 0; - if (key == table_share->primary_key) - extra_flag= HA_CLUSTERED_INDEX; + ulong extra_flag= 0; + if (table && key == table->s->primary_key) + extra_flag= HA_CLUSTERED_INDEX; return((table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT) ? 0 : (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER @@ -4065,19 +4231,10 @@ ha_innobase::primary_key_is_clustered() return(true); } -/** Always normalize table name to lower case on Windows */ -#ifdef __WIN__ -#define normalize_table_name(norm_name, name) \ - normalize_table_name_low(norm_name, name, TRUE) -#else -#define normalize_table_name(norm_name, name) \ - normalize_table_name_low(norm_name, name, FALSE) -#endif /* __WIN__ */ - /*****************************************************************//** Normalizes a table name string. A normalized name consists of the -database name catenated to '/' and table name. An example: -test/mytable. On Windows normalization puts both the database name and the +database name catenated to '/' and table name. Example: test/mytable. +On Windows normalization puts both the database name and the table name always to lower case if "set_lower_case" is set to TRUE. */ static void @@ -4090,9 +4247,11 @@ normalize_table_name_low( to lower case */ { char* name_ptr; + ulint name_len; char* db_ptr; ulint db_len; char* ptr; + ulint norm_len; /* Scan name from the end */ @@ -4104,6 +4263,7 @@ normalize_table_name_low( } name_ptr = ptr + 1; + name_len = strlen(name_ptr); /* skip any number of path separators */ while (ptr >= name && (*ptr == '\\' || *ptr == '/')) { @@ -4122,11 +4282,15 @@ normalize_table_name_low( db_ptr = ptr + 1; + norm_len = db_len + name_len + sizeof "/"; + ut_a(norm_len < FN_REFLEN - 1); + memcpy(norm_name, db_ptr, db_len); norm_name[db_len] = '/'; - memcpy(norm_name + db_len + 1, name_ptr, strlen(name_ptr) + 1); + /* Copy the name and null-byte. */ + memcpy(norm_name + db_len + 1, name_ptr, name_len + 1); if (set_lower_case) { innobase_casedn_str(norm_name); @@ -4141,7 +4305,7 @@ void test_normalize_table_name_low() /*===========================*/ { - char norm_name[128]; + char norm_name[FN_REFLEN]; const char* test_data[][2] = { /* input, expected result */ {"./mysqltest/t1", "mysqltest/t1"}, @@ -4197,12 +4361,84 @@ test_normalize_table_name_low() } } } + +/********************************************************************* +Test ut_format_name(). */ +static +void +test_ut_format_name() +/*=================*/ +{ + char buf[NAME_LEN * 3]; + + struct { + const char* name; + ibool is_table; + ulint buf_size; + const char* expected; + } test_data[] = { + {"test/t1", TRUE, sizeof(buf), "\"test\".\"t1\""}, + {"test/t1", TRUE, 12, "\"test\".\"t1\""}, + {"test/t1", TRUE, 11, "\"test\".\"t1"}, + {"test/t1", TRUE, 10, "\"test\".\"t"}, + {"test/t1", TRUE, 9, "\"test\".\""}, + {"test/t1", TRUE, 8, "\"test\"."}, + {"test/t1", TRUE, 7, "\"test\""}, + {"test/t1", TRUE, 6, "\"test"}, + {"test/t1", TRUE, 5, "\"tes"}, + {"test/t1", TRUE, 4, "\"te"}, + {"test/t1", TRUE, 3, "\"t"}, + {"test/t1", TRUE, 2, "\""}, + {"test/t1", TRUE, 1, ""}, + {"test/t1", TRUE, 0, "BUF_NOT_CHANGED"}, + {"table", TRUE, sizeof(buf), "\"table\""}, + {"ta'le", TRUE, sizeof(buf), "\"ta'le\""}, + {"ta\"le", TRUE, sizeof(buf), "\"ta\"\"le\""}, + {"ta`le", TRUE, sizeof(buf), "\"ta`le\""}, + {"index", FALSE, sizeof(buf), "\"index\""}, + {"ind/ex", FALSE, sizeof(buf), "\"ind/ex\""}, + }; + + for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) { + + memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1); + + char* ret; + + ret = ut_format_name(test_data[i].name, + test_data[i].is_table, + buf, + test_data[i].buf_size); + + ut_a(ret == buf); + + if (strcmp(buf, test_data[i].expected) == 0) { + fprintf(stderr, + "ut_format_name(%s, %s, buf, %lu), " + "expected %s, OK\n", + test_data[i].name, + test_data[i].is_table ? "TRUE" : "FALSE", + test_data[i].buf_size, + test_data[i].expected); + } else { + fprintf(stderr, + "ut_format_name(%s, %s, buf, %lu), " + "expected %s, ERROR: got %s\n", + test_data[i].name, + test_data[i].is_table ? "TRUE" : "FALSE", + test_data[i].buf_size, + test_data[i].expected, + buf); + ut_error; + } + } +} #endif /* !DBUG_OFF */ /********************************************************************//** Get the upper limit of the MySQL integral and floating-point type. @return maximum allowed value for the field */ -static +UNIV_INTERN ulonglong innobase_get_int_col_max_value( /*===========================*/ @@ -4282,12 +4518,13 @@ innobase_match_index_columns( DBUG_ENTER("innobase_match_index_columns"); /* Check whether user defined index column count matches */ - if (key_info->key_parts != index_info->n_user_defined_cols) { + if (key_info->user_defined_key_parts != + index_info->n_user_defined_cols) { DBUG_RETURN(FALSE); } key_part = key_info->key_part; - key_end = key_part + key_info->key_parts; + key_end = key_part + key_info->user_defined_key_parts; innodb_idx_fld = index_info->fields; innodb_idx_fld_end = index_info->fields + index_info->n_fields; @@ -4546,6 +4783,7 @@ ha_innobase::innobase_initialize_autoinc() auto_inc = innobase_next_autoinc( read_auto_inc, 1, 1, 0, col_max_value); + break; } case DB_RECORD_NOT_FOUND: @@ -4595,12 +4833,12 @@ ha_innobase::open( uint test_if_locked) /*!< in: not used */ { dict_table_t* ib_table; - char norm_name[1000]; + char norm_name[FN_REFLEN]; THD* thd; ulint retries = 0; char* is_part = NULL; ibool par_case_name_set = FALSE; - char par_case_name[MAX_FULL_NAME_LEN + 1]; + char par_case_name[FN_REFLEN]; DBUG_ENTER("ha_innobase::open"); @@ -4642,7 +4880,31 @@ ha_innobase::open( retry: /* Get pointer to a table object in InnoDB dictionary cache */ - ib_table = dict_table_open_on_name(norm_name, FALSE); + ib_table = dict_table_open_on_name(norm_name, FALSE, TRUE, + DICT_ERR_IGNORE_NONE); + + if (ib_table + && ((!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID) + && table->s->fields != dict_table_get_n_user_cols(ib_table)) + || (DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID) + && (table->s->fields + != dict_table_get_n_user_cols(ib_table) - 1)))) { + ib_logf(IB_LOG_LEVEL_WARN, + "table %s contains %lu user defined columns " + "in InnoDB, but %lu columns in MySQL. Please " + "check INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and " + REFMAN "innodb-troubleshooting.html " + "for how to resolve it", + norm_name, (ulong) dict_table_get_n_user_cols(ib_table), + (ulong) table->s->fields); + + /* Mark this table as corrupted, so the drop table + or force recovery can still use it, but not others. */ + ib_table->corrupted = true; + dict_table_close(ib_table, FALSE, FALSE); + ib_table = NULL; + is_part = NULL; + } if (NULL == ib_table) { if (is_part && retries < 10) { @@ -4656,13 +4918,13 @@ retry: 1) If boot against an installation from Windows platform, then its partition table name could - be all be in lower case in system tables. So we - will need to check lower case name when load table. + be in lower case in system tables. So we will + need to check lower case name when load table. - 2) If we boot an installation from other case + 2) If we boot an installation from other case sensitive platform in Windows, we might need to - check the existence of table name without lowering - case them in the system table. */ + check the existence of table name without lower + case in the system table. */ if (innobase_get_lower_case_table_names() == 1) { if (!par_case_name_set) { @@ -4670,9 +4932,7 @@ retry: /* Check for the table using lower case name, including the partition separator "P" */ - memcpy(par_case_name, norm_name, - strlen(norm_name)); - par_case_name[strlen(norm_name)] = 0; + strcpy(par_case_name, norm_name); innobase_casedn_str(par_case_name); #else /* On Windows platfrom, check @@ -4686,7 +4946,8 @@ retry: } ib_table = dict_table_open_on_name( - par_case_name, FALSE); + par_case_name, FALSE, TRUE, + DICT_ERR_IGNORE_NONE); } if (!ib_table) { @@ -4724,21 +4985,13 @@ retry: retries); } - sql_print_error("Cannot find or open table %s from\n" - "the internal data dictionary of InnoDB " - "though the .frm file for the\n" - "table exists. Maybe you have deleted and " - "recreated InnoDB data\n" - "files but have forgotten to delete the " - "corresponding .frm files\n" - "of InnoDB tables, or you have moved .frm " - "files to another database?\n" - "or, the table contains indexes that this " - "version of the engine\n" - "doesn't support.\n" - "See " REFMAN "innodb-troubleshooting.html\n" - "how you can resolve the problem.\n", - norm_name); + ib_logf(IB_LOG_LEVEL_WARN, + "Cannot open table %s from the internal data " + "dictionary of InnoDB though the .frm file " + "for the table exists. See " + REFMAN "innodb-troubleshooting.html for how " + "you can resolve the problem.", norm_name); + free_share(share); my_errno = ENOENT; @@ -4747,21 +5000,47 @@ retry: table_opened: + innobase_copy_frm_flags_from_table_share(ib_table, table->s); + + dict_stats_init(ib_table); + MONITOR_INC(MONITOR_TABLE_OPEN); - if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) { - sql_print_error("MySQL is trying to open a table handle but " - "the .ibd file for\ntable %s does not exist.\n" - "Have you deleted the .ibd file from the " - "database directory under\nthe MySQL datadir, " - "or have you used DISCARD TABLESPACE?\n" - "See " REFMAN "innodb-troubleshooting.html\n" - "how you can resolve the problem.\n", - norm_name); + bool no_tablespace; + + if (dict_table_is_discarded(ib_table)) { + + ib_senderrf(thd, + IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + /* Allow an open because a proper DISCARD should have set + all the flags and index root page numbers to FIL_NULL that + should prevent any DML from running but it should allow DDL + operations. */ + + no_tablespace = false; + + } else if (ib_table->ibd_file_missing) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, + ER_TABLESPACE_MISSING, norm_name); + + /* This means we have no idea what happened to the tablespace + file, best to play it safe. */ + + no_tablespace = true; + } else { + no_tablespace = false; + } + + if (!thd_tablespace_op(thd) && no_tablespace) { free_share(share); my_errno = ENOENT; - dict_table_close(ib_table, FALSE); + dict_table_close(ib_table, FALSE, FALSE); + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); } @@ -4909,7 +5188,9 @@ table_opened: } /* Only if the table has an AUTOINC column. */ - if (prebuilt->table != NULL && table->found_next_number_field != NULL) { + if (prebuilt->table != NULL + && !prebuilt->table->ibd_file_missing + && table->found_next_number_field != NULL) { dict_table_autoinc_lock(prebuilt->table); /* Since a table can already be "open" in InnoDB's internal @@ -4930,6 +5211,31 @@ table_opened: } UNIV_INTERN +handler* +ha_innobase::clone( +/*===============*/ + const char* name, /*!< in: table name */ + MEM_ROOT* mem_root) /*!< in: memory context */ +{ + ha_innobase* new_handler; + + DBUG_ENTER("ha_innobase::clone"); + + new_handler = static_cast<ha_innobase*>(handler::clone(name, + mem_root)); + if (new_handler) { + DBUG_ASSERT(new_handler->prebuilt != NULL); + DBUG_ASSERT(new_handler->user_thd == user_thd); + DBUG_ASSERT(new_handler->prebuilt->trx == prebuilt->trx); + + new_handler->prebuilt->select_lock_type + = prebuilt->select_lock_type; + } + + DBUG_RETURN(new_handler); +} + +UNIV_INTERN uint ha_innobase::max_supported_key_part_length() const /*==============================================*/ @@ -4994,36 +5300,6 @@ get_field_offset( return((uint) (field->ptr - table->record[0])); } -/**************************************************************//** -Checks if a field in a record is SQL NULL. Uses the record format -information in table to track the null bit in record. -@return 1 if NULL, 0 otherwise */ -static inline -uint -field_in_record_is_null( -/*====================*/ - TABLE* table, /*!< in: MySQL table object */ - Field* field, /*!< in: MySQL field object */ - char* record) /*!< in: a row in MySQL format */ -{ - int null_offset; - - if (!field->null_ptr) { - - return(0); - } - - null_offset = (uint) ((char*) field->null_ptr - - (char*) table->record[0]); - - if (record[null_offset] & field->null_bit) { - - return(1); - } - - return(0); -} - /*************************************************************//** InnoDB uses this function to compare two data fields for which the data type is such that we must use MySQL code to compare them. NOTE that the prototype @@ -5483,6 +5759,7 @@ get_innobase_type_from_mysql_type( case HA_KEYTYPE_END: ut_error; } + return(0); } @@ -5512,7 +5789,7 @@ innobase_read_from_2_little_endian( /*===============================*/ const uchar* buf) /*!< in: from where to read */ { - return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))); + return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])))); } /*******************************************************************//** @@ -5530,7 +5807,8 @@ ha_innobase::store_key_val_for_row( { KEY* key_info = table->key_info + keynr; KEY_PART_INFO* key_part = key_info->key_part; - KEY_PART_INFO* end = key_part + key_info->key_parts; + KEY_PART_INFO* end = + key_part + key_info->user_defined_key_parts; char* buff_start = buff; enum_field_types mysql_type; Field* field; @@ -5906,10 +6184,9 @@ build_template_field( templ->rec_field_no = dict_index_get_nth_col_pos(index, i); } - if (field->null_ptr) { + if (field->real_maybe_null()) { templ->mysql_null_byte_offset = - (ulint) ((char*) field->null_ptr - - (char*) table->record[0]); + field->null_offset(); templ->mysql_null_bit_mask = (ulint) field->null_bit; } else { @@ -6011,6 +6288,10 @@ ha_innobase::build_template( prebuilt->need_to_access_clustered = (index == clust_index); + /* Either prebuilt->index should be a secondary index, or it + should be the clustered index. */ + ut_ad(dict_index_is_clust(index) == (index == clust_index)); + /* Below we check column by column if we need to access the clustered index. */ @@ -6227,11 +6508,13 @@ min value of the autoinc interval. Once that is fixed we can get rid of the special lock handling. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t ha_innobase::innobase_lock_autoinc(void) /*====================================*/ { - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; + + ut_ad(!srv_read_only_mode); switch (innobase_autoinc_lock_mode) { case AUTOINC_NO_LOCKING: @@ -6276,19 +6559,19 @@ ha_innobase::innobase_lock_autoinc(void) ut_error; } - return(ulong(error)); + return(error); } /********************************************************************//** Reset the autoinc value in the table. @return DB_SUCCESS if all went well else error code */ UNIV_INTERN -ulint +dberr_t ha_innobase::innobase_reset_autoinc( /*================================*/ ulonglong autoinc) /*!< in: value to store */ { - ulint error; + dberr_t error; error = innobase_lock_autoinc(); @@ -6299,7 +6582,7 @@ ha_innobase::innobase_reset_autoinc( dict_table_autoinc_unlock(prebuilt->table); } - return(ulong(error)); + return(error); } /********************************************************************//** @@ -6307,12 +6590,12 @@ Store the autoinc value in the table. The autoinc value is only set if it's greater than the existing autoinc value in the table. @return DB_SUCCESS if all went well else error code */ UNIV_INTERN -ulint +dberr_t ha_innobase::innobase_set_max_autoinc( /*==================================*/ ulonglong auto_inc) /*!< in: value to store */ { - ulint error; + dberr_t error; error = innobase_lock_autoinc(); @@ -6323,7 +6606,7 @@ ha_innobase::innobase_set_max_autoinc( dict_table_autoinc_unlock(prebuilt->table); } - return(ulong(error)); + return(error); } /********************************************************************//** @@ -6336,7 +6619,7 @@ ha_innobase::write_row( /*===================*/ uchar* record) /*!< in: a row in MySQL format */ { - ulint error = 0; + dberr_t error; int error_result= 0; ibool auto_inc_used= FALSE; ulint sql_command; @@ -6344,7 +6627,10 @@ ha_innobase::write_row( DBUG_ENTER("ha_innobase::write_row"); - if (prebuilt->trx != trx) { + if (srv_read_only_mode) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (prebuilt->trx != trx) { sql_print_error("The transaction object for the table handle " "is at %p, but for the current thread it is at " "%p", @@ -6362,6 +6648,8 @@ ha_innobase::write_row( ++trx->will_lock; } + ha_statistic_increment(&SSV::ha_write_count); + sql_command = thd_sql_command(user_thd); if ((sql_command == SQLCOM_ALTER_TABLE @@ -6441,7 +6729,7 @@ no_commit: innobase_get_auto_increment(). */ prebuilt->autoinc_error = DB_SUCCESS; - if ((error = update_auto_increment())) { + if ((error_result = update_auto_increment())) { /* We don't want to mask autoinc overflow errors. */ /* Handle the case where the AUTOINC sub-system @@ -6452,15 +6740,11 @@ no_commit: my_error(ER_AUTOINC_READ_FAILED, MYF(0)); goto func_exit; } else if (prebuilt->autoinc_error != DB_SUCCESS) { - error = (int) prebuilt->autoinc_error; + error = prebuilt->autoinc_error; goto report_error; } - /* MySQL errors are passed straight back. except for - HA_ERR_AUTO_INC_READ_FAILED. This can only happen - for values out of range. - */ - error_result = (int) error; + /* MySQL errors are passed straight back. */ goto func_exit; } @@ -6479,10 +6763,10 @@ no_commit: innobase_srv_conc_enter_innodb(prebuilt->trx); error = row_insert_for_mysql((byte*) record, prebuilt); + DEBUG_SYNC(user_thd, "ib_after_row_insert"); /* Handle duplicate key errors */ if (auto_inc_used) { - ulint err; ulonglong auto_inc; ulonglong col_max_value; @@ -6544,6 +6828,7 @@ set_max_autoinc: ulonglong offset; ulonglong increment; + dberr_t err; offset = prebuilt->autoinc_offset; increment = prebuilt->autoinc_increment; @@ -6562,13 +6847,22 @@ set_max_autoinc: } } break; + default: + break; } } innobase_srv_conc_exit_innodb(prebuilt->trx); report_error: - error_result = convert_error_code_to_mysql((int) error, + if (error == DB_TABLESPACE_DELETED) { + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + } + + error_result = convert_error_code_to_mysql(error, prebuilt->table->flags, user_thd); @@ -6585,9 +6879,9 @@ func_exit: /**********************************************************************//** Checks which fields have changed in a row and stores information of them to an update vector. -@return error number or 0 */ +@return DB_SUCCESS or error code */ static -int +dberr_t calc_row_difference( /*================*/ upd_t* uvect, /*!< in/out: update vector */ @@ -6617,12 +6911,13 @@ calc_row_difference( dfield_t dfield; dict_index_t* clust_index; uint i; - ulint error = DB_SUCCESS; ibool changes_fts_column = FALSE; ibool changes_fts_doc_col = FALSE; trx_t* trx = thd_to_trx(thd); doc_id_t doc_id = FTS_NULL_DOC_ID; + ut_ad(!srv_read_only_mode); + n_fields = table->s->fields; clust_index = dict_table_get_first_index(prebuilt->table); @@ -6694,14 +6989,12 @@ calc_row_difference( } - if (field->null_ptr) { - if (field_in_record_is_null(table, field, - (char*) old_row)) { + if (field->real_maybe_null()) { + if (field->is_null_in_record(old_row)) { o_len = UNIV_SQL_NULL; } - if (field_in_record_is_null(table, field, - (char*) new_row)) { + if (field->is_null_in_record(new_row)) { n_len = UNIV_SQL_NULL; } } @@ -6838,13 +7131,7 @@ calc_row_difference( fts_update_doc_id( innodb_table, ufield, &trx->fts_next_doc_id); - if (error == DB_SUCCESS) { - ++n_changed; - } else { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error (%lu) while updating " - "doc id in calc_row_difference().\n", error); - } + ++n_changed; } else { /* We have a Doc ID column, but none of FTS indexed columns are touched, nor the Doc ID column, so set @@ -6858,7 +7145,7 @@ calc_row_difference( ut_a(buf <= (byte*) original_upd_buff + buff_len); - return(error); + return(DB_SUCCESS); } /**********************************************************************//** @@ -6877,14 +7164,17 @@ ha_innobase::update_row( uchar* new_row) /*!< in: new row in MySQL format */ { upd_t* uvect; - int error = 0; + dberr_t error; trx_t* trx = thd_to_trx(user_thd); DBUG_ENTER("ha_innobase::update_row"); ut_a(prebuilt->trx == trx); - if (!trx_is_started(trx)) { + if (srv_read_only_mode) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { ++trx->will_lock; } @@ -6905,6 +7195,8 @@ ha_innobase::update_row( } } + ha_statistic_increment(&SSV::ha_update_count); + if (prebuilt->upd_node) { uvect = prebuilt->upd_node->update; } else { @@ -6972,18 +7264,18 @@ ha_innobase::update_row( innobase_srv_conc_exit_innodb(trx); func_exit: - error = convert_error_code_to_mysql(error, + int err = convert_error_code_to_mysql(error, prebuilt->table->flags, user_thd); /* If success and no columns were updated. */ - if (error == 0 && uvect->n_fields == 0) { + if (err == 0 && uvect->n_fields == 0) { /* This is the same as success, but instructs MySQL that the row is not really updated and it should not increase the count of updated rows. This is fix for http://bugs.mysql.com/29157 */ - error = HA_ERR_RECORD_IS_THE_SAME; - } else if (error == HA_FTS_INVALID_DOCID) { + err = HA_ERR_RECORD_IS_THE_SAME; + } else if (err == HA_FTS_INVALID_DOCID) { my_error(HA_FTS_INVALID_DOCID, MYF(0)); } @@ -6992,7 +7284,7 @@ func_exit: innobase_active_small(); - DBUG_RETURN(error); + DBUG_RETURN(err); } /**********************************************************************//** @@ -7004,17 +7296,22 @@ ha_innobase::delete_row( /*====================*/ const uchar* record) /*!< in: a row in MySQL format */ { - int error = 0; + dberr_t error; trx_t* trx = thd_to_trx(user_thd); DBUG_ENTER("ha_innobase::delete_row"); ut_a(prebuilt->trx == trx); - if (!trx_is_started(trx)) { + if (srv_read_only_mode) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { ++trx->will_lock; } + ha_statistic_increment(&SSV::ha_delete_count); + if (!prebuilt->upd_node) { row_get_prebuilt_update_vector(prebuilt); } @@ -7029,15 +7326,13 @@ ha_innobase::delete_row( innobase_srv_conc_exit_innodb(trx); - error = convert_error_code_to_mysql( - error, prebuilt->table->flags, user_thd); - /* Tell the InnoDB server that there might be work for utility threads: */ innobase_active_small(); - DBUG_RETURN(error); + DBUG_RETURN(convert_error_code_to_mysql( + error, prebuilt->table->flags, user_thd)); } /**********************************************************************//** @@ -7270,21 +7565,19 @@ ha_innobase::index_read( dict_index_t* index; ulint match_mode = 0; int error; - ulint ret; + dberr_t ret; DBUG_ENTER("index_read"); DEBUG_SYNC_C("ha_innobase_index_read_begin"); ut_a(prebuilt->trx == thd_to_trx(user_thd)); + ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT); + + ha_statistic_increment(&SSV::ha_read_key_count); index = prebuilt->index; if (UNIV_UNLIKELY(index == NULL) || dict_index_is_corrupted(index)) { - DBUG_PRINT("error", ("index: %p index_corrupt: %d data_corrupt: %d", - index, - index ? test(index->type & DICT_CORRUPT) : 0, - (index && index->table ? - test(index->table->corrupted) : 0))); prebuilt->index_usable = FALSE; DBUG_RETURN(HA_ERR_CRASHED); } @@ -7357,6 +7650,7 @@ ha_innobase::index_read( case DB_SUCCESS: error = 0; table->status = 0; + srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1); break; case DB_RECORD_NOT_FOUND: error = HA_ERR_KEY_NOT_FOUND; @@ -7366,10 +7660,30 @@ ha_innobase::index_read( error = HA_ERR_KEY_NOT_FOUND; table->status = STATUS_NOT_FOUND; break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, MYF(0), + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; default: - error = convert_error_code_to_mysql((int) ret, - prebuilt->table->flags, - user_thd); + error = convert_error_code_to_mysql( + ret, prebuilt->table->flags, user_thd); + table->status = STATUS_NOT_FOUND; break; } @@ -7571,8 +7885,8 @@ ha_innobase::general_fetch( uint match_mode) /*!< in: 0, ROW_SEL_EXACT, or ROW_SEL_EXACT_PREFIX */ { - ulint ret; - int error = 0; + dberr_t ret; + int error; DBUG_ENTER("general_fetch"); @@ -7589,6 +7903,7 @@ ha_innobase::general_fetch( case DB_SUCCESS: error = 0; table->status = 0; + srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1); break; case DB_RECORD_NOT_FOUND: error = HA_ERR_END_OF_FILE; @@ -7598,9 +7913,30 @@ ha_innobase::general_fetch( error = HA_ERR_END_OF_FILE; table->status = STATUS_NOT_FOUND; break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; default: error = convert_error_code_to_mysql( - (int) ret, prebuilt->table->flags, user_thd); + ret, prebuilt->table->flags, user_thd); + table->status = STATUS_NOT_FOUND; break; } @@ -7619,6 +7955,8 @@ ha_innobase::index_next( uchar* buf) /*!< in/out: buffer for next row in MySQL format */ { + ha_statistic_increment(&SSV::ha_read_next_count); + return(general_fetch(buf, ROW_SEL_NEXT, 0)); } @@ -7633,6 +7971,8 @@ ha_innobase::index_next_same( const uchar* key, /*!< in: key value */ uint keylen) /*!< in: key value length */ { + ha_statistic_increment(&SSV::ha_read_next_count); + return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode)); } @@ -7646,6 +7986,8 @@ ha_innobase::index_prev( /*====================*/ uchar* buf) /*!< in/out: buffer for previous row in MySQL format */ { + ha_statistic_increment(&SSV::ha_read_prev_count); + return(general_fetch(buf, ROW_SEL_PREV, 0)); } @@ -7662,6 +8004,7 @@ ha_innobase::index_first( int error; DBUG_ENTER("index_first"); + ha_statistic_increment(&SSV::ha_read_first_count); error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY); @@ -7687,6 +8030,7 @@ ha_innobase::index_last( int error; DBUG_ENTER("index_last"); + ha_statistic_increment(&SSV::ha_read_last_count); error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY); @@ -7756,6 +8100,7 @@ ha_innobase::rnd_next( int error; DBUG_ENTER("rnd_next"); + ha_statistic_increment(&SSV::ha_read_rnd_next_count); if (start_of_scan) { error = index_first(buf); @@ -7789,6 +8134,8 @@ ha_innobase::rnd_pos( DBUG_ENTER("rnd_pos"); DBUG_DUMP("key", pos, ref_length); + ha_statistic_increment(&SSV::ha_read_rnd_count); + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); /* Note that we assume the length of the row reference is fixed @@ -7813,8 +8160,6 @@ ha_innobase::ft_init() { DBUG_ENTER("ft_init"); - fprintf(stderr, "ft_init()\n"); - trx_t* trx = check_trx_exists(ha_thd()); /* FTS queries are not treated as autocommit non-locking selects. @@ -7853,15 +8198,15 @@ ha_innobase::ft_init_ext( ulint buf_tmp_used; uint num_errors; - fprintf(stderr, "ft_init_ext()\n"); - - fprintf(stderr, "keynr=%u, '%.*s'\n", - keynr, (int) key->length(), (byte*) key->ptr()); + if (fts_enable_diag_print) { + fprintf(stderr, "keynr=%u, '%.*s'\n", + keynr, (int) key->length(), (byte*) key->ptr()); - if (flags & FT_BOOL) { - fprintf(stderr, "BOOL search\n"); - } else { - fprintf(stderr, "NL search\n"); + if (flags & FT_BOOL) { + fprintf(stderr, "BOOL search\n"); + } else { + fprintf(stderr, "NL search\n"); + } } /* FIXME: utf32 and utf16 are not compatible with some @@ -7908,7 +8253,7 @@ ha_innobase::ft_init_ext( if (!index || index->type != DICT_FTS) { my_error(ER_TABLE_HAS_NO_FT, MYF(0)); - return NULL; + return(NULL); } if (!(table->fts->fts_status & ADDED_TABLE_SYNCED)) { @@ -7919,25 +8264,69 @@ ha_innobase::ft_init_ext( error = fts_query(trx, index, flags, query, query_len, &result); - prebuilt->result = result; - // FIXME: Proper error handling and diagnostic if (error != DB_SUCCESS) { fprintf(stderr, "Error processing query\n"); } else { - /* Must return an instance of a result even if it's empty */ - ut_a(prebuilt->result); - /* Allocate FTS handler, and instantiate it before return */ fts_hdl = (NEW_FT_INFO*) my_malloc(sizeof(NEW_FT_INFO), MYF(0)); fts_hdl->please = (struct _ft_vft*)(&ft_vft_result); + fts_hdl->could_you = (struct _ft_vft_ext*)(&ft_vft_ext_result); fts_hdl->ft_prebuilt = prebuilt; fts_hdl->ft_result = result; + + /* FIXME: Re-evluate the condition when Bug 14469540 + is resolved */ + prebuilt->in_fts_query = true; } - return ((FT_INFO*) fts_hdl); + return((FT_INFO*) fts_hdl); +} + +/*****************************************************************//** +Set up search tuple for a query through FTS_DOC_ID_INDEX on +supplied Doc ID. This is used by MySQL to retrieve the documents +once the search result (Doc IDs) is available */ +static +void +innobase_fts_create_doc_id_key( +/*===========================*/ + dtuple_t* tuple, /* in/out: prebuilt->search_tuple */ + const dict_index_t* + index, /* in: index (FTS_DOC_ID_INDEX) */ + doc_id_t* doc_id) /* in/out: doc id to search, value + could be changed to storage format + used for search. */ +{ + doc_id_t temp_doc_id; + dfield_t* dfield = dtuple_get_nth_field(tuple, 0); + + ut_a(dict_index_get_n_unique(index) == 1); + + dtuple_set_n_fields(tuple, index->n_fields); + dict_index_copy_types(tuple, index, index->n_fields); + +#ifdef UNIV_DEBUG + /* The unique Doc ID field should be an eight-bytes integer */ + dict_field_t* field = dict_index_get_nth_field(index, 0); + ut_a(field->col->mtype == DATA_INT); + ut_ad(sizeof(*doc_id) == field->fixed_len); + ut_ad(innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME) == 0); +#endif /* UNIV_DEBUG */ + + /* Convert to storage byte order */ + mach_write_to_8(reinterpret_cast<byte*>(&temp_doc_id), *doc_id); + *doc_id = temp_doc_id; + dfield_set_data(dfield, doc_id, sizeof(*doc_id)); + + dtuple_set_n_fields_cmp(tuple, 1); + + for (ulint i = 1; i < index->n_fields; i++) { + dfield = dtuple_get_nth_field(tuple, i); + dfield_set_null(dfield); + } } /**********************************************************************//** @@ -7984,6 +8373,14 @@ next_record: if (result->current != NULL) { dict_index_t* index; dtuple_t* tuple = prebuilt->search_tuple; + doc_id_t search_doc_id; + + /* If we only need information from result we can return + without fetching the table row */ + if (ft_prebuilt->read_just_key) { + table->status= 0; + return(0); + } index = dict_table_get_index_on_name( prebuilt->table, FTS_DOC_ID_INDEX_NAME); @@ -7997,48 +8394,74 @@ next_record: fts_ranking_t* ranking = rbt_value( fts_ranking_t, result->current); - /* We pass a pointer to the doc_id because we need to - convert it to storage byte order. */ - row_create_key(tuple, index, &ranking->doc_id); + search_doc_id = ranking->doc_id; + + /* We pass a pointer of search_doc_id because it will be + converted to storage byte order used in the search + tuple. */ + innobase_fts_create_doc_id_key(tuple, index, &search_doc_id); innobase_srv_conc_enter_innodb(prebuilt->trx); - ulint ret = row_search_for_mysql( + dberr_t ret = row_search_for_mysql( (byte*) buf, PAGE_CUR_GE, prebuilt, ROW_SEL_EXACT, 0); innobase_srv_conc_exit_innodb(prebuilt->trx); - - if (ret == DB_SUCCESS) { + switch (ret) { + case DB_SUCCESS: error = 0; table->status = 0; - - } else if (ret == DB_RECORD_NOT_FOUND) { - + break; + case DB_RECORD_NOT_FOUND: result->current = const_cast<ib_rbt_node_t*>( rbt_next(result->rankings_by_rank, result->current)); if (!result->current) { - error = HA_ERR_KEY_NOT_FOUND; + /* exhaust the result set, should return + HA_ERR_END_OF_FILE just like + ha_innobase::general_fetch() and/or + ha_innobase::index_first() etc. */ + error = HA_ERR_END_OF_FILE; table->status = STATUS_NOT_FOUND; } else { goto next_record; } + break; + case DB_END_OF_INDEX: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: - } else if (ret == DB_END_OF_INDEX) { + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); - error = HA_ERR_KEY_NOT_FOUND; table->status = STATUS_NOT_FOUND; - } else { + error = HA_ERR_NO_SUCH_TABLE; + break; + case DB_TABLESPACE_NOT_FOUND: + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + default: error = convert_error_code_to_mysql( - (int) ret, 0, user_thd); + ret, 0, user_thd); table->status = STATUS_NOT_FOUND; + break; } - return (error); + return(error); } return(HA_ERR_END_OF_FILE); @@ -8052,11 +8475,6 @@ ha_innobase::ft_end() { fprintf(stderr, "ft_end()\n"); - if (prebuilt->result != NULL) { - fts_query_free_result(prebuilt->result); - prebuilt->result = NULL; - } - rnd_end(); } @@ -8110,23 +8528,21 @@ See http://bugs.mysql.com/32710 for expl. why we choose PROCESS. */ /*****************************************************************//** Check whether there exist a column named as "FTS_DOC_ID", which is reserved for InnoDB FTS Doc ID -@return TRUE if there exist a "FTS_DOC_ID" column */ +@return true if there exist a "FTS_DOC_ID" column */ static -ibool +bool create_table_check_doc_id_col( /*==========================*/ trx_t* trx, /*!< in: InnoDB transaction handle */ - TABLE* form, /*!< in: information on table + const TABLE* form, /*!< in: information on table columns and indexes */ ulint* doc_id_col) /*!< out: Doc ID column number if - there exist a FTS_DOC_ID column, ULINT_UNDEFINED if column is of the + there exist a FTS_DOC_ID column, + ULINT_UNDEFINED if column is of the wrong type/name/size */ { - ibool find_doc_id = FALSE; - ulint i; - - for (i = 0; i < form->s->fields; i++) { - Field* field; + for (ulint i = 0; i < form->s->fields; i++) { + const Field* field; ulint col_type; ulint col_len; ulint unsigned_type; @@ -8141,21 +8557,19 @@ create_table_check_doc_id_col( if (innobase_strcasecmp(field->field_name, FTS_DOC_ID_COL_NAME) == 0) { - find_doc_id = TRUE; - /* Note the name is case sensitive due to our internal query parser */ if (col_type == DATA_INT - && !field->null_ptr + && !field->real_maybe_null() && col_len == sizeof(doc_id_t) && (strcmp(field->field_name, FTS_DOC_ID_COL_NAME) == 0)) { *doc_id_col = i; } else { push_warning_printf( - (THD*) trx->mysql_thd, + trx->mysql_thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: FTS_DOC_ID column must be " "of BIGINT NOT NULL type, and named " "in all capitalized characters"); @@ -8164,38 +8578,39 @@ create_table_check_doc_id_col( *doc_id_col = ULINT_UNDEFINED; } - break; + return(true); } } - return(find_doc_id); + return(false); } /*****************************************************************//** Creates a table definition to an InnoDB database. */ -static +static __attribute__((nonnull, warn_unused_result)) int create_table_def( /*=============*/ trx_t* trx, /*!< in: InnoDB transaction handle */ - TABLE* form, /*!< in: information on table + const TABLE* form, /*!< in: information on table columns and indexes */ const char* table_name, /*!< in: table name */ - const char* path_of_temp_table,/*!< in: if this is a table explicitly + const char* temp_path, /*!< in: if this is a table explicitly created by the user with the TEMPORARY keyword, then this parameter is the dir path where the table should be placed if we create an .ibd file for it (no .ibd extension - in the path, though); otherwise this - is NULL */ + in the path, though). Otherwise this + is a zero length-string */ + const char* remote_path, /*!< in: Remote path or zero length-string */ ulint flags, /*!< in: table flags */ ulint flags2) /*!< in: table flags2 */ { - Field* field; + THD* thd = trx->mysql_thd; dict_table_t* table; ulint n_cols; - int error; + dberr_t err; ulint col_type; ulint col_len; ulint nulls_allowed; @@ -8206,17 +8621,18 @@ create_table_def( ulint i; ulint doc_id_col = 0; ibool has_doc_id_col = FALSE; + mem_heap_t* heap; DBUG_ENTER("create_table_def"); DBUG_PRINT("enter", ("table_name: %s", table_name)); - ut_a(trx->mysql_thd != NULL); + DBUG_ASSERT(thd != NULL); /* MySQL does the name length check. But we do additional check on the name length here */ if (strlen(table_name) > MAX_FULL_NAME_LEN) { push_warning_printf( - (THD*) trx->mysql_thd, Sql_condition::WARN_LEVEL_WARN, + thd, Sql_condition::WARN_LEVEL_WARN, ER_TABLE_NAME, "InnoDB: Table Name or Database Name is too long"); @@ -8228,7 +8644,7 @@ create_table_def( if (strcmp(strchr(table_name, '/') + 1, "innodb_table_monitor") == 0) { push_warning( - (THD*) trx->mysql_thd, Sql_condition::WARN_LEVEL_WARN, + thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_WRONG_COMMAND, DEPRECATED_MSG_INNODB_TABLE_MONITOR); } @@ -8242,7 +8658,7 @@ create_table_def( if (doc_id_col == ULINT_UNDEFINED) { trx_commit_for_mysql(trx); - error = DB_ERROR; + err = DB_ERROR; goto error_ret; } else { has_doc_id_col = TRUE; @@ -8270,42 +8686,41 @@ create_table_def( flags, flags2); } - if (path_of_temp_table) { + if (flags2 & DICT_TF2_TEMPORARY) { + ut_a(strlen(temp_path)); table->dir_path_of_temp_table = - mem_heap_strdup(table->heap, path_of_temp_table); + mem_heap_strdup(table->heap, temp_path); + } + + if (DICT_TF_HAS_DATA_DIR(flags)) { + ut_a(strlen(remote_path)); + table->data_dir_path = mem_heap_strdup(table->heap, remote_path); + } else { + table->data_dir_path = NULL; } + heap = mem_heap_create(1000); for (i = 0; i < n_cols; i++) { - field = form->field[i]; + Field* field = form->field[i]; col_type = get_innobase_type_from_mysql_type(&unsigned_type, field); if (!col_type) { push_warning_printf( - (THD*) trx->mysql_thd, - Sql_condition::WARN_LEVEL_WARN, + thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_CREATE_TABLE, "Error creating table '%s' with " "column '%s'. Please check its " "column type and try to re-create " "the table with an appropriate " "column type.", - table->name, (char*) field->field_name); + table->name, field->field_name); goto err_col; } - if (field->null_ptr) { - nulls_allowed = 0; - } else { - nulls_allowed = DATA_NOT_NULL; - } - - if (field->binary()) { - binary_type = DATA_BINARY_TYPE; - } else { - binary_type = 0; - } + nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL; + binary_type = field->binary() ? DATA_BINARY_TYPE : 0; charset_no = 0; @@ -8317,13 +8732,13 @@ create_table_def( /* in data0type.h we assume that the number fits in one byte in prtype */ push_warning_printf( - (THD*) trx->mysql_thd, - Sql_condition::WARN_LEVEL_WARN, + thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_CREATE_TABLE, "In InnoDB, charset-collation codes" " must be below 256." " Unsupported code %lu.", (ulong) charset_no); + mem_heap_free(heap); DBUG_RETURN(ER_CANT_CREATE_TABLE); } } @@ -8355,14 +8770,15 @@ create_table_def( field->field_name); err_col: dict_mem_table_free(table); + mem_heap_free(heap); trx_commit_for_mysql(trx); - error = DB_ERROR; + err = DB_ERROR; goto error_ret; } - dict_mem_table_add_col(table, table->heap, - (char*) field->field_name, + dict_mem_table_add_col(table, heap, + field->field_name, col_type, dtype_form_prtype( (ulint) field->type() @@ -8374,25 +8790,33 @@ err_col: /* Add the FTS doc_id hidden column. */ if (flags2 & DICT_TF2_FTS && !has_doc_id_col) { - fts_add_doc_id_column(table); + fts_add_doc_id_column(table, heap); } - error = row_create_table_for_mysql(table, trx); + err = row_create_table_for_mysql(table, trx, false); - if (error == DB_DUPLICATE_KEY) { - char buf[100]; + mem_heap_free(heap); + + if (err == DB_DUPLICATE_KEY || err == DB_TABLESPACE_EXISTS) { + char display_name[FN_REFLEN]; char* buf_end = innobase_convert_identifier( - buf, sizeof buf - 1, table_name, strlen(table_name), - trx->mysql_thd, TRUE); + display_name, sizeof(display_name) - 1, + table_name, strlen(table_name), + thd, TRUE); *buf_end = '\0'; - my_error(ER_TABLE_EXISTS_ERROR, MYF(0), buf); + + my_error(err == DB_DUPLICATE_KEY + ? ER_TABLE_EXISTS_ERROR + : ER_TABLESPACE_EXISTS, MYF(0), display_name); } -error_ret: - error = convert_error_code_to_mysql(error, flags, NULL); + if (err == DB_SUCCESS && (flags2 & DICT_TF2_FTS)) { + fts_optimize_add_table(table); + } - DBUG_RETURN(error); +error_ret: + DBUG_RETURN(convert_error_code_to_mysql(err, flags, thd)); } /*****************************************************************//** @@ -8402,108 +8826,113 @@ int create_index( /*=========*/ trx_t* trx, /*!< in: InnoDB transaction handle */ - TABLE* form, /*!< in: information on table + const TABLE* form, /*!< in: information on table columns and indexes */ ulint flags, /*!< in: InnoDB table flags */ const char* table_name, /*!< in: table name */ uint key_num) /*!< in: index number */ { - Field* field; dict_index_t* index; int error; - ulint n_fields; - KEY* key; - KEY_PART_INFO* key_part; + const KEY* key; ulint ind_type; - ulint col_type; - ulint prefix_len = 0; - ulint is_unsigned; - ulint i; - ulint j; - ulint* field_lengths = NULL; + ulint* field_lengths; DBUG_ENTER("create_index"); key = form->key_info + key_num; - n_fields = key->key_parts; - /* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */ ut_a(innobase_strcasecmp(key->name, innobase_index_reserve_name) != 0); - ind_type = 0; - if (key->flags & HA_FULLTEXT) { - ind_type = DICT_FTS; - } else { - if (key_num == form->s->primary_key) { - ind_type = ind_type | DICT_CLUSTERED; + index = dict_mem_index_create(table_name, key->name, 0, + DICT_FTS, + key->user_defined_key_parts); + + for (ulint i = 0; i < key->user_defined_key_parts; i++) { + KEY_PART_INFO* key_part = key->key_part + i; + dict_mem_index_add_field( + index, key_part->field->field_name, 0); } - if (key->flags & HA_NOSAME ) { - ind_type = ind_type | DICT_UNIQUE; - } - } + DBUG_RETURN(convert_error_code_to_mysql( + row_create_index_for_mysql( + index, trx, NULL), + flags, NULL)); - /* We pass 0 as the space id, and determine at a lower level the space - id where to store the table */ + } - index = dict_mem_index_create(table_name, key->name, 0, - ind_type, n_fields); + ind_type = 0; - if (ind_type != DICT_FTS) { - field_lengths = (ulint*) my_malloc( - sizeof(ulint) * n_fields, MYF(MY_FAE)); + if (key_num == form->s->primary_key) { + ind_type |= DICT_CLUSTERED; + } - ut_ad(!(index->type & DICT_FTS)); + if (key->flags & HA_NOSAME) { + ind_type |= DICT_UNIQUE; } - for (i = 0; i < n_fields; i++) { - key_part = key->key_part + i; + field_lengths = (ulint*) my_malloc( + key->user_defined_key_parts * sizeof * + field_lengths, MYF(MY_FAE)); - if (ind_type != DICT_FTS) { + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ - /* (The flag HA_PART_KEY_SEG denotes in MySQL a - column prefix field in an index: we only store a - specified number of first bytes of the column to - the index field.) The flag does not seem to be - properly set by MySQL. Let us fall back on testing - the length of the key part versus the column. */ + index = dict_mem_index_create(table_name, key->name, 0, + ind_type, key->user_defined_key_parts); - field = NULL; + for (ulint i = 0; i < key->user_defined_key_parts; i++) { + KEY_PART_INFO* key_part = key->key_part + i; + ulint prefix_len; + ulint col_type; + ulint is_unsigned; - for (j = 0; j < form->s->fields; j++) { - field = form->field[j]; + /* (The flag HA_PART_KEY_SEG denotes in MySQL a + column prefix field in an index: we only store a + specified number of first bytes of the column to + the index field.) The flag does not seem to be + properly set by MySQL. Let us fall back on testing + the length of the key part versus the column. */ - if (0 == innobase_strcasecmp( - field->field_name, - key_part->field->field_name)) { - /* Found the corresponding column */ + Field* field = NULL; - break; - } - } + for (ulint j = 0; j < form->s->fields; j++) { - ut_a(j < form->s->fields); + field = form->field[j]; - col_type = get_innobase_type_from_mysql_type( - &is_unsigned, key_part->field); + if (0 == innobase_strcasecmp( + field->field_name, + key_part->field->field_name)) { + /* Found the corresponding column */ - if (DATA_BLOB == col_type - || (key_part->length < field->pack_length() - && field->type() != MYSQL_TYPE_VARCHAR) - || (field->type() == MYSQL_TYPE_VARCHAR - && key_part->length < field->pack_length() - - ((Field_varstring*) field)->length_bytes)) { + goto found; + } + } + ut_error; +found: + col_type = get_innobase_type_from_mysql_type( + &is_unsigned, key_part->field); + + if (DATA_BLOB == col_type + || (key_part->length < field->pack_length() + && field->type() != MYSQL_TYPE_VARCHAR) + || (field->type() == MYSQL_TYPE_VARCHAR + && key_part->length < field->pack_length() + - ((Field_varstring*) field)->length_bytes)) { + + switch (col_type) { + default: prefix_len = key_part->length; - - if (col_type == DATA_INT - || col_type == DATA_FLOAT - || col_type == DATA_DOUBLE - || col_type == DATA_DECIMAL) { - sql_print_error( + break; + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + sql_print_error( "MySQL is trying to create a column " "prefix index field, on an " "inappropriate data type. Table " @@ -8511,17 +8940,16 @@ create_index( table_name, key_part->field->field_name); - prefix_len = 0; - } - } else { prefix_len = 0; } - - field_lengths[i] = key_part->length; + } else { + prefix_len = 0; } - dict_mem_index_add_field(index, - (char*) key_part->field->field_name, prefix_len); + field_lengths[i] = key_part->length; + + dict_mem_index_add_field( + index, key_part->field->field_name, prefix_len); } ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS)); @@ -8529,9 +8957,10 @@ create_index( /* Even though we've defined max_supported_key_part_length, we still do our own checking using field_lengths to be absolutely sure we don't create too long indexes. */ - error = row_create_index_for_mysql(index, trx, field_lengths); - error = convert_error_code_to_mysql(error, flags, NULL); + error = convert_error_code_to_mysql( + row_create_index_for_mysql(index, trx, field_lengths), + flags, NULL); my_free(field_lengths); @@ -8550,7 +8979,7 @@ create_clustered_index_when_no_primary( const char* table_name) /*!< in: table name */ { dict_index_t* index; - int error; + dberr_t error; /* We pass 0 as the space id, and determine at a lower level the space id where to store the table */ @@ -8560,9 +8989,7 @@ create_clustered_index_when_no_primary( error = row_create_index_for_mysql(index, trx, NULL); - error = convert_error_code_to_mysql(error, flags, NULL); - - return(error); + return(convert_error_code_to_mysql(error, flags, NULL)); } /*****************************************************************//** @@ -8599,11 +9026,11 @@ get_row_format_name( if (!use_tablespace) { \ push_warning_printf( \ thd, Sql_condition::WARN_LEVEL_WARN, \ - HA_WRONG_CREATE_OPTION, \ + ER_ILLEGAL_HA_CREATE_OPTION, \ "InnoDB: ROW_FORMAT=%s requires" \ " innodb_file_per_table.", \ get_row_format_name(row_format)); \ - ret = FALSE; \ + ret = "ROW_FORMAT"; \ } /** If file-format is Antelope, issue warning and set ret false */ @@ -8611,11 +9038,11 @@ get_row_format_name( if (srv_file_format < UNIV_FORMAT_B) { \ push_warning_printf( \ thd, Sql_condition::WARN_LEVEL_WARN, \ - HA_WRONG_CREATE_OPTION, \ + ER_ILLEGAL_HA_CREATE_OPTION, \ "InnoDB: ROW_FORMAT=%s requires" \ " innodb_file_format > Antelope.", \ get_row_format_name(row_format)); \ - ret = FALSE; \ + ret = "ROW_FORMAT"; \ } @@ -8624,11 +9051,11 @@ Validates the create options. We may build on this function in future. For now, it checks two specifiers: KEY_BLOCK_SIZE and ROW_FORMAT If innodb_strict_mode is not set then this function is a no-op -@return TRUE if valid. */ -static -ibool -create_options_are_valid( -/*=====================*/ +@return NULL if valid, string if not. */ +UNIV_INTERN +const char* +create_options_are_invalid( +/*=======================*/ THD* thd, /*!< in: connection thread. */ TABLE* form, /*!< in: information on table columns and indexes */ @@ -8636,14 +9063,14 @@ create_options_are_valid( bool use_tablespace) /*!< in: srv_file_per_table */ { ibool kbs_specified = FALSE; - ibool ret = TRUE; + const char* ret = NULL; enum row_type row_format = form->s->row_type; ut_ad(thd != NULL); /* If innodb_strict_mode is not set don't do any validation. */ if (!(THDVAR(thd, strict_mode))) { - return(TRUE); + return(NULL); } ut_ad(form != NULL); @@ -8663,18 +9090,18 @@ create_options_are_valid( if (!use_tablespace) { push_warning( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: KEY_BLOCK_SIZE requires" " innodb_file_per_table."); - ret = FALSE; + ret = "KEY_BLOCK_SIZE"; } if (srv_file_format < UNIV_FORMAT_B) { push_warning( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: KEY_BLOCK_SIZE requires" " innodb_file_format > Antelope."); - ret = FALSE; + ret = "KEY_BLOCK_SIZE"; } /* The maximum KEY_BLOCK_SIZE (KBS) is 16. But if @@ -8686,22 +9113,22 @@ create_options_are_valid( if (create_info->key_block_size > kbs_max) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: KEY_BLOCK_SIZE=%ld" " cannot be larger than %ld.", create_info->key_block_size, kbs_max); - ret = FALSE; + ret = "KEY_BLOCK_SIZE"; } break; default: push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: invalid KEY_BLOCK_SIZE = %lu." " Valid values are [1, 2, 4, 8, 16]", create_info->key_block_size); - ret = FALSE; + ret = "KEY_BLOCK_SIZE"; break; } } @@ -8722,11 +9149,11 @@ create_options_are_valid( if (kbs_specified) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: cannot specify ROW_FORMAT = %s" " with KEY_BLOCK_SIZE.", get_row_format_name(row_format)); - ret = FALSE; + ret = "KEY_BLOCK_SIZE"; } break; case ROW_TYPE_DEFAULT: @@ -8736,12 +9163,42 @@ create_options_are_valid( case ROW_TYPE_NOT_USED: push_warning( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, \ + ER_ILLEGAL_HA_CREATE_OPTION, \ "InnoDB: invalid ROW_FORMAT specifier."); - ret = FALSE; + ret = "ROW_TYPE"; break; } + /* Use DATA DIRECTORY only with file-per-table. */ + if (create_info->data_file_name && !use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY requires" + " innodb_file_per_table."); + ret = "DATA DIRECTORY"; + } + + /* Do not use DATA DIRECTORY with TEMPORARY TABLE. */ + if (create_info->data_file_name + && create_info->options & HA_LEX_CREATE_TMP_TABLE) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY cannot be used" + " for TEMPORARY tables."); + ret = "DATA DIRECTORY"; + } + + /* Do not allow INDEX_DIRECTORY */ + if (create_info->index_file_name) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: INDEX DIRECTORY is not supported"); + ret = "INDEX DIRECTORY"; + } + return(ret); } @@ -8757,11 +9214,18 @@ ha_innobase::update_create_info( ha_innobase::info(HA_STATUS_AUTO); create_info->auto_increment_value = stats.auto_increment_value; } + + /* Update the DATA DIRECTORY name from SYS_DATAFILES. */ + dict_get_and_save_data_dir_path(prebuilt->table, false); + + if (prebuilt->table->data_dir_path) { + create_info->data_file_name = prebuilt->table->data_dir_path; + } } /*****************************************************************//** Initialize the table FTS stopword list -@TRUE if succeed */ +@return TRUE if success */ UNIV_INTERN ibool innobase_fts_load_stopword( @@ -8770,68 +9234,38 @@ innobase_fts_load_stopword( trx_t* trx, /*!< in: transaction */ THD* thd) /*!< in: current thread */ { - return (fts_load_stopword(table, trx, - fts_server_stopword_table, - THDVAR(thd, ft_user_stopword_table), - THDVAR(thd, ft_enable_stopword), FALSE)); + return(fts_load_stopword(table, trx, + fts_server_stopword_table, + THDVAR(thd, ft_user_stopword_table), + THDVAR(thd, ft_enable_stopword), FALSE)); } + /*****************************************************************//** -Creates a new table to an InnoDB database. -@return error number */ +Parses the table name into normal name and either temp path or remote path +if needed. +@return 0 if successful, otherwise, error number */ UNIV_INTERN int -ha_innobase::create( -/*================*/ - const char* name, /*!< in: table name */ - TABLE* form, /*!< in: information on table - columns and indexes */ - HA_CREATE_INFO* create_info) /*!< in: more information of the +ha_innobase::parse_table_name( +/*==========================*/ + const char* name, /*!< in/out: table name provided*/ + HA_CREATE_INFO* create_info, /*!< in: more information of the created table, contains also the create statement string */ + ulint flags, /*!< in: flags*/ + ulint flags2, /*!< in: flags2*/ + char* norm_name, /*!< out: normalized table name */ + char* temp_path, /*!< out: absolute path of table */ + char* remote_path) /*!< out: remote path of table */ { - int error; - trx_t* parent_trx; - trx_t* trx; - int primary_key_no; - uint i; - char name2[FN_REFLEN]; - char norm_name[FN_REFLEN]; THD* thd = ha_thd(); - ib_int64_t auto_inc_value; - ulint fts_indexes = 0; - ibool zip_allowed = TRUE; - enum row_type row_format; - rec_format_t innodb_row_format = REC_FORMAT_COMPACT; - - /* Cache the global variable "srv_file_per_table" to a local - variable before using it. Note that "srv_file_per_table" - is not under dict_sys mutex protection, and could be changed - while creating the table. So we read the current value here - and make all further decisions based on this. */ - bool use_tablespace = srv_file_per_table; - - /* Zip Shift Size - log2 - 9 of compressed page size, - zero for uncompressed */ - ulint zip_ssize = 0; - ulint flags = 0; - ulint flags2 = 0; - dict_table_t* innobase_table = NULL; - - /* Cache the value of innodb_file_format, in case it is - modified by another thread while the table is being created. */ - const ulint file_format_allowed = srv_file_format; - const char* stmt; - size_t stmt_len; - - DBUG_ENTER("ha_innobase::create"); - - DBUG_ASSERT(thd != NULL); - DBUG_ASSERT(create_info != NULL); + bool use_tablespace = flags2 & DICT_TF2_USE_TABLESPACE; + DBUG_ENTER("ha_innobase::parse_table_name"); #ifdef __WIN__ /* Names passed in from server are in two formats: 1. <database_name>/<table_name>: for normal table creation - 2. full path: for temp table creation, or sym link + 2. full path: for temp table creation, or DATA DIRECTORY. When srv_file_per_table is on and mysqld_embedded is off, check for full path pattern, i.e. @@ -8842,7 +9276,7 @@ ha_innobase::create( if (use_tablespace && !mysqld_embedded - && (!create_info->options & HA_LEX_CREATE_TMP_TABLE)) { + && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)) { if ((name[1] == ':') || (name[0] == '\\' && name[1] == '\\')) { @@ -8852,26 +9286,113 @@ ha_innobase::create( } #endif - if (form->s->fields > 1000) { - /* The limit probably should be REC_MAX_N_FIELDS - 3 = 1020, - but we play safe here */ + normalize_table_name(norm_name, name); + temp_path[0] = '\0'; + remote_path[0] = '\0'; - DBUG_RETURN(HA_ERR_TO_BIG_ROW); + /* A full path is used for TEMPORARY TABLE and DATA DIRECTORY. + In the case of; + CREATE TEMPORARY TABLE ... DATA DIRECTORY={path} ... ; + We ignore the DATA DIRECTORY. */ + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + strncpy(temp_path, name, FN_REFLEN - 1); } + if (create_info->data_file_name) { + bool ignore = false; + + /* Use DATA DIRECTORY only with file-per-table. */ + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY requires" + " innodb_file_per_table."); + ignore = true; + } + + /* Do not use DATA DIRECTORY with TEMPORARY TABLE. */ + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY cannot be" + " used for TEMPORARY tables."); + ignore = true; + } + + if (ignore) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + WARN_OPTION_IGNORED, + ER_DEFAULT(WARN_OPTION_IGNORED), + "DATA DIRECTORY"); + } else { + strncpy(remote_path, create_info->data_file_name, + FN_REFLEN - 1); + } + } + + if (create_info->index_file_name) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + WARN_OPTION_IGNORED, + ER_DEFAULT(WARN_OPTION_IGNORED), + "INDEX DIRECTORY"); + } + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Determines InnoDB table flags. +@retval true if successful, false if error */ +UNIV_INTERN +bool +innobase_table_flags( +/*=================*/ + const TABLE* form, /*!< in: table */ + const HA_CREATE_INFO* create_info, /*!< in: information + on table columns and indexes */ + THD* thd, /*!< in: connection */ + bool use_tablespace, /*!< in: whether to create + outside system tablespace */ + ulint* flags, /*!< out: DICT_TF flags */ + ulint* flags2) /*!< out: DICT_TF2 flags */ +{ + DBUG_ENTER("innobase_table_flags"); + + const char* fts_doc_id_index_bad = NULL; + bool zip_allowed = true; + ulint zip_ssize = 0; + enum row_type row_format; + rec_format_t innodb_row_format = REC_FORMAT_COMPACT; + bool use_data_dir; + + /* Cache the value of innodb_file_format, in case it is + modified by another thread while the table is being created. */ + const ulint file_format_allowed = srv_file_format; + + *flags = 0; + *flags2 = 0; + /* Check if there are any FTS indexes defined on this table. */ - for (i = 0; i < form->s->keys; i++) { - KEY* key = form->key_info + i; + for (uint i = 0; i < form->s->keys; i++) { + const KEY* key = &form->key_info[i]; if (key->flags & HA_FULLTEXT) { - ++fts_indexes; + *flags2 |= DICT_TF2_FTS; /* We don't support FTS indexes in temporary tables. */ if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0)); - DBUG_RETURN(-1); + DBUG_RETURN(false); + } + + if (fts_doc_id_index_bad) { + goto index_bad; } } @@ -8884,41 +9405,15 @@ ha_innobase::create( || strcmp(key->name, FTS_DOC_ID_INDEX_NAME) || strcmp(key->key_part[0].field->field_name, FTS_DOC_ID_COL_NAME)) { - push_warning_printf(thd, - Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_NAME_FOR_INDEX, - " InnoDB: Index name %s is reserved" - " for the unique index on" - " FTS_DOC_ID column for FTS" - " document ID indexing" - " on table %s. Please check" - " the index definition to" - " make sure it is of correct" - " type\n", - FTS_DOC_ID_INDEX_NAME, - name); - my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), - FTS_DOC_ID_INDEX_NAME); - DBUG_RETURN(-1); + fts_doc_id_index_bad = key->name; } - } - - strcpy(name2, name); - - normalize_table_name(norm_name, name2); - /* Create the table definition in InnoDB */ - - flags = 0; - - if (fts_indexes > 0) { - flags2 = DICT_TF2_FTS; - } - - /* Validate create options if innodb_strict_mode is set. */ - if (!create_options_are_valid( - thd, form, create_info, use_tablespace)) { - DBUG_RETURN(HA_WRONG_CREATE_OPTION); + if (fts_doc_id_index_bad && (*flags2 & DICT_TF2_FTS)) { +index_bad: + my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0), + fts_doc_id_index_bad); + DBUG_RETURN(false); + } } if (create_info->key_block_size) { @@ -8942,7 +9437,7 @@ ha_innobase::create( if (!use_tablespace) { push_warning( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: KEY_BLOCK_SIZE requires" " innodb_file_per_table."); zip_allowed = FALSE; @@ -8951,7 +9446,7 @@ ha_innobase::create( if (file_format_allowed < UNIV_FORMAT_B) { push_warning( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: KEY_BLOCK_SIZE requires" " innodb_file_format > Antelope."); zip_allowed = FALSE; @@ -8962,7 +9457,7 @@ ha_innobase::create( PAGE_ZIP_SSIZE_MAX)) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: ignoring KEY_BLOCK_SIZE=%lu.", create_info->key_block_size); } @@ -8984,7 +9479,7 @@ ha_innobase::create( with ALTER TABLE anyway. */ push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: ignoring KEY_BLOCK_SIZE=%lu" " unless ROW_FORMAT=COMPRESSED.", create_info->key_block_size); @@ -9012,14 +9507,14 @@ ha_innobase::create( if (!use_tablespace) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: ROW_FORMAT=%s requires" " innodb_file_per_table.", get_row_format_name(row_format)); } else if (file_format_allowed == UNIV_FORMAT_A) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: ROW_FORMAT=%s requires" " innodb_file_format > Antelope.", get_row_format_name(row_format)); @@ -9036,7 +9531,7 @@ ha_innobase::create( case ROW_TYPE_PAGE: push_warning( thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, + ER_ILLEGAL_HA_CREATE_OPTION, "InnoDB: assuming ROW_FORMAT=COMPACT."); case ROW_TYPE_DEFAULT: /* If we fell through, set row format to Compact. */ @@ -9049,12 +9544,100 @@ ha_innobase::create( if (!zip_allowed) { zip_ssize = 0; } - dict_tf_set(&flags, innodb_row_format, zip_ssize); + + use_data_dir = use_tablespace + && ((create_info->data_file_name != NULL) + && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); + + dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + *flags2 |= DICT_TF2_TEMPORARY; + } + + if (use_tablespace) { + *flags2 |= DICT_TF2_USE_TABLESPACE; + } + + DBUG_RETURN(true); +} + +/*****************************************************************//** +Creates a new table to an InnoDB database. +@return error number */ +UNIV_INTERN +int +ha_innobase::create( +/*================*/ + const char* name, /*!< in: table name */ + TABLE* form, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info) /*!< in: more information of the + created table, contains also the + create statement string */ +{ + int error; + trx_t* parent_trx; + trx_t* trx; + int primary_key_no; + uint i; + char norm_name[FN_REFLEN]; /* {database}/{tablename} */ + char temp_path[FN_REFLEN]; /* absolute path of temp frm */ + char remote_path[FN_REFLEN]; /* absolute path of table */ + THD* thd = ha_thd(); + ib_int64_t auto_inc_value; + + /* Cache the global variable "srv_file_per_table" to a local + variable before using it. Note that "srv_file_per_table" + is not under dict_sys mutex protection, and could be changed + while creating the table. So we read the current value here + and make all further decisions based on this. */ + bool use_tablespace = srv_file_per_table; + + /* Zip Shift Size - log2 - 9 of compressed page size, + zero for uncompressed */ + ulint flags; + ulint flags2; + dict_table_t* innobase_table = NULL; + + const char* stmt; + size_t stmt_len; + + DBUG_ENTER("ha_innobase::create"); + + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(create_info != NULL); + + if (form->s->fields > REC_MAX_N_USER_FIELDS) { + DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS); + } else if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + /* Create the table definition in InnoDB */ + + /* Validate create options if innodb_strict_mode is set. */ + if (create_options_are_invalid( + thd, form, create_info, use_tablespace)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + + if (!innobase_table_flags(form, create_info, + thd, use_tablespace, + &flags, &flags2)) { + DBUG_RETURN(-1); + } + + error = parse_table_name(name, create_info, flags, flags2, + norm_name, temp_path, remote_path); + if (error) { + DBUG_RETURN(error); + } /* Look for a primary key */ primary_key_no = (form->s->primary_key != MAX_KEY ? - (int) form->s->primary_key : - -1); + (int) form->s->primary_key : + -1); /* Our function innobase_get_mysql_key_number_for_index assumes the primary key is always number 0, if it exists */ @@ -9071,14 +9654,6 @@ ha_innobase::create( DBUG_RETURN(HA_ERR_GENERIC); } - if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { - flags2 |= DICT_TF2_TEMPORARY; - } - - if (use_tablespace) { - flags2 |= DICT_TF2_USE_TABLESPACE; - } - /* Get the transaction associated with the current thd, or create one if not yet created */ @@ -9097,10 +9672,8 @@ ha_innobase::create( row_mysql_lock_data_dictionary(trx); - error = create_table_def(trx, form, norm_name, - create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL, - flags, flags2); - + error = create_table_def(trx, form, norm_name, temp_path, + remote_path, flags, flags2); if (error) { goto cleanup; } @@ -9130,20 +9703,20 @@ ha_innobase::create( /* Create the ancillary tables that are common to all FTS indexes on this table. */ - if (fts_indexes > 0) { - ulint ret = 0; + if (flags2 & DICT_TF2_FTS) { + enum fts_doc_id_index_enum ret; - innobase_table = dict_table_open_on_name_no_stats( - norm_name, TRUE, DICT_ERR_IGNORE_NONE); + innobase_table = dict_table_open_on_name( + norm_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); ut_a(innobase_table); - /* Check whether there alreadys exist FTS_DOC_ID_INDEX */ + /* Check whether there already exists FTS_DOC_ID_INDEX */ ret = innobase_fts_check_doc_id_index_in_def( form->s->keys, form->s->key_info); - /* Raise error if FTS_DOC_ID_INDEX is of wrong format */ - if (ret == FTS_INCORRECT_DOC_ID_INDEX) { + switch (ret) { + case FTS_INCORRECT_DOC_ID_INDEX: push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_NAME_FOR_INDEX, @@ -9162,20 +9735,23 @@ ha_innobase::create( fts_free(innobase_table); } - dict_table_close(innobase_table, TRUE); + dict_table_close(innobase_table, TRUE, FALSE); my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), FTS_DOC_ID_INDEX_NAME); error = -1; goto cleanup; + case FTS_EXIST_DOC_ID_INDEX: + case FTS_NOT_EXIST_DOC_ID_INDEX: + break; } - error = fts_create_common_tables( + dberr_t err = fts_create_common_tables( trx, innobase_table, norm_name, (ret == FTS_EXIST_DOC_ID_INDEX)); - error = convert_error_code_to_mysql(error, 0, NULL); + error = convert_error_code_to_mysql(err, 0, NULL); - dict_table_close(innobase_table, TRUE); + dict_table_close(innobase_table, TRUE, FALSE); if (error) { goto cleanup; @@ -9196,11 +9772,11 @@ ha_innobase::create( stmt = innobase_get_stmt(thd, &stmt_len); if (stmt) { - error = row_table_add_foreign_constraints( + dberr_t err = row_table_add_foreign_constraints( trx, stmt, stmt_len, norm_name, create_info->options & HA_LEX_CREATE_TMP_TABLE); - switch (error) { + switch (err) { case DB_PARENT_NO_INDEX: push_warning_printf( @@ -9221,9 +9797,11 @@ ha_innobase::create( " table where referencing columns appear" " as the first columns.\n", norm_name); break; + default: + break; } - error = convert_error_code_to_mysql(error, flags, NULL); + error = convert_error_code_to_mysql(err, flags, NULL); if (error) { goto cleanup; @@ -9231,7 +9809,7 @@ ha_innobase::create( } /* Cache all the FTS indexes on this table in the FTS specific structure. They are used for FTS indexed column update handling. */ - if (fts_indexes > 0) { + if (flags2 & DICT_TF2_FTS) { fts_t* fts = innobase_table->fts; ut_a(fts != NULL); @@ -9249,10 +9827,15 @@ ha_innobase::create( log_buffer_flush_to_disk(); - innobase_table = dict_table_open_on_name(norm_name, FALSE); + innobase_table = dict_table_open_on_name( + norm_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); DBUG_ASSERT(innobase_table != 0); + innobase_copy_frm_flags_from_create_info(innobase_table, create_info); + + dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE); + if (innobase_table) { /* We update the highest file format in the system table space, if this table has higher file format setting. */ @@ -9263,9 +9846,9 @@ ha_innobase::create( } /* Load server stopword into FTS cache */ - if (fts_indexes > 0) { + if (flags2 & DICT_TF2_FTS) { if (!innobase_fts_load_stopword(innobase_table, NULL, thd)) { - dict_table_close(innobase_table, FALSE); + dict_table_close(innobase_table, FALSE, FALSE); srv_active_wake_master_thread(); trx_free_for_mysql(trx); DBUG_RETURN(-1); @@ -9302,7 +9885,7 @@ ha_innobase::create( dict_table_autoinc_unlock(innobase_table); } - dict_table_close(innobase_table, FALSE); + dict_table_close(innobase_table, FALSE, FALSE); /* Tell the InnoDB server that there might be work for utility threads: */ @@ -9314,7 +9897,7 @@ ha_innobase::create( DBUG_RETURN(0); cleanup: - innobase_commit_low(trx); + trx_rollback_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); @@ -9332,9 +9915,8 @@ ha_innobase::discard_or_import_tablespace( /*======================================*/ my_bool discard) /*!< in: TRUE if discard, else import */ { + dberr_t err; dict_table_t* dict_table; - trx_t* trx; - int err; DBUG_ENTER("ha_innobase::discard_or_import_tablespace"); @@ -9342,18 +9924,85 @@ ha_innobase::discard_or_import_tablespace( ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + dict_table = prebuilt->table; - trx = prebuilt->trx; - if (discard) { - err = row_discard_tablespace_for_mysql(dict_table->name, trx); + if (dict_table->space == TRX_SYS_SPACE) { + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_IN_SYSTEM_TABLESPACE, + table->s->table_name.str); + + DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE); + } + + trx_start_if_not_started(prebuilt->trx); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads. */ + trx_search_latch_release_if_reserved(prebuilt->trx); + + /* Obtain an exclusive lock on the table. */ + err = row_mysql_lock_table( + prebuilt->trx, dict_table, LOCK_X, + discard ? "setting table lock for DISCARD TABLESPACE" + : "setting table lock for IMPORT TABLESPACE"); + + if (err != DB_SUCCESS) { + /* unable to lock the table: do nothing */ + } else if (discard) { + + /* Discarding an already discarded tablespace should be an + idempotent operation. Also, if the .ibd file is missing the + user may want to set the DISCARD flag in order to IMPORT + a new tablespace. */ + + if (dict_table->ibd_file_missing) { + ib_senderrf( + prebuilt->trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING, + table->s->table_name.str); + } + + err = row_discard_tablespace_for_mysql( + dict_table->name, prebuilt->trx); + + } else if (!dict_table->ibd_file_missing) { + /* Commit the transaction in order to + release the table lock. */ + trx_commit_for_mysql(prebuilt->trx); + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_EXISTS, table->s->table_name.str); + + DBUG_RETURN(HA_ERR_TABLE_EXIST); } else { - err = row_import_tablespace_for_mysql(dict_table->name, trx); + err = row_import_for_mysql(dict_table, prebuilt); + + if (err == DB_SUCCESS) { + + if (table->found_next_number_field) { + dict_table_autoinc_lock(dict_table); + innobase_initialize_autoinc(); + dict_table_autoinc_unlock(dict_table); + } + + info(HA_STATUS_TIME + | HA_STATUS_CONST + | HA_STATUS_VARIABLE + | HA_STATUS_AUTO); + } } - err = convert_error_code_to_mysql(err, dict_table->flags, NULL); + /* Commit the transaction in order to release the table lock. */ + trx_commit_for_mysql(prebuilt->trx); - DBUG_RETURN(err); + DBUG_RETURN(convert_error_code_to_mysql(err, dict_table->flags, NULL)); } /*****************************************************************//** @@ -9364,10 +10013,15 @@ int ha_innobase::truncate() /*===================*/ { + dberr_t err; int error; DBUG_ENTER("ha_innobase::truncate"); + if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + /* Get the transaction associated with the current thd, or create one if not yet created, and update prebuilt->trx */ @@ -9378,11 +10032,28 @@ ha_innobase::truncate() } /* Truncate the table in InnoDB */ - error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx); + err = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx); - error = convert_error_code_to_mysql(error, prebuilt->table->flags, - NULL); + switch (err) { + case DB_TABLESPACE_DELETED: + case DB_TABLESPACE_NOT_FOUND: + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + (err == DB_TABLESPACE_DELETED ? + ER_TABLESPACE_DISCARDED : ER_TABLESPACE_MISSING), + table->s->table_name.str); + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + + default: + error = convert_error_code_to_mysql( + err, prebuilt->table->flags, + prebuilt->trx->mysql_thd); + table->status = STATUS_NOT_FOUND; + break; + } DBUG_RETURN(error); } @@ -9400,12 +10071,11 @@ ha_innobase::delete_table( const char* name) /*!< in: table name */ { ulint name_len; - int error; + dberr_t err; trx_t* parent_trx; trx_t* trx; - THD *thd = ha_thd(); - char norm_name[1000]; - char errstr[1024]; + THD* thd = ha_thd(); + char norm_name[FN_REFLEN]; DBUG_ENTER("ha_innobase::delete_table"); @@ -9413,29 +10083,21 @@ ha_innobase::delete_table( "test_normalize_table_name_low", test_normalize_table_name_low(); ); + DBUG_EXECUTE_IF( + "test_ut_format_name", + test_ut_format_name(); + ); /* Strangely, MySQL passes the table name without the '.frm' extension, in contrast to ::create */ normalize_table_name(norm_name, name); - if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) { + if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) { DBUG_RETURN(HA_ERR_GENERIC); } - /* Remove stats for this table and all of its indexes from the - persistent storage if it exists and if there are stats for this - table in there. This function creates its own trx and commits - it. */ - error = dict_stats_delete_table_stats(norm_name, - errstr, sizeof(errstr)); - if (error != DB_SUCCESS) { - push_warning(thd, Sql_condition::WARN_LEVEL_WARN, - ER_LOCK_WAIT_TIMEOUT, errstr); - } - - /* Get the transaction associated with the current thd, or create one - if not yet created */ - parent_trx = check_trx_exists(thd); /* In case MySQL calls this in the middle of a SELECT query, release @@ -9456,14 +10118,14 @@ ha_innobase::delete_table( /* We are doing a DDL operation. */ ++trx->will_lock; + trx->ddl = true; /* Drop the table in InnoDB */ - error = row_drop_table_for_mysql(norm_name, trx, - thd_sql_command(thd) - == SQLCOM_DROP_DB); + err = row_drop_table_for_mysql( + norm_name, trx, thd_sql_command(thd) == SQLCOM_DROP_DB); - if (error == DB_TABLE_NOT_FOUND + if (err == DB_TABLE_NOT_FOUND && innobase_get_lower_case_table_names() == 1) { char* is_part = NULL; #ifdef __WIN__ @@ -9473,25 +10135,25 @@ ha_innobase::delete_table( #endif /* __WIN__ */ if (is_part) { - char par_case_name[MAX_FULL_NAME_LEN + 1]; + char par_case_name[FN_REFLEN]; #ifndef __WIN__ /* Check for the table using lower case name, including the partition separator "P" */ - memcpy(par_case_name, norm_name, strlen(norm_name)); - par_case_name[strlen(norm_name)] = 0; + strcpy(par_case_name, norm_name); innobase_casedn_str(par_case_name); #else /* On Windows platfrom, check whether there exists table name in system table whose name is not being normalized to lower case */ - normalize_table_name_low(par_case_name, name, FALSE); + normalize_table_name_low( + par_case_name, name, FALSE); #endif - error = row_drop_table_for_mysql(par_case_name, trx, - thd_sql_command(thd) - == SQLCOM_DROP_DB); + err = row_drop_table_for_mysql( + par_case_name, trx, + thd_sql_command(thd) == SQLCOM_DROP_DB); } } @@ -9510,9 +10172,7 @@ ha_innobase::delete_table( trx_free_for_mysql(trx); - error = convert_error_code_to_mysql(error, 0, NULL); - - DBUG_RETURN(error); + DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL)); } /*****************************************************************//** @@ -9538,6 +10198,10 @@ innobase_drop_database( DBUG_ASSERT(hton == innodb_hton_ptr); + if (srv_read_only_mode) { + return; + } + /* In the Windows plugin, thd = current_thd is always NULL */ if (thd) { trx_t* parent_trx = check_trx_exists(thd); @@ -9593,36 +10257,36 @@ innobase_drop_database( innobase_commit_low(trx); trx_free_for_mysql(trx); } + /*********************************************************************//** Renames an InnoDB table. -@return 0 or error code */ -static -int +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t innobase_rename_table( /*==================*/ trx_t* trx, /*!< in: transaction */ const char* from, /*!< in: old name of the table */ - const char* to, /*!< in: new name of the table */ - ibool lock_and_commit) - /*!< in: TRUE=lock data dictionary and commit */ + const char* to) /*!< in: new name of the table */ { - int error; - char* norm_to; - char* norm_from; + dberr_t error; + char norm_to[FN_REFLEN]; + char norm_from[FN_REFLEN]; - // Magic number 64 arbitrary - norm_to = (char*) my_malloc(strlen(to) + 64, MYF(0)); - norm_from = (char*) my_malloc(strlen(from) + 64, MYF(0)); + DBUG_ENTER("innobase_rename_table"); + DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + + ut_ad(!srv_read_only_mode); normalize_table_name(norm_to, to); normalize_table_name(norm_from, from); + DEBUG_SYNC_C("innodb_rename_table_ready"); + /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ - if (lock_and_commit) { - row_mysql_lock_data_dictionary(trx); - } + row_mysql_lock_data_dictionary(trx); /* Transaction must be flagged as a locking transaction or it hasn't been started yet. */ @@ -9630,7 +10294,7 @@ innobase_rename_table( ut_a(trx->will_lock > 0); error = row_rename_table_for_mysql( - norm_from, norm_to, trx, lock_and_commit); + norm_from, norm_to, trx, TRUE); if (error != DB_SUCCESS) { if (error == DB_TABLE_NOT_FOUND @@ -9643,39 +10307,36 @@ innobase_rename_table( #endif /* __WIN__ */ if (is_part) { - char par_case_name[MAX_FULL_NAME_LEN + 1]; - + char par_case_name[FN_REFLEN]; #ifndef __WIN__ /* Check for the table using lower case name, including the partition separator "P" */ - memcpy(par_case_name, norm_from, - strlen(norm_from)); - par_case_name[strlen(norm_from)] = 0; + strcpy(par_case_name, norm_from); innobase_casedn_str(par_case_name); #else /* On Windows platfrom, check whether there exists table name in system table whose name is not being normalized to lower case */ - normalize_table_name_low(par_case_name, - from, FALSE); + normalize_table_name_low( + par_case_name, from, FALSE); #endif error = row_rename_table_for_mysql( - par_case_name, norm_to, trx, - lock_and_commit); - + par_case_name, norm_to, trx, TRUE); } } if (error != DB_SUCCESS) { - FILE* ef = dict_foreign_err_file; - - fputs("InnoDB: Renaming table ", ef); - ut_print_name(ef, trx, TRUE, norm_from); - fputs(" to ", ef); - ut_print_name(ef, trx, TRUE, norm_to); - fputs(" failed!\n", ef); + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + fputs("InnoDB: Renaming table ", ef); + ut_print_name(ef, trx, TRUE, norm_from); + fputs(" to ", ef); + ut_print_name(ef, trx, TRUE, norm_to); + fputs(" failed!\n", ef); + } } else { #ifndef __WIN__ sql_print_warning("Rename partition table %s " @@ -9696,20 +10357,15 @@ innobase_rename_table( } } - if (lock_and_commit) { - row_mysql_unlock_data_dictionary(trx); - - /* Flush the log to reduce probability that the .frm - files and the InnoDB data dictionary get out-of-sync - if the user runs with innodb_flush_log_at_trx_commit = 0 */ + row_mysql_unlock_data_dictionary(trx); - log_buffer_flush_to_disk(); - } + /* Flush the log to reduce probability that the .frm + files and the InnoDB data dictionary get out-of-sync + if the user runs with innodb_flush_log_at_trx_commit = 0 */ - my_free(norm_to); - my_free(norm_from); + log_buffer_flush_to_disk(); - return(error); + DBUG_RETURN(error); } /*********************************************************************//** @@ -9723,12 +10379,17 @@ ha_innobase::rename_table( const char* to) /*!< in: new name of the table */ { trx_t* trx; - int error; + dberr_t error; trx_t* parent_trx; THD* thd = ha_thd(); DBUG_ENTER("ha_innobase::rename_table"); + if (srv_read_only_mode) { + ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + /* Get the transaction associated with the current thd, or create one if not yet created */ @@ -9741,15 +10402,11 @@ ha_innobase::rename_table( trx = innobase_trx_allocate(thd); - /* Either the transaction is already flagged as a locking transaction - or it hasn't been started yet. */ - - ut_a(!trx_is_started(trx) || trx->will_lock > 0); - /* We are doing a DDL operation. */ ++trx->will_lock; + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); - error = innobase_rename_table(trx, from, to, TRUE); + error = innobase_rename_table(trx, from, to); DEBUG_SYNC(thd, "after_innobase_rename_table"); @@ -9761,6 +10418,27 @@ ha_innobase::rename_table( innobase_commit_low(trx); trx_free_for_mysql(trx); + if (error == DB_SUCCESS) { + char norm_from[MAX_FULL_NAME_LEN]; + char norm_to[MAX_FULL_NAME_LEN]; + char errstr[512]; + dberr_t ret; + + normalize_table_name(norm_from, from); + normalize_table_name(norm_to, to); + + ret = dict_stats_rename_table(norm_from, norm_to, + errstr, sizeof(errstr)); + + if (ret != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: %s\n", errstr); + + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, + ER_LOCK_WAIT_TIMEOUT, errstr); + } + } + /* Add a special case to handle the Duplicated Key error and return DB_ERROR instead. This is to avoid a possible SIGSEGV error from mysql error @@ -9773,15 +10451,13 @@ ha_innobase::rename_table( the dup key error here is due to an existing table whose name is the one we are trying to rename to) and return the generic error code. */ - if (error == (int) DB_DUPLICATE_KEY) { + if (error == DB_DUPLICATE_KEY) { my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to); error = DB_ERROR; } - error = convert_error_code_to_mysql(error, 0, NULL); - - DBUG_RETURN(error); + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); } /*********************************************************************//** @@ -9840,7 +10516,7 @@ ha_innobase::records_in_range( goto func_exit; } - key_parts= key->key_parts; + key_parts= key->ext_key_parts; if ((min_key && min_key->keypart_map>=(key_part_map) (1<<key_parts)) || (max_key && max_key->keypart_map>=(key_part_map) (1<<key_parts))) key_parts= key->ext_key_parts; @@ -9848,11 +10524,11 @@ ha_innobase::records_in_range( heap = mem_heap_create(2 * (key_parts * sizeof(dfield_t) + sizeof(dtuple_t))); - range_start = dtuple_create(heap, key_parts); - dict_index_copy_types(range_start, index, key_parts); + range_start = dtuple_create(heap, key_parts); + dict_index_copy_types(range_start, index, key_parts); - range_end = dtuple_create(heap, key_parts); - dict_index_copy_types(range_end, index, key_parts); + range_end = dtuple_create(heap, key_parts); + dict_index_copy_types(range_end, index, key_parts); row_sel_convert_mysql_key_to_innobase( range_start, @@ -9921,10 +10597,10 @@ ha_rows ha_innobase::estimate_rows_upper_bound() /*====================================*/ { - dict_index_t* index; - ulonglong estimate; - ulonglong local_data_file_length; - ulint stat_n_leaf_pages; + const dict_index_t* index; + ulonglong estimate; + ulonglong local_data_file_length; + ulint stat_n_leaf_pages; DBUG_ENTER("estimate_rows_upper_bound"); @@ -9934,8 +10610,7 @@ ha_innobase::estimate_rows_upper_bound() update_thd(ha_thd()); - prebuilt->trx->op_info = (char*) - "calculating upper bound for table rows"; + prebuilt->trx->op_info = "calculating upper bound for table rows"; /* In case MySQL calls this in the middle of a SELECT query, release possible adaptive hash latch to avoid deadlocks of threads */ @@ -9951,16 +10626,15 @@ ha_innobase::estimate_rows_upper_bound() local_data_file_length = ((ulonglong) stat_n_leaf_pages) * UNIV_PAGE_SIZE; - /* Calculate a minimum length for a clustered index record and from that an upper bound for the number of rows. Since we only calculate new statistics in row0mysql.cc when a table has grown by a threshold factor, we must add a safety factor 2 in front of the formula below. */ - estimate = 2 * local_data_file_length / - dict_index_calc_min_rec_len(index); + estimate = 2 * local_data_file_length + / dict_index_calc_min_rec_len(index); - prebuilt->trx->op_info = (char*)""; + prebuilt->trx->op_info = ""; DBUG_RETURN((ha_rows) estimate); } @@ -9980,7 +10654,32 @@ ha_innobase::scan_time() as a random disk read, that is, we do not divide the following by 10, which would be physically realistic. */ - return((double) (prebuilt->table->stat_clustered_index_size)); + /* The locking below is disabled for performance reasons. Without + it we could end up returning uninitialized value to the caller, + which in the worst case could make some query plan go bogus or + issue a Valgrind warning. */ +#if 0 + /* avoid potential lock order violation with dict_table_stats_lock() + below */ + update_thd(ha_thd()); + trx_search_latch_release_if_reserved(prebuilt->trx); +#endif + + ulint stat_clustered_index_size; + +#if 0 + dict_table_stats_lock(prebuilt->table, RW_S_LATCH); +#endif + + ut_a(prebuilt->table->stat_initialized); + + stat_clustered_index_size = prebuilt->table->stat_clustered_index_size; + +#if 0 + dict_table_stats_unlock(prebuilt->table, RW_S_LATCH); +#endif + + return((double) stat_clustered_index_size); } /******************************************************************//** @@ -10016,6 +10715,16 @@ ha_innobase::read_time( return(ranges + (double) rows / (double) total_rows * time_for_scan); } +/******************************************************************//** +Return the size of the InnoDB memory buffer. */ +UNIV_INTERN +longlong +ha_innobase::get_memory_buffer_size() const +/*=======================================*/ +{ + return(innobase_buffer_pool_size); +} + /*********************************************************************//** Calculates the key number used inside MySQL for an Innobase index. We will first check the "index translation table" for a match of the index to get @@ -10041,9 +10750,6 @@ innobase_get_mysql_key_number_for_index( unsigned int i; ut_a(index); - /* - ut_ad(strcmp(index->table->name, ib_table->name) == 0); - */ /* If index does not belong to the table object of share structure (ib_table comes from the share structure) search the index->table @@ -10074,12 +10780,9 @@ innobase_get_mysql_key_number_for_index( } } - /* If index_count in translation table is set to 0, it - is possible we are in the process of rebuilding table, - do not spit error in this case */ - if (share->idx_trans_tbl.index_count) { - /* Print an error message if we cannot find the index - ** in the "index translation table". */ + /* Print an error message if we cannot find the index + in the "index translation table". */ + if (*index->name != TEMP_INDEX_PREFIX) { sql_print_error("Cannot find index %s in InnoDB index " "translation table.", index->name); } @@ -10103,10 +10806,16 @@ innobase_get_mysql_key_number_for_index( ind != NULL; ind = dict_table_get_next_index(ind)) { if (index == ind) { - sql_print_error("Find index %s in InnoDB index list " + /* Temp index is internal to InnoDB, that is + not present in the MySQL index list, so no + need to print such mismatch warning. */ + if (*(index->name) != TEMP_INDEX_PREFIX) { + sql_print_warning( + "Find index %s in InnoDB index list " "but not its MySQL index number " "It could be an InnoDB internal index.", index->name); + } return(-1); } } @@ -10130,45 +10839,49 @@ innodb_rec_per_key( ha_rows records) /*!< in: estimated total records */ { ha_rows rec_per_key; + ib_uint64_t n_diff; + + ut_a(index->table->stat_initialized); ut_ad(i < dict_index_get_n_unique(index)); - /* Note the stat_n_diff_key_vals[] stores the diff value with - n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */ - if (index->stat_n_diff_key_vals[i + 1] == 0) { + n_diff = index->stat_n_diff_key_vals[i]; + + if (n_diff == 0) { rec_per_key = records; } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) { - ib_uint64_t num_null; + ib_uint64_t n_null; + ib_uint64_t n_non_null; + + n_non_null = index->stat_n_non_null_key_vals[i]; /* In theory, index->stat_n_non_null_key_vals[i] should always be less than the number of records. Since this is statistics value, the value could have slight discrepancy. But we will make sure the number of null values is not a negative number. */ - if (records < index->stat_n_non_null_key_vals[i]) { - num_null = 0; + if (records < n_non_null) { + n_null = 0; } else { - num_null = records - index->stat_n_non_null_key_vals[i]; + n_null = records - n_non_null; } /* If the number of NULL values is the same as or large than that of the distinct values, we could consider that the table consists mostly of NULL value. Set rec_per_key to 1. */ - if (index->stat_n_diff_key_vals[i + 1] <= num_null) { + if (n_diff <= n_null) { rec_per_key = 1; } else { /* Need to exclude rows with NULL values from rec_per_key calculation */ - rec_per_key = (ha_rows)( - (records - num_null) - / (index->stat_n_diff_key_vals[i + 1] - - num_null)); + rec_per_key = (ha_rows) + ((records - n_null) / (n_diff - n_null)); } } else { - rec_per_key = (ha_rows) - (records / index->stat_n_diff_key_vals[i + 1]); + DEBUG_SYNC_C("after_checking_for_0"); + rec_per_key = (ha_rows) (records / n_diff); } return(rec_per_key); @@ -10182,17 +10895,12 @@ UNIV_INTERN int ha_innobase::info_low( /*==================*/ - uint flag, /*!< in: what information MySQL - requests */ - dict_stats_upd_option_t stats_upd_option) - /*!< in: whether to (re) calc - the stats or to fetch them from - the persistent storage */ + uint flag, /*!< in: what information is requested */ + bool is_analyze) { dict_table_t* ib_table; - dict_index_t* index; ha_rows rec_per_key; - ib_int64_t n_rows; + ib_uint64_t n_rows; char path[FN_REFLEN]; os_file_stat_t stat_info; @@ -10216,37 +10924,52 @@ ha_innobase::info_low( trx_search_latch_release_if_reserved(prebuilt->trx); ib_table = prebuilt->table; + DBUG_ASSERT(ib_table->n_ref_count > 0); if (flag & HA_STATUS_TIME) { - if (stats_upd_option != DICT_STATS_FETCH - || innobase_stats_on_metadata) { - /* In sql_show we call with this flag: update - then statistics so that they are up-to-date */ - enum db_err ret; + if (is_analyze || innobase_stats_on_metadata) { + + dict_stats_upd_option_t opt; + dberr_t ret; prebuilt->trx->op_info = "updating table statistics"; + if (dict_stats_is_persistent_enabled(ib_table)) { + + ut_ad(!srv_read_only_mode); + + if (is_analyze) { + opt = DICT_STATS_RECALC_PERSISTENT; + } else { + /* This is e.g. 'SHOW INDEXES', fetch + the persistent stats from disk. */ + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + ut_ad(!mutex_own(&dict_sys->mutex)); - ret = dict_stats_update(ib_table, stats_upd_option, - FALSE); + ret = dict_stats_update(ib_table, opt); if (ret != DB_SUCCESS) { prebuilt->trx->op_info = ""; DBUG_RETURN(HA_ERR_GENERIC); } - prebuilt->trx->op_info = "returning various info to MySQL"; + prebuilt->trx->op_info = + "returning various info to MySQL"; } my_snprintf(path, sizeof(path), "%s/%s%s", - mysql_data_home, ib_table->name, reg_ext); + mysql_data_home, ib_table->name, reg_ext); unpack_filename(path,path); /* Note that we do not know the access time of the table, nor the CHECK TABLE time, nor the UPDATE or INSERT time. */ - if (os_file_get_status(path,&stat_info)) { + if (os_file_get_status(path, &stat_info, false) == DB_SUCCESS) { stats.create_time = (ulong) stat_info.ctime; } } @@ -10254,13 +10977,28 @@ ha_innobase::info_low( if (flag & HA_STATUS_VARIABLE) { ulint page_size; + ulint stat_clustered_index_size; + ulint stat_sum_of_other_index_sizes; + + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_lock(ib_table, RW_S_LATCH); + } + + ut_a(ib_table->stat_initialized); n_rows = ib_table->stat_n_rows; - /* Because we do not protect stat_n_rows by any mutex in a - delete, it is theoretically possible that the value can be - smaller than zero! TODO: fix this race. + stat_clustered_index_size + = ib_table->stat_clustered_index_size; + + stat_sum_of_other_index_sizes + = ib_table->stat_sum_of_other_index_sizes; + + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_unlock(ib_table, RW_S_LATCH); + } + /* The MySQL optimizer seems to assume in a left join that n_rows is an accurate estimate if it is zero. Of course, it is not, since we do not have any locks on the rows yet at this phase. @@ -10270,10 +11008,6 @@ ha_innobase::info_low( set. That way SHOW TABLE STATUS will show the best estimate, while the optimizer never sees the table empty. */ - if (n_rows < 0) { - n_rows = 0; - } - if (n_rows == 0 && !(flag & HA_STATUS_TIME)) { n_rows++; } @@ -10303,10 +11037,10 @@ ha_innobase::info_low( stats.records = (ha_rows) n_rows; stats.deleted = 0; stats.data_file_length - = ((ulonglong) ib_table->stat_clustered_index_size) + = ((ulonglong) stat_clustered_index_size) * page_size; - stats.index_file_length = - ((ulonglong) ib_table->stat_sum_of_other_index_sizes) + stats.index_file_length + = ((ulonglong) stat_sum_of_other_index_sizes) * page_size; /* Since fsp_get_available_space_in_free_extents() is @@ -10346,8 +11080,8 @@ ha_innobase::info_low( "space for table %s but its " "tablespace has been discarded or " "the .ibd file is missing. Setting " - "the free space to zero. " - "(Errcode: %M)", + "the free space to zero. " + "(errno: %M)", ib_table->name, errno); stats.delete_length = 0; @@ -10357,7 +11091,7 @@ ha_innobase::info_low( } stats.check_time = 0; - stats.mrr_length_per_rec = ref_length + sizeof(void*); + stats.mrr_length_per_rec= ref_length + 8; // 8 = max(sizeof(void *)); if (stats.records == 0) { stats.mean_rec_length = 0; @@ -10373,12 +11107,40 @@ ha_innobase::info_low( matches up. If prebuilt->clust_index_was_generated holds, InnoDB defines GEN_CLUST_INDEX internally */ ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes) - - prebuilt->clust_index_was_generated; + - prebuilt->clust_index_was_generated; + if (table->s->keys < num_innodb_index) { + /* If there are too many indexes defined + inside InnoDB, ignore those that are being + created, because MySQL will only consider + the fully built indexes here. */ + + for (const dict_index_t* index + = UT_LIST_GET_FIRST(ib_table->indexes); + index != NULL; + index = UT_LIST_GET_NEXT(indexes, index)) { + + /* First, online index creation is + completed inside InnoDB, and then + MySQL attempts to upgrade the + meta-data lock so that it can rebuild + the .frm file. If we get here in that + time frame, dict_index_is_online_ddl() + would not hold and the index would + still not be included in TABLE_SHARE. */ + if (*index->name == TEMP_INDEX_PREFIX) { + num_innodb_index--; + } + } + + if (table->s->keys < num_innodb_index + && innobase_fts_check_doc_id_index( + ib_table, NULL, NULL) + == FTS_EXIST_DOC_ID_INDEX) { + num_innodb_index--; + } + } - if (table->s->keys != num_innodb_index - && (innobase_fts_check_doc_id_index(ib_table, NULL) - == FTS_EXIST_DOC_ID_INDEX - && table->s->keys != (num_innodb_index - 1))) { + if (table->s->keys != num_innodb_index) { sql_print_error("InnoDB: Table %s contains %lu " "indexes inside InnoDB, which " "is different from the number of " @@ -10387,6 +11149,12 @@ ha_innobase::info_low( table->s->keys); } + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_lock(ib_table, RW_S_LATCH); + } + + ut_a(ib_table->stat_initialized); + for (i = 0; i < table->s->keys; i++) { ulong j; rec_per_key = 1; @@ -10395,7 +11163,7 @@ ha_innobase::info_low( The identity of index (match up index name with that of table->key_info[i]) is already verified in innobase_get_index(). */ - index = innobase_get_index(i); + dict_index_t* index = innobase_get_index(i); if (index == NULL) { sql_print_error("Table %s contains fewer " @@ -10410,7 +11178,7 @@ ha_innobase::info_low( break; } - for (j = 0; j < table->key_info[i].key_parts; j++) { + for (j = 0; j < table->key_info[i].ext_key_parts; j++) { if (table->key_info[i].flags & HA_FULLTEXT) { /* The whole concept has no validity @@ -10459,13 +11227,15 @@ ha_innobase::info_low( key_part_map ext_key_part_map= key_info->ext_key_part_map; - if (key_info->key_parts != key_info->ext_key_parts) { + if (key_info->user_defined_key_parts != + key_info->ext_key_parts) + { KEY *pk_key_info= key_info+ table->s->primary_key; - uint k = key_info->key_parts; + uint k = key_info->user_defined_key_parts; ha_rows k_rec_per_key = rec_per_key; - uint pk_parts = pk_key_info->key_parts; + uint pk_parts = pk_key_info->user_defined_key_parts; index= innobase_get_index( table->s->primary_key); @@ -10500,6 +11270,10 @@ ha_innobase::info_low( } } } + + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_unlock(ib_table, RW_S_LATCH); + } } if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { @@ -10522,7 +11296,7 @@ ha_innobase::info_low( errkey = (unsigned int) ( (prebuilt->trx->error_key_num == ULINT_UNDEFINED) - ? -1 + ? ~0 : prebuilt->trx->error_key_num); } } @@ -10545,9 +11319,9 @@ UNIV_INTERN int ha_innobase::info( /*==============*/ - uint flag) /*!< in: what information MySQL requests */ + uint flag) /*!< in: what information is requested */ { - return(info_low(flag, DICT_STATS_FETCH)); + return(this->info_low(flag, false /* not ANALYZE */)); } /**********************************************************************//** @@ -10561,19 +11335,13 @@ ha_innobase::analyze( THD* thd, /*!< in: connection thread handle */ HA_CHECK_OPT* check_opt) /*!< in: currently ignored */ { - dict_stats_upd_option_t upd_option; - int ret; + int ret; - if (THDVAR(thd, analyze_is_persistent)) { - upd_option = DICT_STATS_RECALC_PERSISTENT; - } else { - upd_option = DICT_STATS_RECALC_TRANSIENT; - } - - /* Simply call ::info_low() with all the flags + /* Simply call this->info_low() with all the flags and request recalculation of the statistics */ - ret = info_low(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE, - upd_option); + ret = this->info_low( + HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE, + true /* this is ANALYZE */); if (ret != 0) { return(HA_ADMIN_FAILED); @@ -10646,19 +11414,23 @@ ha_innobase::check( build_template(true); } - if (prebuilt->table->ibd_file_missing) { - sql_print_error("InnoDB: Error:\n" - "InnoDB: MySQL is trying to use a table handle" - " but the .ibd file for\n" - "InnoDB: table %s does not exist.\n" - "InnoDB: Have you deleted the .ibd file" - " from the database directory under\n" - "InnoDB: the MySQL datadir, or have you" - " used DISCARD TABLESPACE?\n" - "InnoDB: Please refer to\n" - "InnoDB: " REFMAN "innodb-troubleshooting.html\n" - "InnoDB: how you can resolve the problem.\n", - prebuilt->table->name); + if (dict_table_is_discarded(prebuilt->table)) { + + ib_senderrf( + thd, + IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + DBUG_RETURN(HA_ADMIN_CORRUPT); + + } else if (prebuilt->table->ibd_file_missing) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + DBUG_RETURN(HA_ADMIN_CORRUPT); } @@ -10684,27 +11456,23 @@ ha_innobase::check( /* Enlarge the fatal lock wait timeout during CHECK TABLE. */ os_increment_counter_by_amount( server_mutex, - srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/); + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); for (index = dict_table_get_first_index(prebuilt->table); index != NULL; index = dict_table_get_next_index(index)) { char index_name[MAX_FULL_NAME_LEN + 1]; -#if 0 - fputs("Validating index ", stderr); - ut_print_name(stderr, trx, FALSE, index->name); - putc('\n', stderr); -#endif - /* If this is an index being created, break */ + /* If this is an index being created or dropped, break */ if (*index->name == TEMP_INDEX_PREFIX) { break; - } else if (!btr_validate_index(index, prebuilt->trx)) { + } else if (!btr_validate_index(index, prebuilt->trx)) { is_ok = FALSE; innobase_format_name( index_name, sizeof index_name, - prebuilt->index->name, TRUE); + index->name, TRUE); push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE, @@ -10768,9 +11536,8 @@ ha_innobase::check( " index %s is corrupted.", index_name); is_ok = FALSE; - row_mysql_lock_data_dictionary(prebuilt->trx); - dict_set_corrupted(index); - row_mysql_unlock_data_dictionary(prebuilt->trx); + dict_set_corrupted( + index, prebuilt->trx, "CHECK TABLE"); } if (thd_kill_level(user_thd)) { @@ -10805,9 +11572,8 @@ ha_innobase::check( index = dict_table_get_first_index(prebuilt->table); if (!dict_index_is_corrupted(index)) { - mutex_enter(&dict_sys->mutex); - dict_set_corrupted(index); - mutex_exit(&dict_sys->mutex); + dict_set_corrupted( + index, prebuilt->trx, "CHECK TABLE"); } prebuilt->table->corrupted = TRUE; } @@ -10828,7 +11594,8 @@ ha_innobase::check( /* Restore the fatal lock wait timeout after CHECK TABLE. */ os_decrement_counter_by_amount( server_mutex, - srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/); + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); prebuilt->trx->op_info = ""; if (thd_kill_level(user_thd)) { @@ -10873,40 +11640,47 @@ ha_innobase::update_table_comment( /* output the data to a temporary file */ - mutex_enter(&srv_dict_tmpfile_mutex); - rewind(srv_dict_tmpfile); + if (!srv_read_only_mode) { - fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB", - fsp_get_available_space_in_free_extents( - prebuilt->table->space)); + mutex_enter(&srv_dict_tmpfile_mutex); - dict_print_info_on_foreign_keys(FALSE, srv_dict_tmpfile, - prebuilt->trx, prebuilt->table); - flen = ftell(srv_dict_tmpfile); - if (flen < 0) { - flen = 0; - } else if (length + flen + 3 > 64000) { - flen = 64000 - 3 - length; - } + rewind(srv_dict_tmpfile); - /* allocate buffer for the full string, and - read the contents of the temporary file */ + fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB", + fsp_get_available_space_in_free_extents( + prebuilt->table->space)); - str = (char*) my_malloc(length + flen + 3, MYF(0)); + dict_print_info_on_foreign_keys( + FALSE, srv_dict_tmpfile, prebuilt->trx, + prebuilt->table); - if (str) { - char* pos = str + length; - if (length) { - memcpy(str, comment, length); - *pos++ = ';'; - *pos++ = ' '; + flen = ftell(srv_dict_tmpfile); + + if (flen < 0) { + flen = 0; + } else if (length + flen + 3 > 64000) { + flen = 64000 - 3 - length; } - rewind(srv_dict_tmpfile); - flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile); - pos[flen] = 0; - } - mutex_exit(&srv_dict_tmpfile_mutex); + /* allocate buffer for the full string, and + read the contents of the temporary file */ + + str = (char*) my_malloc(length + flen + 3, MYF(0)); + + if (str) { + char* pos = str + length; + if (length) { + memcpy(str, comment, length); + *pos++ = ';'; + *pos++ = ' '; + } + rewind(srv_dict_tmpfile); + flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile); + pos[flen] = 0; + } + + mutex_exit(&srv_dict_tmpfile_mutex); + } prebuilt->trx->op_info = (char*)""; @@ -10923,8 +11697,8 @@ char* ha_innobase::get_foreign_key_create_info(void) /*==========================================*/ { - char* str = 0; long flen; + char* str = 0; ut_a(prebuilt != NULL); @@ -10942,31 +11716,36 @@ ha_innobase::get_foreign_key_create_info(void) trx_search_latch_release_if_reserved(prebuilt->trx); - mutex_enter(&srv_dict_tmpfile_mutex); - rewind(srv_dict_tmpfile); + if (!srv_read_only_mode) { + mutex_enter(&srv_dict_tmpfile_mutex); + rewind(srv_dict_tmpfile); - /* output the data to a temporary file */ - dict_print_info_on_foreign_keys(TRUE, srv_dict_tmpfile, - prebuilt->trx, prebuilt->table); - prebuilt->trx->op_info = (char*)""; + /* Output the data to a temporary file */ + dict_print_info_on_foreign_keys( + TRUE, srv_dict_tmpfile, prebuilt->trx, + prebuilt->table); - flen = ftell(srv_dict_tmpfile); - if (flen < 0) { - flen = 0; - } + prebuilt->trx->op_info = (char*)""; - /* allocate buffer for the string, and - read the contents of the temporary file */ + flen = ftell(srv_dict_tmpfile); - str = (char*) my_malloc(flen + 1, MYF(0)); + if (flen < 0) { + flen = 0; + } - if (str) { - rewind(srv_dict_tmpfile); - flen = (uint) fread(str, 1, flen, srv_dict_tmpfile); - str[flen] = 0; - } + /* Allocate buffer for the string, and + read the contents of the temporary file */ + + str = (char*) my_malloc(flen + 1, MYF(0)); + + if (str) { + rewind(srv_dict_tmpfile); + flen = (uint) fread(str, 1, flen, srv_dict_tmpfile); + str[flen] = 0; + } - mutex_exit(&srv_dict_tmpfile_mutex); + mutex_exit(&srv_dict_tmpfile_mutex); + } return(str); } @@ -11180,17 +11959,16 @@ ha_innobase::can_switch_engines(void) bool can_switch; DBUG_ENTER("ha_innobase::can_switch_engines"); - - ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + update_thd(); prebuilt->trx->op_info = "determining if there are foreign key constraints"; - row_mysql_lock_data_dictionary(prebuilt->trx); + row_mysql_freeze_data_dictionary(prebuilt->trx); can_switch = !UT_LIST_GET_FIRST(prebuilt->table->referenced_list) && !UT_LIST_GET_FIRST(prebuilt->table->foreign_list); - row_mysql_unlock_data_dictionary(prebuilt->trx); + row_mysql_unfreeze_data_dictionary(prebuilt->trx); prebuilt->trx->op_info = ""; DBUG_RETURN(can_switch); @@ -11239,50 +12017,52 @@ ha_innobase::extra( enum ha_extra_function operation) /*!< in: HA_EXTRA_FLUSH or some other flag */ { + check_trx_exists(ha_thd()); + /* Warning: since it is not sure that MySQL calls external_lock before calling this function, the trx field in prebuilt can be obsolete! */ switch (operation) { - case HA_EXTRA_FLUSH: - if (prebuilt->blob_heap) { - row_mysql_prebuilt_free_blob_heap(prebuilt); - } - break; - case HA_EXTRA_RESET_STATE: - reset_template(); - thd_to_trx(ha_thd())->duplicates = 0; - break; - case HA_EXTRA_NO_KEYREAD: - prebuilt->read_just_key = 0; - break; - case HA_EXTRA_KEYREAD: - prebuilt->read_just_key = 1; - break; - case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: - prebuilt->keep_other_fields_on_keyread = 1; - break; + case HA_EXTRA_FLUSH: + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + break; + case HA_EXTRA_RESET_STATE: + reset_template(); + thd_to_trx(ha_thd())->duplicates = 0; + break; + case HA_EXTRA_NO_KEYREAD: + prebuilt->read_just_key = 0; + break; + case HA_EXTRA_KEYREAD: + prebuilt->read_just_key = 1; + break; + case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + prebuilt->keep_other_fields_on_keyread = 1; + break; - /* IMPORTANT: prebuilt->trx can be obsolete in - this method, because it is not sure that MySQL - calls external_lock before this method with the - parameters below. We must not invoke update_thd() - either, because the calling threads may change. - CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */ - case HA_EXTRA_INSERT_WITH_UPDATE: - thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE; - break; - case HA_EXTRA_NO_IGNORE_DUP_KEY: - thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE; - break; - case HA_EXTRA_WRITE_CAN_REPLACE: - thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE; - break; - case HA_EXTRA_WRITE_CANNOT_REPLACE: - thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE; - break; - default:/* Do nothing */ - ; + /* IMPORTANT: prebuilt->trx can be obsolete in + this method, because it is not sure that MySQL + calls external_lock before this method with the + parameters below. We must not invoke update_thd() + either, because the calling threads may change. + CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */ + case HA_EXTRA_INSERT_WITH_UPDATE: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE; + break; + case HA_EXTRA_NO_IGNORE_DUP_KEY: + thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE; + break; + case HA_EXTRA_WRITE_CAN_REPLACE: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE; + break; + case HA_EXTRA_WRITE_CANNOT_REPLACE: + thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE; + break; + default:/* Do nothing */ + ; } return(0); @@ -11391,14 +12171,6 @@ ha_innobase::start_stmt( ++trx->will_lock; } - if (prebuilt->result) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Warning: FTS result set not NULL\n"); - - fts_query_free_result(prebuilt->result); - prebuilt->result = NULL; - } - return(0); } @@ -11471,6 +12243,24 @@ ha_innobase::external_lock( } } + /* Check for UPDATEs in read-only mode. */ + if (srv_read_only_mode + && (thd_sql_command(thd) == SQLCOM_UPDATE + || thd_sql_command(thd) == SQLCOM_INSERT + || thd_sql_command(thd) == SQLCOM_REPLACE + || thd_sql_command(thd) == SQLCOM_DROP_TABLE + || thd_sql_command(thd) == SQLCOM_ALTER_TABLE + || thd_sql_command(thd) == SQLCOM_OPTIMIZE + || thd_sql_command(thd) == SQLCOM_CREATE_TABLE + || thd_sql_command(thd) == SQLCOM_CREATE_INDEX + || thd_sql_command(thd) == SQLCOM_DROP_INDEX + || thd_sql_command(thd) == SQLCOM_DELETE)) { + + ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + trx = prebuilt->trx; prebuilt->sql_stat_start = TRUE; @@ -11478,6 +12268,41 @@ ha_innobase::external_lock( reset_template(); + switch (prebuilt->table->quiesce) { + case QUIESCE_START: + /* Check for FLUSH TABLE t WITH READ LOCK; */ + if (!srv_read_only_mode + && thd_sql_command(thd) == SQLCOM_FLUSH + && lock_type == F_RDLCK) { + + row_quiesce_table_start(prebuilt->table, trx); + + /* Use the transaction instance to track UNLOCK + TABLES. It can be done via START TRANSACTION; too + implicitly. */ + + ++trx->flush_tables; + } + break; + + case QUIESCE_COMPLETE: + /* Check for UNLOCK TABLES; implicit or explicit + or trx interruption. */ + if (trx->flush_tables > 0 + && (lock_type == F_UNLCK || trx_is_interrupted(trx))) { + + row_quiesce_table_complete(prebuilt->table, trx); + + ut_a(trx->flush_tables > 0); + --trx->flush_tables; + } + + break; + + case QUIESCE_NONE: + break; + } + if (lock_type == F_WRLCK) { /* If this is a SELECT, then it is in UPDATE TABLE ... @@ -11528,13 +12353,13 @@ ha_innobase::external_lock( && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT) && thd_in_lock_tables(thd)) { - ulint error = row_lock_table_for_mysql( + dberr_t error = row_lock_table_for_mysql( prebuilt, NULL, 0); if (error != DB_SUCCESS) { - error = convert_error_code_to_mysql( - (int) error, 0, thd); - DBUG_RETURN((int) error); + DBUG_RETURN( + convert_error_code_to_mysql( + error, 0, thd)); } } @@ -11624,19 +12449,23 @@ ha_innobase::transactional_table_lock( update_thd(thd); - if (prebuilt->table->ibd_file_missing && !thd_tablespace_op(thd)) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: MySQL is trying to use a table handle" - " but the .ibd file for\n" - "InnoDB: table %s does not exist.\n" - "InnoDB: Have you deleted the .ibd file" - " from the database directory under\n" - "InnoDB: the MySQL datadir?" - "InnoDB: See " REFMAN - "innodb-troubleshooting.html\n" - "InnoDB: how you can resolve the problem.\n", - prebuilt->table->name); + if (!thd_tablespace_op(thd)) { + + if (dict_table_is_discarded(prebuilt->table)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + } else if (prebuilt->table->ibd_file_missing) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + } + DBUG_RETURN(HA_ERR_CRASHED); } @@ -11654,11 +12483,12 @@ ha_innobase::transactional_table_lock( prebuilt->select_lock_type = LOCK_S; prebuilt->stored_select_lock_type = LOCK_S; } else { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB error:\n" -"MySQL is trying to set transactional table lock with corrupted lock type\n" -"to table %s, lock type %d does not exist.\n", - prebuilt->table->name, lock_type); + ib_logf(IB_LOG_LEVEL_ERROR, + "MySQL is trying to set transactional table lock " + "with corrupted lock type to table %s, lock type " + "%d does not exist.", + table->s->table_name.str, lock_type); + DBUG_RETURN(HA_ERR_CRASHED); } @@ -11667,14 +12497,14 @@ ha_innobase::transactional_table_lock( innobase_register_trx(ht, thd, trx); if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) { - ulint error = DB_SUCCESS; + dberr_t error; error = row_lock_table_for_mysql(prebuilt, NULL, 0); if (error != DB_SUCCESS) { - error = convert_error_code_to_mysql( - (int) error, prebuilt->table->flags, thd); - DBUG_RETURN((int) error); + DBUG_RETURN( + convert_error_code_to_mysql( + error, prebuilt->table->flags, thd)); } if (thd_test_options( @@ -11725,6 +12555,13 @@ innodb_show_status( DBUG_ENTER("innodb_show_status"); DBUG_ASSERT(hton == innodb_hton_ptr); + /* We don't create the temp files or associated + mutexes in read-only-mode */ + + if (srv_read_only_mode) { + DBUG_RETURN(0); + } + trx = check_trx_exists(thd); trx_search_latch_release_if_reserved(trx); @@ -11814,11 +12651,11 @@ innodb_mutex_show_status( { char buf1[IO_SIZE]; char buf2[IO_SIZE]; - mutex_t* mutex; + ib_mutex_t* mutex; rw_lock_t* lock; ulint block_mutex_oswait_count = 0; ulint block_lock_oswait_count = 0; - mutex_t* block_mutex = NULL; + ib_mutex_t* block_mutex = NULL; rw_lock_t* block_lock = NULL; #ifdef UNIV_DEBUG ulint rw_lock_count= 0; @@ -11850,41 +12687,7 @@ innodb_mutex_show_status( block_mutex_oswait_count += mutex->count_os_wait; continue; } -#ifdef UNIV_DEBUG - if (mutex->mutex_type != 1) { - if (mutex->count_using > 0) { - buf1len= my_snprintf(buf1, sizeof(buf1), - "%s:%s", - mutex->cmutex_name, - innobase_basename(mutex->cfile_name)); - buf2len= my_snprintf(buf2, sizeof(buf2), - "count=%lu, spin_waits=%lu," - " spin_rounds=%lu, " - "os_waits=%lu, os_yields=%lu," - " os_wait_times=%lu", - mutex->count_using, - mutex->count_spin_loop, - mutex->count_spin_rounds, - mutex->count_os_wait, - mutex->count_os_yield, - (ulong) (mutex->lspent_time/1000)); - - if (stat_print(thd, innobase_hton_name, - hton_name_len, buf1, buf1len, - buf2, buf2len)) { - mutex_exit(&mutex_list_mutex); - DBUG_RETURN(1); - } - } - } else { - rw_lock_count += mutex->count_using; - rw_lock_count_spin_loop += mutex->count_spin_loop; - rw_lock_count_spin_rounds += mutex->count_spin_rounds; - rw_lock_count_os_wait += mutex->count_os_wait; - rw_lock_count_os_yield += mutex->count_os_yield; - rw_lock_wait_time += mutex->lspent_time; - } -#else /* UNIV_DEBUG */ + buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s:%lu", innobase_basename(mutex->cfile_name), (ulong) mutex->cline); @@ -11897,7 +12700,6 @@ innodb_mutex_show_status( mutex_exit(&mutex_list_mutex); DBUG_RETURN(1); } -#endif /* UNIV_DEBUG */ } if (block_mutex) { @@ -12170,12 +12972,52 @@ ha_innobase::store_lock( const bool in_lock_tables = thd_in_lock_tables(thd); const uint sql_command = thd_sql_command(thd); - if (sql_command == SQLCOM_DROP_TABLE) { + if (srv_read_only_mode + && (sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_INSERT + || sql_command == SQLCOM_REPLACE + || sql_command == SQLCOM_DROP_TABLE + || sql_command == SQLCOM_ALTER_TABLE + || sql_command == SQLCOM_OPTIMIZE + || sql_command == SQLCOM_CREATE_TABLE + || sql_command == SQLCOM_CREATE_INDEX + || sql_command == SQLCOM_DROP_INDEX + || sql_command == SQLCOM_DELETE)) { + + ib_senderrf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + } else if (sql_command == SQLCOM_FLUSH + && lock_type == TL_READ_NO_INSERT) { + + /* Check for FLUSH TABLES ... WITH READ LOCK */ + + /* Note: This call can fail, but there is no way to return + the error to the caller. We simply ignore it for now here + and push the error code to the caller where the error is + detected in the function. */ + + dberr_t err = row_quiesce_set_state( + prebuilt->table, QUIESCE_START, trx); + + ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } else { + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + } + + /* Check for DROP TABLE */ + } else if (sql_command == SQLCOM_DROP_TABLE) { /* MySQL calls this function in DROP TABLE though this table handle may belong to another thd that is running a query. Let us in that case skip any changes to the prebuilt struct. */ + /* Check for LOCK TABLE t1,...,tn WITH SHARED LOCKS */ } else if ((lock_type == TL_READ && in_lock_tables) || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) || lock_type == TL_READ_WITH_SHARED_LOCKS @@ -12201,18 +13043,18 @@ ha_innobase::store_lock( unexpected if an obsolete consistent read view would be used. */ - ulint isolation_level; - - isolation_level = trx->isolation_level; + /* Use consistent read for checksum table */ - if ((srv_locks_unsafe_for_binlog - || isolation_level <= TRX_ISO_READ_COMMITTED) - && isolation_level != TRX_ISO_SERIALIZABLE - && (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) - && (sql_command == SQLCOM_INSERT_SELECT - || sql_command == SQLCOM_REPLACE_SELECT - || sql_command == SQLCOM_UPDATE - || sql_command == SQLCOM_CREATE_TABLE)) { + if (sql_command == SQLCOM_CHECKSUM + || ((srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) + && trx->isolation_level != TRX_ISO_SERIALIZABLE + && (lock_type == TL_READ + || lock_type == TL_READ_NO_INSERT) + && (sql_command == SQLCOM_INSERT_SELECT + || sql_command == SQLCOM_REPLACE_SELECT + || sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_CREATE_TABLE))) { /* If we either have innobase_locks_unsafe_for_binlog option set or this session is using READ COMMITTED @@ -12226,11 +13068,6 @@ ha_innobase::store_lock( prebuilt->select_lock_type = LOCK_NONE; prebuilt->stored_select_lock_type = LOCK_NONE; - } else if (sql_command == SQLCOM_CHECKSUM) { - /* Use consistent read for checksum table */ - - prebuilt->select_lock_type = LOCK_NONE; - prebuilt->stored_select_lock_type = LOCK_NONE; } else { prebuilt->select_lock_type = LOCK_S; prebuilt->stored_select_lock_type = LOCK_S; @@ -12330,7 +13167,7 @@ the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked on return and all relevant locks acquired. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t ha_innobase::innobase_get_autoinc( /*==============================*/ ulonglong* value) /*!< out: autoinc value */ @@ -12387,12 +13224,7 @@ ha_innobase::innobase_peek_autoinc(void) } /*********************************************************************//** -This function initializes the auto-inc counter if it has not been -initialized yet. This function does not change the value of the auto-inc -counter if it already has been initialized. Returns the value of the -auto-inc counter in *first_value, and ULONGLONG_MAX in *nb_reserved_values (as -we have a table-level lock). offset, increment, nb_desired_values are ignored. -*first_value is set to -1 if error (deadlock or lock wait timeout) */ +Returns the value of the auto-inc counter in *first_value and ~0 on failure. */ UNIV_INTERN void ha_innobase::get_auto_increment( @@ -12407,7 +13239,7 @@ ha_innobase::get_auto_increment( values */ { trx_t* trx; - ulint error; + dberr_t error; ulonglong autoinc = 0; /* Prepare prebuilt->trx in the table handle */ @@ -12521,18 +13353,15 @@ ha_innobase::reset_auto_increment( { DBUG_ENTER("ha_innobase::reset_auto_increment"); - int error; + dberr_t error; update_thd(ha_thd()); error = row_lock_table_autoinc_for_mysql(prebuilt); if (error != DB_SUCCESS) { - error = convert_error_code_to_mysql(error, - prebuilt->table->flags, - user_thd); - - DBUG_RETURN(error); + DBUG_RETURN(convert_error_code_to_mysql( + error, prebuilt->table->flags, user_thd)); } /* The next value can never be 0. */ @@ -12601,7 +13430,7 @@ ha_innobase::get_foreign_dup_key( /* else */ /* copy table name (and convert from filename-safe encoding to - system_charset_info, e.g. "foo_@0J@00b6" -> "foo_ö") */ + system_charset_info) */ char* p; p = strchr(err_index->table->name, '/'); /* strip ".../" prefix if any */ @@ -12654,7 +13483,7 @@ ha_innobase::cmp_ref( key_part = table->key_info[table->s->primary_key].key_part; key_part_end = key_part - + table->key_info[table->s->primary_key].key_parts; + + table->key_info[table->s->primary_key].user_defined_key_parts; for (; key_part != key_part_end; ++key_part) { field = key_part->field; @@ -12699,11 +13528,10 @@ my_bool ha_innobase::register_query_cache_table( /*====================================*/ THD* thd, /*!< in: user thread handle */ - char* table_key, /*!< in: concatenation of database name, - the null character NUL, - and the table name */ - uint key_length, /*!< in: length of the full name, i.e. - len(dbname) + len(tablename) + 1 */ + char* table_key, /*!< in: normalized path to the + table */ + uint key_length, /*!< in: length of the normalized + path to the table */ qc_engine_callback* call_back, /*!< out: pointer to function for checking if query caching @@ -12825,8 +13653,8 @@ innobase_xa_prepare( false - the current SQL statement ended */ { - int error = 0; - trx_t* trx = check_trx_exists(thd); + int error = 0; + trx_t* trx = check_trx_exists(thd); DBUG_ASSERT(hton == innodb_hton_ptr); @@ -13019,124 +13847,6 @@ innobase_set_cursor_view( } /*******************************************************************//** -If col_name is not NULL, check whether the named column is being -renamed in the table. If col_name is not provided, check -whether any one of columns in the table is being renamed. -@return true if the column is being renamed */ -static -bool -check_column_being_renamed( -/*=======================*/ - const TABLE* table, /*!< in: MySQL table */ - const char* col_name) /*!< in: name of the column */ -{ - uint k; - Field* field; - - for (k = 0; k < table->s->fields; k++) { - field = table->field[k]; - - if (field->flags & FIELD_IS_RENAMED) { - - /* If col_name is not provided, return - if the field is marked as being renamed. */ - if (!col_name) { - return(true); - } - - /* If col_name is provided, return only - if names match */ - if (innobase_strcasecmp(field->field_name, - col_name) == 0) { - return(true); - } - } - } - - return(false); -} - -/*******************************************************************//** -Check whether any of the given columns is being renamed in the table. -@return true if any of col_names is being renamed in table */ -static -bool -column_is_being_renamed( -/*====================*/ - TABLE* table, /*!< in: MySQL table */ - uint n_cols, /*!< in: number of columns */ - const char** col_names) /*!< in: names of the columns */ -{ - uint j; - - for (j = 0; j < n_cols; j++) { - if (check_column_being_renamed(table, col_names[j])) { - return(true); - } - } - - return(false); -} - -/*******************************************************************//** -Check whether a column in table "table" is being renamed and if this column -is part of a foreign key, either part of another table, referencing this -table or part of this table, referencing another table. -@return true if a column that participates in a foreign key definition -is being renamed */ -static -bool -foreign_key_column_is_being_renamed( -/*================================*/ - row_prebuilt_t* prebuilt, /* in: InnoDB prebuilt struct */ - TABLE* table) /* in: MySQL table */ -{ - dict_foreign_t* foreign; - - /* check whether there are foreign keys at all */ - if (UT_LIST_GET_LEN(prebuilt->table->foreign_list) == 0 - && UT_LIST_GET_LEN(prebuilt->table->referenced_list) == 0) { - /* no foreign keys involved with prebuilt->table */ - - return(false); - } - - row_mysql_lock_data_dictionary(prebuilt->trx); - - /* Check whether any column in the foreign key constraints which refer - to this table is being renamed. */ - for (foreign = UT_LIST_GET_FIRST(prebuilt->table->referenced_list); - foreign != NULL; - foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { - - if (column_is_being_renamed(table, foreign->n_fields, - foreign->referenced_col_names)) { - - row_mysql_unlock_data_dictionary(prebuilt->trx); - return(true); - } - } - - /* Check whether any column in the foreign key constraints in the - table is being renamed. */ - for (foreign = UT_LIST_GET_FIRST(prebuilt->table->foreign_list); - foreign != NULL; - foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { - - if (column_is_being_renamed(table, foreign->n_fields, - foreign->foreign_col_names)) { - - row_mysql_unlock_data_dictionary(prebuilt->trx); - return(true); - } - } - - row_mysql_unlock_data_dictionary(prebuilt->trx); - - return(false); -} - -/*******************************************************************//** */ UNIV_INTERN bool @@ -13145,6 +13855,8 @@ ha_innobase::check_if_incompatible_data( HA_CREATE_INFO* info, uint table_changes) { + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); + if (table_changes != IS_EQUAL_YES) { return(COMPATIBLE_DATA_NO); @@ -13157,25 +13869,8 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_NO); } - /* For column rename operation, MySQL does not supply enough - information (new column name etc.) for InnoDB to make appropriate - system metadata change. To avoid system metadata inconsistency, - currently we can just request a table rebuild/copy by returning - COMPATIBLE_DATA_NO */ - if (check_column_being_renamed(table, NULL)) { - return(COMPATIBLE_DATA_NO); - } - - /* Check if a column participating in a foreign key is being renamed. - There is no mechanism for updating InnoDB foreign key definitions. */ - if (foreign_key_column_is_being_renamed(prebuilt, table)) { - - return(COMPATIBLE_DATA_NO); - } - /* Check that row format didn't change */ if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) - && info->row_type != ROW_TYPE_DEFAULT && info->row_type != get_row_type()) { return(COMPATIBLE_DATA_NO); @@ -13189,6 +13884,135 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_YES); } +/****************************************************************//** +Update the system variable innodb_io_capacity_max using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_io_capacity_max_update( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val < srv_io_capacity) { + in_val = srv_io_capacity; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_io_capacity_max cannot be" + " set lower than innodb_io_capacity."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_io_capacity_max to %lu", + srv_io_capacity); + } + + srv_max_io_capacity = in_val; +} + +/****************************************************************//** +Update the system variable innodb_io_capacity using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_io_capacity_update( +/*======================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val > srv_max_io_capacity) { + in_val = srv_max_io_capacity; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_io_capacity cannot be set" + " higher than innodb_io_capacity_max."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_io_capacity to %lu", + srv_max_io_capacity); + } + + srv_io_capacity = in_val; +} + +/****************************************************************//** +Update the system variable innodb_max_dirty_pages_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_max_dirty_pages_pct_update( +/*==============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val < srv_max_dirty_pages_pct_lwm) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_max_dirty_pages_pct cannot be" + " set lower than" + " innodb_max_dirty_pages_pct_lwm."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Lowering" + " innodb_max_dirty_page_pct_lwm to %lu", + in_val); + + srv_max_dirty_pages_pct_lwm = in_val; + } + + srv_max_buf_pool_modified_pct = in_val; +} + +/****************************************************************//** +Update the system variable innodb_max_dirty_pages_pct_lwm using the +"saved" value. This function is registered as a callback with MySQL. */ +static +void +innodb_max_dirty_pages_pct_lwm_update( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val > srv_max_buf_pool_modified_pct) { + in_val = srv_max_buf_pool_modified_pct; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_max_dirty_page_pct_lwm" + " to %lu", + in_val); + } + + srv_max_dirty_pages_pct_lwm = in_val; +} + /************************************************************//** Validate the file format name and return its corresponding id. @return valid file format id */ @@ -13554,8 +14378,8 @@ innodb_internal_table_validate( return(0); } - user_table = dict_table_open_on_name_no_stats( - table_name, FALSE, DICT_ERR_IGNORE_NONE); + user_table = dict_table_open_on_name( + table_name, FALSE, TRUE, DICT_ERR_IGNORE_NONE); if (user_table) { if (dict_table_has_fts_index(user_table)) { @@ -13563,7 +14387,7 @@ innodb_internal_table_validate( ret = 0; } - dict_table_close(user_table, FALSE); + dict_table_close(user_table, FALSE, TRUE); } return(ret); @@ -13608,13 +14432,12 @@ innodb_internal_table_update( } /****************************************************************//** -Update the session variable innodb_session_stopword_table -with the "saved" stopword table name value. This function -is registered as a callback with MySQL. */ +Update the system variable innodb_adaptive_hash_index using the "saved" +value. This function is registered as a callback with MySQL. */ static void -innodb_session_stopword_update( -/*===========================*/ +innodb_adaptive_hash_index_update( +/*==============================*/ THD* thd, /*!< in: thread handle */ struct st_mysql_sys_var* var, /*!< in: pointer to system variable */ @@ -13623,32 +14446,20 @@ innodb_session_stopword_update( const void* save) /*!< in: immediate result from check function */ { - const char* stopword_table_name; - char* old; - - ut_a(save != NULL); - ut_a(var_ptr != NULL); - - stopword_table_name = *static_cast<const char*const*>(save); - old = *(char**) var_ptr; - - if (stopword_table_name) { - *(char**) var_ptr = my_strdup(stopword_table_name, MYF(0)); + if (*(my_bool*) save) { + btr_search_enable(); } else { - *(char**) var_ptr = NULL; - } - - if (old) { - my_free(old); + btr_search_disable(); } } + /****************************************************************//** -Update the system variable innodb_adaptive_hash_index using the "saved" +Update the system variable innodb_cmp_per_index using the "saved" value. This function is registered as a callback with MySQL. */ static void -innodb_adaptive_hash_index_update( -/*==============================*/ +innodb_cmp_per_index_update( +/*========================*/ THD* thd, /*!< in: thread handle */ struct st_mysql_sys_var* var, /*!< in: pointer to system variable */ @@ -13657,11 +14468,13 @@ innodb_adaptive_hash_index_update( const void* save) /*!< in: immediate result from check function */ { - if (*(my_bool*) save) { - btr_search_enable(); - } else { - btr_search_disable(); + /* Reset the stats whenever we enable the table + INFORMATION_SCHEMA.innodb_cmp_per_index. */ + if (!srv_cmp_per_index_enabled && *(my_bool*) save) { + page_zip_reset_stat_per_index(); } + + srv_cmp_per_index_enabled = !!(*(my_bool*) save); } /****************************************************************//** @@ -14233,6 +15046,53 @@ exit: return; } +#ifdef __WIN__ +/*************************************************************//** +Validate if passed-in "value" is a valid value for +innodb_buffer_pool_filename. On Windows, file names with colon (:) +are not allowed. + +@return 0 for valid name */ +static +int +innodb_srv_buf_dump_filename_validate( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* buf_name; + char buff[OS_FILE_MAX_PATH]; + int len= sizeof(buff); + + ut_a(save != NULL); + ut_a(value != NULL); + + buf_name = value->val_str(value, buff, &len); + + if (buf_name) { + if (is_filename_allowed(buf_name, len, FALSE)){ + *static_cast<const char**>(save) = buf_name; + return(0); + } else { + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "InnoDB: innodb_buffer_pool_filename " + "cannot have colon (:) in the file name."); + + } + } + + return(1); +} +#else /* __WIN__ */ +# define innodb_srv_buf_dump_filename_validate NULL +#endif /* __WIN__ */ + /****************************************************************//** Update the system variable innodb_monitor_enable and enable specified monitor counter. @@ -14310,6 +15170,29 @@ innodb_reset_all_monitor_update( } /****************************************************************//** +Update the system variable innodb_compression_level using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_compression_level_update( +/*============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + /* We have this call back just to avoid confusion between + ulong and ulint datatypes. */ + innobase_compression_level = + (*static_cast<const ulong*>(save)); + page_compression_level = + (static_cast<const ulint>(innobase_compression_level)); +} + +/****************************************************************//** Parse and enable InnoDB monitor counters during server startup. User can list the monitor counters/groups to be enable by specifying "loose-innodb_monitor_enable=monitor_name1;monitor_name2..." @@ -14427,6 +15310,12 @@ innobase_fts_retrieve_ranking( ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt; + if (ft_prebuilt->read_just_key) { + fts_ranking_t* ranking = + rbt_value(fts_ranking_t, result->current); + return(ranking->rank); + } + /* Retrieve the ranking value for doc_id with value of prebuilt->fts_doc_id */ return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id)); @@ -14441,20 +15330,16 @@ innobase_fts_close_ranking( FT_INFO * fts_hdl) { fts_result_t* result; - row_prebuilt_t* ft_prebuilt; - ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt; + ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt->in_fts_query = false; result = ((NEW_FT_INFO*) fts_hdl)->ft_result; fts_query_free_result(result); - if (result == ft_prebuilt->result) { - ft_prebuilt->result = NULL; - } - my_free((uchar*) fts_hdl); + return; } @@ -14478,7 +15363,120 @@ innobase_fts_find_ranking( /* Retrieve the ranking value for doc_id with value of prebuilt->fts_doc_id */ - return fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id); + return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id)); +} + +#ifdef UNIV_DEBUG +static my_bool innodb_purge_run_now = TRUE; +static my_bool innodb_purge_stop_now = TRUE; + +/****************************************************************//** +Set the purge state to RUN. If purge is disabled then it +is a no-op. This function is registered as a callback with MySQL. */ +static +void +purge_run_now_set( +/*==============*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_run(); + } +} + +/****************************************************************//** +Set the purge state to STOP. If purge is disabled then it +is a no-op. This function is registered as a callback with MySQL. */ +static +void +purge_stop_now_set( +/*===============*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_stop(); + } +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************** +@return version of the extended FTS API */ +uint +innobase_fts_get_version() +/*======================*/ +{ + /* Currently this doesn't make much sense as returning + HA_CAN_FULLTEXT_EXT automatically mean this version is supported. + This supposed to ease future extensions. */ + return(2); +} + +/*********************************************************************** +@return Which part of the extended FTS API is supported */ +ulonglong +innobase_fts_flags() +/*================*/ +{ + return(FTS_ORDERED_RESULT | FTS_DOCID_IN_RESULT); +} + + +/*********************************************************************** +Find and Retrieve the FTS doc_id for the current result row +@return the document ID */ +ulonglong +innobase_fts_retrieve_docid( +/*========================*/ + FT_INFO_EXT * fts_hdl) /*!< in: FTS handler */ +{ + row_prebuilt_t* ft_prebuilt; + fts_result_t* result; + + ft_prebuilt = ((NEW_FT_INFO *)fts_hdl)->ft_prebuilt; + result = ((NEW_FT_INFO *)fts_hdl)->ft_result; + + if (ft_prebuilt->read_just_key) { + fts_ranking_t* ranking = + rbt_value(fts_ranking_t, result->current); + return(ranking->doc_id); + } + + return(ft_prebuilt->fts_doc_id); +} + +/*********************************************************************** +Find and retrieve the size of the current result +@return number of matching rows */ +ulonglong +innobase_fts_count_matches( +/*=======================*/ + FT_INFO_EXT* fts_hdl) /*!< in: FTS handler */ +{ + NEW_FT_INFO* handle = (NEW_FT_INFO *) fts_hdl; + + if (handle->ft_result->rankings_by_id != 0) { + return rbt_size(handle->ft_result->rankings_by_id); + } else { + return(0); + } } /* These variables are never read by InnoDB or changed. They are a kind of @@ -14515,7 +15513,7 @@ buffer_pool_dump_now( const void* save) /*!< in: immediate result from check function */ { - if (*(my_bool*) save) { + if (*(my_bool*) save && !srv_read_only_mode) { buf_dump_start(); } } @@ -14622,7 +15620,26 @@ static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite, static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, PLUGIN_VAR_RQCMDARG, "Number of IOPs the server can do. Tunes the background IO rate", - NULL, NULL, 200, 100, ~0UL, 0); + NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity, + PLUGIN_VAR_RQCMDARG, + "Limit to which innodb_io_capacity can be inflated.", + NULL, innodb_io_capacity_max_update, + SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100, + SRV_MAX_IO_CAPACITY_LIMIT, 0); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_BOOL(purge_run_now, innodb_purge_run_now, + PLUGIN_VAR_OPCMDARG, + "Set purge state to RUN", + NULL, purge_run_now_set, FALSE); + +static MYSQL_SYSVAR_BOOL(purge_stop_now, innodb_purge_stop_now, + PLUGIN_VAR_OPCMDARG, + "Set purge state to STOP", + NULL, purge_stop_now_set, FALSE); +#endif /* UNIV_DEBUG */ static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size, PLUGIN_VAR_OPCMDARG, @@ -14634,7 +15651,7 @@ static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size, static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, - "Purge threads can be from 0 to 32. Default is 0.", + "Purge threads can be from 1 to 32. Default is 1.", NULL, NULL, 1, /* Default setting */ 1, /* Minimum value */ @@ -14657,7 +15674,7 @@ static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown, static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table, PLUGIN_VAR_NOCMDARG, "Stores each InnoDB table to an .ibd file in the database dir.", - NULL, NULL, FALSE); + NULL, NULL, TRUE); static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name, PLUGIN_VAR_RQCMDARG, @@ -14693,6 +15710,11 @@ static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table innodb_stopword_table_update, NULL); +static MYSQL_SYSVAR_UINT(flush_log_at_timeout, srv_flush_log_at_timeout, + PLUGIN_VAR_OPCMDARG, + "Write and flush logs every (n) second.", + NULL, NULL, 1, 0, 2700, 0); + static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG, "Controls the durability/speed trade-off for commits." @@ -14738,20 +15760,38 @@ static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive, "Set to 1 if you want to have logs archived.", NULL, NULL, FALSE); #endif /* UNIV_LOG_ARCHIVE */ -static MYSQL_SYSVAR_STR(log_group_home_dir, innobase_log_group_home_dir, +static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Path to InnoDB log files.", NULL, NULL, NULL); static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct, PLUGIN_VAR_RQCMDARG, "Percentage of dirty pages allowed in bufferpool.", - NULL, NULL, 75, 0, 99, 0); + NULL, innodb_max_dirty_pages_pct_update, 75, 0, 99, 0); + +static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct_lwm, + srv_max_dirty_pages_pct_lwm, + PLUGIN_VAR_RQCMDARG, + "Percentage of dirty pages at which flushing kicks in.", + NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99, 0); + +static MYSQL_SYSVAR_ULONG(adaptive_flushing_lwm, + srv_adaptive_flushing_lwm, + PLUGIN_VAR_RQCMDARG, + "Percentage of log capacity below which no adaptive flushing happens.", + NULL, NULL, 10, 0, 70, 0); static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing, PLUGIN_VAR_NOCMDARG, "Attempt flushing dirty pages to avoid IO bursts at checkpoints.", NULL, NULL, TRUE); +static MYSQL_SYSVAR_ULONG(flushing_avg_loops, + srv_flushing_avg_loops, + PLUGIN_VAR_RQCMDARG, + "Number of iterations over which the background flushing is averaged.", + NULL, NULL, 30, 1, 1000, 0); + static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag, PLUGIN_VAR_RQCMDARG, "Desired maximum length of the purge queue (0 = no limit)", @@ -14760,11 +15800,11 @@ static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag, static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay, PLUGIN_VAR_RQCMDARG, "Maximum delay of user threads in micro-seconds", - NULL, NULL, + NULL, NULL, 0L, /* Default seting */ 0L, /* Minimum value */ 10000000UL, 0); /* Maximum value */ - + static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)", @@ -14777,8 +15817,9 @@ static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file, static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata, PLUGIN_VAR_OPCMDARG, - "Enable statistics gathering for metadata commands such as SHOW TABLE STATUS (on by default)", - NULL, NULL, TRUE); + "Enable statistics gathering for metadata commands such as " + "SHOW TABLE STATUS for tables that use transient statistics (off by default)", + NULL, NULL, FALSE); static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_transient_sample_pages, PLUGIN_VAR_RQCMDARG, @@ -14792,6 +15833,20 @@ static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages, "statistics (if persistent statistics are not used, default 8)", NULL, NULL, 8, 1, ~0ULL, 0); +static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent, + PLUGIN_VAR_OPCMDARG, + "InnoDB persistent statistics enabled for all tables unless overridden " + "at table level", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(stats_auto_recalc, srv_stats_auto_recalc, + PLUGIN_VAR_OPCMDARG, + "InnoDB automatic recalculation of persistent statistics enabled for all " + "tables unless overridden at table level (automatic recalculation is only " + "done when InnoDB decides that the table has changed too much and needs a " + "new statistics)", + NULL, NULL, TRUE); + static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages, srv_stats_persistent_sample_pages, PLUGIN_VAR_RQCMDARG, @@ -14811,6 +15866,13 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, "innodb_thread_concurrency is reached (0 by default)", NULL, NULL, 0, 0, ~0UL, 0); +static MYSQL_SYSVAR_ULONG(compression_level, innobase_compression_level, + PLUGIN_VAR_RQCMDARG, + "Compression level used for compressed row format. 0 is no compression" + ", 1 is fastest, 9 is best compression and default is 6.", + NULL, innodb_compression_level_update, + DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); + static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "DEPRECATED. This option may be removed in future releases, " @@ -14822,7 +15884,7 @@ static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_ static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment, PLUGIN_VAR_RQCMDARG, "Data file autoextend increment in megabytes", - NULL, NULL, 8L, 1L, 1000L, 0); + NULL, NULL, 64L, 1L, 1000L, 0); static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -14844,12 +15906,12 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", - NULL, NULL, 1L, 1L, MAX_BUFFER_POOLS, 1L); + NULL, NULL, 0L, 0L, MAX_BUFFER_POOLS, 1L); static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, "Filename to/from which to dump/load the InnoDB buffer pool", - NULL, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT); + innodb_srv_buf_dump_filename_validate, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT); static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now, PLUGIN_VAR_RQCMDARG, @@ -14882,10 +15944,13 @@ static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth, "How deep to scan LRU to keep it clean", NULL, NULL, 1024, 100, ~0UL, 0); -static MYSQL_SYSVAR_BOOL(flush_neighbors, srv_flush_neighbors, - PLUGIN_VAR_NOCMDARG, - "Flush neighbors from buffer pool when flushing a block.", - NULL, NULL, TRUE); +static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors, + PLUGIN_VAR_OPCMDARG, + "Set to 0 (don't flush neighbors from buffer pool)," + " 1 (flush contiguous neighbors from buffer pool)" + " or 2 (flush neighbors from buffer pool)," + " when flushing a block", + NULL, NULL, 1, 0, 2, 0); static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency, PLUGIN_VAR_RQCMDARG, @@ -14895,7 +15960,7 @@ static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency, static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter, PLUGIN_VAR_RQCMDARG, "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket", - NULL, NULL, 500L, 1L, ~0UL, 0); + NULL, NULL, 5000L, 1L, ~0UL, 0); static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR, @@ -14905,7 +15970,7 @@ static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads, static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print, PLUGIN_VAR_OPCMDARG, "Whether to enable additional FTS diagnostic printout ", - NULL, NULL, TRUE); + NULL, NULL, FALSE); static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache, PLUGIN_VAR_OPCMDARG, @@ -14921,7 +15986,7 @@ static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name, static MYSQL_SYSVAR_ULONG(ft_cache_size, fts_max_cache_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "InnoDB Fulltext search cache size in bytes", - NULL, NULL, 32000000, 1600000, 80000000, 0); + NULL, NULL, 8000000, 1600000, 80000000, 0); static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -14947,7 +16012,12 @@ static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree, static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Memory buffer size for index creation", - NULL, NULL, 1048576, 524288, 64<<20, 0); + NULL, NULL, 1048576, 65536, 64<<20, 0); + +static MYSQL_SYSVAR_ULONGLONG(online_alter_log_max_size, srv_online_max_size, + PLUGIN_VAR_RQCMDARG, + "Maximum modification log file size for online index creation", + NULL, NULL, 128<<20, 65536, ~0ULL, 0); static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only, PLUGIN_VAR_NOCMDARG, @@ -14964,11 +16034,18 @@ static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads, "Number of background write I/O threads in InnoDB.", NULL, NULL, 4, 1, 64, 0); -static MYSQL_SYSVAR_LONG(force_recovery, innobase_force_recovery, +static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Helps to save your data in case the disk image of the database becomes corrupt.", NULL, NULL, 0, 0, 6, 0); +#ifndef DBUG_OFF +static MYSQL_SYSVAR_ULONG(force_recovery_crash, srv_force_recovery_crash, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Kills the server during crash recovery.", + NULL, NULL, 0, 0, 10, 0); +#endif /* !DBUG_OFF */ + static MYSQL_SYSVAR_ULONG(page_size, srv_page_size, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "Page size to use for all InnoDB tablespaces.", @@ -14983,12 +16060,12 @@ static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size, static MYSQL_SYSVAR_LONGLONG(log_file_size, innobase_log_file_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Size of each log file in a log group.", - NULL, NULL, 5*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L); + NULL, NULL, 48*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L); -static MYSQL_SYSVAR_LONG(log_files_in_group, innobase_log_files_in_group, +static MYSQL_SYSVAR_ULONG(log_files_in_group, srv_n_log_files, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Number of log files in the log group. InnoDB writes to the files in a circular fashion. Value 3 is recommended here.", - NULL, NULL, 2, 2, 100, 0); + "Number of log files in the log group. InnoDB writes to the files in a circular fashion.", + NULL, NULL, 2, 2, SRV_N_LOG_FILES_MAX, 0); static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -15004,13 +16081,13 @@ static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms, PLUGIN_VAR_RQCMDARG, "Move blocks to the 'new' end of the buffer pool if the first access" " was at least this many milliseconds ago." - " The timeout is disabled if 0 (the default).", - NULL, NULL, 0, 0, UINT_MAX32, 0); + " The timeout is disabled if 0.", + NULL, NULL, 1000, 0, UINT_MAX32, 0); static MYSQL_SYSVAR_LONG(open_files, innobase_open_files, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "How many files at the maximum InnoDB keeps open at the same time.", - NULL, NULL, 300L, 10L, LONG_MAX, 0); + NULL, NULL, 0L, 0L, LONG_MAX, 0); static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds, PLUGIN_VAR_RQCMDARG, @@ -15110,6 +16187,37 @@ static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio, "Use native AIO if supported on this platform.", NULL, NULL, TRUE); +static MYSQL_SYSVAR_BOOL(api_enable_binlog, ib_binlog_enabled, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable binlog for applications direct access InnoDB through InnoDB APIs", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(api_enable_mdl, ib_mdl_enabled, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable MDL for applications direct access InnoDB through InnoDB APIs", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(api_disable_rowlock, ib_disable_row_lock, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Disable row lock when direct access InnoDB through InnoDB APIs", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(api_trx_level, ib_trx_level_setting, + PLUGIN_VAR_OPCMDARG, + "InnoDB API transaction isolation level", + NULL, NULL, + 0, /* Default setting */ + 0, /* Minimum value */ + 3, 0); /* Maximum value */ + +static MYSQL_SYSVAR_ULONG(api_bk_commit_interval, ib_bk_commit_interval, + PLUGIN_VAR_OPCMDARG, + "Background commit interval in seconds", + NULL, NULL, + 5, /* Default setting */ + 1, /* Minimum value */ + 1024 * 1024 * 1024, 0); /* Maximum value */ + static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering, PLUGIN_VAR_RQCMDARG, "Buffer changes to reduce random access: " @@ -15137,6 +16245,12 @@ static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, PLUGIN_VAR_RQCMDARG, "Debug flags for InnoDB change buffering (0=none, 2=crash at merge)", NULL, NULL, 0, 0, 2, 0); + +static MYSQL_SYSVAR_BOOL(disable_background_merge, + srv_ibuf_disable_background_merge, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_RQCMDARG, + "Disable change buffering merges by the master thread", + NULL, NULL, FALSE); #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead, @@ -15179,15 +16293,53 @@ static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks, "Print all deadlocks to MySQL error log (off by default)", NULL, NULL, FALSE); +static MYSQL_SYSVAR_ULONG(compression_failure_threshold_pct, + zip_failure_threshold_pct, PLUGIN_VAR_OPCMDARG, + "If the compression failure rate of a table is greater than this number" + " more padding is added to the pages to reduce the failures. A value of" + " zero implies no padding", + NULL, NULL, 5, 0, 100, 0); + +static MYSQL_SYSVAR_ULONG(compression_pad_pct_max, + zip_pad_max, PLUGIN_VAR_OPCMDARG, + "Percentage of empty space on a data page that can be reserved" + " to make the page compressible.", + NULL, NULL, 50, 0, 75, 0); + +static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Start InnoDB in read only mode (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled, + PLUGIN_VAR_OPCMDARG, + "Enable INFORMATION_SCHEMA.innodb_cmp_per_index, " + "may have negative impact on performance (off by default)", + NULL, innodb_cmp_per_index_update, FALSE); + #ifdef UNIV_DEBUG_never static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug, PLUGIN_VAR_RQCMDARG, "Debug flags for InnoDB to limit TRX_RSEG_N_SLOTS for trx_rsegf_undo_find_free()", NULL, NULL, 0, 0, 1024, 0); + +static MYSQL_SYSVAR_UINT(limit_optimistic_insert_debug, + btr_cur_limit_optimistic_insert_debug, PLUGIN_VAR_RQCMDARG, + "Artificially limit the number of records per B-tree page (0=unlimited).", + NULL, NULL, 0, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, + srv_purge_view_update_only_debug, PLUGIN_VAR_NOCMDARG, + "Pause actual purging any delete-marked records, but merely update the purge view. " + "It is to create artificially the situation the purge view have been updated " + "but the each purges were not done yet.", + NULL, NULL, FALSE); #endif /* UNIV_DEBUG */ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), + MYSQL_SYSVAR(api_trx_level), + MYSQL_SYSVAR(api_bk_commit_interval), MYSQL_SYSVAR(autoextend_increment), MYSQL_SYSVAR(buffer_pool_size), MYSQL_SYSVAR(buffer_pool_instances), @@ -15203,9 +16355,13 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(checksums), MYSQL_SYSVAR(commit_concurrency), MYSQL_SYSVAR(concurrency_tickets), + MYSQL_SYSVAR(compression_level), MYSQL_SYSVAR(data_file_path), MYSQL_SYSVAR(data_home_dir), MYSQL_SYSVAR(doublewrite), + MYSQL_SYSVAR(api_enable_binlog), + MYSQL_SYSVAR(api_enable_mdl), + MYSQL_SYSVAR(api_disable_rowlock), MYSQL_SYSVAR(fast_shutdown), MYSQL_SYSVAR(file_io_threads), MYSQL_SYSVAR(read_io_threads), @@ -15214,9 +16370,13 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(file_format), MYSQL_SYSVAR(file_format_check), MYSQL_SYSVAR(file_format_max), + MYSQL_SYSVAR(flush_log_at_timeout), MYSQL_SYSVAR(flush_log_at_trx_commit), MYSQL_SYSVAR(flush_method), MYSQL_SYSVAR(force_recovery), +#ifndef DBUG_OFF + MYSQL_SYSVAR(force_recovery_crash), +#endif /* !DBUG_OFF */ MYSQL_SYSVAR(ft_cache_size), MYSQL_SYSVAR(ft_enable_stopword), MYSQL_SYSVAR(ft_max_token_size), @@ -15237,7 +16397,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_files_in_group), MYSQL_SYSVAR(log_group_home_dir), MYSQL_SYSVAR(max_dirty_pages_pct), + MYSQL_SYSVAR(max_dirty_pages_pct_lwm), + MYSQL_SYSVAR(adaptive_flushing_lwm), MYSQL_SYSVAR(adaptive_flushing), + MYSQL_SYSVAR(flushing_avg_loops), MYSQL_SYSVAR(max_purge_lag), MYSQL_SYSVAR(max_purge_lag_delay), MYSQL_SYSVAR(mirrored_log_groups), @@ -15254,7 +16417,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(stats_on_metadata), MYSQL_SYSVAR(stats_sample_pages), MYSQL_SYSVAR(stats_transient_sample_pages), + MYSQL_SYSVAR(stats_persistent), MYSQL_SYSVAR(stats_persistent_sample_pages), + MYSQL_SYSVAR(stats_auto_recalc), MYSQL_SYSVAR(adaptive_hash_index), MYSQL_SYSVAR(stats_method), MYSQL_SYSVAR(replication_delay), @@ -15262,7 +16427,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(strict_mode), MYSQL_SYSVAR(support_xa), MYSQL_SYSVAR(sort_buffer_size), - MYSQL_SYSVAR(analyze_is_persistent), + MYSQL_SYSVAR(online_alter_log_max_size), MYSQL_SYSVAR(sync_spin_loops), MYSQL_SYSVAR(spin_wait_delay), MYSQL_SYSVAR(table_locks), @@ -15279,33 +16444,45 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(change_buffer_max_size), #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG MYSQL_SYSVAR(change_buffering_debug), + MYSQL_SYSVAR(disable_background_merge), #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ MYSQL_SYSVAR(random_read_ahead), MYSQL_SYSVAR(read_ahead_threshold), + MYSQL_SYSVAR(read_only), MYSQL_SYSVAR(io_capacity), + MYSQL_SYSVAR(io_capacity_max), MYSQL_SYSVAR(monitor_enable), MYSQL_SYSVAR(monitor_disable), MYSQL_SYSVAR(monitor_reset), MYSQL_SYSVAR(monitor_reset_all), MYSQL_SYSVAR(purge_threads), MYSQL_SYSVAR(purge_batch_size), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(purge_run_now), + MYSQL_SYSVAR(purge_stop_now), +#endif /* UNIV_DEBUG */ #if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG MYSQL_SYSVAR(page_hash_locks), MYSQL_SYSVAR(doublewrite_batch_size), #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ MYSQL_SYSVAR(print_all_deadlocks), + MYSQL_SYSVAR(cmp_per_index_enabled), MYSQL_SYSVAR(undo_logs), MYSQL_SYSVAR(rollback_segments), MYSQL_SYSVAR(undo_directory), MYSQL_SYSVAR(undo_tablespaces), MYSQL_SYSVAR(sync_array_size), + MYSQL_SYSVAR(compression_failure_threshold_pct), + MYSQL_SYSVAR(compression_pad_pct_max), #ifdef UNIV_DEBUG_never /* disable this flag. --innodb-trx becomes ambiguous */ MYSQL_SYSVAR(trx_rseg_n_slots_debug), + MYSQL_SYSVAR(limit_optimistic_insert_debug), + MYSQL_SYSVAR(trx_purge_view_update_only_debug), #endif /* UNIV_DEBUG */ NULL }; -maria_declare_plugin(innobase) +mysql_declare_plugin(innobase) { MYSQL_STORAGE_ENGINE_PLUGIN, &innobase_storage_engine, @@ -15318,8 +16495,8 @@ maria_declare_plugin(innobase) INNODB_VERSION_SHORT, innodb_status_variables_export,/* status variables */ innobase_system_variables, /* system variables */ - INNODB_VERSION_STR, /* string version */ - MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ + NULL, /* reserved */ + 0, /* flags */ }, i_s_innodb_trx, i_s_innodb_locks, @@ -15328,6 +16505,8 @@ i_s_innodb_cmp, i_s_innodb_cmp_reset, i_s_innodb_cmpmem, i_s_innodb_cmpmem_reset, +i_s_innodb_cmp_per_index, +i_s_innodb_cmp_per_index_reset, i_s_innodb_buffer_page, i_s_innodb_buffer_page_lru, i_s_innodb_buffer_stats, @@ -15345,9 +16524,11 @@ i_s_innodb_sys_indexes, i_s_innodb_sys_columns, i_s_innodb_sys_fields, i_s_innodb_sys_foreign, -i_s_innodb_sys_foreign_cols +i_s_innodb_sys_foreign_cols, +i_s_innodb_sys_tablespaces, +i_s_innodb_sys_datafiles -maria_declare_plugin_end; +mysql_declare_plugin_end; /** @brief Initialize the default value of innodb_commit_concurrency. @@ -15384,7 +16565,7 @@ innobase_undo_logs_init_default_max() #ifdef UNIV_COMPILE_TEST_FUNCS -typedef struct innobase_convert_name_test_struct { +struct innobase_convert_name_test_t { char* buf; ulint buflen; const char* id; @@ -15393,7 +16574,7 @@ typedef struct innobase_convert_name_test_struct { ibool file_id; const char* expected; -} innobase_convert_name_test_t; +}; void test_innobase_convert_name() @@ -15512,62 +16693,52 @@ test_innobase_convert_name() * Multi Range Read interface, DS-MRR calls */ -int -ha_innobase::multi_range_read_init( - RANGE_SEQ_IF* seq, - void* seq_init_param, - uint n_ranges, - uint mode, - HANDLER_BUFFER* buf) +int ha_innobase::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges, uint mode, + HANDLER_BUFFER *buf) { - return(ds_mrr.dsmrr_init(this, seq, seq_init_param, - n_ranges, mode, buf)); + return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf); } -int -ha_innobase::multi_range_read_next( - range_id_t *range_info) +int ha_innobase::multi_range_read_next(range_id_t *range_info) { - return(ds_mrr.dsmrr_next(range_info)); + return ds_mrr.dsmrr_next(range_info); } -ha_rows -ha_innobase::multi_range_read_info_const( - uint keyno, - RANGE_SEQ_IF* seq, - void* seq_init_param, - uint n_ranges, - uint* bufsz, - uint* flags, - Cost_estimate* cost) +ha_rows ha_innobase::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, + uint n_ranges, uint *bufsz, + uint *flags, + Cost_estimate *cost) { - /* See comments in ha_myisam::multi_range_read_info_const */ - ds_mrr.init(this, table); - return(ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, - n_ranges, bufsz, flags, cost)); + /* See comments in ha_myisam::multi_range_read_info_const */ + ds_mrr.init(this, table); + + if (prebuilt->select_lock_type != LOCK_NONE) + *flags |= HA_MRR_USE_DEFAULT_IMPL; + + ha_rows res= ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges, + bufsz, flags, cost); + return res; } -ha_rows -ha_innobase::multi_range_read_info( - uint keyno, - uint n_ranges, - uint keys, - uint key_parts, - uint* bufsz, - uint* flags, - Cost_estimate* cost) +ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges, + uint keys, uint key_parts, + uint *bufsz, uint *flags, + Cost_estimate *cost) { - ds_mrr.init(this, table); - return(ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, - flags, cost)); + ds_mrr.init(this, table); + ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, + flags, cost); + return res; } -int ha_innobase::multi_range_read_explain_info(uint mrr_mode, char *str, size_t size) +int ha_innobase::multi_range_read_explain_info(uint mrr_mode, char *str, + size_t size) { return ds_mrr.dsmrr_explain_info(mrr_mode, str, size); } - /** * Index Condition Pushdown interface implementation */ @@ -15581,7 +16752,7 @@ innobase_index_cond( /*================*/ void* file) /*!< in/out: pointer to ha_innobase */ { - return handler_index_cond_check(file); + return handler_index_cond_check(file); } /** Attempt to push down an index condition. @@ -15606,3 +16777,181 @@ ha_innobase::idx_cond_push( DBUG_RETURN(NULL); } +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...) /*!< Args */ +{ + char* str; + va_list args; + const char* format = innobase_get_err_msg(code); + + /* If the caller wants to push a message to the client then + the caller must pass a valid session handle. */ + + ut_a(thd != 0); + + /* The error code must exist in the errmsg-utf8.txt file. */ + ut_a(format != 0); + + va_start(args, code); + +#ifdef __WIN__ + int size = _vscprintf(format, args) + 1; + str = static_cast<char*>(malloc(size)); + str[size - 1] = 0x0; + vsnprintf(str, size, format, args); +#elif HAVE_VASPRINTF + (void) vasprintf(&str, format, args); +#else + /* Use a fixed length string. */ + str = static_cast<char*>(malloc(BUFSIZ)); + my_vsnprintf(str, BUFSIZ, format, args); +#endif /* __WIN__ */ + + Sql_condition::enum_warning_level l; + + l = Sql_condition::WARN_LEVEL_NOTE; + + switch(level) { + case IB_LOG_LEVEL_INFO: + break; + case IB_LOG_LEVEL_WARN: + l = Sql_condition::WARN_LEVEL_WARN; + break; + case IB_LOG_LEVEL_ERROR: + /* We can't use push_warning_printf(), it is a hard error. */ + my_printf_error(code, "%s", MYF(0), str); + break; + case IB_LOG_LEVEL_FATAL: + l = Sql_condition::WARN_LEVEL_END; + break; + } + + if (level != IB_LOG_LEVEL_ERROR) { + push_warning_printf(thd, l, code, "InnoDB: %s", str); + } + + va_end(args); + free(str); + + if (level == IB_LOG_LEVEL_FATAL) { + ut_error; + } +} + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ +{ + char* str; + va_list args; + + /* If the caller wants to push a message to the client then + the caller must pass a valid session handle. */ + + ut_a(thd != 0); + ut_a(format != 0); + + va_start(args, format); + +#ifdef __WIN__ + int size = _vscprintf(format, args) + 1; + str = static_cast<char*>(malloc(size)); + str[size - 1] = 0x0; + vsnprintf(str, size, format, args); +#elif HAVE_VASPRINTF + (void) vasprintf(&str, format, args); +#else + /* Use a fixed length string. */ + str = static_cast<char*>(malloc(BUFSIZ)); + my_vsnprintf(str, BUFSIZ, format, args); +#endif /* __WIN__ */ + + ib_senderrf(thd, level, code, str); + + va_end(args); + free(str); +} + +/******************************************************************//** +Write a message to the MySQL log, prefixed with "InnoDB: " */ +UNIV_INTERN +void +ib_logf( +/*====*/ + ib_log_level_t level, /*!< in: warning level */ + const char* format, /*!< printf format */ + ...) /*!< Args */ +{ + char* str; + va_list args; + + va_start(args, format); + +#ifdef __WIN__ + int size = _vscprintf(format, args) + 1; + str = static_cast<char*>(malloc(size)); + str[size - 1] = 0x0; + vsnprintf(str, size, format, args); +#elif HAVE_VASPRINTF + (void) vasprintf(&str, format, args); +#else + /* Use a fixed length string. */ + str = static_cast<char*>(malloc(BUFSIZ)); + my_vsnprintf(str, BUFSIZ, format, args); +#endif /* __WIN__ */ + + switch(level) { + case IB_LOG_LEVEL_INFO: + sql_print_information("InnoDB: %s", str); + break; + case IB_LOG_LEVEL_WARN: + sql_print_warning("InnoDB: %s", str); + break; + case IB_LOG_LEVEL_ERROR: + sql_print_error("InnoDB: %s", str); + break; + case IB_LOG_LEVEL_FATAL: + sql_print_error("InnoDB: %s", str); + break; + } + + va_end(args); + free(str); + + if (level == IB_LOG_LEVEL_FATAL) { + ut_error; + } +} diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index e56a1ec52e3..ece9f7cf58a 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -27,14 +27,14 @@ this program; if not, write to the Free Software Foundation, Inc., /* Structure defines translation table between mysql index and innodb index structures */ -typedef struct innodb_idx_translate_struct { +struct innodb_idx_translate_t { ulint index_count; /*!< number of valid index entries in the index_mapping array */ ulint array_size; /*!< array size of index_mapping */ dict_index_t** index_mapping; /*!< index pointer array directly maps to index in Innodb from MySQL array index */ -} innodb_idx_translate_t; +}; /** InnoDB table share */ @@ -53,15 +53,8 @@ typedef struct st_innobase_share { } INNOBASE_SHARE; -/** InnoDB B-tree index */ -struct dict_index_struct; -/** Prebuilt structures in an Innobase table handle used within MySQL */ -struct row_prebuilt_struct; - -/** InnoDB B-tree index */ -typedef struct dict_index_struct dict_index_t; -/** Prebuilt structures in an Innobase table handle used within MySQL */ -typedef struct row_prebuilt_struct row_prebuilt_t; +/** Prebuilt structures in an InnoDB table handle used within MySQL */ +struct row_prebuilt_t; /** The class defining a handle to an Innodb table */ class ha_innobase: public handler @@ -101,15 +94,13 @@ class ha_innobase: public handler void update_thd(); int change_active_index(uint keynr); int general_fetch(uchar* buf, uint direction, uint match_mode); - ulint innobase_lock_autoinc(); + dberr_t innobase_lock_autoinc(); ulonglong innobase_peek_autoinc(); - ulint innobase_set_max_autoinc(ulonglong auto_inc); - ulint innobase_reset_autoinc(ulonglong auto_inc); - ulint innobase_get_autoinc(ulonglong* value); - ulint innobase_update_autoinc(ulonglong auto_inc); + dberr_t innobase_set_max_autoinc(ulonglong auto_inc); + dberr_t innobase_reset_autoinc(ulonglong auto_inc); + dberr_t innobase_get_autoinc(ulonglong* value); void innobase_initialize_autoinc(); dict_index_t* innobase_get_index(uint keynr); - int info_low(uint flag, dict_stats_upd_option_t stats_upd_option); /* Init values for the class: */ public: @@ -132,9 +123,11 @@ class ha_innobase: public handler const key_map* keys_to_use_for_scanning(); int open(const char *name, int mode, uint test_if_locked); + handler* clone(const char *name, MEM_ROOT *mem_root); int close(void); double scan_time(); double read_time(uint index, uint ranges, ha_rows rows); + longlong get_memory_buffer_size() const; int write_row(uchar * buf); int update_row(const uchar * old_data, uchar * new_data); @@ -182,6 +175,13 @@ class ha_innobase: public handler ha_rows estimate_rows_upper_bound(); void update_create_info(HA_CREATE_INFO* create_info); + int parse_table_name(const char*name, + HA_CREATE_INFO* create_info, + ulint flags, + ulint flags2, + char* norm_name, + char* temp_path, + char* remote_path); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); int truncate(); @@ -219,13 +219,76 @@ class ha_innobase: public handler static ulonglong get_mysql_bin_log_pos(); bool primary_key_is_clustered(); int cmp_ref(const uchar *ref1, const uchar *ref2); - /** Fast index creation (smart ALTER TABLE) @see handler0alter.cc @{ */ - int add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys, - handler_add_index **add); - int final_add_index(handler_add_index *add, bool commit); - int prepare_drop_index(TABLE *table_arg, uint *key_num, - uint num_of_keys); - int final_drop_index(TABLE *table_arg); + /** On-line ALTER TABLE interface @see handler0alter.cc @{ */ + + /** Check if InnoDB supports a particular alter table in-place + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported + @retval HA_ALTER_INPLACE_NO_LOCK Supported + @retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE + Supported, but requires lock + during main phase and exclusive + lock during prepare phase. + @retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE + Supported, prepare phase + requires exclusive lock. + */ + enum_alter_inplace_result check_if_supported_inplace_alter( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info); + /** Allows InnoDB to update internal structures with concurrent + writes blocked (provided that check_if_supported_inplace_alter() + did not return HA_ALTER_INPLACE_NO_LOCK). + This will be invoked before inplace_alter_table(). + + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval true Failure + @retval false Success + */ + bool prepare_inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info); + + /** Alter the table structure in-place with operations + specified using HA_ALTER_FLAGS and Alter_inplace_information. + The level of concurrency allowed during this operation depends + on the return value from check_if_supported_inplace_alter(). + + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval true Failure + @retval false Success + */ + bool inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info); + + /** Commit or rollback the changes made during + prepare_inplace_alter_table() and inplace_alter_table() inside + the storage engine. Note that the allowed level of concurrency + during this operation will be the same as for + inplace_alter_table() and thus might be higher than during + prepare_inplace_alter_table(). (E.g concurrent writes were + blocked during prepare, but might not be during commit). + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + @param commit true => Commit, false => Rollback. + @retval true Failure + @retval false Success + */ + bool commit_inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info, + bool commit); /** @} */ bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); @@ -241,6 +304,8 @@ private: @see build_template() */ inline void reset_template(); + int info_low(uint, bool); + public: /** @name Multi Range Read interface @{ */ /** Initialize multi range read @see DsMrr_impl::dsmrr_init @@ -283,15 +348,12 @@ public: * @param flags * @param cost */ - ha_rows multi_range_read_info(uint keyno, - uint n_ranges, uint keys, - uint key_parts, - uint* bufsz, uint* mrr_mode, + ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint key_parts, uint* bufsz, uint* flags, Cost_estimate* cost); - int multi_range_read_explain_info(uint mrr_mode, - char *str, size_t size); - + int multi_range_read_explain_info(uint mrr_mode, char *str, + size_t size); /** Attempt to push down an index condition. * @param[in] keyno MySQL key number * @param[in] idx_cond Index condition to be checked @@ -364,6 +426,27 @@ bool thd_binlog_filter_ok(const MYSQL_THD thd); */ bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd); +/** + Gets information on the durability property requested by + a thread. + @param thd Thread handle + @return a durability property. +*/ +enum durability_properties thd_get_durability_property(const MYSQL_THD thd); + +/** Get the auto_increment_offset auto_increment_increment. +@param thd Thread object +@param off auto_increment_offset +@param inc auto_increment_increment */ +void thd_get_autoinc(const MYSQL_THD thd, ulong* off, ulong* inc) +__attribute__((nonnull)); + +/** Is strict sql_mode set. +@param thd Thread object +@return True if sql_mode has strict mode (all or trans), false otherwise. +*/ +bool thd_is_strict_mode(const MYSQL_THD thd) +__attribute__((nonnull)); } /* extern "C" */ /** Get the file name and position of the MySQL binlog corresponding to the @@ -371,7 +454,7 @@ bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd); */ extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file); -typedef struct trx_struct trx_t; +struct trx_t; extern const struct _ft_vft ft_vft_result; @@ -379,23 +462,11 @@ extern const struct _ft_vft ft_vft_result; typedef struct new_ft_info { struct _ft_vft *please; + struct _ft_vft_ext *could_you; row_prebuilt_t* ft_prebuilt; fts_result_t* ft_result; } NEW_FT_INFO; -/********************************************************************//** -@file handler/ha_innodb.h -Converts an InnoDB error code to a MySQL error code and also tells to MySQL -about a possible transaction rollback inside InnoDB caused by a lock wait -timeout or a deadlock. -@return MySQL error code */ -int -convert_error_code_to_mysql( -/*========================*/ - int error, /*!< in: InnoDB error code */ - ulint flags, /*!< in: InnoDB table flags, or 0 */ - MYSQL_THD thd); /*!< in: user thread handle or NULL */ - /*********************************************************************//** Allocates an InnoDB transaction for a MySQL handler object. @return InnoDB transaction handle */ @@ -410,13 +481,50 @@ system default primary index name 'GEN_CLUST_INDEX'. If a name matches, this function pushes an warning message to the client, and returns true. @return true if the index name matches the reserved name */ +UNIV_INTERN bool innobase_index_name_is_reserved( /*============================*/ THD* thd, /*!< in/out: MySQL connection */ const KEY* key_info, /*!< in: Indexes to be created */ - ulint num_of_keys); /*!< in: Number of indexes to + ulint num_of_keys) /*!< in: Number of indexes to be created. */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Determines InnoDB table flags. +@retval true if successful, false if error */ +UNIV_INTERN +bool +innobase_table_flags( +/*=================*/ + const TABLE* form, /*!< in: table */ + const HA_CREATE_INFO* create_info, /*!< in: information + on table columns and indexes */ + THD* thd, /*!< in: connection */ + bool use_tablespace, /*!< in: whether to create + outside system tablespace */ + ulint* flags, /*!< out: DICT_TF flags */ + ulint* flags2) /*!< out: DICT_TF2 flags */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Validates the create options. We may build on this function +in future. For now, it checks two specifiers: +KEY_BLOCK_SIZE and ROW_FORMAT +If innodb_strict_mode is not set then this function is a no-op +@return NULL if valid, string if not. */ +UNIV_INTERN +const char* +create_options_are_invalid( +/*=======================*/ + THD* thd, /*!< in: connection thread. */ + TABLE* form, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: create info. */ + bool use_tablespace) /*!< in: srv_file_per_table */ + __attribute__((nonnull, warn_unused_result)); + /*********************************************************************//** Retrieve the FTS Relevance Ranking result for doc with doc_id of prebuilt->fts_doc_id @@ -434,7 +542,7 @@ of prebuilt->fts_doc_id UNIV_INTERN float innobase_fts_find_ranking( -/*==========================*/ +/*======================*/ FT_INFO* fts_hdl, /*!< in: FTS handler */ uchar* record, /*!< in: Unused */ uint len); /*!< in: Unused */ @@ -443,24 +551,20 @@ Free the memory for the FTS handler */ UNIV_INTERN void innobase_fts_close_ranking( -/*==========================*/ - FT_INFO* fts_hdl); /*!< in: FTS handler */ -/*********************************************************************//** -Free the memory for the FTS handler */ -void -innobase_fts_close_ranking( -/*==========================*/ - FT_INFO* fts_hdl); /*!< in: FTS handler */ +/*=======================*/ + FT_INFO* fts_hdl) /*!< in: FTS handler */ + __attribute__((nonnull)); /*****************************************************************//** Initialize the table FTS stopword list -@return TRUE is succeed */ +@return TRUE if success */ UNIV_INTERN ibool innobase_fts_load_stopword( /*=======================*/ dict_table_t* table, /*!< in: Table has the FTS */ trx_t* trx, /*!< in: transaction */ - THD* thd); /*!< in: current thread */ + THD* thd) /*!< in: current thread */ + __attribute__((nonnull(1,3), warn_unused_result)); /** Some defines for innobase_fts_check_doc_id_index() return value */ enum fts_doc_id_index_enum { @@ -472,15 +576,17 @@ enum fts_doc_id_index_enum { /*******************************************************************//** Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME on the Doc ID column. -@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index, -FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */ +@return the status of the FTS_DOC_ID index */ UNIV_INTERN enum fts_doc_id_index_enum innobase_fts_check_doc_id_index( /*============================*/ - dict_table_t* table, /*!< in: table definition */ - ulint* fts_doc_col_no);/*!< out: The column number for - Doc ID */ + const dict_table_t* table, /*!< in: table definition */ + const TABLE* altered_table, /*!< in: MySQL table + that is being altered */ + ulint* fts_doc_col_no) /*!< out: The column number for + Doc ID */ + __attribute__((warn_unused_result)); /*******************************************************************//** Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME @@ -492,4 +598,59 @@ enum fts_doc_id_index_enum innobase_fts_check_doc_id_index_in_def( /*===================================*/ ulint n_key, /*!< in: Number of keys */ - KEY* key_info); /*!< in: Key definition */ + const KEY* key_info) /*!< in: Key definitions */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************** +@return version of the extended FTS API */ +uint +innobase_fts_get_version(); + +/*********************************************************************** +@return Which part of the extended FTS API is supported */ +ulonglong +innobase_fts_flags(); + +/*********************************************************************** +Find and Retrieve the FTS doc_id for the current result row +@return the document ID */ +ulonglong +innobase_fts_retrieve_docid( +/*============================*/ + FT_INFO_EXT* fts_hdl); /*!< in: FTS handler */ + +/*********************************************************************** +Find and retrieve the size of the current result +@return number of matching rows */ +ulonglong +innobase_fts_count_matches( +/*============================*/ + FT_INFO_EXT* fts_hdl); /*!< in: FTS handler */ + +/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default +system clustered index when there is no primary key. */ +extern const char innobase_index_reserve_name[]; + +/*********************************************************************//** +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_create_info( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + HA_CREATE_INFO* create_info); /*!< in: create info */ + +/*********************************************************************//** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_table_share( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + TABLE_SHARE* table_share); /*!< in: table share */ diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 1468bc79c04..437443979c0 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -23,11 +23,20 @@ Smart ALTER TABLE #include <unireg.h> #include <mysqld_error.h> -#include <sql_lex.h> // SQLCOM_CREATE_INDEX +#include <log.h> +#include <debug_sync.h> #include <innodb_priv.h> +#include <sql_alter.h> +#include <sql_class.h> +#include "dict0crea.h" +#include "dict0dict.h" +#include "dict0priv.h" #include "dict0stats.h" +#include "dict0stats_bg.h" #include "log0log.h" +#include "rem0types.h" +#include "row0log.h" #include "row0merge.h" #include "srv0srv.h" #include "trx0trx.h" @@ -36,9 +45,995 @@ Smart ALTER TABLE #include "handler0alter.h" #include "srv0mon.h" #include "fts0priv.h" +#include "pars0pars.h" #include "ha_innodb.h" +/** Operations for creating an index in place */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ONLINE_CREATE + = Alter_inplace_info::ADD_INDEX + | Alter_inplace_info::ADD_UNIQUE_INDEX; + +/** Operations for rebuilding a table in place */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_INPLACE_REBUILD + = Alter_inplace_info::ADD_PK_INDEX + | Alter_inplace_info::DROP_PK_INDEX + | Alter_inplace_info::CHANGE_CREATE_OPTION + | Alter_inplace_info::ALTER_COLUMN_NULLABLE + | Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE + | Alter_inplace_info::ALTER_COLUMN_ORDER + | Alter_inplace_info::DROP_COLUMN + | Alter_inplace_info::ADD_COLUMN + /* + | Alter_inplace_info::ALTER_COLUMN_TYPE + | Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH + */ + ; + +/** Operations for creating indexes or rebuilding a table */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_INPLACE_CREATE + = INNOBASE_ONLINE_CREATE | INNOBASE_INPLACE_REBUILD; + +/** Operations for altering a table that InnoDB does not care about */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_INPLACE_IGNORE + = Alter_inplace_info::ALTER_COLUMN_DEFAULT + | Alter_inplace_info::ALTER_COLUMN_COLUMN_FORMAT + | Alter_inplace_info::ALTER_COLUMN_STORAGE_TYPE + | Alter_inplace_info::ALTER_RENAME; + +/** Operations that InnoDB can perform online */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ONLINE_OPERATIONS + = INNOBASE_INPLACE_IGNORE + | INNOBASE_ONLINE_CREATE + | Alter_inplace_info::DROP_INDEX + | Alter_inplace_info::DROP_UNIQUE_INDEX + | Alter_inplace_info::DROP_FOREIGN_KEY + | Alter_inplace_info::ALTER_COLUMN_NAME + | Alter_inplace_info::ADD_FOREIGN_KEY; + +/* Report an InnoDB error to the client by invoking my_error(). */ +static UNIV_COLD __attribute__((nonnull)) +void +my_error_innodb( +/*============*/ + dberr_t error, /*!< in: InnoDB error code */ + const char* table, /*!< in: table name */ + ulint flags) /*!< in: table flags */ +{ + switch (error) { + case DB_MISSING_HISTORY: + my_error(ER_TABLE_DEF_CHANGED, MYF(0)); + break; + case DB_RECORD_NOT_FOUND: + my_error(ER_KEY_NOT_FOUND, MYF(0), table); + break; + case DB_DEADLOCK: + my_error(ER_LOCK_DEADLOCK, MYF(0)); + break; + case DB_LOCK_WAIT_TIMEOUT: + my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0)); + break; + case DB_INTERRUPTED: + my_error(ER_QUERY_INTERRUPTED, MYF(0)); + break; + case DB_OUT_OF_MEMORY: + my_error(ER_OUT_OF_RESOURCES, MYF(0)); + break; + case DB_OUT_OF_FILE_SPACE: + my_error(ER_RECORD_FILE_FULL, MYF(0), table); + break; + case DB_TOO_BIG_INDEX_COL: + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); + break; + case DB_TOO_MANY_CONCURRENT_TRXS: + my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0)); + break; + case DB_LOCK_TABLE_FULL: + my_error(ER_LOCK_TABLE_FULL, MYF(0)); + break; + case DB_UNDO_RECORD_TOO_BIG: + my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0)); + break; + case DB_CORRUPTION: + my_error(ER_NOT_KEYFILE, MYF(0), table); + break; + case DB_TOO_BIG_RECORD: + my_error(ER_TOO_BIG_ROWSIZE, MYF(0), + page_get_free_space_of_empty( + flags & DICT_TF_COMPACT) / 2); + break; + case DB_INVALID_NULL: + /* TODO: report the row, as we do for DB_DUPLICATE_KEY */ + my_error(ER_INVALID_USE_OF_NULL, MYF(0)); + break; +#ifdef UNIV_DEBUG + case DB_SUCCESS: + case DB_DUPLICATE_KEY: + case DB_TABLESPACE_EXISTS: + case DB_ONLINE_LOG_TOO_BIG: + /* These codes should not be passed here. */ + ut_error; +#endif /* UNIV_DEBUG */ + default: + my_error(ER_GET_ERRNO, MYF(0), error); + break; + } +} + +/** Determine if fulltext indexes exist in a given table. +@param table_share MySQL table +@return whether fulltext indexes exist on the table */ +static +bool +innobase_fulltext_exist( +/*====================*/ + const TABLE_SHARE* table_share) +{ + for (uint i = 0; i < table_share->keys; i++) { + if (table_share->key_info[i].flags & HA_FULLTEXT) { + return(true); + } + } + + return(false); +} + +/*******************************************************************//** +Determine if ALTER TABLE needs to rebuild the table. +@param ha_alter_info the DDL operation +@return whether it is necessary to rebuild the table */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_need_rebuild( +/*==================*/ + const Alter_inplace_info* ha_alter_info) +{ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION + && !(ha_alter_info->create_info->used_fields + & (HA_CREATE_USED_ROW_FORMAT + | HA_CREATE_USED_KEY_BLOCK_SIZE))) { + /* Any other CHANGE_CREATE_OPTION than changing + ROW_FORMAT or KEY_BLOCK_SIZE is ignored. */ + return(false); + } + + return(!!(ha_alter_info->handler_flags & INNOBASE_INPLACE_REBUILD)); +} + +/** Check if InnoDB supports a particular alter table in-place +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported +@retval HA_ALTER_INPLACE_NO_LOCK Supported +@retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE Supported, but requires +lock during main phase and exclusive lock during prepare phase. +@retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE Supported, prepare phase +requires exclusive lock (any transactions that have accessed the table +must commit or roll back first, and no transactions can access the table +while prepare_inplace_alter_table() is executing) +*/ +UNIV_INTERN +enum_alter_inplace_result +ha_innobase::check_if_supported_inplace_alter( +/*==========================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + DBUG_ENTER("check_if_supported_inplace_alter"); + + if (srv_read_only_mode) { + ha_alter_info->unsupported_reason = + innobase_get_err_msg(ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } else if (srv_created_new_raw || srv_force_recovery) { + ha_alter_info->unsupported_reason = + innobase_get_err_msg(ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (altered_table->s->fields > REC_MAX_N_USER_FIELDS) { + /* Deny the inplace ALTER TABLE. MySQL will try to + re-create the table and ha_innobase::create() will + return an error too. This is how we effectively + deny adding too many columns to a table. */ + ha_alter_info->unsupported_reason = + innobase_get_err_msg(ER_TOO_MANY_FIELDS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + update_thd(); + trx_search_latch_release_if_reserved(prebuilt->trx); + + if (ha_alter_info->handler_flags + & ~(INNOBASE_ONLINE_OPERATIONS | INNOBASE_INPLACE_REBUILD)) { + if (ha_alter_info->handler_flags + & (Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH + | Alter_inplace_info::ALTER_COLUMN_TYPE)) + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* Only support online add foreign key constraint when + check_foreigns is turned off */ + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ADD_FOREIGN_KEY) + && prebuilt->trx->check_foreigns) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + DBUG_RETURN(HA_ALTER_INPLACE_NO_LOCK); + } + + /* Only support NULL -> NOT NULL change if strict table sql_mode + is set. Fall back to COPY for conversion if not strict tables. + In-Place will fail with an error when trying to convert + NULL to a NOT NULL value. */ + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE) + && !thd_is_strict_mode(user_thd)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* InnoDB cannot IGNORE when creating unique indexes. IGNORE + should silently delete some duplicate rows. Our inplace_alter + code will not delete anything from existing indexes. */ + if (ha_alter_info->ignore + && (ha_alter_info->handler_flags + & (Alter_inplace_info::ADD_PK_INDEX + | Alter_inplace_info::ADD_UNIQUE_INDEX))) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* DROP PRIMARY KEY is only allowed in combination with ADD + PRIMARY KEY. */ + if ((ha_alter_info->handler_flags + & (Alter_inplace_info::ADD_PK_INDEX + | Alter_inplace_info::DROP_PK_INDEX)) + == Alter_inplace_info::DROP_PK_INDEX) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* ADD FOREIGN KEY does not currently work properly in combination + with renaming columns. (Bug#14105491) */ + if ((ha_alter_info->handler_flags + & (Alter_inplace_info::ADD_FOREIGN_KEY + | Alter_inplace_info::ALTER_COLUMN_NAME)) + == (Alter_inplace_info::ADD_FOREIGN_KEY + | Alter_inplace_info::ALTER_COLUMN_NAME)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_RENAME); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* DROP FOREIGN KEY may not currently work properly in combination + with other operations. (Work-around for 5.6.10 only.) */ + if ((ha_alter_info->handler_flags + & Alter_inplace_info::DROP_FOREIGN_KEY) + && (ha_alter_info->handler_flags + & (Alter_inplace_info::DROP_FOREIGN_KEY + | INNOBASE_INPLACE_REBUILD)) + != Alter_inplace_info::DROP_FOREIGN_KEY) { + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* If a column change from NOT NULL to NULL, + and there's a implict pk on this column. the + table should be rebuild. The change should + only go through the "Copy" method.*/ + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NULLABLE)) { + uint primary_key = altered_table->s->primary_key; + + /* See if MYSQL table has no pk but we do.*/ + if (UNIV_UNLIKELY(primary_key >= MAX_KEY) + && !row_table_got_default_clust_index(prebuilt->table)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_PRIMARY_CANT_HAVE_NULL); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + /* We should be able to do the operation in-place. + See if we can do it online (LOCK=NONE). */ + bool online = true; + + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + + /* Fix the key parts. */ + for (KEY* new_key = ha_alter_info->key_info_buffer; + new_key < ha_alter_info->key_info_buffer + + ha_alter_info->key_count; + new_key++) { + for (KEY_PART_INFO* key_part = new_key->key_part; + key_part < new_key->key_part + new_key->user_defined_key_parts; + key_part++) { + const Create_field* new_field; + + DBUG_ASSERT(key_part->fieldnr + < altered_table->s->fields); + + cf_it.rewind(); + for (uint fieldnr = 0; (new_field = cf_it++); + fieldnr++) { + if (fieldnr == key_part->fieldnr) { + break; + } + } + + DBUG_ASSERT(new_field); + + key_part->field = altered_table->field[ + key_part->fieldnr]; + /* In some special cases InnoDB emits "false" + duplicate key errors with NULL key values. Let + us play safe and ensure that we can correctly + print key values even in such cases .*/ + key_part->null_offset = key_part->field->null_offset(); + key_part->null_bit = key_part->field->null_bit; + + if (new_field->field) { + /* This is an existing column. */ + continue; + } + + /* This is an added column. */ + DBUG_ASSERT(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN); + + /* We cannot replace a hidden FTS_DOC_ID + with a user-visible FTS_DOC_ID. */ + if (prebuilt->table->fts + && innobase_fulltext_exist(altered_table->s) + && !my_strcasecmp( + system_charset_info, + key_part->field->field_name, + FTS_DOC_ID_COL_NAME)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + DBUG_ASSERT((MTYP_TYPENR(key_part->field->unireg_check) + == Field::NEXT_NUMBER) + == !!(key_part->field->flags + & AUTO_INCREMENT_FLAG)); + + if (key_part->field->flags & AUTO_INCREMENT_FLAG) { + /* We cannot assign an AUTO_INCREMENT + column values during online ALTER. */ + DBUG_ASSERT(key_part->field == altered_table + -> found_next_number_field); + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC); + online = false; + } + } + } + + DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col + <= table->s->fields); + DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col + < dict_table_get_n_user_cols(prebuilt->table)); + + if (prebuilt->table->fts + && innobase_fulltext_exist(altered_table->s)) { + /* FULLTEXT indexes are supposed to remain. */ + /* Disallow DROP INDEX FTS_DOC_ID_INDEX */ + + for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { + if (!my_strcasecmp( + system_charset_info, + ha_alter_info->index_drop_buffer[i]->name, + FTS_DOC_ID_INDEX_NAME)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + /* InnoDB can have a hidden FTS_DOC_ID_INDEX on a + visible FTS_DOC_ID column as well. Prevent dropping or + renaming the FTS_DOC_ID. */ + + for (Field** fp = table->field; *fp; fp++) { + if (!((*fp)->flags + & (FIELD_IS_RENAMED | FIELD_IS_DROPPED))) { + continue; + } + + if (!my_strcasecmp( + system_charset_info, + (*fp)->field_name, + FTS_DOC_ID_COL_NAME)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + } + + prebuilt->trx->will_lock++; + + if (!online) { + /* We already determined that only a non-locking + operation is possible. */ + } else if (((ha_alter_info->handler_flags + & Alter_inplace_info::ADD_PK_INDEX) + || innobase_need_rebuild(ha_alter_info)) + && (innobase_fulltext_exist(altered_table->s) + || (prebuilt->table->flags2 + & DICT_TF2_FTS_HAS_DOC_ID))) { + /* Refuse to rebuild the table online, if + fulltext indexes are to survive the rebuild, + or if the table contains a hidden FTS_DOC_ID column. */ + online = false; + /* If the table already contains fulltext indexes, + refuse to rebuild the table natively altogether. */ + if (prebuilt->table->fts) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_INNODB_FT_LIMIT); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + } else if ((ha_alter_info->handler_flags + & Alter_inplace_info::ADD_INDEX)) { + /* Building a full-text index requires a lock. + We could do without a lock if the table already contains + an FTS_DOC_ID column, but in that case we would have + to apply the modification log to the full-text indexes. */ + + for (uint i = 0; i < ha_alter_info->index_add_count; i++) { + const KEY* key = + &ha_alter_info->key_info_buffer[ + ha_alter_info->index_add_buffer[i]]; + if (key->flags & HA_FULLTEXT) { + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_GENERATED_KEY + | HA_BINARY_PACK_KEY))); + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + online = false; + break; + } + } + } + + DBUG_RETURN(online + ? HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE + : HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE); +} + +/*************************************************************//** +Initialize the dict_foreign_t structure with supplied info +@return true if added, false if duplicate foreign->id */ +static __attribute__((nonnull(1,3,5,7))) +bool +innobase_init_foreign( +/*==================*/ + dict_foreign_t* foreign, /*!< in/out: structure to + initialize */ + char* constraint_name, /*!< in/out: constraint name if + exists */ + dict_table_t* table, /*!< in: foreign table */ + dict_index_t* index, /*!< in: foreign key index */ + const char** column_names, /*!< in: foreign key column + names */ + ulint num_field, /*!< in: number of columns */ + const char* referenced_table_name, /*!< in: referenced table + name */ + dict_table_t* referenced_table, /*!< in: referenced table */ + dict_index_t* referenced_index, /*!< in: referenced index */ + const char** referenced_column_names,/*!< in: referenced column + names */ + ulint referenced_num_field) /*!< in: number of referenced + columns */ +{ + if (constraint_name) { + ulint db_len; + + /* Catenate 'databasename/' to the constraint name specified + by the user: we conceive the constraint as belonging to the + same MySQL 'database' as the table itself. We store the name + to foreign->id. */ + + db_len = dict_get_db_name_len(table->name); + + foreign->id = static_cast<char*>(mem_heap_alloc( + foreign->heap, db_len + strlen(constraint_name) + 2)); + + ut_memcpy(foreign->id, table->name, db_len); + foreign->id[db_len] = '/'; + strcpy(foreign->id + db_len + 1, constraint_name); + } + + ut_ad(mutex_own(&dict_sys->mutex)); + + /* Check if any existing foreign key has the same id */ + + for (const dict_foreign_t* existing_foreign + = UT_LIST_GET_FIRST(table->foreign_list); + existing_foreign != 0; + existing_foreign = UT_LIST_GET_NEXT( + foreign_list, existing_foreign)) { + + if (ut_strcmp(existing_foreign->id, foreign->id) == 0) { + return(false); + } + } + + foreign->foreign_table = table; + foreign->foreign_table_name = mem_heap_strdup( + foreign->heap, table->name); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + foreign->foreign_index = index; + foreign->n_fields = (unsigned int) num_field; + + foreign->foreign_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, num_field * sizeof(void*))); + + for (ulint i = 0; i < foreign->n_fields; i++) { + foreign->foreign_col_names[i] = mem_heap_strdup( + foreign->heap, column_names[i]); + } + + foreign->referenced_index = referenced_index; + foreign->referenced_table = referenced_table; + + foreign->referenced_table_name = mem_heap_strdup( + foreign->heap, referenced_table_name); + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + foreign->referenced_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, + referenced_num_field * sizeof(void*))); + + for (ulint i = 0; i < foreign->n_fields; i++) { + foreign->referenced_col_names[i] + = mem_heap_strdup(foreign->heap, + referenced_column_names[i]); + } + + return(true); +} + +/*************************************************************//** +Check whether the foreign key options is legit +@return true if it is */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_check_fk_option( +/*=====================*/ + dict_foreign_t* foreign) /*!< in:InnoDB Foreign key */ +{ + if (foreign->type & (DICT_FOREIGN_ON_UPDATE_SET_NULL + | DICT_FOREIGN_ON_DELETE_SET_NULL) + && foreign->foreign_index) { + + for (ulint j = 0; j < foreign->n_fields; j++) { + if ((dict_index_get_nth_col( + foreign->foreign_index, j)->prtype) + & DATA_NOT_NULL) { + + /* It is not sensible to define + SET NULL if the column is not + allowed to be NULL! */ + return(false); + } + } + } + + return(true); +} + +/*************************************************************//** +Set foreign key options +@return true if successfully set */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_set_foreign_key_option( +/*============================*/ + dict_foreign_t* foreign, /*!< in:InnoDB Foreign key */ + Foreign_key* fk_key) /*!< in: Foreign key info from + MySQL */ +{ + ut_ad(!foreign->type); + + switch (fk_key->delete_opt) { + case Foreign_key::FK_OPTION_NO_ACTION: + case Foreign_key::FK_OPTION_RESTRICT: + case Foreign_key::FK_OPTION_DEFAULT: + foreign->type = DICT_FOREIGN_ON_DELETE_NO_ACTION; + break; + case Foreign_key::FK_OPTION_CASCADE: + foreign->type = DICT_FOREIGN_ON_DELETE_CASCADE; + break; + case Foreign_key::FK_OPTION_SET_NULL: + foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL; + break; + } + + switch (fk_key->update_opt) { + case Foreign_key::FK_OPTION_NO_ACTION: + case Foreign_key::FK_OPTION_RESTRICT: + case Foreign_key::FK_OPTION_DEFAULT: + foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION; + break; + case Foreign_key::FK_OPTION_CASCADE: + foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE; + break; + case Foreign_key::FK_OPTION_SET_NULL: + foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL; + break; + } + + return(innobase_check_fk_option(foreign)); +} + +/*******************************************************************//** +Check if a foreign key constraint can make use of an index +that is being created. +@return useable index, or NULL if none found */ +static __attribute__((nonnull, warn_unused_result)) +const KEY* +innobase_find_equiv_index( +/*======================*/ + const char*const* col_names, + /*!< in: column names */ + uint n_cols, /*!< in: number of columns */ + const KEY* keys, /*!< in: index information */ + const uint* add, /*!< in: indexes being created */ + uint n_add) /*!< in: number of indexes to create */ +{ + for (uint i = 0; i < n_add; i++) { + const KEY* key = &keys[add[i]]; + + if (key->user_defined_key_parts < n_cols) { +no_match: + continue; + } + + for (uint j = 0; j < n_cols; j++) { + const KEY_PART_INFO& key_part = key->key_part[j]; + uint32 col_len + = key_part.field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes + length field for a true VARCHAR. */ + + if (key_part.field->type() == MYSQL_TYPE_VARCHAR) { + col_len -= static_cast<const Field_varstring*>( + key_part.field)->length_bytes; + } + + if (key_part.length < col_len) { + + /* Column prefix indexes cannot be + used for FOREIGN KEY constraints. */ + goto no_match; + } + + if (innobase_strcasecmp(col_names[j], + key_part.field->field_name)) { + /* Name mismatch */ + goto no_match; + } + } + + return(key); + } + + return(NULL); +} + +/*************************************************************//** +Found an index whose first fields are the columns in the array +in the same order and is not marked for deletion +@return matching index, NULL if not found */ +static +dict_index_t* +innobase_find_fk_index( +/*===================*/ + Alter_inplace_info* ha_alter_info, + /*!< in: alter table info */ + dict_table_t* table, /*!< in: table */ + const char** columns,/*!< in: array of column names */ + ulint n_cols) /*!< in: number of columns */ + +{ + dict_index_t* index; + dict_index_t* found_index = NULL; + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (index->type & DICT_FTS) { + goto next_rec; + } else if (dict_foreign_qualify_index( + table, columns, n_cols, index, NULL, TRUE, FALSE)) { + /* Check if this index is in the drop list */ + if (index) { + KEY** drop_key; + + drop_key = ha_alter_info->index_drop_buffer; + + for (uint i = 0; + i < ha_alter_info->index_drop_count; + i++) { + if (innobase_strcasecmp( + drop_key[i]->name, + index->name) == 0) { + goto next_rec; + } + } + } + + found_index = index; + break; + } + +next_rec: + index = dict_table_get_next_index(index); + } + + return(found_index); +} + +/*************************************************************//** +Create InnoDB foreign key structure from MySQL alter_info +@retval true if successful +@retval false on error (will call my_error()) */ +static +bool +innobase_get_foreign_key_info( +/*==========================*/ + Alter_inplace_info* + ha_alter_info, /*!< in: alter table info */ + const TABLE_SHARE* + table_share, /*!< in: the TABLE_SHARE */ + dict_table_t* table, /*!< in: table */ + dict_foreign_t**add_fk, /*!< out: foreign constraint added */ + ulint* n_add_fk, /*!< out: number of foreign + constraints added */ + mem_heap_t* heap, /*!< in: memory heap */ + const trx_t* trx) /*!< in: user transaction */ +{ + Key* key; + Foreign_key* fk_key; + ulint i = 0; + dict_table_t* referenced_table = NULL; + char* referenced_table_name = NULL; + ulint num_fk = 0; + Alter_info* alter_info = ha_alter_info->alter_info; + + *n_add_fk = 0; + + List_iterator<Key> key_iterator(alter_info->key_list); + + while ((key=key_iterator++)) { + if (key->type == Key::FOREIGN_KEY) { + const char* column_names[MAX_NUM_FK_COLUMNS]; + dict_index_t* index = NULL; + const char* referenced_column_names[MAX_NUM_FK_COLUMNS]; + dict_index_t* referenced_index = NULL; + ulint num_col = 0; + ulint referenced_num_col = 0; + bool correct_option; + char* db_namep = NULL; + char* tbl_namep = NULL; + ulint db_name_len = 0; + ulint tbl_name_len = 0; +#ifdef __WIN__ + char db_name[MAX_DATABASE_NAME_LEN]; + char tbl_name[MAX_TABLE_NAME_LEN]; +#endif + + fk_key= static_cast<Foreign_key*>(key); + + if (fk_key->columns.elements > 0) { + Key_part_spec* column; + List_iterator<Key_part_spec> key_part_iterator( + fk_key->columns); + + /* Get all the foreign key column info for the + current table */ + while ((column = key_part_iterator++)) { + column_names[i] = + column->field_name.str; + ut_ad(i < MAX_NUM_FK_COLUMNS); + i++; + } + + index = innobase_find_fk_index( + ha_alter_info, table, column_names, i); + + /* MySQL would add a index in the creation + list if no such index for foreign table, + so we have to use DBUG_EXECUTE_IF to simulate + the scenario */ + DBUG_EXECUTE_IF("innodb_test_no_foreign_idx", + index = NULL;); + + /* Check whether there exist such + index in the the index create clause */ + if (!index && !innobase_find_equiv_index( + column_names, i, + ha_alter_info->key_info_buffer, + ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count)) { + my_error( + ER_FK_NO_INDEX_CHILD, + MYF(0), + fk_key->name.str, + table_share->table_name.str); + goto err_exit; + } + + num_col = i; + } + + add_fk[num_fk] = dict_mem_foreign_create(); + +#ifndef __WIN__ + tbl_namep = fk_key->ref_table.str; + tbl_name_len = fk_key->ref_table.length; + db_namep = fk_key->ref_db.str; + db_name_len = fk_key->ref_db.length; +#else + ut_ad(fk_key->ref_table.str); + + memcpy(tbl_name, fk_key->ref_table.str, + fk_key->ref_table.length); + tbl_name[fk_key->ref_table.length] = 0; + innobase_casedn_str(tbl_name); + tbl_name_len = strlen(tbl_name); + tbl_namep = &tbl_name[0]; + + if (fk_key->ref_db.str != NULL) { + memcpy(db_name, fk_key->ref_db.str, + fk_key->ref_db.length); + db_name[fk_key->ref_db.length] = 0; + innobase_casedn_str(db_name); + db_name_len = strlen(db_name); + db_namep = &db_name[0]; + } +#endif + mutex_enter(&dict_sys->mutex); + + referenced_table_name = dict_get_referenced_table( + table->name, + db_namep, + db_name_len, + tbl_namep, + tbl_name_len, + &referenced_table, + add_fk[num_fk]->heap); + + /* Test the case when referenced_table failed to + open, if trx->check_foreigns is not set, we should + still be able to add the foreign key */ + DBUG_EXECUTE_IF("innodb_test_open_ref_fail", + referenced_table = NULL;); + + if (!referenced_table && trx->check_foreigns) { + mutex_exit(&dict_sys->mutex); + my_error(ER_FK_CANNOT_OPEN_PARENT, + MYF(0), tbl_namep); + + goto err_exit; + } + + i = 0; + + if (fk_key->ref_columns.elements > 0) { + Key_part_spec* column; + List_iterator<Key_part_spec> key_part_iterator( + fk_key->ref_columns); + + while ((column = key_part_iterator++)) { + referenced_column_names[i] = + column->field_name.str; + ut_ad(i < MAX_NUM_FK_COLUMNS); + i++; + } + + if (referenced_table) { + referenced_index = + dict_foreign_find_index( + referenced_table, + referenced_column_names, + i, NULL, + TRUE, FALSE); + + DBUG_EXECUTE_IF( + "innodb_test_no_reference_idx", + referenced_index = NULL;); + + /* Check whether there exist such + index in the the index create clause */ + if (!referenced_index) { + mutex_exit(&dict_sys->mutex); + my_error( + ER_FK_NO_INDEX_PARENT, + MYF(0), + fk_key->name.str, + tbl_namep); + goto err_exit; + } + } else { + ut_a(!trx->check_foreigns); + } + + referenced_num_col = i; + } + + if (!innobase_init_foreign( + add_fk[num_fk], fk_key->name.str, + table, index, column_names, + num_col, referenced_table_name, + referenced_table, referenced_index, + referenced_column_names, referenced_num_col)) { + mutex_exit(&dict_sys->mutex); + my_error( + ER_FK_DUP_NAME, + MYF(0), + add_fk[num_fk]->id); + goto err_exit; + } + + mutex_exit(&dict_sys->mutex); + + correct_option = innobase_set_foreign_key_option( + add_fk[num_fk], fk_key); + + DBUG_EXECUTE_IF("innodb_test_wrong_fk_option", + correct_option = false;); + + if (!correct_option) { + my_error(ER_FK_INCORRECT_OPTION, + MYF(0), + table_share->table_name.str, + add_fk[num_fk]->id); + goto err_exit; + } + + num_fk++; + i = 0; + } + + } + + *n_add_fk = num_fk; + + return(true); +err_exit: + for (i = 0; i <= num_fk; i++) { + if (add_fk[i]) { + dict_foreign_free(add_fk[i]); + } + } + + return(false); +} + /*************************************************************//** Copies an InnoDB column to a MySQL field. This function is adapted from row_sel_field_store_in_mysql_format(). */ @@ -91,10 +1086,9 @@ innobase_col_to_mysql( break; case DATA_BLOB: - /* Store a pointer to the BLOB buffer to dest: the BLOB was - already copied to the buffer in row_sel_store_mysql_rec */ - - row_mysql_store_blob_ref(dest, flen, data, len); + /* Skip MySQL BLOBs when reporting an erroneous row + during index creation or table rebuild. */ + field->set_null(); break; #ifdef UNIV_DEBUG @@ -102,8 +1096,6 @@ innobase_col_to_mysql( ut_ad(flen >= len); ut_ad(DATA_MBMAXLEN(col->mbminmaxlen) >= DATA_MBMINLEN(col->mbminmaxlen)); - ut_ad(DATA_MBMAXLEN(col->mbminmaxlen) - > DATA_MBMINLEN(col->mbminmaxlen) || flen == len); memcpy(dest, data, len); break; @@ -137,20 +1129,19 @@ UNIV_INTERN void innobase_rec_to_mysql( /*==================*/ - TABLE* table, /*!< in/out: MySQL table */ - const rec_t* rec, /*!< in: record */ - const dict_index_t* index, /*!< in: index */ - const ulint* offsets) /*!< in: rec_get_offsets( - rec, index, ...) */ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ { uint n_fields = table->s->fields; - uint i; ut_ad(n_fields == dict_table_get_n_user_cols(index->table) - || (DICT_TF2_FLAG_IS_SET(index->table, DICT_TF2_FTS_HAS_DOC_ID) - && n_fields + 1 == dict_table_get_n_user_cols(index->table))); + - !!(DICT_TF2_FLAG_IS_SET(index->table, + DICT_TF2_FTS_HAS_DOC_ID))); - for (i = 0; i < n_fields; i++) { + for (uint i = 0; i < n_fields; i++) { Field* field = table->field[i]; ulint ipos; ulint ilen; @@ -160,7 +1151,8 @@ innobase_rec_to_mysql( ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE); - if (UNIV_UNLIKELY(ipos == ULINT_UNDEFINED)) { + if (ipos == ULINT_UNDEFINED + || rec_offs_nth_extern(offsets, ipos)) { null_field: field->set_null(); continue; @@ -184,6 +1176,85 @@ null_field: } /*************************************************************//** +Copies an InnoDB index entry to table->record[0]. */ +UNIV_INTERN +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ +{ + uint n_fields = table->s->fields; + + ut_ad(n_fields == dict_table_get_n_user_cols(index->table) + - !!(DICT_TF2_FLAG_IS_SET(index->table, + DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + ulint ipos; + + field->reset(); + + ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE); + + if (ipos == ULINT_UNDEFINED + || dfield_is_ext(&fields[ipos]) + || dfield_is_null(&fields[ipos])) { + + field->set_null(); + } else { + field->set_notnull(); + + const dfield_t* df = &fields[ipos]; + + innobase_col_to_mysql( + dict_field_get_col( + dict_index_get_nth_field(index, ipos)), + static_cast<const uchar*>(dfield_get_data(df)), + dfield_get_len(df), field); + } + } +} + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. */ +UNIV_INTERN +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ +{ + uint n_fields = table->s->fields; + + /* The InnoDB row may contain an extra FTS_DOC_ID column at the end. */ + ut_ad(row->n_fields == dict_table_get_n_cols(itab)); + ut_ad(n_fields == row->n_fields - DATA_N_SYS_COLS + - !!(DICT_TF2_FLAG_IS_SET(itab, DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + const dfield_t* df = dtuple_get_nth_field(row, i); + + field->reset(); + + if (dfield_is_ext(df) || dfield_is_null(df)) { + field->set_null(); + } else { + field->set_notnull(); + + innobase_col_to_mysql( + dict_table_get_nth_col(itab, i), + static_cast<const uchar*>(dfield_get_data(df)), + dfield_get_len(df), field); + } + } +} + +/*************************************************************//** Resets table->record[0]. */ UNIV_INTERN void @@ -199,66 +1270,29 @@ innobase_rec_reset( } } -/******************************************************************//** -Removes the filename encoding of a database and table name. */ -static -void -innobase_convert_tablename( -/*=======================*/ - char* s) /*!< in: identifier; out: decoded identifier */ -{ - uint errors; - - char* slash = strchr(s, '/'); - - if (slash) { - char* t; - /* Temporarily replace the '/' with NUL. */ - *slash = 0; - /* Convert the database name. */ - strconvert(&my_charset_filename, s, system_charset_info, - s, slash - s + 1, &errors); - - t = s + strlen(s); - ut_ad(slash >= t); - /* Append a '.' after the database name. */ - *t++ = '.'; - slash++; - /* Convert the table name. */ - strconvert(&my_charset_filename, slash, system_charset_info, - t, slash - t + strlen(slash), &errors); - } else { - strconvert(&my_charset_filename, s, - system_charset_info, s, strlen(s), &errors); - } -} - /*******************************************************************//** This function checks that index keys are sensible. @return 0 or error number */ -static +static __attribute__((nonnull, warn_unused_result)) int innobase_check_index_keys( /*======================*/ - const KEY* key_info, /*!< in: Indexes to be - created */ - ulint num_of_keys, /*!< in: Number of - indexes to be created */ - const dict_table_t* table) /*!< in: Existing indexes */ + const Alter_inplace_info* info, + /*!< in: indexes to be created or dropped */ + const dict_table_t* innodb_table) + /*!< in: Existing indexes */ { - ulint key_num; - - ut_ad(key_info); - ut_ad(num_of_keys); - - for (key_num = 0; key_num < num_of_keys; key_num++) { - const KEY& key = key_info[key_num]; + for (uint key_num = 0; key_num < info->index_add_count; + key_num++) { + const KEY& key = info->key_info_buffer[ + info->index_add_buffer[key_num]]; /* Check that the same index name does not appear twice in indexes to be created. */ for (ulint i = 0; i < key_num; i++) { - const KEY& key2 = key_info[i]; + const KEY& key2 = info->key_info_buffer[ + info->index_add_buffer[i]]; if (0 == strcmp(key.name, key2.name)) { my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), @@ -270,23 +1304,36 @@ innobase_check_index_keys( /* Check that the same index name does not already exist. */ - for (const dict_index_t* index - = dict_table_get_first_index(table); - index; index = dict_table_get_next_index(index)) { + const dict_index_t* index; - if (0 == strcmp(key.name, index->name)) { - my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), - key.name); + for (index = dict_table_get_first_index(innodb_table); + index; index = dict_table_get_next_index(index)) { - return(ER_WRONG_NAME_FOR_INDEX); + if (!strcmp(key.name, index->name)) { + break; } } - /* Check that MySQL does not try to create a column - prefix index field on an inappropriate data type and - that the same column does not appear twice in the index. */ + if (index) { + /* If a key by the same name is being created and + dropped, the name clash is OK. */ + for (uint i = 0; i < info->index_drop_count; + i++) { + const KEY* drop_key + = info->index_drop_buffer[i]; - for (ulint i = 0; i < key.key_parts; i++) { + if (0 == strcmp(key.name, drop_key->name)) { + goto name_ok; + } + } + + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key.name); + + return(ER_WRONG_NAME_FOR_INDEX); + } + +name_ok: + for (ulint i = 0; i < key.user_defined_key_parts; i++) { const KEY_PART_INFO& key_part1 = key.key_part[i]; const Field* field @@ -301,6 +1348,10 @@ innobase_check_index_keys( case DATA_FLOAT: case DATA_DOUBLE: case DATA_DECIMAL: + /* Check that MySQL does not try to + create a column prefix index field on + an inappropriate data type. */ + if (field->type() == MYSQL_TYPE_VARCHAR) { if (key_part1.length >= field->pack_length() @@ -320,17 +1371,19 @@ innobase_check_index_keys( return(ER_WRONG_KEY_COLUMN); } + /* Check that the same column does not appear + twice in the index. */ + for (ulint j = 0; j < i; j++) { const KEY_PART_INFO& key_part2 = key.key_part[j]; - if (strcmp(key_part1.field->field_name, - key_part2.field->field_name)) { + if (key_part1.fieldnr != key_part2.fieldnr) { continue; } my_error(ER_WRONG_KEY_COLUMN, MYF(0), - key_part1.field->field_name); + field->field_name); return(ER_WRONG_KEY_COLUMN); } } @@ -341,16 +1394,19 @@ innobase_check_index_keys( /*******************************************************************//** Create index field definition for key part */ -static +static __attribute__((nonnull(2,3))) void innobase_create_index_field_def( /*============================*/ - KEY_PART_INFO* key_part, /*!< in: MySQL key definition */ - mem_heap_t* heap, /*!< in: memory heap */ - merge_index_field_t* index_field) /*!< out: index field + const TABLE* altered_table, /*!< in: MySQL table that is + being altered, or NULL + if a new clustered index is + not being created */ + const KEY_PART_INFO* key_part, /*!< in: MySQL key definition */ + index_field_t* index_field) /*!< out: index field definition for key_part */ { - Field* field; + const Field* field; ibool is_unsigned; ulint col_type; @@ -359,9 +1415,13 @@ innobase_create_index_field_def( ut_ad(key_part); ut_ad(index_field); - field = key_part->field; + field = altered_table + ? altered_table->field[key_part->fieldnr] + : key_part->field; ut_a(field); + index_field->col_no = key_part->fieldnr; + col_type = get_innobase_type_from_mysql_type(&is_unsigned, field); if (DATA_BLOB == col_type @@ -376,44 +1436,48 @@ innobase_create_index_field_def( index_field->prefix_len = 0; } - index_field->field_name = mem_heap_strdup(heap, field->field_name); - DBUG_VOID_RETURN; } /*******************************************************************//** Create index definition for key */ -static +static __attribute__((nonnull)) void innobase_create_index_def( /*======================*/ - KEY* key, /*!< in: key definition */ - bool new_primary, /*!< in: TRUE=generating - a new primary key + const TABLE* altered_table, /*!< in: MySQL table that is + being altered */ + const KEY* keys, /*!< in: key definitions */ + ulint key_number, /*!< in: MySQL key number */ + bool new_clustered, /*!< in: true if generating + a new clustered index on the table */ - bool key_primary, /*!< in: TRUE if this key - is a primary key */ - merge_index_def_t* index, /*!< out: index definition */ + bool key_clustered, /*!< in: true if this is + the new clustered index */ + index_def_t* index, /*!< out: index definition */ mem_heap_t* heap) /*!< in: heap where memory is allocated */ { - ulint i; - ulint len; - ulint n_fields = key->key_parts; - char* index_name; + const KEY* key = &keys[key_number]; + ulint i; + ulint len; + ulint n_fields = key->user_defined_key_parts; + char* index_name; DBUG_ENTER("innobase_create_index_def"); + DBUG_ASSERT(!key_clustered || new_clustered); - index->fields = (merge_index_field_t*) mem_heap_alloc( - heap, n_fields * sizeof *index->fields); + index->fields = static_cast<index_field_t*>( + mem_heap_alloc(heap, n_fields * sizeof *index->fields)); index->ind_type = 0; + index->key_number = key_number; index->n_fields = n_fields; len = strlen(key->name) + 1; - index->name = index_name = (char*) mem_heap_alloc(heap, - len + !new_primary); + index->name = index_name = static_cast<char*>( + mem_heap_alloc(heap, len + !new_clustered)); - if (UNIV_LIKELY(!new_primary)) { + if (!new_clustered) { *index_name++ = TEMP_INDEX_PREFIX; } @@ -423,144 +1487,155 @@ innobase_create_index_def( index->ind_type |= DICT_UNIQUE; } - if (key->flags & HA_FULLTEXT) { + if (key_clustered) { + DBUG_ASSERT(!(key->flags & HA_FULLTEXT)); + index->ind_type |= DICT_CLUSTERED; + } else if (key->flags & HA_FULLTEXT) { + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_BINARY_PACK_KEY))); + DBUG_ASSERT(!(key->flags & HA_NOSAME)); + DBUG_ASSERT(!index->ind_type); index->ind_type |= DICT_FTS; } - if (key_primary) { - index->ind_type |= DICT_CLUSTERED; + if (!new_clustered) { + altered_table = NULL; } for (i = 0; i < n_fields; i++) { - innobase_create_index_field_def(&key->key_part[i], heap, - &index->fields[i]); + innobase_create_index_field_def( + altered_table, &key->key_part[i], &index->fields[i]); } DBUG_VOID_RETURN; } /*******************************************************************//** -Copy index field definition */ +Check whether the table has the FTS_DOC_ID column +@return whether there exists an FTS_DOC_ID column */ static -void -innobase_copy_index_field_def( +bool +innobase_fts_check_doc_id_col( /*==========================*/ - const dict_field_t* field, /*!< in: definition to copy */ - merge_index_field_t* index_field) /*!< out: copied definition */ + const dict_table_t* table, /*!< in: InnoDB table with + fulltext index */ + const TABLE* altered_table, + /*!< in: MySQL table with + fulltext index */ + ulint* fts_doc_col_no) + /*!< out: The column number for + Doc ID, or ULINT_UNDEFINED + if it is of wrong type */ { - DBUG_ENTER("innobase_copy_index_field_def"); - DBUG_ASSERT(field != NULL); - DBUG_ASSERT(index_field != NULL); - - index_field->field_name = field->name; - index_field->prefix_len = field->prefix_len; - - DBUG_VOID_RETURN; -} - -/*******************************************************************//** -Copy index definition for the index */ -static -void -innobase_copy_index_def( -/*====================*/ - const dict_index_t* index, /*!< in: index definition to copy */ - merge_index_def_t* new_index,/*!< out: Index definition */ - mem_heap_t* heap) /*!< in: heap where allocated */ -{ - ulint n_fields; - ulint i; - - DBUG_ENTER("innobase_copy_index_def"); + *fts_doc_col_no = ULINT_UNDEFINED; - /* Note that we take only those fields that user defined to be - in the index. In the internal representation more colums were - added and those colums are not copied .*/ + const uint n_cols = altered_table->s->fields; + uint i; - n_fields = index->n_user_defined_cols; + for (i = 0; i < n_cols; i++) { + const Field* field = altered_table->s->field[i]; - new_index->fields = (merge_index_field_t*) mem_heap_alloc( - heap, n_fields * sizeof *new_index->fields); + if (my_strcasecmp(system_charset_info, + field->field_name, FTS_DOC_ID_COL_NAME)) { + continue; + } - /* When adding a PRIMARY KEY, we may convert a previous - clustered index to a secondary index (UNIQUE NOT NULL). */ - new_index->ind_type = index->type & ~DICT_CLUSTERED; - new_index->n_fields = n_fields; - new_index->name = index->name; + if (strcmp(field->field_name, FTS_DOC_ID_COL_NAME)) { + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name); + } else if (field->type() != MYSQL_TYPE_LONGLONG + || field->pack_length() != 8 + || field->real_maybe_null() + || !(field->flags & UNSIGNED_FLAG)) { + my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, MYF(0), + field->field_name); + } else { + *fts_doc_col_no = i; + } - for (i = 0; i < n_fields; i++) { - innobase_copy_index_field_def(&index->fields[i], - &new_index->fields[i]); + return(true); } - DBUG_VOID_RETURN; -} - -/*******************************************************************//** -Check whether the table has the FTS_DOC_ID column -@return TRUE if there exists the FTS_DOC_ID column, if TRUE but fts_doc_col_no - equal to ULINT_UNDEFINED then that means the column exists but is not - of the right type. */ -static -ibool -innobase_fts_check_doc_id_col( -/*==========================*/ - dict_table_t* table, /*!< in: table with FTS index */ - ulint* fts_doc_col_no) /*!< out: The column number for - Doc ID */ -{ - *fts_doc_col_no = ULINT_UNDEFINED; + if (!table) { + return(false); + } - for (ulint i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) { + for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) { const char* name = dict_table_get_col_name(table, i); if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) { +#ifdef UNIV_DEBUG const dict_col_t* col; col = dict_table_get_nth_col(table, i); - if (col->mtype != DATA_INT || col->len != 8) { - fprintf(stderr, - " InnoDB: %s column in table %s" - " must be of the BIGINT datatype\n", - FTS_DOC_ID_COL_NAME, table->name); - } else if (!(col->prtype & DATA_NOT_NULL)) { - fprintf(stderr, - " InnoDB: %s column in table %s" - " must be NOT NULL\n", - FTS_DOC_ID_COL_NAME, table->name); - - } else if (!(col->prtype & DATA_UNSIGNED)) { - fprintf(stderr, - " InnoDB: %s column in table %s" - " must be UNSIGNED\n", - FTS_DOC_ID_COL_NAME, table->name); - } else { - *fts_doc_col_no = i; - } - - return(TRUE); + /* Because the FTS_DOC_ID does not exist in + the MySQL data dictionary, this must be the + internally created FTS_DOC_ID column. */ + ut_ad(col->mtype == DATA_INT); + ut_ad(col->len == 8); + ut_ad(col->prtype & DATA_NOT_NULL); + ut_ad(col->prtype & DATA_UNSIGNED); +#endif /* UNIV_DEBUG */ + *fts_doc_col_no = i; + return(true); } } - return(FALSE); + return(false); } /*******************************************************************//** Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME on the Doc ID column. -@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index, -FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */ +@return the status of the FTS_DOC_ID index */ UNIV_INTERN enum fts_doc_id_index_enum innobase_fts_check_doc_id_index( /*============================*/ - dict_table_t* table, /*!< in: table definition */ - ulint* fts_doc_col_no) /*!< out: The column number for - Doc ID */ + const dict_table_t* table, /*!< in: table definition */ + const TABLE* altered_table, /*!< in: MySQL table + that is being altered */ + ulint* fts_doc_col_no) /*!< out: The column number for + Doc ID, or ULINT_UNDEFINED + if it is being created in + ha_alter_info */ { - dict_index_t* index; - dict_field_t* field; + const dict_index_t* index; + const dict_field_t* field; + + if (altered_table) { + /* Check if a unique index with the name of + FTS_DOC_ID_INDEX_NAME is being created. */ + + for (uint i = 0; i < altered_table->s->keys; i++) { + const KEY& key = altered_table->s->key_info[i]; + + if (innobase_strcasecmp( + key.name, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + if ((key.flags & HA_NOSAME) + && key.user_defined_key_parts == 1 + && !strcmp(key.name, FTS_DOC_ID_INDEX_NAME) + && !strcmp(key.key_part[0].field->field_name, + FTS_DOC_ID_COL_NAME)) { + if (fts_doc_col_no) { + *fts_doc_col_no = ULINT_UNDEFINED; + } + return(FTS_EXIST_DOC_ID_INDEX); + } else { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + } + } + + if (!table) { + return(FTS_NOT_EXIST_DOC_ID_INDEX); + } for (index = dict_table_get_first_index(table); index; index = dict_table_get_next_index(index)) { @@ -572,6 +1647,7 @@ innobase_fts_check_doc_id_index( } if (!dict_index_is_unique(index) + || dict_index_get_n_unique(index) > 1 || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) { return(FTS_INCORRECT_DOC_ID_INDEX); } @@ -592,9 +1668,9 @@ innobase_fts_check_doc_id_index( } else { return(FTS_INCORRECT_DOC_ID_INDEX); } - } + /* Not found */ return(FTS_NOT_EXIST_DOC_ID_INDEX); } @@ -608,12 +1684,12 @@ enum fts_doc_id_index_enum innobase_fts_check_doc_id_index_in_def( /*===================================*/ ulint n_key, /*!< in: Number of keys */ - KEY * key_info) /*!< in: Key definition */ + const KEY* key_info) /*!< in: Key definition */ { /* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index list */ for (ulint j = 0; j < n_key; j++) { - KEY* key = &key_info[j]; + const KEY* key = &key_info[j]; if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) { continue; @@ -622,14 +1698,15 @@ innobase_fts_check_doc_id_index_in_def( /* Do a check on FTS DOC ID_INDEX, it must be unique, named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */ if (!(key->flags & HA_NOSAME) + || key->user_defined_key_parts != 1 || strcmp(key->name, FTS_DOC_ID_INDEX_NAME) || strcmp(key->key_part[0].field->field_name, - FTS_DOC_ID_COL_NAME)) { + FTS_DOC_ID_COL_NAME)) { return(FTS_INCORRECT_DOC_ID_INDEX); - } + } return(FTS_EXIST_DOC_ID_INDEX); - } + } return(FTS_NOT_EXIST_DOC_ID_INDEX); } @@ -639,8 +1716,7 @@ Create an index table where indexes are ordered as follows: IF a new primary key is defined for the table THEN 1) New primary key - 2) Original secondary indexes - 3) New secondary indexes + 2) The remaining keys in key_info ELSE @@ -648,626 +1724,1272 @@ ELSE ENDIF - -@return key definitions or NULL */ -static -merge_index_def_t* -innobase_create_key_def( -/*====================*/ - trx_t* trx, /*!< in: trx */ - dict_table_t* table, /*!< in: table definition */ - mem_heap_t* heap, /*!< in: heap where space for key - definitions are allocated */ - KEY* key_info, /*!< in: Indexes to be created */ - ulint& n_keys, /*!< in/out: Number of indexes to - be created */ - ulint* num_fts_index, /*!< out: Number of FTS indexes */ - ibool* add_fts_doc_id, /*!< out: Whether we need to add - new DOC ID column for FTS index */ - ibool* add_fts_doc_id_idx)/*!< out: Whether we need to add - new index on DOC ID column */ +@return key definitions */ +static __attribute__((nonnull, warn_unused_result, malloc)) +index_def_t* +innobase_create_key_defs( +/*=====================*/ + mem_heap_t* heap, + /*!< in/out: memory heap where space for key + definitions are allocated */ + const Alter_inplace_info* ha_alter_info, + /*!< in: alter operation */ + const TABLE* altered_table, + /*!< in: MySQL table that is being altered */ + ulint& n_add, + /*!< in/out: number of indexes to be created */ + ulint& n_fts_add, + /*!< out: number of FTS indexes to be created */ + bool got_default_clust, + /*!< in: whether the table lacks a primary key */ + ulint& fts_doc_id_col, + /*!< in: The column number for Doc ID */ + bool& add_fts_doc_id, + /*!< in: whether we need to add new DOC ID + column for FTS index */ + bool& add_fts_doc_idx) + /*!< in: whether we need to add new DOC ID + index for FTS index */ { - ulint i = 0; - merge_index_def_t* indexdef; - merge_index_def_t* indexdefs; + index_def_t* indexdef; + index_def_t* indexdefs; bool new_primary; + const uint*const add + = ha_alter_info->index_add_buffer; + const KEY*const key_info + = ha_alter_info->key_info_buffer; - DBUG_ENTER("innobase_create_key_def"); - - indexdef = indexdefs = (merge_index_def_t*) - mem_heap_alloc(heap, sizeof *indexdef - * (n_keys + UT_LIST_GET_LEN(table->indexes))); - - *add_fts_doc_id = FALSE; - *add_fts_doc_id_idx = FALSE; + DBUG_ENTER("innobase_create_key_defs"); + DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx); + DBUG_ASSERT(ha_alter_info->index_add_count == n_add); /* If there is a primary key, it is always the first index - defined for the table. */ + defined for the innodb_table. */ - new_primary = !my_strcasecmp(system_charset_info, - key_info->name, "PRIMARY"); + new_primary = n_add > 0 + && !my_strcasecmp(system_charset_info, + key_info[*add].name, "PRIMARY"); + n_fts_add = 0; /* If there is a UNIQUE INDEX consisting entirely of NOT NULL columns and if the index does not contain column prefix(es) (only prefix/part of the column is indexed), MySQL will treat the index as a PRIMARY KEY unless the table already has one. */ - if (!new_primary && (key_info->flags & HA_NOSAME) - && (!(key_info->flags & HA_KEY_HAS_PART_KEY_SEG)) - && row_table_got_default_clust_index(table)) { - uint key_part = key_info->key_parts; + if (n_add > 0 && !new_primary && got_default_clust + && (key_info[*add].flags & HA_NOSAME) + && !(key_info[*add].flags & HA_KEY_HAS_PART_KEY_SEG)) { + uint key_part = key_info[*add].user_defined_key_parts; - new_primary = TRUE; + new_primary = true; while (key_part--) { - if (key_info->key_part[key_part].key_type - & FIELDFLAG_MAYBE_NULL) { - new_primary = FALSE; + const uint maybe_null + = key_info[*add].key_part[key_part].key_type + & FIELDFLAG_MAYBE_NULL; + DBUG_ASSERT(!maybe_null + == !key_info[*add].key_part[key_part]. + field->real_maybe_null()); + + if (maybe_null) { + new_primary = false; break; } } } - /* Check whether any indexes in the create list are Full - Text Indexes*/ - for (ulint j = 0; j < n_keys; j++) { - if (key_info[j].flags & HA_FULLTEXT) { - (*num_fts_index)++; - } - } - - /* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index - list */ - if (innobase_fts_check_doc_id_index_in_def(n_keys, key_info) - == FTS_INCORRECT_DOC_ID_INDEX) { - push_warning_printf((THD*) trx->mysql_thd, - Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_NAME_FOR_INDEX, - " InnoDB: Index name %s is reserved" - " for the unique index on" - " FTS_DOC_ID column for FTS" - " document ID indexing" - " on table %s. Please check" - " the index definition to" - " make sure it is of correct" - " type\n", - FTS_DOC_ID_INDEX_NAME, - table->name); - DBUG_RETURN(NULL); - } - - /* If we are to build an FTS index, check whether the table - already has a DOC ID column, if not, we will need to add a - Doc ID hidden column and rebuild the primary index */ - if (*num_fts_index) { - enum fts_doc_id_index_enum ret; - ibool exists; - ulint doc_col_no; - ulint fts_doc_col_no; - - exists = innobase_fts_check_doc_id_col(table, &fts_doc_col_no); - - if (exists) { - - if (fts_doc_col_no == ULINT_UNDEFINED) { - - push_warning_printf( - (THD*) trx->mysql_thd, - Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_COLUMN_NAME, - " InnoDB: There exists a column %s " - "in table %s, but it is the wrong " - "type. Create of FTS index failed.\n", - FTS_DOC_ID_COL_NAME, table->name); + const bool rebuild = new_primary || add_fts_doc_id + || innobase_need_rebuild(ha_alter_info); + /* Reserve one more space if new_primary is true, and we might + need to add the FTS_DOC_ID_INDEX */ + indexdef = indexdefs = static_cast<index_def_t*>( + mem_heap_alloc( + heap, sizeof *indexdef + * (ha_alter_info->key_count + + rebuild + + got_default_clust))); - DBUG_RETURN(NULL); - - } else if (!table->fts) { - table->fts = fts_create(table); - } - - table->fts->doc_col = fts_doc_col_no; + if (rebuild) { + ulint primary_key_number; + if (new_primary) { + DBUG_ASSERT(n_add > 0); + primary_key_number = *add; + } else if (got_default_clust) { + /* Create the GEN_CLUST_INDEX */ + index_def_t* index = indexdef++; + + index->fields = NULL; + index->n_fields = 0; + index->ind_type = DICT_CLUSTERED; + index->name = mem_heap_strdup( + heap, innobase_index_reserve_name); + index->key_number = ~0; + primary_key_number = ULINT_UNDEFINED; + goto created_clustered; } else { - *add_fts_doc_id = TRUE; - *add_fts_doc_id_idx = TRUE; - - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Rebuild table %s to add " - "DOC_ID column\n", table->name); + primary_key_number = 0; } - ret = innobase_fts_check_doc_id_index(table, &doc_col_no); + /* Create the PRIMARY key index definition */ + innobase_create_index_def( + altered_table, key_info, primary_key_number, + TRUE, TRUE, indexdef++, heap); - switch (ret) { - case FTS_NOT_EXIST_DOC_ID_INDEX: - *add_fts_doc_id_idx = TRUE; - break; - case FTS_INCORRECT_DOC_ID_INDEX: +created_clustered: + n_add = 1; - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Index %s is used for FTS" - " Doc ID indexing on table %s, it is" - " now on the wrong column or of" - " wrong format. Please drop it.\n", - FTS_DOC_ID_INDEX_NAME, table->name); - DBUG_RETURN(NULL); + for (ulint i = 0; i < ha_alter_info->key_count; i++) { + if (i == primary_key_number) { + continue; + } + /* Copy the index definitions. */ + innobase_create_index_def( + altered_table, key_info, i, TRUE, FALSE, + indexdef, heap); - default: - ut_ad(ret == FTS_EXIST_DOC_ID_INDEX); + if (indexdef->ind_type & DICT_FTS) { + n_fts_add++; + } - ut_ad(doc_col_no == fts_doc_col_no); + indexdef++; + n_add++; } - } - /* If DICT_TF2_FTS_ADD_DOC_ID is set, we will need to rebuild - the table to add the unique Doc ID column for FTS index. And - thus the primary index would required to be rebuilt. Copy all - the index definitions */ - if (new_primary || *add_fts_doc_id) { - const dict_index_t* index; - - if (new_primary) { - /* Create the PRIMARY key index definition */ - innobase_create_index_def(&key_info[i++], - TRUE, TRUE, - indexdef++, heap); - } + if (n_fts_add > 0) { + if (!add_fts_doc_id + && !innobase_fts_check_doc_id_col( + NULL, altered_table, + &fts_doc_id_col)) { + fts_doc_id_col = altered_table->s->fields; + add_fts_doc_id = true; + } - row_mysql_lock_data_dictionary(trx); + if (!add_fts_doc_idx) { + fts_doc_id_index_enum ret; + ulint doc_col_no; - index = dict_table_get_first_index(table); + ret = innobase_fts_check_doc_id_index( + NULL, altered_table, &doc_col_no); - /* Copy the index definitions of the old table. Skip - the old clustered index if it is a generated clustered - index or a PRIMARY KEY. If the clustered index is a - UNIQUE INDEX, it must be converted to a secondary index. */ + /* This should have been checked before */ + ut_ad(ret != FTS_INCORRECT_DOC_ID_INDEX); - if (new_primary - && (dict_index_get_nth_col(index, 0)->mtype - == DATA_SYS - || !my_strcasecmp(system_charset_info, - index->name, "PRIMARY"))) { - index = dict_table_get_next_index(index); + if (ret == FTS_NOT_EXIST_DOC_ID_INDEX) { + add_fts_doc_idx = true; + } else { + ut_ad(ret == FTS_EXIST_DOC_ID_INDEX); + ut_ad(doc_col_no == ULINT_UNDEFINED + || doc_col_no == fts_doc_id_col); + } + } } + } else { + /* Create definitions for added secondary indexes. */ - while (index) { - innobase_copy_index_def(index, indexdef++, heap); + for (ulint i = 0; i < n_add; i++) { + innobase_create_index_def( + altered_table, key_info, add[i], FALSE, FALSE, + indexdef, heap); - if (new_primary && index->type & DICT_FTS) { - (*num_fts_index)++; + if (indexdef->ind_type & DICT_FTS) { + n_fts_add++; } - index = dict_table_get_next_index(index); + indexdef++; } + } - /* The primary index would be rebuilt if a FTS Doc ID - column is to be added, and the primary index definition - is just copied from old table and stored in indexdefs[0] */ - if (*add_fts_doc_id) { - indexdefs[0].ind_type |= DICT_CLUSTERED; - DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_ADD_DOC_ID); - } + DBUG_ASSERT(indexdefs + n_add == indexdef); - row_mysql_unlock_data_dictionary(trx); - } + if (add_fts_doc_idx) { + index_def_t* index = indexdef++; - /* Create definitions for added secondary indexes. */ + index->fields = static_cast<index_field_t*>( + mem_heap_alloc(heap, sizeof *index->fields)); + index->n_fields = 1; + index->fields->col_no = fts_doc_id_col; + index->fields->prefix_len = 0; + index->ind_type = DICT_UNIQUE; - while (i < n_keys) { - innobase_create_index_def(&key_info[i++], new_primary, FALSE, - indexdef++, heap); - } + if (rebuild) { + index->name = mem_heap_strdup( + heap, FTS_DOC_ID_INDEX_NAME); + ut_ad(!add_fts_doc_id + || fts_doc_id_col == altered_table->s->fields); + } else { + char* index_name; + index->name = index_name = static_cast<char*>( + mem_heap_alloc( + heap, + 1 + sizeof FTS_DOC_ID_INDEX_NAME)); + *index_name++ = TEMP_INDEX_PREFIX; + memcpy(index_name, FTS_DOC_ID_INDEX_NAME, + sizeof FTS_DOC_ID_INDEX_NAME); + } - n_keys = indexdef - indexdefs; + /* TODO: assign a real MySQL key number for this */ + index->key_number = ULINT_UNDEFINED; + n_add++; + } + DBUG_ASSERT(indexdef > indexdefs); + DBUG_ASSERT((ulint) (indexdef - indexdefs) + <= ha_alter_info->key_count + + add_fts_doc_idx + got_default_clust); + DBUG_ASSERT(ha_alter_info->index_add_count <= n_add); DBUG_RETURN(indexdefs); } /*******************************************************************//** Check each index column size, make sure they do not exceed the max limit -@return HA_ERR_INDEX_COL_TOO_LONG if index column size exceeds limit */ -static -int +@return true if index column size exceeds limit */ +static __attribute__((nonnull, warn_unused_result)) +bool innobase_check_column_length( /*=========================*/ - const dict_table_t*table, /*!< in: table definition */ + ulint max_col_len, /*!< in: maximum column length */ const KEY* key_info) /*!< in: Indexes to be created */ { - ulint max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table); - - for (ulint key_part = 0; key_part < key_info->key_parts; key_part++) { + for (ulint key_part = 0; key_part < key_info->user_defined_key_parts; key_part++) { if (key_info->key_part[key_part].length > max_col_len) { - my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), max_col_len); - return(HA_ERR_INDEX_COL_TOO_LONG); + return(true); } } - return(0); + return(false); } -/*******************************************************************//** -Create a temporary tablename using query id, thread id, and id -@return temporary tablename */ -static -char* -innobase_create_temporary_tablename( -/*================================*/ - mem_heap_t* heap, /*!< in: memory heap */ - char id, /*!< in: identifier [0-9a-zA-Z] */ - const char* table_name) /*!< in: table name */ +struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx { - char* name; - ulint len; - static const char suffix[] = "@0023 "; /* "# " */ + /** Dummy query graph */ + que_thr_t* thr; + /** InnoDB indexes being created */ + dict_index_t** add; + /** MySQL key numbers for the InnoDB indexes that are being created */ + const ulint* add_key_numbers; + /** number of InnoDB indexes being created */ + const ulint num_to_add; + /** InnoDB indexes being dropped */ + dict_index_t** drop; + /** number of InnoDB indexes being dropped */ + const ulint num_to_drop; + /** InnoDB foreign key constraints being dropped */ + dict_foreign_t** drop_fk; + /** number of InnoDB foreign key constraints being dropped */ + const ulint num_to_drop_fk; + /** InnoDB foreign key constraints being added */ + dict_foreign_t** add_fk; + /** number of InnoDB foreign key constraints being dropped */ + const ulint num_to_add_fk; + /** whether to create the indexes online */ + bool online; + /** memory heap */ + mem_heap_t* heap; + /** dictionary transaction */ + trx_t* trx; + /** table where the indexes are being created or dropped */ + dict_table_t* indexed_table; + /** mapping of old column numbers to new ones, or NULL */ + const ulint* col_map; + /** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */ + const ulint add_autoinc; + /** default values of ADD COLUMN, or NULL */ + const dtuple_t* add_cols; + /** autoinc sequence to use */ + ib_sequence_t sequence; + + ha_innobase_inplace_ctx(trx_t* user_trx, + dict_index_t** add_arg, + const ulint* add_key_numbers_arg, + ulint num_to_add_arg, + dict_index_t** drop_arg, + ulint num_to_drop_arg, + dict_foreign_t** drop_fk_arg, + ulint num_to_drop_fk_arg, + dict_foreign_t** add_fk_arg, + ulint num_to_add_fk_arg, + bool online_arg, + mem_heap_t* heap_arg, + trx_t* trx_arg, + dict_table_t* indexed_table_arg, + const ulint* col_map_arg, + ulint add_autoinc_arg, + ulonglong autoinc_col_min_value_arg, + ulonglong autoinc_col_max_value_arg, + const dtuple_t* add_cols_arg) : + inplace_alter_handler_ctx(), + add (add_arg), add_key_numbers (add_key_numbers_arg), + num_to_add (num_to_add_arg), + drop (drop_arg), num_to_drop (num_to_drop_arg), + drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg), + add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg), + online (online_arg), heap (heap_arg), trx (trx_arg), + indexed_table (indexed_table_arg), + col_map (col_map_arg), add_autoinc (add_autoinc_arg), + add_cols (add_cols_arg), + sequence(user_trx ? user_trx->mysql_thd : 0, + autoinc_col_min_value_arg, autoinc_col_max_value_arg) + { +#ifdef UNIV_DEBUG + for (ulint i = 0; i < num_to_add; i++) { + ut_ad(!add[i]->to_be_dropped); + } + for (ulint i = 0; i < num_to_drop; i++) { + ut_ad(drop[i]->to_be_dropped); + } +#endif /* UNIV_DEBUG */ - len = strlen(table_name); + thr = pars_complete_graph_for_exec(NULL, user_trx, heap); + } - name = (char*) mem_heap_alloc(heap, len + sizeof suffix); - memcpy(name, table_name, len); - memcpy(name + len, suffix, sizeof suffix); - name[len + (sizeof suffix - 2)] = id; + ~ha_innobase_inplace_ctx() + { + mem_heap_free(heap); + } - return(name); -} +private: + // Disable copying + ha_innobase_inplace_ctx(const ha_innobase_inplace_ctx&); + ha_innobase_inplace_ctx& operator=(const ha_innobase_inplace_ctx&); +}; -class ha_innobase_add_index : public handler_add_index +/********************************************************************//** +Drop any indexes that we were not able to free previously due to +open table handles. */ +static +void +online_retry_drop_indexes_low( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx) /*!< in/out: transaction */ { -public: - /** table where the indexes are being created */ - dict_table_t* indexed_table; - ha_innobase_add_index(TABLE* table, KEY* key_info, uint num_of_keys, - dict_table_t* indexed_table_arg) : - handler_add_index(table, key_info, num_of_keys), - indexed_table (indexed_table_arg) {} - ~ha_innobase_add_index() {} -}; + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + + /* We can have table->n_ref_count > 1, because other threads + may have prebuilt->table pointing to the table. However, these + other threads should be between statements, waiting for the + next statement to execute, or for a meta-data lock. */ + ut_ad(table->n_ref_count >= 1); + + if (table->drop_aborted) { + row_merge_drop_indexes(trx, table, TRUE); + } +} -/*******************************************************************//** -This is to create FTS_DOC_ID_INDEX definition on the newly added Doc ID for -the FTS indexes table -@return dict_index_t for the FTS_DOC_ID_INDEX */ -dict_index_t* -innobase_create_fts_doc_id_idx( -/*===========================*/ - dict_table_t* indexed_table, /*!< in: Table where indexes are - created */ - trx_t* trx, /*!< in: Transaction */ - mem_heap_t* heap) /*!< Heap for index definitions */ +/********************************************************************//** +Drop any indexes that we were not able to free previously due to +open table handles. */ +static __attribute__((nonnull)) +void +online_retry_drop_indexes( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + THD* user_thd) /*!< in/out: MySQL connection */ { - dict_index_t* index; - merge_index_def_t fts_index_def; - char* index_name; - - /* Create the temp index name for FTS_DOC_ID_INDEX */ - fts_index_def.name = index_name = (char*) mem_heap_alloc( - heap, FTS_DOC_ID_INDEX_NAME_LEN + 2); - *index_name++ = TEMP_INDEX_PREFIX; - memcpy(index_name, FTS_DOC_ID_INDEX_NAME, - FTS_DOC_ID_INDEX_NAME_LEN); - index_name[FTS_DOC_ID_INDEX_NAME_LEN] = 0; - - /* Only the Doc ID will be indexed */ - fts_index_def.n_fields = 1; - fts_index_def.ind_type = DICT_UNIQUE; - fts_index_def.fields = (merge_index_field_t*) mem_heap_alloc( - heap, sizeof *fts_index_def.fields); - fts_index_def.fields[0].prefix_len = 0; - fts_index_def.fields[0].field_name = mem_heap_strdup( - heap, FTS_DOC_ID_COL_NAME); - - index = row_merge_create_index(trx, indexed_table, &fts_index_def); - return(index); + if (table->drop_aborted) { + trx_t* trx = innobase_trx_allocate(user_thd); + + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + row_mysql_lock_data_dictionary(trx); + online_retry_drop_indexes_low(table, trx); + trx_commit_for_mysql(trx); + row_mysql_unlock_data_dictionary(trx); + trx_free_for_mysql(trx); + } + +#ifdef UNIV_DEBUG + mutex_enter(&dict_sys->mutex); + dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE); + mutex_exit(&dict_sys->mutex); + ut_a(!table->drop_aborted); +#endif /* UNIV_DEBUG */ } -/*******************************************************************//** -Clean up on ha_innobase::add_index error. */ -static +/********************************************************************//** +Commit a dictionary transaction and drop any indexes that we were not +able to free previously due to open table handles. */ +static __attribute__((nonnull)) void -innobase_add_index_cleanup( -/*=======================*/ - row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */ - trx_t* trx, /*!< in/out: transaction */ - dict_table_t* table) /*!< in/out: table on which - the indexes were going to be - created */ +online_retry_drop_indexes_with_trx( +/*===============================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx) /*!< in/out: transaction */ { - trx_rollback_to_savepoint(trx, NULL); + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); - ut_a(trx != prebuilt->trx); + /* Now that the dictionary is being locked, check if we can + drop any incompletely created indexes that may have been left + behind in rollback_inplace_alter_table() earlier. */ + if (table->drop_aborted) { - trx_free_for_mysql(trx); + trx->table_id = 0; - trx_commit_for_mysql(prebuilt->trx); + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); - if (table != NULL) { + online_retry_drop_indexes_low(table, trx); + trx_commit_for_mysql(trx); + } +} - rw_lock_x_lock(&dict_operation_lock); +/** Determines if InnoDB is dropping a foreign key constraint. +@param foreign the constraint +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@return whether the constraint is being dropped */ +inline __attribute__((pure, nonnull, warn_unused_result)) +bool +innobase_dropping_foreign( +/*======================*/ + const dict_foreign_t* foreign, + dict_foreign_t** drop_fk, + ulint n_drop_fk) +{ + while (n_drop_fk--) { + if (*drop_fk++ == foreign) { + return(true); + } + } - dict_mutex_enter_for_mysql(); + return(false); +} - /* Note: This check excludes the system tables. However, we - should be safe because users cannot add indexes to system - tables. */ +/** Determines if an InnoDB FOREIGN KEY constraint depends on a +column that is being dropped or modified to NOT NULL. +@param user_table InnoDB table as it is before the ALTER operation +@param col_name Name of the column being altered +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@param drop true=drop column, false=set NOT NULL +@retval true Not allowed (will call my_error()) +@retval false Allowed +*/ +static __attribute__((pure, nonnull, warn_unused_result)) +bool +innobase_check_foreigns_low( +/*========================*/ + const dict_table_t* user_table, + dict_foreign_t** drop_fk, + ulint n_drop_fk, + const char* col_name, + bool drop) +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + /* Check if any FOREIGN KEY constraints are defined on this + column. */ + for (const dict_foreign_t* foreign = UT_LIST_GET_FIRST( + user_table->foreign_list); + foreign; + foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { + if (!drop && !(foreign->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + continue; + } - if (UT_LIST_GET_LEN(table->foreign_list) == 0 - && UT_LIST_GET_LEN(table->referenced_list) == 0 - && !table->can_be_evicted) { + if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) { + continue; + } - dict_table_move_from_non_lru_to_lru(table); + for (unsigned f = 0; f < foreign->n_fields; f++) { + if (!strcmp(foreign->foreign_col_names[f], + col_name)) { + my_error(drop + ? ER_FK_COLUMN_CANNOT_DROP + : ER_FK_COLUMN_NOT_NULL, MYF(0), + col_name, foreign->id); + return(true); + } } + } + + if (!drop) { + /* SET NULL clauses on foreign key constraints of + child tables affect the child tables, not the parent table. + The column can be NOT NULL in the parent table. */ + return(false); + } - dict_table_close(table, TRUE); + /* Check if any FOREIGN KEY constraints in other tables are + referring to the column that is being dropped. */ + for (const dict_foreign_t* foreign = UT_LIST_GET_FIRST( + user_table->referenced_list); + foreign; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { + if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) { + continue; + } - dict_mutex_exit_for_mysql(); + for (unsigned f = 0; f < foreign->n_fields; f++) { + char display_name[FN_REFLEN]; - rw_lock_x_unlock(&dict_operation_lock); + if (strcmp(foreign->referenced_col_names[f], + col_name)) { + continue; + } + + char* buf_end = innobase_convert_name( + display_name, (sizeof display_name) - 1, + foreign->foreign_table_name, + strlen(foreign->foreign_table_name), + NULL, TRUE); + *buf_end = '\0'; + my_error(ER_FK_COLUMN_CANNOT_DROP_CHILD, + MYF(0), col_name, foreign->id, + display_name); + + return(true); + } } + + return(false); } -/*******************************************************************//** -Create indexes. -@return 0 or error number */ -UNIV_INTERN -int -ha_innobase::add_index( -/*===================*/ - TABLE* in_table, /*!< in: Table where indexes - are created */ - KEY* key_info, /*!< in: Indexes - to be created */ - uint num_of_keys, /*!< in: Number of indexes - to be created */ - handler_add_index** add) /*!< out: context */ +/** Determines if an InnoDB FOREIGN KEY constraint depends on a +column that is being dropped or modified to NOT NULL. +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param user_table InnoDB table as it is before the ALTER operation +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@retval true Not allowed (will call my_error()) +@retval false Allowed +*/ +static __attribute__((pure, nonnull, warn_unused_result)) +bool +innobase_check_foreigns( +/*====================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* old_table, + const dict_table_t* user_table, + dict_foreign_t** drop_fk, + ulint n_drop_fk) { - dict_index_t** index = NULL; /*!< Index to be created */ - dict_index_t* fts_index = NULL;/*!< FTS Index to be created */ - dict_table_t* indexed_table; /*!< Table where indexes are created */ - merge_index_def_t* index_defs; /*!< Index definitions */ - mem_heap_t* heap = NULL; /*!< Heap for index definitions */ - trx_t* trx; /*!< Transaction */ - ulint num_of_idx; - ulint num_created = 0; - ibool dict_locked = FALSE; - ulint new_primary = 0; - int error; - ulint num_fts_index = 0; - ulint num_idx_create = 0; - ibool fts_add_doc_id = FALSE; - ibool fts_add_doc_idx = FALSE; + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); - DBUG_ENTER("ha_innobase::add_index"); - ut_a(table); - ut_a(key_info); - ut_a(num_of_keys); + for (Field** fp = old_table->field; *fp; fp++) { + cf_it.rewind(); + const Create_field* new_field; - *add = NULL; + ut_ad(!(*fp)->real_maybe_null() + == !!((*fp)->flags & NOT_NULL_FLAG)); - if (srv_created_new_raw || srv_force_recovery) { - DBUG_RETURN(HA_ERR_WRONG_COMMAND); + while ((new_field = cf_it++)) { + if (new_field->field == *fp) { + break; + } + } + + if (!new_field || (new_field->flags & NOT_NULL_FLAG)) { + if (innobase_check_foreigns_low( + user_table, drop_fk, n_drop_fk, + (*fp)->field_name, !new_field)) { + return(true); + } + } } - update_thd(); + return(false); +} - /* In case MySQL calls this in the middle of a SELECT query, release - possible adaptive hash latch to avoid deadlocks of threads. */ - trx_search_latch_release_if_reserved(prebuilt->trx); +/** Convert a default value for ADD COLUMN. - /* Check if the index name is reserved. */ - if (innobase_index_name_is_reserved(user_thd, key_info, num_of_keys)) { - DBUG_RETURN(-1); +@param heap Memory heap where allocated +@param dfield InnoDB data field to copy to +@param field MySQL value for the column +@param comp nonzero if in compact format */ +static __attribute__((nonnull)) +void +innobase_build_col_map_add( +/*=======================*/ + mem_heap_t* heap, + dfield_t* dfield, + const Field* field, + ulint comp) +{ + if (field->is_real_null()) { + dfield_set_null(dfield); + return; } - indexed_table = dict_table_open_on_name(prebuilt->table->name, FALSE); + ulint size = field->pack_length(); - if (UNIV_UNLIKELY(!indexed_table)) { - DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); - } + byte* buf = static_cast<byte*>(mem_heap_alloc(heap, size)); - ut_a(indexed_table == prebuilt->table); + row_mysql_store_col_in_innobase_format( + dfield, buf, TRUE, field->ptr, size, comp); +} - if (indexed_table->tablespace_discarded) { - DBUG_RETURN(-1); +/** Construct the translation table for reordering, dropping or +adding columns. + +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param table MySQL table as it is before the ALTER operation +@param new_table InnoDB table corresponding to MySQL altered_table +@param old_table InnoDB table corresponding to MYSQL table +@param add_cols Default values for ADD COLUMN, or NULL if no ADD COLUMN +@param heap Memory heap where allocated +@return array of integers, mapping column numbers in the table +to column numbers in altered_table */ +static __attribute__((nonnull(1,2,3,4,5,7), warn_unused_result)) +const ulint* +innobase_build_col_map( +/*===================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* table, + const dict_table_t* new_table, + const dict_table_t* old_table, + dtuple_t* add_cols, + mem_heap_t* heap) +{ + DBUG_ENTER("innobase_build_col_map"); + DBUG_ASSERT(altered_table != table); + DBUG_ASSERT(new_table != old_table); + DBUG_ASSERT(dict_table_get_n_cols(new_table) + >= altered_table->s->fields + DATA_N_SYS_COLS); + DBUG_ASSERT(dict_table_get_n_cols(old_table) + >= table->s->fields + DATA_N_SYS_COLS); + DBUG_ASSERT(!!add_cols == !!(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN)); + DBUG_ASSERT(!add_cols || dtuple_get_n_fields(add_cols) + == dict_table_get_n_cols(new_table)); + + ulint* col_map = static_cast<ulint*>( + mem_heap_alloc(heap, old_table->n_cols * sizeof *col_map)); + + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + uint i = 0; + + /* Any dropped columns will map to ULINT_UNDEFINED. */ + for (uint old_i = 0; old_i + DATA_N_SYS_COLS < old_table->n_cols; + old_i++) { + col_map[old_i] = ULINT_UNDEFINED; } - /* Check that index keys are sensible */ - error = innobase_check_index_keys(key_info, num_of_keys, prebuilt->table); + while (const Create_field* new_field = cf_it++) { + for (uint old_i = 0; table->field[old_i]; old_i++) { + const Field* field = table->field[old_i]; + if (new_field->field == field) { + col_map[old_i] = i; + goto found_col; + } + } - if (UNIV_UNLIKELY(error)) { - dict_table_close(prebuilt->table, FALSE); - DBUG_RETURN(error); + innobase_build_col_map_add( + heap, dtuple_get_nth_field(add_cols, i), + altered_table->s->field[i], + dict_table_is_comp(new_table)); +found_col: + i++; } - /* Check each index's column length to make sure they do not - exceed limit */ - for (ulint i = 0; i < num_of_keys; i++) { - if (key_info[i].flags & HA_FULLTEXT) { - continue; + DBUG_ASSERT(i == altered_table->s->fields); + + i = table->s->fields; + + /* Add the InnoDB hidden FTS_DOC_ID column, if any. */ + if (i + DATA_N_SYS_COLS < old_table->n_cols) { + /* There should be exactly one extra field, + the FTS_DOC_ID. */ + DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(old_table, + DICT_TF2_FTS_HAS_DOC_ID)); + DBUG_ASSERT(i + DATA_N_SYS_COLS + 1 == old_table->n_cols); + DBUG_ASSERT(!strcmp(dict_table_get_col_name( + old_table, table->s->fields), + FTS_DOC_ID_COL_NAME)); + if (altered_table->s->fields + DATA_N_SYS_COLS + < new_table->n_cols) { + DBUG_ASSERT(DICT_TF2_FLAG_IS_SET( + new_table, + DICT_TF2_FTS_HAS_DOC_ID)); + DBUG_ASSERT(altered_table->s->fields + + DATA_N_SYS_COLS + 1 + == new_table->n_cols); + col_map[i] = altered_table->s->fields; + } else { + DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET( + new_table, + DICT_TF2_FTS_HAS_DOC_ID)); + col_map[i] = ULINT_UNDEFINED; } - error = innobase_check_column_length(prebuilt->table, - &key_info[i]); + i++; + } else { + DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET( + old_table, + DICT_TF2_FTS_HAS_DOC_ID)); + } + + for (; i < old_table->n_cols; i++) { + col_map[i] = i + new_table->n_cols - old_table->n_cols; + } + + DBUG_RETURN(col_map); +} + +/** Drop newly create FTS index related auxiliary table during +FIC create index process, before fts_add_index is called +@param table table that was being rebuilt online +@param trx transaction +@return DB_SUCCESS if successful, otherwise last error code +*/ +static +dberr_t +innobase_drop_fts_index_table( +/*==========================*/ + dict_table_t* table, + trx_t* trx) +{ + dberr_t ret_err = DB_SUCCESS; + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + if (index->type & DICT_FTS) { + dberr_t err; + + err = fts_drop_index_tables(trx, index); - if (error) { - dict_table_close(prebuilt->table, FALSE); - DBUG_RETURN(error); + if (err != DB_SUCCESS) { + ret_err = err; + } } } - heap = mem_heap_create(1024); - trx_start_if_not_started(prebuilt->trx); + return(ret_err); +} + +/** Update internal structures with concurrent writes blocked, +while preparing ALTER TABLE. + +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param user_table InnoDB table that is being altered +@param user_trx User transaction, for locking the table +@param table_name Table name in MySQL +@param flags Table and tablespace flags +@param flags2 Additional table flags +@param heap Memory heap, or NULL +@param drop_index Indexes to be dropped, or NULL +@param n_drop_index Number of indexes to drop +@param drop_foreign Foreign key constraints to be dropped, or NULL +@param n_drop_foreign Number of foreign key constraints to drop +@param fts_doc_id_col The column number of FTS_DOC_ID +@param add_autoinc_col The number of an added AUTO_INCREMENT column, + or ULINT_UNDEFINED if none was added +@param add_fts_doc_id Flag: add column FTS_DOC_ID? +@param add_fts_doc_id_idx Flag: add index (FTS_DOC_ID)? + +@retval true Failure +@retval false Success +*/ +static __attribute__((warn_unused_result, nonnull(1,2,3,4))) +bool +prepare_inplace_alter_table_dict( +/*=============================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* old_table, + dict_table_t* user_table, + trx_t* user_trx, + const char* table_name, + ulint flags, + ulint flags2, + mem_heap_t* heap, + dict_index_t** drop_index, + ulint n_drop_index, + dict_foreign_t** drop_foreign, + ulint n_drop_foreign, + dict_foreign_t** add_foreign, + ulint n_add_foreign, + ulint fts_doc_id_col, + ulint add_autoinc_col, + ulonglong autoinc_col_max_value, + bool add_fts_doc_id, + bool add_fts_doc_id_idx) +{ + trx_t* trx; + bool dict_locked = false; + dict_index_t** add_index; /* indexes to be created */ + ulint* add_key_nums; /* MySQL key numbers */ + ulint n_add_index; + index_def_t* index_defs; /* index definitions */ + dict_index_t* fts_index = NULL; + dict_table_t* indexed_table = user_table; + ulint new_clustered = 0; + dberr_t error; + THD* user_thd = user_trx->mysql_thd; + const ulint* col_map = NULL; + dtuple_t* add_cols = NULL; + ulint num_fts_index; + + DBUG_ENTER("prepare_inplace_alter_table_dict"); + DBUG_ASSERT((add_autoinc_col != ULINT_UNDEFINED) + == (autoinc_col_max_value > 0)); + DBUG_ASSERT(!n_drop_index == !drop_index); + DBUG_ASSERT(!n_drop_foreign == !drop_foreign); + DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx); + DBUG_ASSERT(!add_fts_doc_id_idx + || innobase_fulltext_exist(altered_table->s)); + + trx_start_if_not_started_xa(user_trx); /* Create a background transaction for the operations on the data dictionary tables. */ trx = innobase_trx_allocate(user_thd); - trx_start_if_not_started(trx); - /* We don't want this table to be evicted from the cache while we - are building an index on it. Another issue is that while we are - building the index this table could be referred to in a foreign - key relationship. In innobase_add_index_cleanup() we check for - that condition before moving it back to the LRU list. */ + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); - row_mysql_lock_data_dictionary(trx); - - if (prebuilt->table->can_be_evicted) { - dict_table_move_from_lru_to_non_lru(prebuilt->table); + if (!heap) { + heap = mem_heap_create(1024); } - row_mysql_unlock_data_dictionary(trx); - /* Create table containing all indexes to be built in this - alter table add index so that they are in the correct order + ALTER TABLE ADD INDEX so that they are in the correct order in the table. */ - num_of_idx = num_of_keys; + n_add_index = ha_alter_info->index_add_count; - index_defs = innobase_create_key_def( - trx, prebuilt->table, heap, key_info, num_of_idx, - &num_fts_index, &fts_add_doc_id, &fts_add_doc_idx); + index_defs = innobase_create_key_defs( + heap, ha_alter_info, altered_table, n_add_index, + num_fts_index, row_table_got_default_clust_index(indexed_table), + fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx); - if (!index_defs) { - error = DB_UNSUPPORTED; - goto error_handling; - } + new_clustered = DICT_CLUSTERED & index_defs[0].ind_type; + + const bool locked = + !ha_alter_info->online + || add_autoinc_col != ULINT_UNDEFINED + || num_fts_index > 0 + || (innobase_need_rebuild(ha_alter_info) + && innobase_fulltext_exist(altered_table->s)); - /* Currently, support create one single FULLTEXT index in parallel at - a time */ if (num_fts_index > 1) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Only support create ONE Fulltext index" - " at a time\n"); - error = DB_UNSUPPORTED; - goto error_handling; + my_error(ER_INNODB_FT_LIMIT, MYF(0)); + goto error_handled; } - new_primary = DICT_CLUSTERED & index_defs[0].ind_type; + if (locked && ha_alter_info->online) { + /* This should have been blocked in + check_if_supported_inplace_alter(). */ + ut_ad(0); + my_error(ER_NOT_SUPPORTED_YET, MYF(0), + thd_query_string(user_thd)->str); + goto error_handled; + } - /* If a new FTS Doc ID column is to be added, there will be - one additional index to be built on the Doc ID column itself. */ - num_idx_create = (fts_add_doc_idx) ? num_of_idx + 1 : num_of_idx; + /* The primary index would be rebuilt if a FTS Doc ID + column is to be added, and the primary index definition + is just copied from old table and stored in indexdefs[0] */ + DBUG_ASSERT(!add_fts_doc_id || new_clustered); + DBUG_ASSERT(!!new_clustered == + (innobase_need_rebuild(ha_alter_info) + || add_fts_doc_id)); /* Allocate memory for dictionary index definitions */ - index = (dict_index_t**) mem_heap_alloc( - heap, num_idx_create * sizeof *index); - /* Flag this transaction as a dictionary operation, so that - the data dictionary will be locked in crash recovery. */ - trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + add_index = (dict_index_t**) mem_heap_alloc( + heap, n_add_index * sizeof *add_index); + add_key_nums = (ulint*) mem_heap_alloc( + heap, n_add_index * sizeof *add_key_nums); + + /* This transaction should be dictionary operation, so that + the data dictionary will be locked during crash recovery. */ + + ut_ad(trx->dict_operation == TRX_DICT_OP_INDEX); /* Acquire a lock on the table before creating any indexes. */ - error = row_merge_lock_table(prebuilt->trx, prebuilt->table, - new_primary ? LOCK_X : LOCK_S); - if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + if (locked) { + error = row_merge_lock_table( + user_trx, indexed_table, LOCK_S); - goto error_handling; + if (error != DB_SUCCESS) { + + goto error_handling; + } + } else { + error = DB_SUCCESS; } /* Latch the InnoDB data dictionary exclusively so that no deadlocks or lock waits can happen in it during an index create operation. */ row_mysql_lock_data_dictionary(trx); - dict_locked = TRUE; + dict_locked = true; - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE)); + /* Wait for background stats processing to stop using the table that + we are going to alter. We know bg stats will not start using it again + until we are holding the data dict locked and we are holding it here + at least until checking ut_ad(user_table->n_ref_count == 1) below. + XXX what may happen if bg stats opens the table after we + have unlocked data dictionary below? */ + dict_stats_wait_bg_to_stop_using_tables(user_table, NULL, trx); - /* If a new primary key is defined for the table we need + online_retry_drop_indexes_low(indexed_table, trx); + + ut_d(dict_table_check_for_dup_indexes( + indexed_table, CHECK_ABORTED_OK)); + + /* If a new clustered index is defined for the table we need to drop the original table and rebuild all indexes. */ - if (UNIV_UNLIKELY(new_primary)) { - /* This transaction should be the only one - operating on the table. The table get above - would have incremented the ref count to 2. */ - ut_a(prebuilt->table->n_ref_count == 2); + if (new_clustered) { + char* new_table_name = dict_mem_create_temporary_tablename( + heap, indexed_table->name, indexed_table->id); + ulint n_cols; - char* new_table_name = innobase_create_temporary_tablename( - heap, '1', prebuilt->table->name); + if (innobase_check_foreigns( + ha_alter_info, altered_table, old_table, + user_table, drop_foreign, n_drop_foreign)) { + goto new_clustered_failed; + } - /* Clone the table. */ + n_cols = altered_table->s->fields; + + if (add_fts_doc_id) { + n_cols++; + DBUG_ASSERT(flags2 & DICT_TF2_FTS); + DBUG_ASSERT(add_fts_doc_id_idx); + flags2 |= DICT_TF2_FTS_ADD_DOC_ID + | DICT_TF2_FTS_HAS_DOC_ID + | DICT_TF2_FTS; + } + + DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS)); + + /* Create the table. */ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - indexed_table = row_merge_create_temporary_table( - new_table_name, index_defs, prebuilt->table, trx); - if (!indexed_table) { + if (dict_table_get_low(new_table_name)) { + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), + new_table_name); + goto new_clustered_failed; + } - switch (trx->error_state) { - case DB_TABLESPACE_ALREADY_EXISTS: - case DB_DUPLICATE_KEY: - innobase_convert_tablename(new_table_name); - my_error(HA_ERR_TABLE_EXIST, MYF(0), - new_table_name); - error = HA_ERR_TABLE_EXIST; - break; - default: - error = convert_error_code_to_mysql( - trx->error_state, - prebuilt->table->flags, - user_thd); + /* The initial space id 0 may be overridden later. */ + indexed_table = dict_mem_table_create( + new_table_name, 0, n_cols, flags, flags2); + + if (DICT_TF_HAS_DATA_DIR(flags)) { + indexed_table->data_dir_path = + mem_heap_strdup(indexed_table->heap, + user_table->data_dir_path); + } + + for (uint i = 0; i < altered_table->s->fields; i++) { + const Field* field = altered_table->field[i]; + ulint is_unsigned; + ulint field_type + = (ulint) field->type(); + ulint col_type + = get_innobase_type_from_mysql_type( + &is_unsigned, field); + ulint charset_no; + ulint col_len; + + /* we assume in dtype_form_prtype() that this + fits in two bytes */ + ut_a(field_type <= MAX_CHAR_COLL_NUM); + + if (!field->real_maybe_null()) { + field_type |= DATA_NOT_NULL; + } + + if (field->binary()) { + field_type |= DATA_BINARY_TYPE; + } + + if (is_unsigned) { + field_type |= DATA_UNSIGNED; } - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, - TRUE)); - row_mysql_unlock_data_dictionary(trx); - mem_heap_free(heap); + if (dtype_is_string_type(col_type)) { + charset_no = (ulint) field->charset()->number; - innobase_add_index_cleanup( - prebuilt, trx, prebuilt->table); + if (charset_no > MAX_CHAR_COLL_NUM) { + dict_mem_table_free(indexed_table); + my_error(ER_WRONG_KEY_COLUMN, MYF(0), + field->field_name); + goto new_clustered_failed; + } + } else { + charset_no = 0; + } + + col_len = field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes + length field for a true VARCHAR. Let us + subtract that, so that the InnoDB column + length in the InnoDB data dictionary is the + real maximum byte length of the actual data. */ + + if (field->type() == MYSQL_TYPE_VARCHAR) { + uint32 length_bytes + = static_cast<const Field_varstring*>( + field)->length_bytes; + + col_len -= length_bytes; + + if (length_bytes == 2) { + field_type |= DATA_LONG_TRUE_VARCHAR; + } + } - DBUG_RETURN(error); + if (dict_col_name_is_reserved(field->field_name)) { + dict_mem_table_free(indexed_table); + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name); + goto new_clustered_failed; + } + + dict_mem_table_add_col( + indexed_table, heap, + field->field_name, + col_type, + dtype_form_prtype(field_type, charset_no), + col_len); + } + + if (add_fts_doc_id) { + fts_add_doc_id_column(indexed_table, heap); + indexed_table->fts->doc_col = fts_doc_id_col; + ut_ad(fts_doc_id_col == altered_table->s->fields); + } else if (indexed_table->fts) { + indexed_table->fts->doc_col = fts_doc_id_col; } - trx->table_id = indexed_table->id; + error = row_create_table_for_mysql(indexed_table, trx, false); + + switch (error) { + dict_table_t* temp_table; + case DB_SUCCESS: + /* We need to bump up the table ref count and + before we can use it we need to open the + table. The new_table must be in the data + dictionary cache, because we are still holding + the dict_sys->mutex. */ + ut_ad(mutex_own(&dict_sys->mutex)); + temp_table = dict_table_open_on_name( + indexed_table->name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE); + ut_a(indexed_table == temp_table); + /* n_ref_count must be 1, because purge cannot + be executing on this very table as we are + holding dict_operation_lock X-latch. */ + DBUG_ASSERT(indexed_table->n_ref_count == 1); + break; + case DB_TABLESPACE_EXISTS: + my_error(ER_TABLESPACE_EXISTS, MYF(0), + new_table_name); + goto new_clustered_failed; + case DB_DUPLICATE_KEY: + my_error(HA_ERR_TABLE_EXIST, MYF(0), + altered_table->s->table_name.str); + goto new_clustered_failed; + default: + my_error_innodb(error, table_name, flags); + new_clustered_failed: + DBUG_ASSERT(trx != user_trx); + trx_rollback_to_savepoint(trx, NULL); + + ut_ad(user_table->n_ref_count == 1); + + online_retry_drop_indexes_with_trx(user_table, trx); + + goto err_exit; + } + + if (ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN) { + + add_cols = dtuple_create( + heap, dict_table_get_n_cols(indexed_table)); + + dict_table_copy_types(add_cols, indexed_table); + } + + col_map = innobase_build_col_map( + ha_alter_info, altered_table, old_table, + indexed_table, user_table, + add_cols, heap); + } else { + DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info)); + + if (!indexed_table->fts + && innobase_fulltext_exist(altered_table->s)) { + indexed_table->fts = fts_create(indexed_table); + indexed_table->fts->doc_col = fts_doc_id_col; + } } + /* Assign table_id, so that no table id of + fts_create_index_tables() will be written to the undo logs. */ + DBUG_ASSERT(indexed_table->id != 0); + trx->table_id = indexed_table->id; + /* Create the indexes in SYS_INDEXES and load into dictionary. */ - for (num_created = 0; num_created < num_of_idx; num_created++) { + for (ulint num_created = 0; num_created < n_add_index; num_created++) { - index[num_created] = row_merge_create_index( + add_index[num_created] = row_merge_create_index( trx, indexed_table, &index_defs[num_created]); - if (!index[num_created]) { + add_key_nums[num_created] = index_defs[num_created].key_number; + + if (!add_index[num_created]) { error = trx->error_state; + DBUG_ASSERT(error != DB_SUCCESS); goto error_handling; } - if (index[num_created]->type & DICT_FTS) { - fts_index = index[num_created]; - fts_create_index_tables(trx, fts_index); + if (add_index[num_created]->type & DICT_FTS) { + DBUG_ASSERT(num_fts_index); + DBUG_ASSERT(!fts_index); + DBUG_ASSERT(add_index[num_created]->type == DICT_FTS); + fts_index = add_index[num_created]; + } + /* If only online ALTER TABLE operations have been + requested, allocate a modification log. If the table + will be locked anyway, the modification + log is unnecessary. When rebuilding the table + (new_clustered), we will allocate the log for the + clustered index of the old table, later. */ + if (new_clustered + || locked + || user_table->ibd_file_missing + || dict_table_is_discarded(user_table)) { + /* No need to allocate a modification log. */ + ut_ad(!add_index[num_created]->online_log); + } else if (add_index[num_created]->type & DICT_FTS) { + /* Fulltext indexes are not covered + by a modification log. */ + } else { + DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", + error = DB_OUT_OF_MEMORY; + goto error_handling;); + rw_lock_x_lock(&add_index[num_created]->lock); + bool ok = row_log_allocate(add_index[num_created], + NULL, true, NULL, NULL); + rw_lock_x_unlock(&add_index[num_created]->lock); + + if (!ok) { + error = DB_OUT_OF_MEMORY; + goto error_handling; + } } } - /* create FTS_DOC_ID_INDEX on the Doc ID column on the table */ - if (fts_add_doc_idx) { - index[num_of_idx] = innobase_create_fts_doc_id_idx( - indexed_table, trx, heap); - /* FTS_DOC_ID_INDEX is internal defined new index */ - num_of_idx++; - num_created++; + ut_ad(new_clustered == (indexed_table != user_table)); + + DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", + error = DB_OUT_OF_MEMORY; + goto error_handling;); + + if (new_clustered && !locked) { + /* Allocate a log for online table rebuild. */ + dict_index_t* clust_index = dict_table_get_first_index( + user_table); + + rw_lock_x_lock(&clust_index->lock); + bool ok = row_log_allocate( + clust_index, indexed_table, + !(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_PK_INDEX), + add_cols, col_map); + rw_lock_x_unlock(&clust_index->lock); + + if (!ok) { + error = DB_OUT_OF_MEMORY; + goto error_handling; + } + + /* Assign a consistent read view for + row_merge_read_clustered_index(). */ + trx_assign_read_view(user_trx); } - if (num_fts_index) { + if (fts_index) { + /* Ensure that the dictionary operation mode will + not change while creating the auxiliary tables. */ + trx_dict_op_t op = trx_get_dict_operation(trx); + +#ifdef UNIV_DEBUG + switch (op) { + case TRX_DICT_OP_NONE: + break; + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + goto op_ok; + } + ut_error; +op_ok: +#endif /* UNIV_DEBUG */ + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + DICT_TF2_FLAG_SET(indexed_table, DICT_TF2_FTS); + /* This function will commit the transaction and reset + the trx_t::dict_operation flag on success. */ + + error = fts_create_index_tables(trx, fts_index); + + DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table", + error = DB_LOCK_WAIT_TIMEOUT; + goto error_handling;); + + if (error != DB_SUCCESS) { + goto error_handling; + } + + trx_start_for_ddl(trx, op); + if (!indexed_table->fts || ib_vector_size(indexed_table->fts->indexes) == 0) { - fts_create_common_tables(trx, indexed_table, - prebuilt->table->name, TRUE); + error = fts_create_common_tables( + trx, indexed_table, user_table->name, TRUE); + + DBUG_EXECUTE_IF("innodb_test_fail_after_fts_common_table", + error = DB_LOCK_WAIT_TIMEOUT; + goto error_handling;); + + if (error != DB_SUCCESS) { + goto error_handling; + } indexed_table->fts->fts_status |= TABLE_DICT_LOCKED; - innobase_fts_load_stopword( - indexed_table, trx, ha_thd()); + + error = innobase_fts_load_stopword( + indexed_table, trx, user_thd) + ? DB_SUCCESS : DB_ERROR; indexed_table->fts->fts_status &= ~TABLE_DICT_LOCKED; - } - if (new_primary && prebuilt->table->fts) { - indexed_table->fts->doc_col = prebuilt->table->fts->doc_col; + if (error != DB_SUCCESS) { + goto error_handling; + } } + + ut_ad(trx_get_dict_operation(trx) == op); } - ut_ad(error == DB_SUCCESS); + DBUG_ASSERT(error == DB_SUCCESS); /* Commit the data dictionary transaction in order to release the table locks on the system tables. This means that if @@ -1278,633 +3000,2212 @@ ha_innobase::add_index( trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); - dict_locked = FALSE; + dict_locked = false; ut_a(trx->lock.n_active_thrs == 0); - if (UNIV_UNLIKELY(new_primary)) { - /* A primary key is to be built. Acquire an exclusive - table lock also on the table that is being created. */ - ut_ad(indexed_table != prebuilt->table); - - error = row_merge_lock_table(prebuilt->trx, indexed_table, - LOCK_X); - - if (UNIV_UNLIKELY(error != DB_SUCCESS)) { +error_handling: + /* After an error, remove all those index definitions from the + dictionary which were defined. */ - goto error_handling; - } + switch (error) { + case DB_SUCCESS: + ut_a(!dict_locked); + + ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(dict_table_check_for_dup_indexes( + user_table, CHECK_PARTIAL_OK)); + ut_d(mutex_exit(&dict_sys->mutex)); + ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx( + user_trx, add_index, add_key_nums, n_add_index, + drop_index, n_drop_index, + drop_foreign, n_drop_foreign, + add_foreign, n_add_foreign, + !locked, heap, trx, indexed_table, col_map, + add_autoinc_col, + ha_alter_info->create_info->auto_increment_value, + autoinc_col_max_value, + add_cols); + DBUG_RETURN(false); + case DB_TABLESPACE_EXISTS: + my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)"); + break; + case DB_DUPLICATE_KEY: + my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES"); + break; + default: + my_error_innodb(error, table_name, user_table->flags); } - /* Read the clustered index of the table and build indexes - based on this information using temporary files and merge sort. */ - error = row_merge_build_indexes(prebuilt->trx, - prebuilt->table, indexed_table, - index, num_of_idx, table); +error_handled: -error_handling: - - /* After an error, remove all those index definitions from the - dictionary which were defined. */ + user_trx->error_info = NULL; + trx->error_state = DB_SUCCESS; if (!dict_locked) { row_mysql_lock_data_dictionary(trx); - dict_locked = TRUE; } - switch (error) { - case DB_SUCCESS: - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE)); + if (new_clustered) { + if (indexed_table != user_table) { - *add = new ha_innobase_add_index( - table, key_info, num_of_keys, indexed_table); + if (DICT_TF2_FLAG_IS_SET(indexed_table, DICT_TF2_FTS)) { + innobase_drop_fts_index_table( + indexed_table, trx); + } - dict_table_close(prebuilt->table, dict_locked); - break; + dict_table_close(indexed_table, TRUE, FALSE); - case DB_TOO_BIG_RECORD: - my_error(HA_ERR_TO_BIG_ROW, MYF(0)); - goto error_exit; - case DB_PRIMARY_KEY_IS_NULL: - my_error(ER_PRIMARY_CANT_HAVE_NULL, MYF(0)); - /* fall through */ - case DB_DUPLICATE_KEY: - if (fts_add_doc_idx - && prebuilt->trx->error_key_num == num_of_idx - 1) { - prebuilt->trx->error_key_num = ULINT_UNDEFINED; - } -error_exit: - prebuilt->trx->error_info = NULL; - /* fall through */ - default: - dict_table_close(prebuilt->table, dict_locked); +#ifdef UNIV_DDL_DEBUG + /* Nobody should have initialized the stats of the + newly created table yet. When this is the case, we + know that it has not been added for background stats + gathering. */ + ut_a(!indexed_table->stat_initialized); +#endif /* UNIV_DDL_DEBUG */ - trx->error_state = DB_SUCCESS; + row_merge_drop_table(trx, indexed_table); - if (new_primary) { - if (indexed_table != prebuilt->table) { - dict_table_close(indexed_table, dict_locked); - row_merge_drop_table(trx, indexed_table); + /* Free the log for online table rebuild, if + one was allocated. */ + + dict_index_t* clust_index = dict_table_get_first_index( + user_table); + + rw_lock_x_lock(&clust_index->lock); + + if (clust_index->online_log) { + ut_ad(!locked); + row_log_abort_sec(clust_index); + clust_index->online_status + = ONLINE_INDEX_COMPLETE; } - } else { - row_merge_drop_indexes(trx, indexed_table, - index, num_created); + + rw_lock_x_unlock(&clust_index->lock); } + + trx_commit_for_mysql(trx); + /* n_ref_count must be 1, because purge cannot + be executing on this very table as we are + holding dict_operation_lock X-latch. */ + DBUG_ASSERT(user_table->n_ref_count == 1 || !locked); + + online_retry_drop_indexes_with_trx(user_table, trx); + } else { + ut_ad(indexed_table == user_table); + row_merge_drop_indexes(trx, user_table, TRUE); + trx_commit_for_mysql(trx); + } + + ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE)); + ut_ad(!user_table->drop_aborted); + +err_exit: + /* Clear the to_be_dropped flag in the data dictionary cache. */ + for (ulint i = 0; i < n_drop_index; i++) { + DBUG_ASSERT(*drop_index[i]->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(drop_index[i]->to_be_dropped); + drop_index[i]->to_be_dropped = 0; } - ut_ad(!new_primary || prebuilt->table->n_ref_count == 1); - trx_commit_for_mysql(trx); - ut_ad(dict_locked); row_mysql_unlock_data_dictionary(trx); + trx_free_for_mysql(trx); mem_heap_free(heap); - if (prebuilt->trx) { - trx_commit_for_mysql(prebuilt->trx); - } + trx_commit_for_mysql(user_trx); /* There might be work for utility threads.*/ srv_active_wake_master_thread(); - DBUG_RETURN(convert_error_code_to_mysql(error, prebuilt->table->flags, - user_thd)); + DBUG_RETURN(true); } -/*******************************************************************//** -Finalize or undo add_index(). -@return 0 or error number */ +/* Check whether an index is needed for the foreign key constraint. +If so, if it is dropped, is there an equivalent index can play its role. +@return true if the index is needed and can't be dropped */ +static __attribute__((warn_unused_result)) +bool +innobase_check_foreign_key_index( +/*=============================*/ + Alter_inplace_info* ha_alter_info, /*!< in: Structure describing + changes to be done by ALTER + TABLE */ + dict_index_t* index, /*!< in: index to check */ + dict_table_t* indexed_table, /*!< in: table that owns the + foreign keys */ + trx_t* trx, /*!< in/out: transaction */ + dict_foreign_t** drop_fk, /*!< in: Foreign key constraints + to drop */ + ulint n_drop_fk) /*!< in: Number of foreign keys + to drop */ +{ + dict_foreign_t* foreign; + + ut_ad(!index->to_be_dropped); + + /* Check if the index is referenced. */ + foreign = dict_table_get_referenced_constraint(indexed_table, index); + + ut_ad(!foreign || indexed_table + == foreign->referenced_table); + + if (foreign + && !dict_foreign_find_index( + indexed_table, + foreign->referenced_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, + /*check_null=*/FALSE) + && !innobase_find_equiv_index( + foreign->referenced_col_names, + foreign->n_fields, + ha_alter_info->key_info_buffer, + ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count) + ) { + trx->error_info = index; + return(true); + } + + /* Check if this index references some + other table */ + foreign = dict_table_get_foreign_constraint( + indexed_table, index); + + ut_ad(!foreign || indexed_table + == foreign->foreign_table); + + if (foreign + && !innobase_dropping_foreign( + foreign, drop_fk, n_drop_fk) + && !dict_foreign_find_index( + indexed_table, + foreign->foreign_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, + /*check_null=*/FALSE) + && !innobase_find_equiv_index( + foreign->foreign_col_names, + foreign->n_fields, + ha_alter_info->key_info_buffer, + ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count) + ) { + trx->error_info = index; + return(true); + } + + return(false); +} + +/** Allows InnoDB to update internal structures with concurrent +writes blocked (provided that check_if_supported_inplace_alter() +did not return HA_ALTER_INPLACE_NO_LOCK). +This will be invoked before inplace_alter_table(). + +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval true Failure +@retval false Success +*/ UNIV_INTERN -int -ha_innobase::final_add_index( -/*=========================*/ - handler_add_index* add_arg,/*!< in: context from add_index() */ - bool commit) /*!< in: true=commit, false=rollback */ +bool +ha_innobase::prepare_inplace_alter_table( +/*=====================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) { - ha_innobase_add_index* add; - trx_t* trx; - int err = 0; + dict_index_t** drop_index; /*!< Index to be dropped */ + ulint n_drop_index; /*!< Number of indexes to drop */ + dict_foreign_t**drop_fk; /*!< Foreign key constraints to drop */ + ulint n_drop_fk; /*!< Number of foreign keys to drop */ + dict_foreign_t**add_fk = NULL; /*!< Foreign key constraints to drop */ + ulint n_add_fk; /*!< Number of foreign keys to drop */ + dict_table_t* indexed_table; /*!< Table where indexes are created */ + mem_heap_t* heap; + int error; + ulint flags; + ulint flags2; + ulint max_col_len; + ulint add_autoinc_col_no = ULINT_UNDEFINED; + ulonglong autoinc_col_max_value = 0; + ulint fts_doc_col_no = ULINT_UNDEFINED; + bool add_fts_doc_id = false; + bool add_fts_doc_id_idx = false; + + DBUG_ENTER("prepare_inplace_alter_table"); + DBUG_ASSERT(!ha_alter_info->handler_ctx); + DBUG_ASSERT(ha_alter_info->create_info); + + if (srv_read_only_mode) { + DBUG_RETURN(false); + } - DBUG_ENTER("ha_innobase::final_add_index"); + MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE); - ut_ad(add_arg); - add = static_cast<class ha_innobase_add_index*>(add_arg); +#ifdef UNIV_DEBUG + for (dict_index_t* index = dict_table_get_first_index(prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + ut_ad(!index->to_be_dropped); + } +#endif /* UNIV_DEBUG */ - /* Create a background transaction for the operations on - the data dictionary tables. */ - trx = innobase_trx_allocate(user_thd); - trx_start_if_not_started(trx); + ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(dict_table_check_for_dup_indexes( + prebuilt->table, CHECK_ABORTED_OK)); + ut_d(mutex_exit(&dict_sys->mutex)); - /* Flag this transaction as a dictionary operation, so that - the data dictionary will be locked in crash recovery. */ - trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + /* Nothing to do */ + goto func_exit; + } - /* Latch the InnoDB data dictionary exclusively so that no deadlocks - or lock waits can happen in it during an index create operation. */ - row_mysql_lock_data_dictionary(trx); + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION + && !innobase_need_rebuild(ha_alter_info)) { + goto func_exit; + } - if (add->indexed_table != prebuilt->table) { - ulint error; + if (ha_alter_info->handler_flags + & Alter_inplace_info::CHANGE_CREATE_OPTION) { + if (const char* invalid_opt = create_options_are_invalid( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_opt); + goto err_exit_no_heap; + } + } - /* We copied the table (new_primary). */ - if (commit) { - mem_heap_t* heap; - char* tmp_name; + /* Check if any index name is reserved. */ + if (innobase_index_name_is_reserved( + user_thd, + ha_alter_info->key_info_buffer, + ha_alter_info->key_count)) { +err_exit_no_heap: + DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0); + if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { + online_retry_drop_indexes(prebuilt->table, user_thd); + } + DBUG_RETURN(true); + } - heap = mem_heap_create(1024); + indexed_table = prebuilt->table; - /* A new primary key was defined for the table - and there was no error at this point. We can - now rename the old table as a temporary table, - rename the new temporary table as the old - table and drop the old table. */ - tmp_name = innobase_create_temporary_tablename( - heap, '2', prebuilt->table->name); + /* Check that index keys are sensible */ + error = innobase_check_index_keys(ha_alter_info, indexed_table); - error = row_merge_rename_tables( - prebuilt->table, add->indexed_table, - tmp_name, trx); + if (error) { + goto err_exit_no_heap; + } - ut_a(prebuilt->table->n_ref_count == 1); + /* Prohibit renaming a column to something that the table + already contains. */ + if (ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME) { + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); - switch (error) { - case DB_TABLESPACE_ALREADY_EXISTS: - case DB_DUPLICATE_KEY: - ut_a(add->indexed_table->n_ref_count == 0); - innobase_convert_tablename(tmp_name); - my_error(HA_ERR_TABLE_EXIST, MYF(0), tmp_name); - err = HA_ERR_TABLE_EXIST; - break; - default: - err = convert_error_code_to_mysql( - error, prebuilt->table->flags, - user_thd); - break; + for (Field** fp = table->field; *fp; fp++) { + if (!((*fp)->flags & FIELD_IS_RENAMED)) { + continue; } - mem_heap_free(heap); + const char* name = 0; + + cf_it.rewind(); + while (Create_field* cf = cf_it++) { + if (cf->field == *fp) { + name = cf->field_name; + goto check_if_ok_to_rename; + } + } + + ut_error; +check_if_ok_to_rename: + /* Prohibit renaming a column from FTS_DOC_ID + if full-text indexes exist. */ + if (!my_strcasecmp(system_charset_info, + (*fp)->field_name, + FTS_DOC_ID_COL_NAME) + && innobase_fulltext_exist(altered_table->s)) { + my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, + MYF(0), name); + goto err_exit_no_heap; + } + + /* Prohibit renaming a column to an internal column. */ + const char* s = prebuilt->table->col_names; + unsigned j; + /* Skip user columns. + MySQL should have checked these already. + We want to allow renaming of c1 to c2, c2 to c1. */ + for (j = 0; j < table->s->fields; j++) { + s += strlen(s) + 1; + } + + for (; j < prebuilt->table->n_def; j++) { + if (!my_strcasecmp( + system_charset_info, name, s)) { + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + s); + goto err_exit_no_heap; + } + + s += strlen(s) + 1; + } } + } - if (!commit || err) { - dict_table_close(add->indexed_table, TRUE); - error = row_merge_drop_table(trx, add->indexed_table); - trx_commit_for_mysql(prebuilt->trx); - } else { - dict_table_t* old_table = prebuilt->table; - trx_commit_for_mysql(prebuilt->trx); - row_prebuilt_free(prebuilt, TRUE); - error = row_merge_drop_table(trx, old_table); - prebuilt = row_create_prebuilt(add->indexed_table, - 0 /* XXX Do we know the mysql_row_len here? - Before the addition of this parameter to - row_create_prebuilt() the mysql_row_len - member was left 0 (from zalloc) in the - prebuilt object. */); + if (!innobase_table_flags(altered_table, + ha_alter_info->create_info, + user_thd, + srv_file_per_table + || indexed_table->space != 0, + &flags, &flags2)) { + goto err_exit_no_heap; + } + + max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags); + + /* Check each index's column length to make sure they do not + exceed limit */ + for (ulint i = 0; i < ha_alter_info->index_add_count; i++) { + const KEY* key = &ha_alter_info->key_info_buffer[ + ha_alter_info->index_add_buffer[i]]; + + if (key->flags & HA_FULLTEXT) { + /* The column length does not matter for + fulltext search indexes. But, UNIQUE + fulltext indexes are not supported. */ + DBUG_ASSERT(!(key->flags & HA_NOSAME)); + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_BINARY_PACK_KEY))); + continue; } - err = convert_error_code_to_mysql( - error, prebuilt->table->flags, user_thd); + if (innobase_check_column_length(max_col_len, key)) { + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + max_col_len); + goto err_exit_no_heap; + } } - if (add->indexed_table == prebuilt->table - || DICT_TF2_FLAG_IS_SET(prebuilt->table, DICT_TF2_FTS_ADD_DOC_ID)) { - /* We created secondary indexes (!new_primary) or create full - text index and added a new Doc ID column, we will need to - rename the secondary index on the Doc ID column to its - official index name.. */ + /* Check existing index definitions for too-long column + prefixes as well, in case max_col_len shrunk. */ + for (const dict_index_t* index + = dict_table_get_first_index(indexed_table); + index; + index = dict_table_get_next_index(index)) { + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS + || (index->type & DICT_CORRUPT)); + continue; + } - if (commit) { - err = convert_error_code_to_mysql( - row_merge_rename_indexes(trx, prebuilt->table), - prebuilt->table->flags, user_thd); + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + if (field->prefix_len > max_col_len) { + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + max_col_len); + goto err_exit_no_heap; + } } + } + + n_drop_index = 0; + n_drop_fk = 0; + + if (ha_alter_info->handler_flags + & Alter_inplace_info::DROP_FOREIGN_KEY) { + DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0); - if (!commit || err) { - dict_index_t* index; - dict_index_t* next_index; + heap = mem_heap_create(1024); - for (index = dict_table_get_first_index( - prebuilt->table); - index; index = next_index) { + drop_fk = static_cast<dict_foreign_t**>( + mem_heap_alloc( + heap, + ha_alter_info->alter_info->drop_list.elements + * sizeof(dict_foreign_t*))); - next_index = dict_table_get_next_index(index); + List_iterator<Alter_drop> drop_it( + ha_alter_info->alter_info->drop_list); - if (*index->name == TEMP_INDEX_PREFIX) { - row_merge_drop_index( - index, prebuilt->table, trx); + while (Alter_drop* drop = drop_it++) { + if (drop->type != Alter_drop::FOREIGN_KEY) { + continue; + } + + for (dict_foreign_t* foreign = UT_LIST_GET_FIRST( + prebuilt->table->foreign_list); + foreign != NULL; + foreign = UT_LIST_GET_NEXT( + foreign_list, foreign)) { + const char* fid = strchr(foreign->id, '/'); + + DBUG_ASSERT(fid); + /* If no database/ prefix was present in + the FOREIGN KEY constraint name, compare + to the full constraint name. */ + fid = fid ? fid + 1 : foreign->id; + + if (!my_strcasecmp(system_charset_info, + fid, drop->name)) { + drop_fk[n_drop_fk++] = foreign; + goto found_fk; } } + + my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0), + drop->name); + goto err_exit; +found_fk: + continue; } - DICT_TF2_FLAG_UNSET(prebuilt->table, DICT_TF2_FTS_ADD_DOC_ID); + DBUG_ASSERT(n_drop_fk > 0); + DBUG_ASSERT(n_drop_fk + == ha_alter_info->alter_info->drop_list.elements); + } else { + drop_fk = NULL; + heap = NULL; } - /* If index is successfully built, we will need to rebuild index - translation table. Set valid index entry count in the translation - table to zero. */ - if (err == 0 && commit) { - ibool new_primary; - dict_index_t* index; - dict_index_t* next_index; - ibool new_fts = FALSE; - dict_index_t* primary; + if (ha_alter_info->index_drop_count) { + dict_index_t* drop_primary = NULL; - new_primary = !my_strcasecmp( - system_charset_info, add->key_info[0].name, "PRIMARY"); - - primary = dict_table_get_first_index(add->indexed_table); - - if (!new_primary) { - new_primary = !my_strcasecmp( - system_charset_info, add->key_info[0].name, - primary->name); + DBUG_ASSERT(ha_alter_info->handler_flags + & (Alter_inplace_info::DROP_INDEX + | Alter_inplace_info::DROP_UNIQUE_INDEX + | Alter_inplace_info::DROP_PK_INDEX)); + /* Check which indexes to drop. */ + if (!heap) { + heap = mem_heap_create(1024); + } + drop_index = static_cast<dict_index_t**>( + mem_heap_alloc( + heap, (ha_alter_info->index_drop_count + 1) + * sizeof *drop_index)); + + for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { + const KEY* key + = ha_alter_info->index_drop_buffer[i]; + dict_index_t* index + = dict_table_get_index_on_name_and_min_id( + indexed_table, key->name); + + if (!index) { + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_WRONG_INDEX, + "InnoDB could not find key " + "with name %s", key->name); + } else { + ut_ad(!index->to_be_dropped); + if (!dict_index_is_clust(index)) { + drop_index[n_drop_index++] = index; + } else { + drop_primary = index; + } + } } - share->idx_trans_tbl.index_count = 0; + /* If all FULLTEXT indexes were removed, drop an + internal FTS_DOC_ID_INDEX as well, unless it exists in + the table. */ + + if (innobase_fulltext_exist(table->s) + && !innobase_fulltext_exist(altered_table->s) + && !DICT_TF2_FLAG_IS_SET( + indexed_table, DICT_TF2_FTS_HAS_DOC_ID)) { + dict_index_t* fts_doc_index + = dict_table_get_index_on_name( + indexed_table, FTS_DOC_ID_INDEX_NAME); + + // Add some fault tolerance for non-debug builds. + if (fts_doc_index == NULL) { + goto check_if_can_drop_indexes; + } - if (new_primary) { - for (index = primary; index; index = next_index) { + DBUG_ASSERT(!fts_doc_index->to_be_dropped); + + for (uint i = 0; i < table->s->keys; i++) { + if (!my_strcasecmp( + system_charset_info, + FTS_DOC_ID_INDEX_NAME, + table->s->key_info[i].name)) { + /* The index exists in the MySQL + data dictionary. Do not drop it, + even though it is no longer needed + by InnoDB fulltext search. */ + goto check_if_can_drop_indexes; + } + } - next_index = dict_table_get_next_index(index); + drop_index[n_drop_index++] = fts_doc_index; + } - if (index->type & DICT_FTS) { - fts_add_index(index, - add->indexed_table); - new_fts = TRUE; +check_if_can_drop_indexes: + /* Check if the indexes can be dropped. */ + + /* Prevent a race condition between DROP INDEX and + CREATE TABLE adding FOREIGN KEY constraints. */ + row_mysql_lock_data_dictionary(prebuilt->trx); + + if (prebuilt->trx->check_foreigns) { + for (uint i = 0; i < n_drop_index; i++) { + dict_index_t* index = drop_index[i]; + + if (innobase_check_foreign_key_index( + ha_alter_info, index, indexed_table, + prebuilt->trx, drop_fk, n_drop_fk)) { + row_mysql_unlock_data_dictionary( + prebuilt->trx); + prebuilt->trx->error_info = index; + print_error(HA_ERR_DROP_INDEX_FK, + MYF(0)); + goto err_exit; } } + + /* If a primary index is dropped, need to check + any depending foreign constraints get affected */ + if (drop_primary + && innobase_check_foreign_key_index( + ha_alter_info, drop_primary, indexed_table, + prebuilt->trx, drop_fk, n_drop_fk)) { + row_mysql_unlock_data_dictionary(prebuilt->trx); + print_error(HA_ERR_DROP_INDEX_FK, MYF(0)); + goto err_exit; + } + } + + if (!n_drop_index) { + drop_index = NULL; } else { - ulint i; - for (i = 0; i < add->num_of_keys; i++) { - if (add->key_info[i].flags & HA_FULLTEXT) { - dict_index_t* fts_index; - - fts_index = - dict_table_get_index_on_name( - prebuilt->table, - add->key_info[i].name); - - ut_ad(fts_index); - fts_add_index(fts_index, - prebuilt->table); - new_fts = TRUE; + /* Flag all indexes that are to be dropped. */ + for (ulint i = 0; i < n_drop_index; i++) { + ut_ad(!drop_index[i]->to_be_dropped); + drop_index[i]->to_be_dropped = 1; + } + } + + row_mysql_unlock_data_dictionary(prebuilt->trx); + } else { + drop_index = NULL; + } + + n_add_fk = 0; + + if (ha_alter_info->handler_flags + & Alter_inplace_info::ADD_FOREIGN_KEY) { + ut_ad(!prebuilt->trx->check_foreigns); + + if (!heap) { + heap = mem_heap_create(1024); + } + + add_fk = static_cast<dict_foreign_t**>( + mem_heap_zalloc( + heap, + ha_alter_info->alter_info->key_list.elements + * sizeof(dict_foreign_t*))); + + if (!innobase_get_foreign_key_info( + ha_alter_info, table_share, prebuilt->table, + add_fk, &n_add_fk, heap, prebuilt->trx)) { +err_exit: + if (n_drop_index) { + row_mysql_lock_data_dictionary(prebuilt->trx); + + /* Clear the to_be_dropped flags, which might + have been set at this point. */ + for (ulint i = 0; i < n_drop_index; i++) { + DBUG_ASSERT(*drop_index[i]->name + != TEMP_INDEX_PREFIX); + drop_index[i]->to_be_dropped = 0; } + + row_mysql_unlock_data_dictionary(prebuilt->trx); } + + if (heap) { + mem_heap_free(heap); + } + goto err_exit_no_heap; } + } - if (new_fts) { - fts_optimize_add_table(prebuilt->table); + if (!(ha_alter_info->handler_flags & INNOBASE_INPLACE_CREATE)) { + if (heap) { + ha_alter_info->handler_ctx + = new ha_innobase_inplace_ctx( + prebuilt->trx, 0, 0, 0, + drop_index, n_drop_index, + drop_fk, n_drop_fk, + add_fk, n_add_fk, + ha_alter_info->online, + heap, 0, indexed_table, 0, + ULINT_UNDEFINED, 0, 0, 0); } + +func_exit: + DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0); + if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { + online_retry_drop_indexes(prebuilt->table, user_thd); + } + DBUG_RETURN(false); } - trx_commit_for_mysql(trx); - if (prebuilt->trx) { - trx_commit_for_mysql(prebuilt->trx); + /* If we are to build a full-text search index, check whether + the table already has a DOC ID column. If not, we will need to + add a Doc ID hidden column and rebuild the primary index */ + if (innobase_fulltext_exist(altered_table->s)) { + ulint doc_col_no; + + if (!innobase_fts_check_doc_id_col( + prebuilt->table, altered_table, &fts_doc_col_no)) { + fts_doc_col_no = altered_table->s->fields; + add_fts_doc_id = true; + add_fts_doc_id_idx = true; + + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_WRONG_INDEX, + "InnoDB rebuilding table to add column " + FTS_DOC_ID_COL_NAME); + } else if (fts_doc_col_no == ULINT_UNDEFINED) { + goto err_exit; + } + + switch (innobase_fts_check_doc_id_index( + prebuilt->table, altered_table, &doc_col_no)) { + case FTS_NOT_EXIST_DOC_ID_INDEX: + add_fts_doc_id_idx = true; + break; + case FTS_INCORRECT_DOC_ID_INDEX: + my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0), + FTS_DOC_ID_INDEX_NAME); + goto err_exit; + case FTS_EXIST_DOC_ID_INDEX: + DBUG_ASSERT(doc_col_no == fts_doc_col_no + || doc_col_no == ULINT_UNDEFINED + || (ha_alter_info->handler_flags + & (Alter_inplace_info::ALTER_COLUMN_ORDER + | Alter_inplace_info::DROP_COLUMN + | Alter_inplace_info::ADD_COLUMN))); + } } - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE)); + /* See if an AUTO_INCREMENT column was added. */ + uint i = 0; + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + while (const Create_field* new_field = cf_it++) { + const Field* field; - ut_a(fts_check_cached_index(prebuilt->table)); + DBUG_ASSERT(i < altered_table->s->fields); - row_mysql_unlock_data_dictionary(trx); + for (uint old_i = 0; table->field[old_i]; old_i++) { + if (new_field->field == table->field[old_i]) { + goto found_col; + } + } - trx_free_for_mysql(trx); + /* This is an added column. */ + DBUG_ASSERT(!new_field->field); + DBUG_ASSERT(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN); - /* There might be work for utility threads.*/ - srv_active_wake_master_thread(); + field = altered_table->field[i]; - delete add; - DBUG_RETURN(err); + DBUG_ASSERT((MTYP_TYPENR(field->unireg_check) + == Field::NEXT_NUMBER) + == !!(field->flags & AUTO_INCREMENT_FLAG)); + + if (field->flags & AUTO_INCREMENT_FLAG) { + if (add_autoinc_col_no != ULINT_UNDEFINED) { + /* This should have been blocked earlier. */ + ut_ad(0); + my_error(ER_WRONG_AUTO_KEY, MYF(0)); + goto err_exit; + } + add_autoinc_col_no = i; + + autoinc_col_max_value = innobase_get_int_col_max_value( + field); + } +found_col: + i++; + } + + DBUG_ASSERT(user_thd == prebuilt->trx->mysql_thd); + DBUG_RETURN(prepare_inplace_alter_table_dict( + ha_alter_info, altered_table, table, + prebuilt->table, prebuilt->trx, + table_share->table_name.str, + flags, flags2, + heap, drop_index, n_drop_index, + drop_fk, n_drop_fk, add_fk, n_add_fk, + fts_doc_col_no, add_autoinc_col_no, + autoinc_col_max_value, add_fts_doc_id, + add_fts_doc_id_idx)); } -/*******************************************************************//** -Prepare to drop some indexes of a table. -@return 0 or error number */ + +/** Alter the table structure in-place with operations +specified using Alter_inplace_info. +The level of concurrency allowed during this operation depends +on the return value from check_if_supported_inplace_alter(). + +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval true Failure +@retval false Success +*/ UNIV_INTERN -int -ha_innobase::prepare_drop_index( -/*============================*/ - TABLE* in_table, /*!< in: Table where indexes are dropped */ - uint* key_num, /*!< in: Key nums to be dropped */ - uint num_of_keys) /*!< in: Number of keys to be dropped */ +bool +ha_innobase::inplace_alter_table( +/*=============================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) { - trx_t* trx; - int err = 0; - uint n_key; + dberr_t error; - DBUG_ENTER("ha_innobase::prepare_drop_index"); - ut_ad(table); - ut_ad(key_num); - ut_ad(num_of_keys); - if (srv_created_new_raw || srv_force_recovery) { - DBUG_RETURN(HA_ERR_WRONG_COMMAND); + DBUG_ENTER("inplace_alter_table"); + + if (srv_read_only_mode) { + DBUG_RETURN(false); } - update_thd(); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ - trx_search_latch_release_if_reserved(prebuilt->trx); - trx = prebuilt->trx; + DEBUG_SYNC(user_thd, "innodb_inplace_alter_table_enter"); - /* Test and mark all the indexes to be dropped */ + if (!(ha_alter_info->handler_flags & INNOBASE_INPLACE_CREATE)) { +ok_exit: + DEBUG_SYNC(user_thd, "innodb_after_inplace_alter_table"); + DBUG_RETURN(false); + } - row_mysql_lock_data_dictionary(trx); - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE)); + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION + && !innobase_need_rebuild(ha_alter_info)) { + goto ok_exit; + } - /* Check that none of the indexes have previously been flagged - for deletion. */ - { - const dict_index_t* index - = dict_table_get_first_index(prebuilt->table); - do { - ut_a(!index->to_be_dropped); - index = dict_table_get_next_index(index); - } while (index); + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (ha_alter_info->handler_ctx); + + DBUG_ASSERT(ctx); + DBUG_ASSERT(ctx->trx); + + if (prebuilt->table->ibd_file_missing + || dict_table_is_discarded(prebuilt->table)) { + goto all_done; + } + + /* Read the clustered index of the table and build + indexes based on this information using temporary + files and merge sort. */ + DBUG_EXECUTE_IF("innodb_OOM_inplace_alter", + error = DB_OUT_OF_MEMORY; goto oom;); + error = row_merge_build_indexes( + prebuilt->trx, + prebuilt->table, ctx->indexed_table, + ctx->online, + ctx->add, ctx->add_key_numbers, ctx->num_to_add, + altered_table, ctx->add_cols, ctx->col_map, + ctx->add_autoinc, ctx->sequence); +#ifndef DBUG_OFF +oom: +#endif /* !DBUG_OFF */ + if (error == DB_SUCCESS && ctx->online + && ctx->indexed_table != prebuilt->table) { + DEBUG_SYNC_C("row_log_table_apply1_before"); + error = row_log_table_apply( + ctx->thr, prebuilt->table, altered_table); } - for (n_key = 0; n_key < num_of_keys; n_key++) { - const KEY* key; - dict_index_t* index; + DEBUG_SYNC_C("inplace_after_index_build"); - key = table->key_info + key_num[n_key]; - index = dict_table_get_index_on_name_and_min_id( - prebuilt->table, key->name); + DBUG_EXECUTE_IF("create_index_fail", + error = DB_DUPLICATE_KEY;); - if (!index) { - sql_print_error("InnoDB could not find key n:o %u " - "with name %s for table %s", - key_num[n_key], - key ? key->name : "NULL", - prebuilt->table->name); + /* After an error, remove all those index definitions + from the dictionary which were defined. */ - err = HA_ERR_KEY_NOT_FOUND; - goto func_exit; + switch (error) { + KEY* dup_key; + all_done: + case DB_SUCCESS: + ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(dict_table_check_for_dup_indexes( + prebuilt->table, CHECK_PARTIAL_OK)); + ut_d(mutex_exit(&dict_sys->mutex)); + /* prebuilt->table->n_ref_count can be anything here, + given that we hold at most a shared lock on the table. */ + goto ok_exit; + case DB_DUPLICATE_KEY: + if (prebuilt->trx->error_key_num == ULINT_UNDEFINED + || ha_alter_info->key_count == 0) { + /* This should be the hidden index on + FTS_DOC_ID, or there is no PRIMARY KEY in the + table. Either way, we should be seeing and + reporting a bogus duplicate key error. */ + dup_key = NULL; + } else { + DBUG_ASSERT(prebuilt->trx->error_key_num + < ha_alter_info->key_count); + dup_key = &ha_alter_info->key_info_buffer[ + prebuilt->trx->error_key_num]; } + print_keydup_error(altered_table, dup_key, MYF(0)); + break; + case DB_ONLINE_LOG_TOO_BIG: + DBUG_ASSERT(ctx->online); + my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), + (prebuilt->trx->error_key_num == ULINT_UNDEFINED) + ? FTS_DOC_ID_INDEX_NAME + : ha_alter_info->key_info_buffer[ + prebuilt->trx->error_key_num].name); + break; + case DB_INDEX_CORRUPT: + my_error(ER_INDEX_CORRUPT, MYF(0), + (prebuilt->trx->error_key_num == ULINT_UNDEFINED) + ? FTS_DOC_ID_INDEX_NAME + : ha_alter_info->key_info_buffer[ + prebuilt->trx->error_key_num].name); + break; + default: + my_error_innodb(error, + table_share->table_name.str, + prebuilt->table->flags); + } - /* Refuse to drop the clustered index. It would be - better to automatically generate a clustered index, - but mysql_alter_table() will call this method only - after ha_innobase::add_index(). */ + /* prebuilt->table->n_ref_count can be anything here, given + that we hold at most a shared lock on the table. */ + prebuilt->trx->error_info = NULL; + ctx->trx->error_state = DB_SUCCESS; - if (dict_index_is_clust(index)) { - my_error(ER_REQUIRES_PRIMARY_KEY, MYF(0)); - err = -1; - goto func_exit; - } + DBUG_RETURN(true); +} - rw_lock_x_lock(dict_index_get_lock(index)); - index->to_be_dropped = TRUE; - rw_lock_x_unlock(dict_index_get_lock(index)); +/** Free the modification log for online table rebuild. +@param table table that was being rebuilt online */ +static +void +innobase_online_rebuild_log_free( +/*=============================*/ + dict_table_t* table) +{ + dict_index_t* clust_index = dict_table_get_first_index(table); + + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_x_lock(&clust_index->lock); + + if (clust_index->online_log) { + ut_ad(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_CREATION); + clust_index->online_status = ONLINE_INDEX_COMPLETE; + row_log_free(clust_index->online_log); + DEBUG_SYNC_C("innodb_online_rebuild_log_free_aborted"); + } + + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + rw_lock_x_unlock(&clust_index->lock); +} + +/** Rollback a secondary index creation, drop the indexes with +temparary index prefix +@param prebuilt the prebuilt struct +@param table_share the TABLE_SHARE +@param trx the transaction +*/ +static +void +innobase_rollback_sec_index( +/*========================*/ + row_prebuilt_t* prebuilt, + const TABLE_SHARE* table_share, + trx_t* trx) +{ + row_merge_drop_indexes(trx, prebuilt->table, FALSE); + + /* Free the table->fts only if there is no FTS_DOC_ID + in the table */ + if (prebuilt->table->fts + && !DICT_TF2_FLAG_IS_SET(prebuilt->table, + DICT_TF2_FTS_HAS_DOC_ID) + && !innobase_fulltext_exist(table_share)) { + fts_free(prebuilt->table); } +} - /* If FOREIGN_KEY_CHECKS = 1 you may not drop an index defined - for a foreign key constraint because InnoDB requires that both - tables contain indexes for the constraint. Such index can - be dropped only if FOREIGN_KEY_CHECKS is set to 0. - Note that CREATE INDEX id ON table does a CREATE INDEX and - DROP INDEX, and we can ignore here foreign keys because a - new index for the foreign key has already been created. +/** Roll back the changes made during prepare_inplace_alter_table() +and inplace_alter_table() inside the storage engine. Note that the +allowed level of concurrency during this operation will be the same as +for inplace_alter_table() and thus might be higher than during +prepare_inplace_alter_table(). (E.g concurrent writes were blocked +during prepare, but might not be during commit). + +@param ha_alter_info Data used during in-place alter. +@param table_share the TABLE_SHARE +@param prebuilt the prebuilt struct +@retval true Failure +@retval false Success +*/ +inline +bool +rollback_inplace_alter_table( +/*=========================*/ + Alter_inplace_info* ha_alter_info, + const TABLE_SHARE* table_share, + row_prebuilt_t* prebuilt) +{ + bool fail = false; - We check for the foreign key constraints after marking the - candidate indexes for deletion, because when we check for an - equivalent foreign index we don't want to select an index that - is later deleted. */ + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (ha_alter_info->handler_ctx); - if (trx->check_foreigns - && thd_sql_command(user_thd) != SQLCOM_CREATE_INDEX) { - dict_index_t* index; + DBUG_ENTER("rollback_inplace_alter_table"); - for (index = dict_table_get_first_index(prebuilt->table); - index; - index = dict_table_get_next_index(index)) { - dict_foreign_t* foreign; + if (!ctx || !ctx->trx) { + /* If we have not started a transaction yet, + (almost) nothing has been or needs to be done. */ + goto func_exit; + } - if (!index->to_be_dropped) { + row_mysql_lock_data_dictionary(ctx->trx); - continue; + if (prebuilt->table != ctx->indexed_table) { + dberr_t err; + ulint flags = ctx->indexed_table->flags; + + /* DML threads can access ctx->indexed_table via the + online rebuild log. Free it first. */ + innobase_online_rebuild_log_free(prebuilt->table); + + /* Since the FTS index specific auxiliary tables has + not yet registered with "table->fts" by fts_add_index(), + we will need explicitly delete them here */ + if (DICT_TF2_FLAG_IS_SET(ctx->indexed_table, DICT_TF2_FTS)) { + + err = innobase_drop_fts_index_table( + ctx->indexed_table, ctx->trx); + + if (err != DB_SUCCESS) { + my_error_innodb( + err, table_share->table_name.str, + flags); + fail = true; } + } - /* Check if the index is referenced. */ - foreign = dict_table_get_referenced_constraint( - prebuilt->table, index); + /* Drop the table. */ + dict_table_close(ctx->indexed_table, TRUE, FALSE); - if (foreign) { -index_needed: - trx_set_detailed_error( - trx, - "Index needed in foreign key " - "constraint"); +#ifdef UNIV_DDL_DEBUG + /* Nobody should have initialized the stats of the + newly created table yet. When this is the case, we + know that it has not been added for background stats + gathering. */ + ut_a(!ctx->indexed_table->stat_initialized); +#endif /* UNIV_DDL_DEBUG */ - trx->error_info = index; + err = row_merge_drop_table(ctx->trx, ctx->indexed_table); - err = HA_ERR_DROP_INDEX_FK; - break; - } else { - /* Check if this index references some - other table */ - foreign = dict_table_get_foreign_constraint( - prebuilt->table, index); + switch (err) { + case DB_SUCCESS: + break; + default: + my_error_innodb(err, table_share->table_name.str, + flags); + fail = true; + } + } else { + DBUG_ASSERT(!(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_PK_INDEX)); - if (foreign) { - ut_a(foreign->foreign_index == index); + trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX); - /* Search for an equivalent index that - the foreign key constraint could use - if this index were to be deleted. */ - if (!dict_foreign_find_equiv_index( - foreign)) { + innobase_rollback_sec_index(prebuilt, table_share, ctx->trx); + } - goto index_needed; - } - } + trx_commit_for_mysql(ctx->trx); + row_mysql_unlock_data_dictionary(ctx->trx); + trx_free_for_mysql(ctx->trx); + + +func_exit: +#ifndef DBUG_OFF + dict_index_t* clust_index = dict_table_get_first_index( + prebuilt->table); + DBUG_ASSERT(!clust_index->online_log); + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); +#endif /* !DBUG_OFF */ + + if (ctx) { + if (ctx->num_to_add_fk) { + for (ulint i = 0; i < ctx->num_to_add_fk; i++) { + dict_foreign_free(ctx->add_fk[i]); } } - } else if (thd_sql_command(user_thd) == SQLCOM_CREATE_INDEX) { - /* This is a drop of a foreign key constraint index that - was created by MySQL when the constraint was added. MySQL - does this when the user creates an index explicitly which - can be used in place of the automatically generated index. */ - dict_index_t* index; + if (ctx->num_to_drop) { + row_mysql_lock_data_dictionary(prebuilt->trx); + + /* Clear the to_be_dropped flags + in the data dictionary cache. + The flags may already have been cleared, + in case an error was detected in + commit_inplace_alter_table(). */ + for (ulint i = 0; i < ctx->num_to_drop; i++) { + dict_index_t* index = ctx->drop[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + + index->to_be_dropped = 0; + } + + row_mysql_unlock_data_dictionary(prebuilt->trx); + } + } + + trx_commit_for_mysql(prebuilt->trx); + srv_active_wake_master_thread(); + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + DBUG_RETURN(fail); +} + +/** Drop a FOREIGN KEY constraint. +@param table_share the TABLE_SHARE +@param trx data dictionary transaction +@param foreign the foreign key constraint, will be freed +@retval true Failure +@retval false Success */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_drop_foreign( +/*==================*/ + const TABLE_SHARE* table_share, + trx_t* trx, + dict_foreign_t* foreign) +{ + DBUG_ENTER("innobase_drop_foreign"); + + DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Drop the constraint from the data dictionary. */ + static const char sql[] = + "PROCEDURE DROP_FOREIGN_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FOREIGN WHERE ID=:id;\n" + "DELETE FROM SYS_FOREIGN_COLS WHERE ID=:id;\n" + "END;\n"; + + dberr_t error; + pars_info_t* info; + + info = pars_info_create(); + pars_info_add_str_literal(info, "id", foreign->id); + + trx->op_info = "dropping foreign key constraint from dictionary"; + error = que_eval_sql(info, sql, FALSE, trx); + trx->op_info = ""; + + DBUG_EXECUTE_IF("ib_drop_foreign_error", + error = DB_OUT_OF_FILE_SPACE;); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_share->table_name.str, 0); + trx->error_state = DB_SUCCESS; + DBUG_RETURN(true); + } + + /* Drop the foreign key constraint from the data dictionary cache. */ + dict_foreign_remove_from_cache(foreign); + DBUG_RETURN(false); +} + +/** Rename a column. +@param table_share the TABLE_SHARE +@param prebuilt the prebuilt struct +@param trx data dictionary transaction +@param nth_col 0-based index of the column +@param from old column name +@param to new column name +@param new_clustered whether the table has been rebuilt +@retval true Failure +@retval false Success */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_rename_column( +/*===================*/ + const TABLE_SHARE* table_share, + row_prebuilt_t* prebuilt, + trx_t* trx, + ulint nth_col, + const char* from, + const char* to, + bool new_clustered) +{ + pars_info_t* info; + dberr_t error; - for (index = dict_table_get_first_index(prebuilt->table); - index; - index = dict_table_get_next_index(index)) { - dict_foreign_t* foreign; + DBUG_ENTER("innobase_rename_column"); - if (!index->to_be_dropped) { + DBUG_ASSERT(trx_get_dict_operation(trx) + == new_clustered ? TRX_DICT_OP_TABLE : TRX_DICT_OP_INDEX); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + if (new_clustered) { + goto rename_foreign; + } + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "tableid", prebuilt->table->id); + pars_info_add_int4_literal(info, "nth", nth_col); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); + + trx->op_info = "renaming column in SYS_COLUMNS"; + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_COLUMNS_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS SET NAME=:new\n" + "WHERE TABLE_ID=:tableid AND NAME=:old\n" + "AND POS=:nth;\n" + "END;\n", + FALSE, trx); + + DBUG_EXECUTE_IF("ib_rename_column_error", + error = DB_OUT_OF_FILE_SPACE;); + + if (error != DB_SUCCESS) { +err_exit: + my_error_innodb(error, table_share->table_name.str, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + + trx->op_info = "renaming column in SYS_FIELDS"; + + for (dict_index_t* index = dict_table_get_first_index(prebuilt->table); + index != NULL; + index = dict_table_get_next_index(index)) { + + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + if (strcmp(dict_index_get_nth_field(index, i)->name, + from)) { continue; } - /* Check if this index references some other table */ - foreign = dict_table_get_foreign_constraint( - prebuilt->table, index); + info = pars_info_create(); - if (foreign == NULL) { + pars_info_add_ull_literal(info, "indexid", index->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); - continue; + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n" + "BEGIN\n" + + "UPDATE SYS_FIELDS SET COL_NAME=:new\n" + "WHERE INDEX_ID=:indexid AND COL_NAME=:old\n" + "AND POS=:nth;\n" + + /* Try again, in case there is a prefix_len + encoded in SYS_FIELDS.POS */ + + "UPDATE SYS_FIELDS SET COL_NAME=:new\n" + "WHERE INDEX_ID=:indexid AND COL_NAME=:old\n" + "AND POS>=65536*:nth AND POS<65536*(:nth+1);\n" + + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + goto err_exit; } + } + } - ut_a(foreign->foreign_index == index); +rename_foreign: + trx->op_info = "renaming column in SYS_FOREIGN_COLS"; - /* Search for an equivalent index that the - foreign key constraint could use if this index - were to be deleted. */ + for (dict_foreign_t* foreign = UT_LIST_GET_FIRST( + prebuilt->table->foreign_list); + foreign != NULL; + foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { + for (unsigned i = 0; i < foreign->n_fields; i++) { + if (strcmp(foreign->foreign_col_names[i], from)) { + continue; + } - if (!dict_foreign_find_equiv_index(foreign)) { - trx_set_detailed_error( - trx, - "Index needed in foreign key " - "constraint"); + info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FOREIGN_F_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FOREIGN_COLS\n" + "SET FOR_COL_NAME=:new\n" + "WHERE ID=:id AND POS=:nth\n" + "AND FOR_COL_NAME=:old;\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + goto err_exit; + } + } + } - trx->error_info = foreign->foreign_index; + for (dict_foreign_t* foreign = UT_LIST_GET_FIRST( + prebuilt->table->referenced_list); + foreign != NULL; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { + for (unsigned i = 0; i < foreign->n_fields; i++) { + if (strcmp(foreign->referenced_col_names[i], from)) { + continue; + } - err = HA_ERR_DROP_INDEX_FK; - break; + info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FOREIGN_R_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FOREIGN_COLS\n" + "SET REF_COL_NAME=:new\n" + "WHERE ID=:id AND POS=:nth\n" + "AND REF_COL_NAME=:old;\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + goto err_exit; } } } -func_exit: - if (err) { - /* Undo our changes since there was some sort of error. */ - dict_index_t* index - = dict_table_get_first_index(prebuilt->table); + trx->op_info = ""; + if (!new_clustered) { + /* Rename the column in the data dictionary cache. */ + dict_mem_table_col_rename(prebuilt->table, nth_col, from, to); + } + DBUG_RETURN(false); +} + +/** Rename columns. +@param ha_alter_info Data used during in-place alter. +@param new_clustered whether the table has been rebuilt +@param table the TABLE +@param table_share the TABLE_SHARE +@param prebuilt the prebuilt struct +@param trx data dictionary transaction +@retval true Failure +@retval false Success */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_rename_columns( +/*====================*/ + Alter_inplace_info* ha_alter_info, + bool new_clustered, + const TABLE* table, + const TABLE_SHARE* table_share, + row_prebuilt_t* prebuilt, + trx_t* trx) +{ + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + uint i = 0; + + for (Field** fp = table->field; *fp; fp++, i++) { + if (!((*fp)->flags & FIELD_IS_RENAMED)) { + continue; + } + + cf_it.rewind(); + while (Create_field* cf = cf_it++) { + if (cf->field == *fp) { + if (innobase_rename_column( + table_share, + prebuilt, trx, i, + cf->field->field_name, + cf->field_name, new_clustered)) { + return(true); + } + goto processed_field; + } + } - do { - rw_lock_x_lock(dict_index_get_lock(index)); - index->to_be_dropped = FALSE; - rw_lock_x_unlock(dict_index_get_lock(index)); - index = dict_table_get_next_index(index); - } while (index); + ut_error; +processed_field: + continue; } - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE)); - row_mysql_unlock_data_dictionary(trx); + return(false); +} - DBUG_RETURN(err); +/** Undo the in-memory addition of foreign key on table->foreign_list +and table->referenced_list. +@param ctx saved alter table context +@param table the foreign table */ +static __attribute__((nonnull)) +void +innobase_undo_add_fk( +/*=================*/ + ha_innobase_inplace_ctx* ctx, + dict_table_t* fk_table) +{ + for (ulint i = 0; i < ctx->num_to_add_fk; i++) { + UT_LIST_REMOVE( + foreign_list, + fk_table->foreign_list, + ctx->add_fk[i]); + + if (ctx->add_fk[i]->referenced_table) { + UT_LIST_REMOVE( + referenced_list, + ctx->add_fk[i]->referenced_table + ->referenced_list, + ctx->add_fk[i]); + } + } } -/*******************************************************************//** -Drop the indexes that were passed to a successful prepare_drop_index(). -@return 0 or error number */ +/** Commit or rollback the changes made during +prepare_inplace_alter_table() and inplace_alter_table() inside +the storage engine. Note that the allowed level of concurrency +during this operation will be the same as for +inplace_alter_table() and thus might be higher than during +prepare_inplace_alter_table(). (E.g concurrent writes were +blocked during prepare, but might not be during commit). +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. +@param commit true => Commit, false => Rollback. +@retval true Failure +@retval false Success +*/ UNIV_INTERN -int -ha_innobase::final_drop_index( -/*==========================*/ - TABLE* iin_table) /*!< in: Table where indexes - are dropped */ +bool +ha_innobase::commit_inplace_alter_table( +/*====================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info, + bool commit) { - dict_index_t* index; /*!< Index to be dropped */ - trx_t* trx; /*!< Transaction */ - int err; - - DBUG_ENTER("ha_innobase::final_drop_index"); - ut_ad(table); + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (ha_alter_info->handler_ctx); + trx_t* trx; + trx_t* fk_trx = NULL; + int err = 0; + bool new_clustered; + dict_table_t* fk_table = NULL; + ulonglong max_autoinc; + + ut_ad(!srv_read_only_mode); + + DBUG_ENTER("commit_inplace_alter_table"); + + DEBUG_SYNC_C("innodb_commit_inplace_alter_table_enter"); + + DEBUG_SYNC_C("innodb_commit_inplace_alter_table_wait"); + + if (!commit) { + /* A rollback is being requested. So far we may at + most have created some indexes. If any indexes were to + be dropped, they would actually be dropped in this + method if commit=true. */ + DBUG_RETURN(rollback_inplace_alter_table( + ha_alter_info, table_share, prebuilt)); + } - if (srv_created_new_raw || srv_force_recovery) { - DBUG_RETURN(HA_ERR_WRONG_COMMAND); + if (!altered_table->found_next_number_field) { + /* There is no AUTO_INCREMENT column in the table + after the ALTER operation. */ + max_autoinc = 0; + } else if (ctx && ctx->add_autoinc != ULINT_UNDEFINED) { + /* An AUTO_INCREMENT column was added. Get the last + value from the sequence, which may be based on a + supplied AUTO_INCREMENT value. */ + max_autoinc = ctx->sequence.last(); + } else if ((ha_alter_info->handler_flags + & Alter_inplace_info::CHANGE_CREATE_OPTION) + && (ha_alter_info->create_info->used_fields + & HA_CREATE_USED_AUTO)) { + /* An AUTO_INCREMENT value was supplied, but the table + was not rebuilt. Get the user-supplied value. */ + max_autoinc = ha_alter_info->create_info->auto_increment_value; + } else { + /* An AUTO_INCREMENT value was not specified. + Read the old counter value from the table. */ + ut_ad(table->found_next_number_field); + dict_table_autoinc_lock(prebuilt->table); + max_autoinc = dict_table_autoinc_read(prebuilt->table); + dict_table_autoinc_unlock(prebuilt->table); } - update_thd(); + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + DBUG_ASSERT(!ctx); + /* We may want to update table attributes. */ + goto func_exit; + } - trx_search_latch_release_if_reserved(prebuilt->trx); trx_start_if_not_started_xa(prebuilt->trx); - /* Create a background transaction for the operations on - the data dictionary tables. */ - trx = innobase_trx_allocate(user_thd); - trx_start_if_not_started_xa(trx); - - /* Flag this transaction as a dictionary operation, so that - the data dictionary will be locked in crash recovery. */ - trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); - - /* Lock the table exclusively, to ensure that no active - transaction depends on an index that is being dropped. */ - err = convert_error_code_to_mysql( - row_merge_lock_table(prebuilt->trx, prebuilt->table, LOCK_X), - prebuilt->table->flags, user_thd); - - /* Delete corresponding rows from the stats table. - Marko advises not to edit both user tables and SYS_* tables in one - trx, thus we use prebuilt->trx instead of trx. Because of this the - drop from SYS_* and from the stats table cannot happen in one - transaction and eventually if a crash occurs below, between - trx_commit_for_mysql(trx); which drops the indexes from SYS_* and - trx_commit_for_mysql(prebuilt->trx); - then an orphaned rows will be left in the stats table. */ - for (index = dict_table_get_first_index(prebuilt->table); - index != NULL; - index = dict_table_get_next_index(index)) { + { + /* Exclusively lock the table, to ensure that no other + transaction is holding locks on the table while we + change the table definition. The MySQL meta-data lock + should normally guarantee that no conflicting locks + exist. However, FOREIGN KEY constraints checks and any + transactions collected during crash recovery could be + holding InnoDB locks only, not MySQL locks. */ + dberr_t error = row_merge_lock_table( + prebuilt->trx, prebuilt->table, LOCK_X); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_share->table_name.str, 0); + DBUG_RETURN(true); + } - if (index->to_be_dropped) { + DEBUG_SYNC(user_thd, "innodb_alter_commit_after_lock_table"); + } - enum db_err ret; - char errstr[1024]; + if (ctx) { + if (ctx->indexed_table != prebuilt->table) { + for (dict_index_t* index = dict_table_get_first_index( + ctx->indexed_table); + index; + index = dict_table_get_next_index(index)) { + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + if (dict_index_is_corrupted(index)) { + my_error(ER_INDEX_CORRUPT, MYF(0), + index->name); + DBUG_RETURN(true); + } + } + } else { + for (ulint i = 0; i < ctx->num_to_add; i++) { + dict_index_t* index = ctx->add[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(*index->name == TEMP_INDEX_PREFIX); + if (dict_index_is_corrupted(index)) { + /* Report a duplicate key + error for the index that was + flagged corrupted, most likely + because a duplicate value was + inserted (directly or by + rollback) after + ha_innobase::inplace_alter_table() + completed. */ + my_error(ER_DUP_UNKNOWN_IN_INDEX, + MYF(0), index->name + 1); + DBUG_RETURN(true); + } + } + } + } - ret = dict_stats_delete_index_stats( - index, prebuilt->trx, - errstr, sizeof(errstr)); + if (!ctx || !ctx->trx) { + /* Create a background transaction for the operations on + the data dictionary tables. */ + trx = innobase_trx_allocate(user_thd); - if (ret != DB_SUCCESS) { - push_warning(user_thd, - Sql_condition::WARN_LEVEL_WARN, - ER_LOCK_WAIT_TIMEOUT, - errstr); - } + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + new_clustered = false; + } else { + trx_dict_op_t op; + + trx = ctx->trx; + + new_clustered = ctx->indexed_table != prebuilt->table; + + op = (new_clustered) ? TRX_DICT_OP_TABLE : TRX_DICT_OP_INDEX; + + trx_start_for_ddl(trx, op); + } + + if (new_clustered) { + if (prebuilt->table->fts) { + ut_ad(!prebuilt->table->fts->add_wq); + fts_optimize_remove_table(prebuilt->table); + } + + if (ctx->indexed_table->fts) { + ut_ad(!ctx->indexed_table->fts->add_wq); + fts_optimize_remove_table(ctx->indexed_table); } } + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during the data dictionary operation. */ row_mysql_lock_data_dictionary(trx); - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE)); - if (UNIV_UNLIKELY(err)) { + /* Wait for background stats processing to stop using the + indexes that we are going to drop (if any). */ + if (ctx) { + dict_stats_wait_bg_to_stop_using_tables( + prebuilt->table, ctx->indexed_table, trx); + } - /* Unmark the indexes to be dropped. */ - for (index = dict_table_get_first_index(prebuilt->table); - index; index = dict_table_get_next_index(index)) { + /* Final phase of add foreign key processing */ + if (ctx && ctx->num_to_add_fk > 0) { + ulint highest_id_so_far; + dberr_t error; + + /* If it runs concurrently with create index or table + rebuild, we will need a separate trx to do the system + table change, since in the case of failure to rebuild/create + index, it will need to commit the trx that drops the newly + created table/index, while for FK, it needs to rollback + the metadata change */ + if (new_clustered || ctx->num_to_add) { + fk_trx = innobase_trx_allocate(user_thd); - rw_lock_x_lock(dict_index_get_lock(index)); - index->to_be_dropped = FALSE; - rw_lock_x_unlock(dict_index_get_lock(index)); + trx_start_for_ddl(fk_trx, TRX_DICT_OP_INDEX); + + fk_trx->dict_operation_lock_mode = + trx->dict_operation_lock_mode; + } else { + fk_trx = trx; } - goto func_exit; + ut_ad(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_FOREIGN_KEY); + + highest_id_so_far = dict_table_get_highest_foreign_id( + prebuilt->table); + + highest_id_so_far++; + + fk_table = ctx->indexed_table; + + for (ulint i = 0; i < ctx->num_to_add_fk; i++) { + + /* Get the new dict_table_t */ + if (new_clustered) { + ctx->add_fk[i]->foreign_table + = fk_table; + } + + /* Add Foreign Key info to in-memory metadata */ + UT_LIST_ADD_LAST(foreign_list, + fk_table->foreign_list, + ctx->add_fk[i]); + + if (ctx->add_fk[i]->referenced_table) { + UT_LIST_ADD_LAST( + referenced_list, + ctx->add_fk[i]->referenced_table->referenced_list, + ctx->add_fk[i]); + } + + if (!ctx->add_fk[i]->foreign_index) { + ctx->add_fk[i]->foreign_index + = dict_foreign_find_index( + fk_table, + ctx->add_fk[i]->foreign_col_names, + ctx->add_fk[i]->n_fields, NULL, + TRUE, FALSE); + + ut_ad(ctx->add_fk[i]->foreign_index); + + if (!innobase_check_fk_option( + ctx->add_fk[i])) { + my_error(ER_FK_INCORRECT_OPTION, + MYF(0), + table_share->table_name.str, + ctx->add_fk[i]->id); + goto undo_add_fk; + } + } + + /* System table change */ + error = dict_create_add_foreign_to_dictionary( + &highest_id_so_far, prebuilt->table, + ctx->add_fk[i], fk_trx); + + DBUG_EXECUTE_IF( + "innodb_test_cannot_add_fk_system", + error = DB_ERROR;); + + if (error != DB_SUCCESS) { + my_error(ER_FK_FAIL_ADD_SYSTEM, MYF(0), + ctx->add_fk[i]->id); + goto undo_add_fk; + } + } + + /* Make sure the tables are moved to non-lru side of + dictionary list */ + error = dict_load_foreigns(prebuilt->table->name, FALSE, TRUE); + + if (error != DB_SUCCESS) { + my_error(ER_CANNOT_ADD_FOREIGN, MYF(0)); + +undo_add_fk: + err = -1; + + if (new_clustered) { + goto drop_new_clustered; + } else if (ctx->num_to_add > 0) { + ut_ad(trx != fk_trx); + + innobase_rollback_sec_index( + prebuilt, table_share, trx); + innobase_undo_add_fk(ctx, fk_table); + trx_rollback_for_mysql(fk_trx); + + goto trx_commit; + } else { + goto trx_rollback; + } + } + } + + if (new_clustered) { + dberr_t error; + char* tmp_name; + + /* Clear the to_be_dropped flag in the data dictionary. */ + for (ulint i = 0; i < ctx->num_to_drop; i++) { + dict_index_t* index = ctx->drop[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->to_be_dropped); + index->to_be_dropped = 0; + } + + /* We copied the table. Any indexes that were + requested to be dropped were not created in the copy + of the table. Apply any last bit of the rebuild log + and then rename the tables. */ + + if (ctx->online) { + DEBUG_SYNC_C("row_log_table_apply2_before"); + error = row_log_table_apply( + ctx->thr, prebuilt->table, altered_table); + + switch (error) { + KEY* dup_key; + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + if (prebuilt->trx->error_key_num + == ULINT_UNDEFINED) { + /* This should be the hidden index on + FTS_DOC_ID. */ + dup_key = NULL; + } else { + DBUG_ASSERT( + prebuilt->trx->error_key_num + < ha_alter_info->key_count); + dup_key = &ha_alter_info + ->key_info_buffer[ + prebuilt->trx + ->error_key_num]; + } + print_keydup_error(altered_table, dup_key, MYF(0)); + break; + case DB_ONLINE_LOG_TOO_BIG: + my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), + ha_alter_info->key_info_buffer[0] + .name); + break; + case DB_INDEX_CORRUPT: + my_error(ER_INDEX_CORRUPT, MYF(0), + (prebuilt->trx->error_key_num + == ULINT_UNDEFINED) + ? FTS_DOC_ID_INDEX_NAME + : ha_alter_info->key_info_buffer[ + prebuilt->trx->error_key_num] + .name); + break; + default: + my_error_innodb(error, + table_share->table_name.str, + prebuilt->table->flags); + } + + if (error != DB_SUCCESS) { + err = -1; + goto drop_new_clustered; + } + } + + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME) + && innobase_rename_columns(ha_alter_info, true, table, + table_share, prebuilt, trx)) { + err = -1; + goto drop_new_clustered; + } + + /* A new clustered index was defined for the table + and there was no error at this point. We can + now rename the old table as a temporary table, + rename the new temporary table as the old + table and drop the old table. */ + tmp_name = dict_mem_create_temporary_tablename( + ctx->heap, ctx->indexed_table->name, + ctx->indexed_table->id); + + /* Rename table will reload and refresh the in-memory + foreign key constraint metadata. This is a rename operation + in preparing for dropping the old table. Set the table + to_be_dropped bit here, so to make sure DML foreign key + constraint check does not use the stale dict_foreign_t. + This is done because WL#6049 (FK MDL) has not been + implemented yet */ + prebuilt->table->to_be_dropped = true; + + DBUG_EXECUTE_IF("ib_ddl_crash_before_rename", + DBUG_SUICIDE();); + + /* The new table must inherit the flag from the + "parent" table. */ + if (dict_table_is_discarded(prebuilt->table)) { + ctx->indexed_table->ibd_file_missing = true; + ctx->indexed_table->flags2 |= DICT_TF2_DISCARDED; + } + + error = row_merge_rename_tables( + prebuilt->table, ctx->indexed_table, + tmp_name, trx); + + DBUG_EXECUTE_IF("ib_ddl_crash_after_rename", + DBUG_SUICIDE();); + + /* n_ref_count must be 1, because purge cannot + be executing on this very table as we are + holding dict_operation_lock X-latch. */ + ut_a(prebuilt->table->n_ref_count == 1); + + switch (error) { + dict_table_t* old_table; + case DB_SUCCESS: + old_table = prebuilt->table; + + DBUG_EXECUTE_IF("ib_ddl_crash_before_commit", + DBUG_SUICIDE();); + + trx_commit_for_mysql(prebuilt->trx); + + DBUG_EXECUTE_IF("ib_ddl_crash_after_commit", + DBUG_SUICIDE();); + + if (fk_trx) { + ut_ad(fk_trx != trx); + trx_commit_for_mysql(fk_trx); + } + + row_prebuilt_free(prebuilt, TRUE); + error = row_merge_drop_table(trx, old_table); + prebuilt = row_create_prebuilt( + ctx->indexed_table, table->s->reclength); + err = 0; + break; + case DB_TABLESPACE_EXISTS: + ut_a(ctx->indexed_table->n_ref_count == 1); + my_error(ER_TABLESPACE_EXISTS, MYF(0), tmp_name); + err = HA_ERR_TABLESPACE_EXISTS; + goto drop_new_clustered; + case DB_DUPLICATE_KEY: + ut_a(ctx->indexed_table->n_ref_count == 1); + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), tmp_name); + err = HA_ERR_TABLE_EXIST; + goto drop_new_clustered; + default: + my_error_innodb(error, + table_share->table_name.str, + prebuilt->table->flags); + err = -1; + +drop_new_clustered: + /* Reset the to_be_dropped bit for the old table, + since we are aborting the operation and dropping + the new table due to some error conditions */ + prebuilt->table->to_be_dropped = false; + + /* Need to drop the added foreign key first */ + if (fk_trx) { + ut_ad(fk_trx != trx); + innobase_undo_add_fk(ctx, fk_table); + trx_rollback_for_mysql(fk_trx); + } + + dict_table_close(ctx->indexed_table, TRUE, FALSE); + +#ifdef UNIV_DDL_DEBUG + /* Nobody should have initialized the stats of the + newly created table yet. When this is the case, we + know that it has not been added for background stats + gathering. */ + ut_a(!ctx->indexed_table->stat_initialized); +#endif /* UNIV_DDL_DEBUG */ + + row_merge_drop_table(trx, ctx->indexed_table); + ctx->indexed_table = NULL; + goto trx_commit; + } + } else if (ctx) { + dberr_t error; + + /* We altered the table in place. */ + /* Lose the TEMP_INDEX_PREFIX. */ + for (ulint i = 0; i < ctx->num_to_add; i++) { + dict_index_t* index = ctx->add[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(*index->name + == TEMP_INDEX_PREFIX); + index->name++; + error = row_merge_rename_index_to_add( + trx, prebuilt->table->id, + index->id); + if (error != DB_SUCCESS) { + sql_print_error( + "InnoDB: rename index to add: %lu\n", + (ulong) error); + DBUG_ASSERT(0); + } + } + + /* Drop any indexes that were requested to be dropped. + Rename them to TEMP_INDEX_PREFIX in the data + dictionary first. We do not bother to rename + index->name in the dictionary cache, because the index + is about to be freed after row_merge_drop_indexes_dict(). */ + + for (ulint i = 0; i < ctx->num_to_drop; i++) { + dict_index_t* index = ctx->drop[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->table == prebuilt->table); + DBUG_ASSERT(index->to_be_dropped); + + error = row_merge_rename_index_to_drop( + trx, index->table->id, index->id); + if (error != DB_SUCCESS) { + sql_print_error( + "InnoDB: rename index to drop: %lu\n", + (ulong) error); + DBUG_ASSERT(0); + } + } + } + + if (err == 0 + && (ha_alter_info->handler_flags + & Alter_inplace_info::DROP_FOREIGN_KEY)) { + DBUG_ASSERT(ctx->num_to_drop_fk > 0); + DBUG_ASSERT(ctx->num_to_drop_fk + == ha_alter_info->alter_info->drop_list.elements); + for (ulint i = 0; i < ctx->num_to_drop_fk; i++) { + DBUG_ASSERT(prebuilt->table + == ctx->drop_fk[i]->foreign_table); + + if (innobase_drop_foreign( + table_share, trx, ctx->drop_fk[i])) { + err = -1; + } + } + } + + if (err == 0 && !new_clustered + && (ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME) + && innobase_rename_columns(ha_alter_info, false, table, + table_share, prebuilt, trx)) { + err = -1; } - /* Drop indexes marked to be dropped */ + if (err == 0) { + if (fk_trx && fk_trx != trx) { + /* This needs to be placed before "trx_commit" marker, + since anyone called "goto trx_commit" has committed + or rolled back fk_trx before jumping here */ + trx_commit_for_mysql(fk_trx); + } +trx_commit: + trx_commit_for_mysql(trx); + } else { +trx_rollback: + /* undo the addition of foreign key */ + if (fk_trx) { + innobase_undo_add_fk(ctx, fk_table); - index = dict_table_get_first_index(prebuilt->table); + if (fk_trx != trx) { + trx_rollback_for_mysql(fk_trx); + } + } - while (index) { - dict_index_t* next_index; + trx_rollback_for_mysql(trx); + + /* If there are newly added secondary indexes, above + rollback will revert the rename operation and put the + new indexes with the temp index prefix, we can drop + them here */ + if (ctx && !new_clustered) { + ulint i; + + /* Need to drop the in-memory dict_index_t first + to avoid dict_table_check_for_dup_indexes() + assertion in row_merge_drop_indexes() in the case + of add and drop the same index */ + for (i = 0; i < ctx->num_to_add; i++) { + dict_index_t* index = ctx->add[i]; + dict_index_remove_from_cache( + prebuilt->table, index); + } - next_index = dict_table_get_next_index(index); + if (ctx->num_to_add) { + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + row_merge_drop_indexes(trx, prebuilt->table, + FALSE); + trx_commit_for_mysql(trx); + } - if (index->to_be_dropped) { - row_merge_drop_index(index, prebuilt->table, trx); + for (i = 0; i < ctx->num_to_drop; i++) { + dict_index_t* index = ctx->drop[i]; + index->to_be_dropped = false; + } } + } - index = next_index; + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + if (new_clustered) { + innobase_online_rebuild_log_free(prebuilt->table); } - /* Check that all flagged indexes were dropped. */ - for (index = dict_table_get_first_index(prebuilt->table); - index; index = dict_table_get_next_index(index)) { - ut_a(!index->to_be_dropped); + if (err == 0 && ctx) { + /* The changes were successfully performed. */ + bool add_fts = false; + + /* Rebuild the index translation table. + This should only be needed when !new_clustered. */ + share->idx_trans_tbl.index_count = 0; + + /* Publish the created fulltext index, if any. + Note that a fulltext index can be created without + creating the clustered index, if there already exists + a suitable FTS_DOC_ID column. If not, one will be + created, implying new_clustered */ + for (ulint i = 0; i < ctx->num_to_add; i++) { + dict_index_t* index = ctx->add[i]; + + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS); + fts_add_index(index, prebuilt->table); + add_fts = true; + } + } + + if (!new_clustered && ha_alter_info->index_drop_count) { + + /* Really drop the indexes that were dropped. + The transaction had to be committed first + (after renaming the indexes), so that in the + event of a crash, crash recovery will drop the + indexes, because it drops all indexes whose + names start with TEMP_INDEX_PREFIX. Once we + have started dropping an index tree, there is + no way to roll it back. */ + + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + for (ulint i = 0; i < ctx->num_to_drop; i++) { + dict_index_t* index = ctx->drop[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->table == prebuilt->table); + DBUG_ASSERT(index->to_be_dropped); + + /* Replace the indexes in foreign key + constraints if needed. */ + + dict_foreign_replace_index( + prebuilt->table, index, prebuilt->trx); + + /* Mark the index dropped + in the data dictionary cache. */ + rw_lock_x_lock(dict_index_get_lock(index)); + index->page = FIL_NULL; + rw_lock_x_unlock(dict_index_get_lock(index)); + } + + row_merge_drop_indexes_dict(trx, prebuilt->table->id); + + for (ulint i = 0; i < ctx->num_to_drop; i++) { + dict_index_t* index = ctx->drop[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->table == prebuilt->table); + + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS + || (index->type + & DICT_CORRUPT)); + DBUG_ASSERT(prebuilt->table->fts); + fts_drop_index( + prebuilt->table, index, trx); + } + + dict_index_remove_from_cache( + prebuilt->table, index); + } + + trx_commit_for_mysql(trx); + } + + ut_d(dict_table_check_for_dup_indexes( + prebuilt->table, CHECK_ALL_COMPLETE)); + DBUG_ASSERT(new_clustered == !prebuilt->trx); + + if (add_fts) { + fts_optimize_add_table(prebuilt->table); + } } - /* We will need to rebuild index translation table. Set - valid index entry count in the translation table to zero */ - share->idx_trans_tbl.index_count = 0; + if (!prebuilt->trx) { + /* We created a new clustered index and committed the + user transaction already, so that we were able to + drop the old table. */ + update_thd(); + prebuilt->trx->will_lock++; -func_exit: - ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE)); + DBUG_EXECUTE_IF("ib_ddl_crash_after_user_trx_commit", + DBUG_SUICIDE();); + + trx_start_if_not_started_xa(prebuilt->trx); + } + ut_d(dict_table_check_for_dup_indexes( + prebuilt->table, CHECK_ABORTED_OK)); ut_a(fts_check_cached_index(prebuilt->table)); + row_mysql_unlock_data_dictionary(trx); + if (fk_trx && fk_trx != trx) { + fk_trx->dict_operation_lock_mode = 0; + trx_free_for_mysql(fk_trx); + } + trx_free_for_mysql(trx); + + if (ctx && trx == ctx->trx) { + ctx->trx = NULL; + } + + if (err == 0) { + /* Delete corresponding rows from the stats table. We update + the statistics in a separate transaction from trx, because + lock waits are not allowed in a data dictionary transaction. + (Lock waits are possible on the statistics table, because it + is directly accessible by users, not covered by the + dict_operation_lock.) + + Because the data dictionary changes were already committed, + orphaned rows may be left in the statistics table if the + system crashes. */ + + for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { + const KEY* key + = ha_alter_info->index_drop_buffer[i]; + dberr_t ret; + char errstr[1024]; + + ret = dict_stats_drop_index( + prebuilt->table->name, key->name, + errstr, sizeof(errstr)); + + if (ret != DB_SUCCESS) { + push_warning(user_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_LOCK_WAIT_TIMEOUT, + errstr); + } + } + + if (ctx && !dict_table_is_discarded(prebuilt->table)) { + bool stats_init_called = false; + + for (uint i = 0; i < ctx->num_to_add; i++) { + dict_index_t* index = ctx->add[i]; + + if (!(index->type & DICT_FTS)) { + + if (!stats_init_called) { + innobase_copy_frm_flags_from_table_share( + index->table, + altered_table->s); + + dict_stats_init(index->table); + + stats_init_called = true; + } + + dict_stats_update_for_index(index); + } + } + } + } - trx_commit_for_mysql(trx); trx_commit_for_mysql(prebuilt->trx); - row_mysql_unlock_data_dictionary(trx); /* Flush the log to reduce probability that the .frm files and the InnoDB data dictionary get out-of-sync if the user runs @@ -1912,12 +5213,106 @@ func_exit: log_buffer_flush_to_disk(); - trx_free_for_mysql(trx); - /* Tell the InnoDB server that there might be work for utility threads: */ srv_active_wake_master_thread(); - DBUG_RETURN(err); +func_exit: + + if (err == 0 && altered_table->found_next_number_field != 0) { + dict_table_autoinc_lock(prebuilt->table); + dict_table_autoinc_initialize(prebuilt->table, max_autoinc); + dict_table_autoinc_unlock(prebuilt->table); + } + +#ifndef DBUG_OFF + dict_index_t* clust_index = dict_table_get_first_index( + prebuilt->table); + DBUG_ASSERT(!clust_index->online_log); + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); +#endif /* !DBUG_OFF */ + +#ifdef UNIV_DEBUG + for (dict_index_t* index = dict_table_get_first_index( + prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + ut_ad(!index->to_be_dropped); + } +#endif /* UNIV_DEBUG */ + + if (err == 0) { + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + +#ifdef UNIV_DDL_DEBUG + /* Invoke CHECK TABLE atomically after a successful + ALTER TABLE. */ + TABLE* old_table = table; + table = altered_table; + ut_a(check(user_thd, 0) == HA_ADMIN_OK); + table = old_table; +#endif /* UNIV_DDL_DEBUG */ + } + + DBUG_RETURN(err != 0); +} + +/** +@param thd - the session +@param start_value - the lower bound +@param max_value - the upper bound (inclusive) */ +ib_sequence_t::ib_sequence_t( + THD* thd, + ulonglong start_value, + ulonglong max_value) + : + m_max_value(max_value), + m_increment(0), + m_offset(0), + m_next_value(start_value), + m_eof(false) +{ + if (thd != 0 && m_max_value > 0) { + + thd_get_autoinc(thd, &m_offset, &m_increment); + + if (m_increment > 1 || m_offset > 1) { + + /* If there is an offset or increment specified + then we need to work out the exact next value. */ + + m_next_value = innobase_next_autoinc( + start_value, 1, + m_increment, m_offset, m_max_value); + + } else if (start_value == 0) { + /* The next value can never be 0. */ + m_next_value = 1; + } + } else { + m_eof = true; + } +} + +/** +Postfix increment +@return the next value to insert */ +ulonglong +ib_sequence_t::operator++(int) UNIV_NOTHROW +{ + ulonglong current = m_next_value; + + ut_ad(!m_eof); + ut_ad(m_max_value > 0); + + m_next_value = innobase_next_autoinc( + current, 1, m_increment, m_offset, m_max_value); + + if (m_next_value == m_max_value && current == m_next_value) { + m_eof = true; + } + + return(current); } diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 882f5040a38..4f84f477b3a 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,6 +39,7 @@ Created July 18, 2007 Vasil Dimov #include "btr0types.h" #include "buf0buddy.h" /* for i_s_cmpmem */ #include "buf0buf.h" /* for buf_pool */ +#include "dict0dict.h" /* for dict_table_stats_lock() */ #include "dict0load.h" /* for file sys_tables related info. */ #include "dict0mem.h" #include "dict0types.h" @@ -57,14 +58,12 @@ Created July 18, 2007 Vasil Dimov /** structure associates a name string with a file page type and/or buffer page state. */ -struct buffer_page_desc_str_struct{ +struct buf_page_desc_t{ const char* type_str; /*!< String explain the page type/state */ ulint type_value; /*!< Page type or page state */ }; -typedef struct buffer_page_desc_str_struct buf_page_desc_str_t; - /** Any states greater than FIL_PAGE_TYPE_LAST would be treated as unknown. */ #define I_S_PAGE_TYPE_UNKNOWN (FIL_PAGE_TYPE_LAST + 1) @@ -73,7 +72,7 @@ in i_s_page_type[] array */ #define I_S_PAGE_TYPE_INDEX 1 /** Name string for File Page Types */ -static buf_page_desc_str_t i_s_page_type[] = { +static buf_page_desc_t i_s_page_type[] = { {"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED}, {"INDEX", FIL_PAGE_INDEX}, {"UNDO_LOG", FIL_PAGE_UNDO_LOG}, @@ -98,7 +97,7 @@ static buf_page_desc_str_t i_s_page_type[] = { /** This structure defines information we will fetch from pages currently cached in the buffer pool. It will be used to populate table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */ -struct buffer_page_info_struct{ +struct buf_page_info_t{ ulint block_id; /*!< Buffer Pool block ID */ unsigned space_id:32; /*!< Tablespace ID */ unsigned page_num:32; /*!< Page number/offset */ @@ -131,8 +130,6 @@ struct buffer_page_info_struct{ index_id_t index_id; /*!< Index ID if a index page */ }; -typedef struct buffer_page_info_struct buf_page_info_t; - /** maximum number of buffer page info we would cache. */ #define MAX_BUF_INFO_CACHED 10000 @@ -282,6 +279,43 @@ field_store_string( } /*******************************************************************//** +Store the name of an index in a MYSQL_TYPE_VARCHAR field. +Handles the names of incomplete secondary indexes. +@return 0 on success */ +static +int +field_store_index_name( +/*===================*/ + Field* field, /*!< in/out: target field for + storage */ + const char* index_name) /*!< in: NUL-terminated utf-8 + index name, possibly starting with + TEMP_INDEX_PREFIX */ +{ + int ret; + + ut_ad(index_name != NULL); + ut_ad(field->real_type() == MYSQL_TYPE_VARCHAR); + + /* Since TEMP_INDEX_PREFIX is not a valid UTF8, we need to convert + it to something else. */ + if (index_name[0] == TEMP_INDEX_PREFIX) { + char buf[NAME_LEN + 1]; + buf[0] = '?'; + memcpy(buf + 1, index_name + 1, strlen(index_name)); + ret = field->store(buf, strlen(buf), + system_charset_info); + } else { + ret = field->store(index_name, strlen(index_name), + system_charset_info); + } + + field->set_notnull(); + + return(ret); +} + +/*******************************************************************//** Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field. If the value is ULINT_UNDEFINED then the field it set to NULL. @return 0 on success */ @@ -713,7 +747,7 @@ static struct st_mysql_information_schema i_s_info = MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; -UNIV_INTERN struct st_maria_plugin i_s_innodb_trx = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_trx = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -757,9 +791,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_trx = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */ @@ -923,16 +961,9 @@ fill_innodb_locks_from_cache( /* lock_index */ if (row->lock_index != NULL) { - - bufend = innobase_convert_name(buf, sizeof(buf), - row->lock_index, - strlen(row->lock_index), - thd, FALSE); - OK(fields[IDX_LOCK_INDEX]->store(buf, bufend - buf, - system_charset_info)); - fields[IDX_LOCK_INDEX]->set_notnull(); + OK(field_store_index_name(fields[IDX_LOCK_INDEX], + row->lock_index)); } else { - fields[IDX_LOCK_INDEX]->set_null(); } @@ -979,7 +1010,7 @@ innodb_locks_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_locks = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_locks = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -1023,9 +1054,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_locks = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */ @@ -1162,7 +1197,7 @@ innodb_lock_waits_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_lock_waits = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_lock_waits = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -1206,9 +1241,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_lock_waits = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /*******************************************************************//** @@ -1495,7 +1534,7 @@ i_s_cmp_reset_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -1539,12 +1578,16 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; -UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_reset = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_reset = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -1589,9 +1632,371 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_reset = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic tables +information_schema.innodb_cmp_per_index and +information_schema.innodb_cmp_per_index_reset. */ +static ST_FIELD_INFO i_s_cmp_per_index_fields_info[] = +{ +#define IDX_DATABASE_NAME 0 + {STRUCT_FLD(field_name, "database_name"), + STRUCT_FLD(field_length, 192), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TABLE_NAME 1 + {STRUCT_FLD(field_name, "table_name"), + STRUCT_FLD(field_length, 192), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_INDEX_NAME 2 + {STRUCT_FLD(field_name, "index_name"), + STRUCT_FLD(field_length, 192), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_COMPRESS_OPS 3 + {STRUCT_FLD(field_name, "compress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_COMPRESS_OPS_OK 4 + {STRUCT_FLD(field_name, "compress_ops_ok"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_COMPRESS_TIME 5 + {STRUCT_FLD(field_name, "compress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_UNCOMPRESS_OPS 6 + {STRUCT_FLD(field_name, "uncompress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_UNCOMPRESS_TIME 7 + {STRUCT_FLD(field_name, "uncompress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill the dynamic table +information_schema.innodb_cmp_per_index or +information_schema.innodb_cmp_per_index_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_fill_low( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* , /*!< in: condition (ignored) */ + ibool reset) /*!< in: TRUE=reset cumulated counts */ +{ + TABLE* table = tables->table; + Field** fields = table->field; + int status = 0; + + DBUG_ENTER("i_s_cmp_per_index_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* Create a snapshot of the stats so we do not bump into lock + order violations with dict_sys->mutex below. */ + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index_t snap (page_zip_stat_per_index); + mutex_exit(&page_zip_stat_per_index_mutex); + + mutex_enter(&dict_sys->mutex); + + page_zip_stat_per_index_t::iterator iter; + ulint i; + + for (iter = snap.begin(), i = 0; iter != snap.end(); iter++, i++) { + + char name[192]; + dict_index_t* index = dict_index_find_on_id_low(iter->first); + + if (index != NULL) { + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(index->table_name, + db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + field_store_string(fields[IDX_DATABASE_NAME], db_utf8); + field_store_string(fields[IDX_TABLE_NAME], table_utf8); + field_store_index_name(fields[IDX_INDEX_NAME], + index->name); + } else { + /* index not found */ + ut_snprintf(name, sizeof(name), + "index_id:" IB_ID_FMT, iter->first); + field_store_string(fields[IDX_DATABASE_NAME], + "unknown"); + field_store_string(fields[IDX_TABLE_NAME], + "unknown"); + field_store_string(fields[IDX_INDEX_NAME], + name); + } + + fields[IDX_COMPRESS_OPS]->store( + iter->second.compressed); + + fields[IDX_COMPRESS_OPS_OK]->store( + iter->second.compressed_ok); + + fields[IDX_COMPRESS_TIME]->store( + (long) (iter->second.compressed_usec / 1000000)); + + fields[IDX_UNCOMPRESS_OPS]->store( + iter->second.decompressed); + + fields[IDX_UNCOMPRESS_TIME]->store( + (long) (iter->second.decompressed_usec / 1000000)); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + + /* Release and reacquire the dict mutex to allow other + threads to proceed. This could eventually result in the + contents of INFORMATION_SCHEMA.innodb_cmp_per_index being + inconsistent, but it is an acceptable compromise. */ + if (i % 1000 == 0) { + mutex_exit(&dict_sys->mutex); + mutex_enter(&dict_sys->mutex); + } + } + + mutex_exit(&dict_sys->mutex); + + if (reset) { + page_zip_reset_stat_per_index(); + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_per_index. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_fill( +/*===================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_per_index_fill_low(thd, tables, cond, FALSE)); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_per_index_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_reset_fill( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_per_index_fill_low(thd, tables, cond, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_per_index. +@return 0 on success */ +static +int +i_s_cmp_per_index_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_per_index_fields_info; + schema->fill_table = i_s_cmp_per_index_fill; + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_per_index_reset. +@return 0 on success */ +static +int +i_s_cmp_per_index_reset_init( +/*=========================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_per_index_fields_info; + schema->fill_table = i_s_cmp_per_index_reset_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_per_index = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP_PER_INDEX"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index)"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_per_index_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_per_index_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP_PER_INDEX_RESET"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index);" + " reset cumulated counts"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_per_index_reset_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table information_schema.innodb_cmpmem. */ @@ -1695,8 +2100,8 @@ i_s_cmpmem_fill_low( table->field[3]->store(UNIV_LIKELY(x < BUF_BUDDY_SIZES) ? UT_LIST_GET_LEN(buf_pool->zip_free[x]) : 0); - table->field[4]->store((longlong) - buddy_stat->relocated, true); + table->field[4]->store( + (longlong) buddy_stat->relocated, true); table->field[5]->store( (ulong) (buddy_stat->relocated_usec / 1000000)); @@ -1786,7 +2191,7 @@ i_s_cmpmem_reset_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -1830,12 +2235,16 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; -UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem_reset = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem_reset = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -1880,9 +2289,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem_reset = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */ @@ -1935,7 +2348,7 @@ static ST_FIELD_INFO innodb_metrics_fields_info[] = #define METRIC_AVG_VALUE_START 5 {STRUCT_FLD(field_name, "AVG_COUNT"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), @@ -1971,7 +2384,7 @@ static ST_FIELD_INFO innodb_metrics_fields_info[] = #define METRIC_AVG_VALUE_RESET 9 {STRUCT_FLD(field_name, "AVG_COUNT_RESET"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), @@ -2360,7 +2773,7 @@ innodb_metrics_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_metrics = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_metrics = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -2404,9 +2817,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_metrics = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */ static ST_FIELD_INFO i_s_stopword_fields_info[] = @@ -2473,7 +2890,7 @@ i_s_stopword_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_default_stopword = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_default_stopword = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -2481,7 +2898,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_default_stopword = /* pointer to type-specific plugin descriptor */ /* void* */ - STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(info, &i_s_stopword_fields_info), /* plugin name */ /* const char* */ @@ -2517,9 +2934,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_default_stopword = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED @@ -2571,8 +2992,8 @@ i_s_fts_deleted_generic_fill( deleted = fts_doc_ids_create(); - user_table = dict_table_open_on_name_no_stats( - fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE); + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { DBUG_RETURN(0); @@ -2603,7 +3024,7 @@ i_s_fts_deleted_generic_fill( fts_doc_ids_free(deleted); - dict_table_close(user_table, FALSE); + dict_table_close(user_table, FALSE, FALSE); DBUG_RETURN(0); } @@ -2642,7 +3063,7 @@ i_s_fts_deleted_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_deleted = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_deleted = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -2650,7 +3071,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_deleted = /* pointer to type-specific plugin descriptor */ /* void* */ - STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(info, &i_s_fts_doc_fields_info), /* plugin name */ /* const char* */ @@ -2686,9 +3107,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_deleted = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /*******************************************************************//** @@ -2725,7 +3150,7 @@ i_s_fts_being_deleted_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_being_deleted = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_being_deleted = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -2733,7 +3158,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_being_deleted = /* pointer to type-specific plugin descriptor */ /* void* */ - STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(info, &i_s_fts_doc_fields_info), /* plugin name */ /* const char* */ @@ -2769,9 +3194,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_being_deleted = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /*******************************************************************//** @@ -2803,8 +3232,8 @@ i_s_fts_inserted_fill( DBUG_RETURN(0); } - user_table = dict_table_open_on_name_no_stats( - fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE); + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { DBUG_RETURN(0); @@ -2835,7 +3264,7 @@ i_s_fts_inserted_fill( fts_doc_ids_free(inserted); - dict_table_close(user_table, FALSE); + dict_table_close(user_table, FALSE, FALSE); DBUG_RETURN(0); } @@ -2858,7 +3287,7 @@ i_s_fts_inserted_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_inserted = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_inserted = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -2866,7 +3295,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_inserted = /* pointer to type-specific plugin descriptor */ /* void* */ - STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(info, &i_s_fts_doc_fields_info), /* plugin name */ /* const char* */ @@ -2902,9 +3331,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_inserted = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and @@ -3078,8 +3511,8 @@ i_s_fts_index_cache_fill( DBUG_RETURN(0); } - user_table = dict_table_open_on_name_no_stats( - fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE); + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { DBUG_RETURN(0); @@ -3098,7 +3531,7 @@ i_s_fts_index_cache_fill( i_s_fts_index_cache_fill_one_index(index_cache, thd, tables); } - dict_table_close(user_table, FALSE); + dict_table_close(user_table, FALSE, FALSE); DBUG_RETURN(0); } @@ -3121,7 +3554,7 @@ i_s_fts_index_cache_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_cache = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_index_cache = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -3129,7 +3562,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_cache = /* pointer to type-specific plugin descriptor */ /* void* */ - STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(info, &i_s_fts_index_fields_info), /* plugin name */ /* const char* */ @@ -3165,9 +3598,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_cache = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /*******************************************************************//** @@ -3276,6 +3713,7 @@ i_s_fts_index_table_fill_one_index( ulint num_row_fill; DBUG_ENTER("i_s_fts_index_cache_fill_one_index"); + DBUG_ASSERT(!dict_index_is_online_ddl(index)); heap = mem_heap_create(1024); @@ -3384,8 +3822,8 @@ i_s_fts_index_table_fill( DBUG_RETURN(0); } - user_table = dict_table_open_on_name_no_stats( - fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE); + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { DBUG_RETURN(0); @@ -3398,7 +3836,7 @@ i_s_fts_index_table_fill( } } - dict_table_close(user_table, FALSE); + dict_table_close(user_table, FALSE, FALSE); DBUG_RETURN(0); } @@ -3421,7 +3859,7 @@ i_s_fts_index_table_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_table = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_index_table = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -3429,7 +3867,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_table = /* pointer to type-specific plugin descriptor */ /* void* */ - STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(info, &i_s_fts_index_fields_info), /* plugin name */ /* const char* */ @@ -3465,9 +3903,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_table = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */ @@ -3541,8 +3983,8 @@ i_s_fts_config_fill( fields = table->field; - user_table = dict_table_open_on_name_no_stats( - fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE); + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { DBUG_RETURN(0); @@ -3556,6 +3998,7 @@ i_s_fts_config_fill( if (!ib_vector_is_empty(user_table->fts->indexes)) { index = (dict_index_t*) ib_vector_getp_const( user_table->fts->indexes, 0); + DBUG_ASSERT(!dict_index_is_online_ddl(index)); } while (fts_config_key[i]) { @@ -3567,10 +4010,10 @@ i_s_fts_config_fill( value.f_str = str; - if (strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0 - && index) { + if (index + && strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0) { key_name = fts_config_create_index_param_name( - fts_config_key[i], index); + fts_config_key[i], index); allocated = TRUE; } else { key_name = (char*) fts_config_key[i]; @@ -3597,7 +4040,7 @@ i_s_fts_config_fill( trx_free_for_background(trx); - dict_table_close(user_table, FALSE); + dict_table_close(user_table, FALSE, FALSE); DBUG_RETURN(0); } @@ -3620,7 +4063,7 @@ i_s_fts_config_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_config = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_config = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -3628,7 +4071,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_config = /* pointer to type-specific plugin descriptor */ /* void* */ - STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(info, &i_s_fts_config_fields_info), /* plugin name */ /* const char* */ @@ -3664,9 +4107,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_config = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */ @@ -3782,7 +4229,7 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = #define IDX_BUF_STATS_PAGE_YOUNG_RATE 12 {STRUCT_FLD(field_name, "PAGES_MADE_YOUNG_RATE"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, 0), @@ -3791,7 +4238,7 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = #define IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13 {STRUCT_FLD(field_name, "PAGES_MADE_NOT_YOUNG_RATE"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, 0), @@ -3827,7 +4274,7 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = #define IDX_BUF_STATS_PAGE_READ_RATE 17 {STRUCT_FLD(field_name, "PAGES_READ_RATE"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, 0), @@ -3836,7 +4283,7 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = #define IDX_BUF_STATS_PAGE_CREATE_RATE 18 {STRUCT_FLD(field_name, "PAGES_CREATE_RATE"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, 0), @@ -3845,7 +4292,7 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = #define IDX_BUF_STATS_PAGE_WRITTEN_RATE 19 {STRUCT_FLD(field_name, "PAGES_WRITTEN_RATE"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, 0), @@ -3908,7 +4355,7 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = #define IDX_BUF_STATS_READ_AHEAD_RATE 26 {STRUCT_FLD(field_name, "READ_AHEAD_RATE"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, 0), @@ -3917,7 +4364,7 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = #define IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27 {STRUCT_FLD(field_name, "READ_AHEAD_EVICTED_RATE"), - STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, 0), @@ -4023,11 +4470,13 @@ i_s_innodb_stats_fill( OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(info->n_pages_written)); + OK(fields[IDX_BUF_STATS_GET]->store(info->n_page_gets)); + OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store(info->pages_read_rate)); - OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store(info->pages_created_rate)); + OK(fields[IDX_BUF_STATS_PAGE_CREATE_RATE]->store(info->pages_created_rate)); - OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(info->pages_written_rate)); + OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_RATE]->store(info->pages_written_rate)); if (info->n_page_get_delta) { OK(fields[IDX_BUF_STATS_HIT_RATE]->store( @@ -4137,7 +4586,7 @@ i_s_innodb_buffer_pool_stats_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_stats = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_stats = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -4181,9 +4630,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_stats = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */ @@ -4384,9 +4837,8 @@ i_s_innodb_buffer_page_fill( TABLE_LIST* tables, /*!< in/out: tables to fill */ const buf_page_info_t* info_array, /*!< in: array cached page info */ - ulint num_page, /*!< in: number of page info - cached */ - mem_heap_t* heap) /*!< in: temp heap memory */ + ulint num_page) /*!< in: number of page info + cached */ { TABLE* table; Field** fields; @@ -4400,15 +4852,13 @@ i_s_innodb_buffer_page_fill( /* Iterate through the cached array and fill the I_S table rows */ for (ulint i = 0; i < num_page; i++) { const buf_page_info_t* page_info; - const char* table_name; - const char* index_name; + char table_name[MAX_FULL_NAME_LEN + 1]; + const char* table_name_end = NULL; const char* state_str; enum buf_page_state state; page_info = info_array + i; - table_name = NULL; - index_name = NULL; state_str = NULL; OK(fields[IDX_BUFFER_POOL_ID]->store(page_info->pool_id)); @@ -4446,6 +4896,10 @@ i_s_innodb_buffer_page_fill( OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store( page_info->access_time)); + fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_null(); + + fields[IDX_BUFFER_PAGE_INDEX_NAME]->set_null(); + /* If this is an index page, fetch the index name and table name */ if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { @@ -4455,32 +4909,28 @@ i_s_innodb_buffer_page_fill( index = dict_index_get_if_in_cache_low( page_info->index_id); - /* Copy the index/table name under mutex. We - do not want to hold the InnoDB mutex while - filling the IS table */ if (index) { - const char* name_ptr = index->name; - - if (name_ptr[0] == TEMP_INDEX_PREFIX) { - name_ptr++; - } - - index_name = mem_heap_strdup(heap, name_ptr); - - table_name = mem_heap_strdup(heap, - index->table_name); + table_name_end = innobase_convert_name( + table_name, sizeof(table_name), + index->table_name, + strlen(index->table_name), + thd, TRUE); + + OK(fields[IDX_BUFFER_PAGE_TABLE_NAME]->store( + table_name, + table_name_end - table_name, + system_charset_info)); + fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_notnull(); + + OK(field_store_index_name( + fields[IDX_BUFFER_PAGE_INDEX_NAME], + index->name)); } mutex_exit(&dict_sys->mutex); } - OK(field_store_string( - fields[IDX_BUFFER_PAGE_TABLE_NAME], table_name)); - - OK(field_store_string( - fields[IDX_BUFFER_PAGE_INDEX_NAME], index_name)); - OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store( page_info->num_recs)); @@ -4593,7 +5043,7 @@ i_s_innodb_set_page_type( /* Encountered an unknown page type */ page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; } else { - /* Make sure we get the righ index into the + /* Make sure we get the right index into the i_s_page_type[] array */ ut_a(page_type == i_s_page_type[page_type].type_value); @@ -4751,7 +5201,7 @@ i_s_innodb_fill_buffer_pool( just collected from the buffer chunk scan */ status = i_s_innodb_buffer_page_fill( thd, tables, info_buffer, - num_page, heap); + num_page); /* If something goes wrong, break and return */ if (status) { @@ -4830,7 +5280,7 @@ i_s_innodb_buffer_page_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_page = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -4874,9 +5324,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] = @@ -5094,13 +5548,11 @@ i_s_innodb_buf_page_lru_fill( /* Iterate through the cached array and fill the I_S table rows */ for (ulint i = 0; i < num_page; i++) { const buf_page_info_t* page_info; - const char* table_name; - const char* index_name; + char table_name[MAX_FULL_NAME_LEN + 1]; + const char* table_name_end = NULL; const char* state_str; enum buf_page_state state; - table_name = NULL; - index_name = NULL; state_str = NULL; page_info = info_array + i; @@ -5140,6 +5592,10 @@ i_s_innodb_buf_page_lru_fill( OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store( page_info->access_time)); + fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_null(); + + fields[IDX_BUF_LRU_PAGE_INDEX_NAME]->set_null(); + /* If this is an index page, fetch the index name and table name */ if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { @@ -5149,30 +5605,28 @@ i_s_innodb_buf_page_lru_fill( index = dict_index_get_if_in_cache_low( page_info->index_id); - /* Copy the index/table name under mutex. We - do not want to hold the InnoDB mutex while - filling the IS table */ if (index) { - const char* name_ptr = index->name; - - if (name_ptr[0] == TEMP_INDEX_PREFIX) { - name_ptr++; - } - - index_name = mem_heap_strdup(heap, name_ptr); - table_name = mem_heap_strdup(heap, - index->table_name); + table_name_end = innobase_convert_name( + table_name, sizeof(table_name), + index->table_name, + strlen(index->table_name), + thd, TRUE); + + OK(fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->store( + table_name, + table_name_end - table_name, + system_charset_info)); + fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_notnull(); + + OK(field_store_index_name( + fields[IDX_BUF_LRU_PAGE_INDEX_NAME], + index->name)); } mutex_exit(&dict_sys->mutex); } - OK(field_store_string( - fields[IDX_BUF_LRU_PAGE_TABLE_NAME], table_name)); - - OK(field_store_string( - fields[IDX_BUF_LRU_PAGE_INDEX_NAME], index_name)); OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store( page_info->num_recs)); @@ -5372,7 +5826,7 @@ i_s_innodb_buffer_page_lru_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page_lru = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_page_lru = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -5416,9 +5870,13 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page_lru = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; /*******************************************************************//** @@ -5437,10 +5895,11 @@ i_s_common_deinit( DBUG_RETURN(0); } +/** SYS_TABLES ***************************************************/ /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = { -#define SYS_TABLE_ID 0 +#define SYS_TABLES_ID 0 {STRUCT_FLD(field_name, "TABLE_ID"), STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), @@ -5449,7 +5908,7 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLE_NAME 1 +#define SYS_TABLES_NAME 1 {STRUCT_FLD(field_name, "NAME"), STRUCT_FLD(field_length, MAX_FULL_NAME_LEN + 1), STRUCT_FLD(field_type, MYSQL_TYPE_STRING), @@ -5458,7 +5917,7 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLE_FLAG 2 +#define SYS_TABLES_FLAG 2 {STRUCT_FLD(field_name, "FLAG"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -5467,7 +5926,7 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLE_NUM_COLUMN 3 +#define SYS_TABLES_NUM_COLUMN 3 {STRUCT_FLD(field_name, "N_COLS"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -5476,7 +5935,7 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define SYS_TABLE_SPACE 4 +#define SYS_TABLES_SPACE 4 {STRUCT_FLD(field_name, "SPACE"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -5485,6 +5944,33 @@ static ST_FIELD_INFO innodb_sys_tables_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, +#define SYS_TABLES_FILE_FORMAT 5 + {STRUCT_FLD(field_name, "FILE_FORMAT"), + STRUCT_FLD(field_length, 10), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_ROW_FORMAT 6 + {STRUCT_FLD(field_name, "ROW_FORMAT"), + STRUCT_FLD(field_length, 12), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_ZIP_PAGE_SIZE 7 + {STRUCT_FLD(field_name, "ZIP_PAGE_SIZE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + END_OF_ST_FIELD_INFO }; @@ -5501,20 +5987,42 @@ i_s_dict_fill_sys_tables( TABLE* table_to_fill) /*!< in/out: fill this table */ { Field** fields; + ulint compact = DICT_TF_GET_COMPACT(table->flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table->flags); + ulint zip_size = dict_tf_get_zip_size(table->flags); + const char* file_format; + const char* row_format; + + file_format = trx_sys_file_format_id_to_name(atomic_blobs); + if (!compact) { + row_format = "Redundant"; + } else if (!atomic_blobs) { + row_format = "Compact"; + } else if DICT_TF_GET_ZIP_SSIZE(table->flags) { + row_format = "Compressed"; + } else { + row_format = "Dynamic"; + } DBUG_ENTER("i_s_dict_fill_sys_tables"); fields = table_to_fill->field; - OK(fields[SYS_TABLE_ID]->store(longlong(table->id), TRUE)); + OK(fields[SYS_TABLES_ID]->store(longlong(table->id), TRUE)); + + OK(field_store_string(fields[SYS_TABLES_NAME], table->name)); + + OK(fields[SYS_TABLES_FLAG]->store(table->flags)); - OK(field_store_string(fields[SYS_TABLE_NAME], table->name)); + OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols)); - OK(fields[SYS_TABLE_FLAG]->store(table->flags)); + OK(fields[SYS_TABLES_SPACE]->store(table->space)); - OK(fields[SYS_TABLE_NUM_COLUMN]->store(table->n_cols)); + OK(field_store_string(fields[SYS_TABLES_FILE_FORMAT], file_format)); - OK(fields[SYS_TABLE_SPACE]->store(table->space)); + OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format)); + + OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store(zip_size)); OK(schema_table_store_record(thd, table_to_fill)); @@ -5614,7 +6122,7 @@ innodb_sys_tables_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tables = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tables = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -5658,11 +6166,16 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tables = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; +/** SYS_TABLESTATS ***********************************************/ /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */ static ST_FIELD_INFO innodb_sys_tablestats_fields_info[] = { @@ -5772,24 +6285,37 @@ i_s_dict_fill_sys_tablestats( OK(field_store_string(fields[SYS_TABLESTATS_NAME], table->name)); + dict_table_stats_lock(table, RW_S_LATCH); + if (table->stat_initialized) { OK(field_store_string(fields[SYS_TABLESTATS_INIT], "Initialized")); + + OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, + TRUE)); + + OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store( + table->stat_clustered_index_size)); + + OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store( + table->stat_sum_of_other_index_sizes)); + + OK(fields[SYS_TABLESTATS_MODIFIED]->store( + (ulint) table->stat_modified_counter)); } else { OK(field_store_string(fields[SYS_TABLESTATS_INIT], "Uninitialized")); - } - OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, TRUE)); + OK(fields[SYS_TABLESTATS_NROW]->store(0, TRUE)); - OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store( - table->stat_clustered_index_size)); + OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0)); - OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store( - table->stat_sum_of_other_index_sizes)); + OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0)); - OK(fields[SYS_TABLESTATS_MODIFIED]->store( - table->stat_modified_counter)); + OK(fields[SYS_TABLESTATS_MODIFIED]->store(0)); + } + + dict_table_stats_unlock(table, RW_S_LATCH); OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, TRUE)); @@ -5889,7 +6415,7 @@ innodb_sys_tablestats_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tablestats = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tablestats = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -5933,11 +6459,16 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tablestats = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; +/** SYS_INDEXES **************************************************/ /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */ static ST_FIELD_INFO innodb_sysindex_fields_info[] = { @@ -6022,17 +6553,12 @@ i_s_dict_fill_sys_indexes( TABLE* table_to_fill) /*!< in/out: fill this table */ { Field** fields; - const char* name_ptr = index->name; DBUG_ENTER("i_s_dict_fill_sys_indexes"); fields = table_to_fill->field; - if (name_ptr[0] == TEMP_INDEX_PREFIX) { - name_ptr++; - } - - OK(field_store_string(fields[SYS_INDEX_NAME], name_ptr)); + OK(field_store_index_name(fields[SYS_INDEX_NAME], index->name)); OK(fields[SYS_INDEX_ID]->store(longlong(index->id), TRUE)); @@ -6144,7 +6670,7 @@ innodb_sys_indexes_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_indexes = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_indexes = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -6188,12 +6714,17 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_indexes = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; -/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_COLUMNS */ +/** SYS_COLUMNS **************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */ static ST_FIELD_INFO innodb_sys_columns_fields_info[] = { #define SYS_COLUMN_TABLE_ID 0 @@ -6379,7 +6910,7 @@ innodb_sys_columns_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_columns = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_columns = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -6423,11 +6954,17 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_columns = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; -/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_fields */ + +/** SYS_FIELDS ***************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */ static ST_FIELD_INFO innodb_sys_fields_fields_info[] = { #define SYS_FIELD_INDEX_ID 0 @@ -6586,7 +7123,7 @@ innodb_sys_fields_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_fields = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_fields = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -6630,12 +7167,17 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_fields = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; -/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign */ +/** SYS_FOREIGN ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */ static ST_FIELD_INFO innodb_sys_foreign_fields_info[] = { #define SYS_FOREIGN_ID 0 @@ -6720,6 +7262,7 @@ i_s_dict_fill_sys_foreign( DBUG_RETURN(0); } + /*******************************************************************//** Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop through each record in SYS_FOREIGN, and extract the foreign key @@ -6786,6 +7329,7 @@ i_s_sys_foreign_fill_table( DBUG_RETURN(0); } + /*******************************************************************//** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign @return 0 on success */ @@ -6807,7 +7351,7 @@ innodb_sys_foreign_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_foreign = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -6851,11 +7395,17 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; -/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols */ + +/** SYS_FOREIGN_COLS ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */ static ST_FIELD_INFO innodb_sys_foreign_cols_fields_info[] = { #define SYS_FOREIGN_COL_ID 0 @@ -7021,7 +7571,7 @@ innodb_sys_foreign_cols_init( DBUG_RETURN(0); } -UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign_cols = +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_foreign_cols = { /* the plugin type (a MYSQL_XXX_PLUGIN value) */ /* int */ @@ -7065,8 +7615,470 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign_cols = /* struct st_mysql_sys_var** */ STRUCT_FLD(system_vars, NULL), - /* Maria extension */ - STRUCT_FLD(version_info, INNODB_VERSION_STR), - STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_TABLESPACES ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */ +static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = +{ +#define SYS_TABLESPACES_SPACE 0 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_NAME 1 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, MAX_FULL_NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_FLAGS 2 + {STRUCT_FLD(field_name, "FLAG"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_FILE_FORMAT 3 + {STRUCT_FLD(field_name, "FILE_FORMAT"), + STRUCT_FLD(field_length, 10), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_ROW_FORMAT 4 + {STRUCT_FLD(field_name, "ROW_FORMAT"), + STRUCT_FLD(field_length, 22), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_PAGE_SIZE 5 + {STRUCT_FLD(field_name, "PAGE_SIZE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_ZIP_PAGE_SIZE 6 + {STRUCT_FLD(field_name, "ZIP_PAGE_SIZE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO + +}; + +/**********************************************************************//** +Function to fill INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES with information +collected by scanning SYS_TABLESPACESS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_tablespaces( +/*==========================*/ + THD* thd, /*!< in: thread */ + ulint space, /*!< in: space ID */ + const char* name, /*!< in: tablespace name */ + ulint flags, /*!< in: tablespace flags */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); + ulint page_size = fsp_flags_get_page_size(flags);; + ulint zip_size = fsp_flags_get_zip_size(flags); + const char* file_format; + const char* row_format; + + DBUG_ENTER("i_s_dict_fill_sys_tablespaces"); + + file_format = trx_sys_file_format_id_to_name(atomic_blobs); + if (!atomic_blobs) { + row_format = "Compact or Redundant"; + } else if DICT_TF_GET_ZIP_SSIZE(flags) { + row_format = "Compressed"; + } else { + row_format = "Dynamic"; + } + + fields = table_to_fill->field; + + OK(fields[SYS_TABLESPACES_SPACE]->store(space)); + + OK(field_store_string(fields[SYS_TABLESPACES_NAME], name)); + + OK(fields[SYS_TABLESPACES_FLAGS]->store(flags)); + + OK(field_store_string(fields[SYS_TABLESPACES_FILE_FORMAT], + file_format)); + + OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], + row_format)); + + OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(page_size)); + + OK(fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->store(zip_size)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table. +Loop through each record in SYS_TABLESPACES, and extract the column +information and fill the INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table. +@return 0 on success */ +static +int +i_s_sys_tablespaces_fill_table( +/*===========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_tablespaces_fill_table"); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES); + + while (rec) { + const char* err_msg; + ulint space; + const char* name; + ulint flags; + + /* Extract necessary information from a SYS_TABLESPACES row */ + err_msg = dict_process_sys_tablespaces( + heap, rec, &space, &name, &flags); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_tablespaces( + thd, space, name, flags, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES +@return 0 on success */ +static +int +innodb_sys_tablespaces_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_tablespaces_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_tablespaces_fields_info; + schema->fill_table = i_s_sys_tablespaces_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tablespaces = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_TABLESPACES"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_TABLESPACES"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_tablespaces_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), }; +/** SYS_DATAFILES ************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES */ +static ST_FIELD_INFO innodb_sys_datafiles_fields_info[] = +{ +#define SYS_DATAFILES_SPACE 0 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_DATAFILES_PATH 1 + {STRUCT_FLD(field_name, "PATH"), + STRUCT_FLD(field_length, OS_FILE_MAX_PATH), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Function to fill INFORMATION_SCHEMA.INNODB_SYS_DATAFILES with information +collected by scanning SYS_DATAFILESS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_datafiles( +/*========================*/ + THD* thd, /*!< in: thread */ + ulint space, /*!< in: space ID */ + const char* path, /*!< in: absolute path */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_datafiles"); + + fields = table_to_fill->field; + + OK(field_store_ulint(fields[SYS_DATAFILES_SPACE], space)); + + OK(field_store_string(fields[SYS_DATAFILES_PATH], path)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table. +Loop through each record in SYS_DATAFILES, and extract the column +information and fill the INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table. +@return 0 on success */ +static +int +i_s_sys_datafiles_fill_table( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_datafiles_fill_table"); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_DATAFILES); + + while (rec) { + const char* err_msg; + ulint space; + const char* path; + + /* Extract necessary information from a SYS_DATAFILES row */ + err_msg = dict_process_sys_datafiles( + heap, rec, &space, &path); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_datafiles( + thd, space, path, tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES +@return 0 on success */ +static +int +innodb_sys_datafiles_init( +/*======================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_datafiles_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_datafiles_fields_info; + schema->fill_table = i_s_sys_datafiles_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_datafiles = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_DATAFILES"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_DATAFILES"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_datafiles_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h index 7fc7b091795..9e3e651706a 100644 --- a/storage/innobase/handler/i_s.h +++ b/storage/innobase/handler/i_s.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,30 +28,34 @@ Created July 18, 2007 Vasil Dimov const char plugin_author[] = "Oracle Corporation"; -extern struct st_maria_plugin i_s_innodb_trx; -extern struct st_maria_plugin i_s_innodb_locks; -extern struct st_maria_plugin i_s_innodb_lock_waits; -extern struct st_maria_plugin i_s_innodb_cmp; -extern struct st_maria_plugin i_s_innodb_cmp_reset; -extern struct st_maria_plugin i_s_innodb_cmpmem; -extern struct st_maria_plugin i_s_innodb_cmpmem_reset; -extern struct st_maria_plugin i_s_innodb_metrics; -extern struct st_maria_plugin i_s_innodb_ft_default_stopword; -extern struct st_maria_plugin i_s_innodb_ft_inserted; -extern struct st_maria_plugin i_s_innodb_ft_deleted; -extern struct st_maria_plugin i_s_innodb_ft_being_deleted; -extern struct st_maria_plugin i_s_innodb_ft_index_cache; -extern struct st_maria_plugin i_s_innodb_ft_index_table; -extern struct st_maria_plugin i_s_innodb_ft_config; -extern struct st_maria_plugin i_s_innodb_buffer_page; -extern struct st_maria_plugin i_s_innodb_buffer_page_lru; -extern struct st_maria_plugin i_s_innodb_buffer_stats; -extern struct st_maria_plugin i_s_innodb_sys_tables; -extern struct st_maria_plugin i_s_innodb_sys_tablestats; -extern struct st_maria_plugin i_s_innodb_sys_indexes; -extern struct st_maria_plugin i_s_innodb_sys_columns; -extern struct st_maria_plugin i_s_innodb_sys_fields; -extern struct st_maria_plugin i_s_innodb_sys_foreign; -extern struct st_maria_plugin i_s_innodb_sys_foreign_cols; +extern struct st_mysql_plugin i_s_innodb_trx; +extern struct st_mysql_plugin i_s_innodb_locks; +extern struct st_mysql_plugin i_s_innodb_lock_waits; +extern struct st_mysql_plugin i_s_innodb_cmp; +extern struct st_mysql_plugin i_s_innodb_cmp_reset; +extern struct st_mysql_plugin i_s_innodb_cmp_per_index; +extern struct st_mysql_plugin i_s_innodb_cmp_per_index_reset; +extern struct st_mysql_plugin i_s_innodb_cmpmem; +extern struct st_mysql_plugin i_s_innodb_cmpmem_reset; +extern struct st_mysql_plugin i_s_innodb_metrics; +extern struct st_mysql_plugin i_s_innodb_ft_default_stopword; +extern struct st_mysql_plugin i_s_innodb_ft_inserted; +extern struct st_mysql_plugin i_s_innodb_ft_deleted; +extern struct st_mysql_plugin i_s_innodb_ft_being_deleted; +extern struct st_mysql_plugin i_s_innodb_ft_index_cache; +extern struct st_mysql_plugin i_s_innodb_ft_index_table; +extern struct st_mysql_plugin i_s_innodb_ft_config; +extern struct st_mysql_plugin i_s_innodb_buffer_page; +extern struct st_mysql_plugin i_s_innodb_buffer_page_lru; +extern struct st_mysql_plugin i_s_innodb_buffer_stats; +extern struct st_mysql_plugin i_s_innodb_sys_tables; +extern struct st_mysql_plugin i_s_innodb_sys_tablestats; +extern struct st_mysql_plugin i_s_innodb_sys_indexes; +extern struct st_mysql_plugin i_s_innodb_sys_columns; +extern struct st_mysql_plugin i_s_innodb_sys_fields; +extern struct st_mysql_plugin i_s_innodb_sys_foreign; +extern struct st_mysql_plugin i_s_innodb_sys_foreign_cols; +extern struct st_mysql_plugin i_s_innodb_sys_tablespaces; +extern struct st_mysql_plugin i_s_innodb_sys_datafiles; #endif /* i_s_h */ diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index cd9de39f3c6..168da732bc0 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -25,6 +25,10 @@ Created 7/19/1997 Heikki Tuuri #include "ibuf0ibuf.h" +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +UNIV_INTERN my_bool srv_ibuf_disable_background_merge; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + /** Number of bits describing a single page */ #define IBUF_BITS_PER_PAGE 4 #if IBUF_BITS_PER_PAGE % 2 @@ -56,6 +60,7 @@ Created 7/19/1997 Heikki Tuuri #include "log0recv.h" #include "que0que.h" #include "srv0start.h" /* srv_shutdown_state */ +#include "ha_prototypes.h" /* STRUCTURE OF AN INSERT BUFFER RECORD @@ -284,16 +289,16 @@ type, counter, and some flags. */ /** The mutex used to block pessimistic inserts to ibuf trees */ -static mutex_t ibuf_pessimistic_insert_mutex; +static ib_mutex_t ibuf_pessimistic_insert_mutex; /** The mutex protecting the insert buffer structs */ -static mutex_t ibuf_mutex; +static ib_mutex_t ibuf_mutex; /** The mutex protecting the insert buffer bitmaps */ -static mutex_t ibuf_bitmap_mutex; +static ib_mutex_t ibuf_bitmap_mutex; /** The area in pages from which contract looks for page numbers for merge */ -#define IBUF_MERGE_AREA 8 +#define IBUF_MERGE_AREA 8UL /** Inside the merge area, pages which have at most 1 per this number less buffered entries compared to maximum volume that can buffered for a single @@ -507,7 +512,7 @@ ibuf_init_at_db_start(void) dict_index_t* index; ulint n_used; page_t* header_page; - ulint error; + dberr_t error; ibuf = static_cast<ibuf_t*>(mem_zalloc(sizeof(ibuf_t))); @@ -2485,6 +2490,73 @@ ibuf_get_merge_page_nos_func( return(sum_volumes); } +/*******************************************************************//** +Get the matching records for space id. +@return current rec or NULL */ +static __attribute__((nonnull, warn_unused_result)) +const rec_t* +ibuf_get_user_rec( +/*===============*/ + btr_pcur_t* pcur, /*!< in: the current cursor */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + do { + const rec_t* rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_user_rec(rec)) { + return(rec); + } + } while (btr_pcur_move_to_next(pcur, mtr)); + + return(NULL); +} + +/*********************************************************************//** +Reads page numbers for a space id from an ibuf tree. +@return a lower limit for the combined volume of records which will be +merged */ +static __attribute__((nonnull, warn_unused_result)) +ulint +ibuf_get_merge_pages( +/*=================*/ + btr_pcur_t* pcur, /*!< in/out: cursor */ + ulint space, /*!< in: space for which to merge */ + ulint limit, /*!< in: max page numbers to read */ + ulint* pages, /*!< out: pages read */ + ulint* spaces, /*!< out: spaces read */ + ib_int64_t* versions,/*!< out: space versions read */ + ulint* n_pages,/*!< out: number of pages read */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + const rec_t* rec; + ulint volume = 0; + ib_int64_t version = fil_space_get_version(space); + + ut_a(space != ULINT_UNDEFINED); + + *n_pages = 0; + + while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0 + && ibuf_rec_get_space(mtr, rec) == space + && *n_pages < limit) { + + ulint page_no = ibuf_rec_get_page_no(mtr, rec); + + if (*n_pages == 0 || pages[*n_pages - 1] != page_no) { + spaces[*n_pages] = space; + pages[*n_pages] = page_no; + versions[*n_pages] = version; + ++*n_pages; + } + + volume += ibuf_rec_get_volume(mtr, rec); + + btr_pcur_move_to_next(pcur, mtr); + } + + return(volume); +} + /*********************************************************************//** Contracts insert buffer trees by reading pages to the buffer pool. @return a lower limit for the combined size in bytes of entries which @@ -2492,32 +2564,22 @@ will be merged from ibuf trees to the pages read, 0 if ibuf is empty */ static ulint -ibuf_contract_ext( -/*==============*/ - ulint* n_pages,/*!< out: number of pages to which merged */ - ibool sync) /*!< in: TRUE if the caller wants to wait for the - issued read with the highest tablespace address - to complete */ +ibuf_merge_pages( +/*=============*/ + ulint* n_pages, /*!< out: number of pages to which merged */ + bool sync) /*!< in: TRUE if the caller wants to wait for + the issued read with the highest tablespace + address to complete */ { + mtr_t mtr; btr_pcur_t pcur; + ulint sum_sizes; ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; - ulint sum_sizes; - mtr_t mtr; *n_pages = 0; - /* We perform a dirty read of ibuf->empty, without latching - the insert buffer root page. We trust this dirty read except - when a slow shutdown is being executed. During a slow - shutdown, the insert buffer merge must be completed. */ - - if (UNIV_UNLIKELY(ibuf->empty) - && UNIV_LIKELY(!srv_shutdown_state)) { - return(0); - } - ibuf_mtr_start(&mtr); /* Open a cursor to a randomly chosen leaf of the tree, at a random @@ -2554,18 +2616,159 @@ ibuf_contract_ext( ibuf_mtr_commit(&mtr); btr_pcur_close(&pcur); - buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos, - *n_pages); + buf_read_ibuf_merge_pages( + sync, space_ids, space_versions, page_nos, *n_pages); return(sum_sizes + 1); } /*********************************************************************//** +Get the table instance from the table id. +@return table instance */ +static __attribute__((warn_unused_result)) +dict_table_t* +ibuf_get_table( +/*===========*/ + table_id_t table_id) /*!< in: valid table id */ +{ + rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + + dict_table_t* table = dict_table_open_on_id(table_id, FALSE, FALSE); + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + return(table); +} + +/*********************************************************************//** Contracts insert buffer trees by reading pages to the buffer pool. @return a lower limit for the combined size in bytes of entries which will be merged from ibuf trees to the pages read, 0 if ibuf is empty */ -UNIV_INTERN +static +ulint +ibuf_merge_space( +/*=============*/ + ulint space, /*!< in: tablespace id to merge */ + ulint* n_pages)/*!< out: number of pages to which merged */ +{ + mtr_t mtr; + btr_pcur_t pcur; + mem_heap_t* heap = mem_heap_create(512); + dtuple_t* tuple = ibuf_search_tuple_build(space, 0, heap); + + ibuf_mtr_start(&mtr); + + /* Position the cursor on the first matching record. */ + + btr_pcur_open( + ibuf->index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, + &mtr); + + mem_heap_free(heap); + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); + + ulint sum_sizes = 0; + ulint pages[IBUF_MAX_N_PAGES_MERGED]; + ulint spaces[IBUF_MAX_N_PAGES_MERGED]; + ib_int64_t versions[IBUF_MAX_N_PAGES_MERGED]; + + if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf->empty); + ut_ad(page_get_space_id(btr_pcur_get_page(&pcur)) + == IBUF_SPACE_ID); + ut_ad(page_get_page_no(btr_pcur_get_page(&pcur)) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + } else { + + sum_sizes = ibuf_get_merge_pages( + &pcur, space, IBUF_MAX_N_PAGES_MERGED, + &pages[0], &spaces[0], &versions[0], n_pages, + &mtr); + + ++sum_sizes; + } + + ibuf_mtr_commit(&mtr); + + btr_pcur_close(&pcur); + + if (sum_sizes > 0) { + + ut_a(*n_pages > 0 || sum_sizes == 1); + +#ifdef UNIV_DEBUG + ut_ad(*n_pages <= UT_ARR_SIZE(pages)); + + for (ulint i = 0; i < *n_pages; ++i) { + ut_ad(spaces[i] == space); + ut_ad(i == 0 || versions[i] == versions[i - 1]); + } +#endif /* UNIV_DEBUG */ + + buf_read_ibuf_merge_pages( + TRUE, spaces, versions, pages, *n_pages); + } + + return(sum_sizes); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +static __attribute__((nonnull, warn_unused_result)) +ulint +ibuf_merge( +/*=======*/ + table_id_t table_id, /*!< in: if merge should be + done only for a specific + table, for all tables this + should be 0 */ + ulint* n_pages, /*!< out: number of pages to + which merged */ + bool sync) /*!< in: TRUE if the caller + wants to wait for the issued + read with the highest + tablespace address to complete */ +{ + dict_table_t* table; + + *n_pages = 0; + + /* We perform a dirty read of ibuf->empty, without latching + the insert buffer root page. We trust this dirty read except + when a slow shutdown is being executed. During a slow + shutdown, the insert buffer merge must be completed. */ + + if (ibuf->empty && !srv_shutdown_state) { + return(0); + } else if (table_id == 0) { + return(ibuf_merge_pages(n_pages, sync)); + } else if ((table = ibuf_get_table(table_id)) == 0) { + /* Table has been dropped. */ + return(0); + } + + ulint volume = ibuf_merge_space(table->space, n_pages); + + dict_table_close(table, FALSE, FALSE); + + return(volume); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +static ulint ibuf_contract( /*==========*/ @@ -2575,7 +2778,7 @@ ibuf_contract( { ulint n_pages; - return(ibuf_contract_ext(&n_pages, sync)); + return(ibuf_merge(0, &n_pages, sync)); } /*********************************************************************//** @@ -2587,17 +2790,26 @@ UNIV_INTERN ulint ibuf_contract_in_background( /*========================*/ - ibool full) /*!< in: TRUE if the caller wants to do a full - contract based on PCT_IO(100). If FALSE then - the size of contract batch is determined based - on the current size of the ibuf tree. */ + table_id_t table_id, /*!< in: if merge should be done only + for a specific table, for all tables + this should be 0 */ + ibool full) /*!< in: TRUE if the caller wants to + do a full contract based on PCT_IO(100). + If FALSE then the size of contract + batch is determined based on the + current size of the ibuf tree. */ { ulint sum_bytes = 0; ulint sum_pages = 0; - ulint n_bytes; ulint n_pag2; ulint n_pages; +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + if (srv_ibuf_disable_background_merge && table_id == 0) { + return(0); + } +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + if (full) { /* Caller has requested a full batch */ n_pages = PCT_IO(100); @@ -2620,7 +2832,9 @@ ibuf_contract_in_background( } while (sum_pages < n_pages) { - n_bytes = ibuf_contract_ext(&n_pag2, FALSE); + ulint n_bytes; + + n_bytes = ibuf_merge(table_id, &n_pag2, FALSE); if (n_bytes == 0) { return(sum_bytes); @@ -3061,7 +3275,7 @@ ibuf_update_max_tablespace_id(void) ibuf_mtr_start(&mtr); btr_pcur_open_at_index_side( - FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + false, ibuf->index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); @@ -3223,8 +3437,8 @@ ibuf_get_entry_counter_func( Buffer an operation in the insert/delete buffer, instead of doing it directly to the disk page, if this is possible. @return DB_SUCCESS, DB_STRONG_FAIL or other error */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t ibuf_insert_low( /*============*/ ulint mode, /*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */ @@ -3246,7 +3460,9 @@ ibuf_insert_low( btr_pcur_t pcur; btr_cur_t* cursor; dtuple_t* ibuf_entry; + mem_heap_t* offsets_heap = NULL; mem_heap_t* heap; + ulint* offsets = NULL; ulint buffered; lint min_n_recs; rec_t* ins_rec; @@ -3254,7 +3470,7 @@ ibuf_insert_low( page_t* bitmap_page; buf_block_t* block; page_t* root; - ulint err; + dberr_t err; ibool do_merge; ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; @@ -3294,7 +3510,7 @@ ibuf_insert_low( return(DB_STRONG_FAIL); } - heap = mem_heap_create(512); + heap = mem_heap_create(1024); /* Build the entry which contains the space id and the page number as the first fields and the type information for other fields, and @@ -3464,9 +3680,11 @@ fail_exit: cursor = btr_pcur_get_btr_cur(&pcur); if (mode == BTR_MODIFY_PREV) { - err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor, - ibuf_entry, &ins_rec, - &dummy_big_rec, 0, thr, &mtr); + err = btr_cur_optimistic_insert( + BTR_NO_LOCKING_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); block = btr_cur_get_block(cursor); ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID); @@ -3493,13 +3711,15 @@ fail_exit: err = btr_cur_optimistic_insert( BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, - cursor, ibuf_entry, &ins_rec, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, &dummy_big_rec, 0, thr, &mtr); if (err == DB_FAIL) { err = btr_cur_pessimistic_insert( BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, - cursor, ibuf_entry, &ins_rec, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, &dummy_big_rec, 0, thr, &mtr); } @@ -3512,6 +3732,10 @@ fail_exit: ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID); } + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + if (err == DB_SUCCESS && op != IBUF_OP_DELETE) { /* Update the page max trx id field */ page_update_max_trx_id(block, NULL, @@ -3568,7 +3792,7 @@ ibuf_insert( ulint page_no,/*!< in: page number where to insert */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; ulint entry_size; ibool no_counter; /* Read the settable global variable ibuf_use only once in @@ -3699,7 +3923,7 @@ skip_watch: /********************************************************************//** During merge, inserts to an index page a secondary index entry extracted from the insert buffer. */ -static +static __attribute__((nonnull)) void ibuf_insert_to_index_page_low( /*==========================*/ @@ -3707,6 +3931,8 @@ ibuf_insert_to_index_page_low( buf_block_t* block, /*!< in/out: index page where the buffered entry should be placed */ dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t* heap, /*!< in/out: memory heap */ mtr_t* mtr, /*!< in/out: mtr */ page_cur_t* page_cur)/*!< in/out: cursor positioned on the record after which to insert the buffered entry */ @@ -3718,8 +3944,8 @@ ibuf_insert_to_index_page_low( const page_t* bitmap_page; ulint old_bits; - if (UNIV_LIKELY - (page_cur_tuple_insert(page_cur, entry, index, 0, mtr) != NULL)) { + if (page_cur_tuple_insert( + page_cur, entry, index, offsets, &heap, 0, mtr) != NULL) { return; } @@ -3730,8 +3956,8 @@ ibuf_insert_to_index_page_low( /* This time the record must fit */ - if (UNIV_LIKELY - (page_cur_tuple_insert(page_cur, entry, index, 0, mtr) != NULL)) { + if (page_cur_tuple_insert(page_cur, entry, index, + offsets, &heap, 0, mtr) != NULL) { return; } @@ -3785,6 +4011,8 @@ ibuf_insert_to_index_page( ulint low_match; page_t* page = buf_block_get_frame(block); rec_t* rec; + ulint* offsets; + mem_heap_t* heap; ut_ad(ibuf_inside(mtr)); ut_ad(dtuple_check_typed(entry)); @@ -3835,10 +4063,14 @@ dump: low_match = page_cur_search(block, index, entry, PAGE_CUR_LE, &page_cur); + heap = mem_heap_create( + sizeof(upd_t) + + REC_OFFS_HEADER_SIZE * sizeof(*offsets) + + dtuple_get_n_fields(entry) + * (sizeof(upd_field_t) + sizeof *offsets)); + if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) { - mem_heap_t* heap; upd_t* update; - ulint* offsets; page_zip_des_t* page_zip; rec = page_cur_get_rec(&page_cur); @@ -3847,12 +4079,10 @@ dump: row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */ ut_ad(rec_get_deleted_flag(rec, page_is_comp(page))); - heap = mem_heap_create(1024); - offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); update = row_upd_build_sec_rec_difference_binary( - index, entry, rec, NULL, heap); + rec, index, offsets, entry, heap); page_zip = buf_block_get_page_zip(block); @@ -3862,9 +4092,7 @@ dump: Bug #56680 was fixed. */ btr_cur_set_deleted_flag_for_ibuf( rec, page_zip, FALSE, mtr); -updated_in_place: - mem_heap_free(heap); - return; + goto updated_in_place; } /* Copy the info bits. Clear the delete-mark. */ @@ -3908,15 +4136,20 @@ updated_in_place: lock_rec_store_on_page_infimum(block, rec); page_cur_delete_rec(&page_cur, index, offsets, mtr); page_cur_move_to_prev(&page_cur); - mem_heap_free(heap); - ibuf_insert_to_index_page_low(entry, block, index, mtr, + ibuf_insert_to_index_page_low(entry, block, index, + &offsets, heap, mtr, &page_cur); lock_rec_restore_from_page_infimum(block, rec, block); } else { - ibuf_insert_to_index_page_low(entry, block, index, mtr, + offsets = NULL; + ibuf_insert_to_index_page_low(entry, block, index, + &offsets, heap, mtr, &page_cur); } + +updated_in_place: + mem_heap_free(heap); } /****************************************************************//** @@ -3950,7 +4183,7 @@ ibuf_set_del_mark( /* Delete mark the old index record. According to a comment in row_upd_sec_index_entry(), it can already have been delete marked if a lock wait occurred in - row_ins_index_entry() in a previous invocation of + row_ins_sec_index_entry() in a previous invocation of row_upd_sec_index_entry(). */ if (UNIV_LIKELY @@ -4128,7 +4361,7 @@ ibuf_restore_pos( ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); fputs("InnoDB: Validating insert buffer tree:\n", stderr); - if (!btr_validate_index(ibuf->index, NULL)) { + if (!btr_validate_index(ibuf->index, 0)) { ut_error; } @@ -4160,7 +4393,7 @@ ibuf_delete_rec( { ibool success; page_t* root; - ulint err; + dberr_t err; ut_ad(ibuf_inside(mtr)); ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); @@ -4183,7 +4416,8 @@ ibuf_delete_rec( } #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ - success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr); + success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), + 0, mtr); if (success) { if (UNIV_UNLIKELY(!page_get_n_recs(btr_pcur_get_page(pcur)))) { @@ -4241,7 +4475,7 @@ ibuf_delete_rec( root = ibuf_tree_root_get(mtr); - btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), + btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0, RB_NONE, mtr); ut_a(err == DB_SUCCESS); @@ -4829,4 +5063,109 @@ ibuf_print( mutex_exit(&ibuf_mutex); } + +/******************************************************************//** +Checks the insert buffer bitmaps on IMPORT TABLESPACE. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +ibuf_check_bitmap_on_import( +/*========================*/ + const trx_t* trx, /*!< in: transaction */ + ulint space_id) /*!< in: tablespace identifier */ +{ + ulint zip_size; + ulint page_size; + ulint size; + ulint page_no; + + ut_ad(space_id); + ut_ad(trx->mysql_thd); + + zip_size = fil_space_get_zip_size(space_id); + + if (zip_size == ULINT_UNDEFINED) { + return(DB_TABLE_NOT_FOUND); + } + + size = fil_space_get_size(space_id); + + if (size == 0) { + return(DB_TABLE_NOT_FOUND); + } + + mutex_enter(&ibuf_mutex); + + page_size = zip_size ? zip_size : UNIV_PAGE_SIZE; + + for (page_no = 0; page_no < size; page_no += page_size) { + mtr_t mtr; + page_t* bitmap_page; + ulint i; + + if (trx_is_interrupted(trx)) { + mutex_exit(&ibuf_mutex); + return(DB_INTERRUPTED); + } + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + ibuf_enter(&mtr); + + bitmap_page = ibuf_bitmap_get_map_page( + space_id, page_no, zip_size, &mtr); + + for (i = FSP_IBUF_BITMAP_OFFSET + 1; i < page_size; i++) { + const ulint offset = page_no + i; + + if (ibuf_bitmap_page_get_bits( + bitmap_page, offset, zip_size, + IBUF_BITMAP_IBUF, &mtr)) { + + mutex_exit(&ibuf_mutex); + ibuf_exit(&mtr); + mtr_commit(&mtr); + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Space %u page %u" + " is wrongly flagged to belong to the" + " insert buffer", + (unsigned) space_id, + (unsigned) offset); + + return(DB_CORRUPTION); + } + + if (ibuf_bitmap_page_get_bits( + bitmap_page, offset, zip_size, + IBUF_BITMAP_BUFFERED, &mtr)) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Buffered changes" + " for space %u page %u are lost", + (unsigned) space_id, + (unsigned) offset); + + /* Tolerate this error, so that + slightly corrupted tables can be + imported and dumped. Clear the bit. */ + ibuf_bitmap_page_set_bits( + bitmap_page, offset, zip_size, + IBUF_BITMAP_BUFFERED, FALSE, &mtr); + } + } + + ibuf_exit(&mtr); + mtr_commit(&mtr); + } + + mutex_exit(&ibuf_mutex); + return(DB_SUCCESS); +} #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/api0api.h b/storage/innobase/include/api0api.h new file mode 100644 index 00000000000..5b7bfdbdde5 --- /dev/null +++ b/storage/innobase/include/api0api.h @@ -0,0 +1,1282 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/api0api.h +InnoDB Native API + +2008-08-01 Created by Sunny Bains. +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +*******************************************************/ + +#ifndef api0api_h +#define api0api_h + +#include "db0err.h" +#include <stdio.h> + +#ifdef _MSC_VER +#define strncasecmp _strnicmp +#define strcasecmp _stricmp +#endif + +#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER) +#define UNIV_NO_IGNORE __attribute__ ((warn_unused_result)) +#else +#define UNIV_NO_IGNORE +#endif /* __GNUC__ && __GNUC__ > 2 && !__INTEL_COMPILER */ + +/* See comment about ib_bool_t as to why the two macros are unsigned long. */ +/** The boolean value of "true" used internally within InnoDB */ +#define IB_TRUE 0x1UL +/** The boolean value of "false" used internally within InnoDB */ +#define IB_FALSE 0x0UL + +/* Basic types used by the InnoDB API. */ +/** All InnoDB error codes are represented by ib_err_t */ +typedef enum dberr_t ib_err_t; +/** Representation of a byte within InnoDB */ +typedef unsigned char ib_byte_t; +/** Representation of an unsigned long int within InnoDB */ +typedef unsigned long int ib_ulint_t; + +/* We assume C99 support except when using VisualStudio. */ +#if !defined(_MSC_VER) +#include <stdint.h> +#endif /* _MSC_VER */ + +/* Integer types used by the API. Microsft VS defines its own types +and we use the Microsoft types when building with Visual Studio. */ +#if defined(_MSC_VER) +/** A signed 8 bit integral type. */ +typedef __int8 ib_i8_t; +#else +/** A signed 8 bit integral type. */ +typedef int8_t ib_i8_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 8 bit integral type. */ +typedef unsigned __int8 ib_u8_t; +#else +/** An unsigned 8 bit integral type. */ +typedef uint8_t ib_u8_t; +#endif + +#if defined(_MSC_VER) +/** A signed 16 bit integral type. */ +typedef __int16 ib_i16_t; +#else +/** A signed 16 bit integral type. */ +typedef int16_t ib_i16_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 16 bit integral type. */ +typedef unsigned __int16 ib_u16_t; +#else +/** An unsigned 16 bit integral type. */ +typedef uint16_t ib_u16_t; +#endif + +#if defined(_MSC_VER) +/** A signed 32 bit integral type. */ +typedef __int32 ib_i32_t; +#else +/** A signed 32 bit integral type. */ +typedef int32_t ib_i32_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 32 bit integral type. */ +typedef unsigned __int32 ib_u32_t; +#else +/** An unsigned 32 bit integral type. */ +typedef uint32_t ib_u32_t; +#endif + +#if defined(_MSC_VER) +/** A signed 64 bit integral type. */ +typedef __int64 ib_i64_t; +#else +/** A signed 64 bit integral type. */ +typedef int64_t ib_i64_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 64 bit integral type. */ +typedef unsigned __int64 ib_u64_t; +#else +/** An unsigned 64 bit integral type. */ +typedef uint64_t ib_u64_t; +#endif + +typedef void* ib_opaque_t; +typedef ib_opaque_t ib_charset_t; +typedef ib_ulint_t ib_bool_t; +typedef ib_u64_t ib_id_u64_t; + +/** @enum ib_cfg_type_t Possible types for a configuration variable. */ +typedef enum { + IB_CFG_IBOOL, /*!< The configuration parameter is + of type ibool */ + + /* XXX Can we avoid having different types for ulint and ulong? + - On Win64 "unsigned long" is 32 bits + - ulong is always defined as "unsigned long" + - On Win64 ulint is defined as 64 bit integer + => On Win64 ulint != ulong. + If we typecast all ulong and ulint variables to the smaller type + ulong, then we will cut the range of the ulint variables. + This is not a problem for most ulint variables because their max + allowed values do not exceed 2^32-1 (e.g. log_groups is ulint + but its max allowed value is 10). BUT buffer_pool_size and + log_file_size allow up to 2^64-1. */ + + IB_CFG_ULINT, /*!< The configuration parameter is + of type ulint */ + + IB_CFG_ULONG, /*!< The configuration parameter is + of type ulong */ + + IB_CFG_TEXT, /*!< The configuration parameter is + of type char* */ + + IB_CFG_CB /*!< The configuration parameter is + a callback parameter */ +} ib_cfg_type_t; + +/** @enum ib_col_type_t column types that are supported. */ +typedef enum { + IB_VARCHAR = 1, /*!< Character varying length. The + column is not padded. */ + + IB_CHAR = 2, /*!< Fixed length character string. The + column is padded to the right. */ + + IB_BINARY = 3, /*!< Fixed length binary, similar to + IB_CHAR but the column is not padded + to the right. */ + + IB_VARBINARY = 4, /*!< Variable length binary */ + + IB_BLOB = 5, /*!< Binary large object, or + a TEXT type */ + + IB_INT = 6, /*!< Integer: can be any size + from 1 - 8 bytes. If the size is + 1, 2, 4 and 8 bytes then you can use + the typed read and write functions. For + other sizes you will need to use the + ib_col_get_value() function and do the + conversion yourself. */ + + IB_SYS = 8, /*!< System column, this column can + be one of DATA_TRX_ID, DATA_ROLL_PTR + or DATA_ROW_ID. */ + + IB_FLOAT = 9, /*!< C (float) floating point value. */ + + IB_DOUBLE = 10, /*!> C (double) floating point value. */ + + IB_DECIMAL = 11, /*!< Decimal stored as an ASCII + string */ + + IB_VARCHAR_ANYCHARSET = 12, /*!< Any charset, varying length */ + + IB_CHAR_ANYCHARSET = 13 /*!< Any charset, fixed length */ + +} ib_col_type_t; + +/** @enum ib_tbl_fmt_t InnoDB table format types */ +typedef enum { + IB_TBL_REDUNDANT, /*!< Redundant row format, the column + type and length is stored in the row.*/ + + IB_TBL_COMPACT, /*!< Compact row format, the column + type is not stored in the row. The + length is stored in the row but the + storage format uses a compact format + to store the length of the column data + and record data storage format also + uses less storage. */ + + IB_TBL_DYNAMIC, /*!< Compact row format. BLOB prefixes + are not stored in the clustered index */ + + IB_TBL_COMPRESSED /*!< Similar to dynamic format but + with pages compressed */ +} ib_tbl_fmt_t; + +/** @enum ib_col_attr_t InnoDB column attributes */ +typedef enum { + IB_COL_NONE = 0, /*!< No special attributes. */ + + IB_COL_NOT_NULL = 1, /*!< Column data can't be NULL. */ + + IB_COL_UNSIGNED = 2, /*!< Column is IB_INT and unsigned. */ + + IB_COL_NOT_USED = 4, /*!< Future use, reserved. */ + + IB_COL_CUSTOM1 = 8, /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ + + IB_COL_CUSTOM2 = 16, /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ + + IB_COL_CUSTOM3 = 32 /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ +} ib_col_attr_t; + +/* Note: must match lock0types.h */ +/** @enum ib_lck_mode_t InnoDB lock modes. */ +typedef enum { + IB_LOCK_IS = 0, /*!< Intention shared, an intention + lock should be used to lock tables */ + + IB_LOCK_IX, /*!< Intention exclusive, an intention + lock should be used to lock tables */ + + IB_LOCK_S, /*!< Shared locks should be used to + lock rows */ + + IB_LOCK_X, /*!< Exclusive locks should be used to + lock rows*/ + + IB_LOCK_TABLE_X, /*!< exclusive table lock */ + + IB_LOCK_NONE, /*!< This is used internally to note + consistent read */ + + IB_LOCK_NUM = IB_LOCK_NONE /*!< number of lock modes */ +} ib_lck_mode_t; + +typedef enum { + IB_CLUSTERED = 1, /*!< clustered index */ + IB_UNIQUE = 2 /*!< unique index */ +} ib_index_type_t; + +/** @enum ib_srch_mode_t InnoDB cursor search modes for ib_cursor_moveto(). +Note: Values must match those found in page0cur.h */ +typedef enum { + IB_CUR_G = 1, /*!< If search key is not found then + position the cursor on the row that + is greater than the search key */ + + IB_CUR_GE = 2, /*!< If the search key not found then + position the cursor on the row that + is greater than or equal to the search + key */ + + IB_CUR_L = 3, /*!< If search key is not found then + position the cursor on the row that + is less than the search key */ + + IB_CUR_LE = 4 /*!< If search key is not found then + position the cursor on the row that + is less than or equal to the search + key */ +} ib_srch_mode_t; + +/** @enum ib_match_mode_t Various match modes used by ib_cursor_moveto() */ +typedef enum { + IB_CLOSEST_MATCH, /*!< Closest match possible */ + + IB_EXACT_MATCH, /*!< Search using a complete key + value */ + + IB_EXACT_PREFIX /*!< Search using a key prefix which + must match to rows: the prefix may + contain an incomplete field (the + last field in prefix may be just + a prefix of a fixed length column) */ +} ib_match_mode_t; + +/** @struct ib_col_meta_t InnoDB column meta data. */ +typedef struct { + ib_col_type_t type; /*!< Type of the column */ + + ib_col_attr_t attr; /*!< Column attributes */ + + ib_u32_t type_len; /*!< Length of type */ + + ib_u16_t client_type; /*!< 16 bits of data relevant only to + the client. InnoDB doesn't care */ + + ib_charset_t* charset; /*!< Column charset */ +} ib_col_meta_t; + +/* Note: Must be in sync with trx0trx.h */ +/** @enum ib_trx_state_t The transaction state can be queried using the +ib_trx_state() function. The InnoDB deadlock monitor can roll back a +transaction and users should be prepared for this, especially where there +is high contention. The way to determine the state of the transaction is to +query it's state and check. */ +typedef enum { + IB_TRX_NOT_STARTED, /*!< Has not started yet, the + transaction has not ben started yet.*/ + + IB_TRX_ACTIVE, /*!< The transaction is currently + active and needs to be either + committed or rolled back. */ + + IB_TRX_COMMITTED_IN_MEMORY, /*!< Not committed to disk yet */ + + IB_TRX_PREPARED /*!< Support for 2PC/XA */ +} ib_trx_state_t; + +/* Note: Must be in sync with trx0trx.h */ +/** @enum ib_trx_level_t Transaction isolation levels */ +typedef enum { + IB_TRX_READ_UNCOMMITTED = 0, /*!< Dirty read: non-locking SELECTs are + performed so that we do not look at a + possible earlier version of a record; + thus they are not 'consistent' reads + under this isolation level; otherwise + like level 2 */ + + IB_TRX_READ_COMMITTED = 1, /*!< Somewhat Oracle-like isolation, + except that in range UPDATE and DELETE + we must block phantom rows with + next-key locks; SELECT ... FOR UPDATE + and ... LOCK IN SHARE MODE only lock + the index records, NOT the gaps before + them, and thus allow free inserting; + each consistent read reads its own + snapshot */ + + IB_TRX_REPEATABLE_READ = 2, /*!< All consistent reads in the same + trx read the same snapshot; full + next-key locking used in locking reads + to block insertions into gaps */ + + IB_TRX_SERIALIZABLE = 3 /*!< All plain SELECTs are converted to + LOCK IN SHARE MODE reads */ +} ib_trx_level_t; + +/** Generical InnoDB callback prototype. */ +typedef void (*ib_cb_t)(void); + +#define IB_CFG_BINLOG_ENABLED 0x1 +#define IB_CFG_MDL_ENABLED 0x2 +#define IB_CFG_DISABLE_ROWLOCK 0x4 + +/** The first argument to the InnoDB message logging function. By default +it's set to stderr. You should treat ib_msg_stream_t as a void*, since +it will probably change in the future. */ +typedef FILE* ib_msg_stream_t; + +/** All log messages are written to this function.It should have the same +behavior as fprintf(3). */ +typedef int (*ib_msg_log_t)(ib_msg_stream_t, const char*, ...); + +/* Note: This is to make it easy for API users to have type +checking for arguments to our functions. Making it ib_opaque_t +by itself will result in pointer decay resulting in subverting +of the compiler's type checking. */ + +/** InnoDB tuple handle. This handle can refer to either a cluster index +tuple or a secondary index tuple. There are two types of tuples for each +type of index, making a total of four types of tuple handles. There +is a tuple for reading the entire row contents and another for searching +on the index key. */ +typedef struct ib_tuple_t* ib_tpl_t; + +/** InnoDB transaction handle, all database operations need to be covered +by transactions. This handle represents a transaction. The handle can be +created with ib_trx_begin(), you commit your changes with ib_trx_commit() +and undo your changes using ib_trx_rollback(). If the InnoDB deadlock +monitor rolls back the transaction then you need to free the transaction +using the function ib_trx_release(). You can query the state of an InnoDB +transaction by calling ib_trx_state(). */ +typedef struct trx_t* ib_trx_t; + +/** InnoDB cursor handle */ +typedef struct ib_cursor_t* ib_crsr_t; + +/*************************************************************//** +This function is used to compare two data fields for which the data type +is such that we must use the client code to compare them. + +@param col_meta column meta data +@param p1 key +@oaram p1_len key length +@param p2 second key +@param p2_len second key length +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ + +typedef int (*ib_client_cmp_t)( + const ib_col_meta_t* col_meta, + const ib_byte_t* p1, + ib_ulint_t p1_len, + const ib_byte_t* p2, + ib_ulint_t p2_len); + +/* This should be the same as univ.i */ +/** Represents SQL_NULL length */ +#define IB_SQL_NULL 0xFFFFFFFF +/** The number of system columns in a row. */ +#define IB_N_SYS_COLS 3 + +/** The maximum length of a text column. */ +#define MAX_TEXT_LEN 4096 + +/* MySQL uses 3 byte UTF-8 encoding. */ +/** The maximum length of a column name in a table schema. */ +#define IB_MAX_COL_NAME_LEN (64 * 3) + +/** The maximum length of a table name (plus database name). */ +#define IB_MAX_TABLE_NAME_LEN (64 * 3) * 2 + +/*****************************************************************//** +Start a transaction that's been rolled back. This special function +exists for the case when InnoDB's deadlock detector has rolledack +a transaction. While the transaction has been rolled back the handle +is still valid and can be reused by calling this function. If you +don't want to reuse the transaction handle then you can free the handle +by calling ib_trx_release(). +@return innobase txn handle */ + +ib_err_t +ib_trx_start( +/*=========*/ + ib_trx_t ib_trx, /*!< in: transaction to restart */ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + void* thd); /*!< in: THD */ + +/*****************************************************************//** +Begin a transaction. This will allocate a new transaction handle and +put the transaction in the active state. +@return innobase txn handle */ + +ib_trx_t +ib_trx_begin( +/*=========*/ + ib_trx_level_t ib_trx_level); /*!< in: trx isolation level */ + +/*****************************************************************//** +Query the transaction's state. This function can be used to check for +the state of the transaction in case it has been rolled back by the +InnoDB deadlock detector. Note that when a transaction is selected as +a victim for rollback, InnoDB will always return an appropriate error +code indicating this. @see DB_DEADLOCK, @see DB_LOCK_TABLE_FULL and +@see DB_LOCK_WAIT_TIMEOUT +@return transaction state */ + +ib_trx_state_t +ib_trx_state( +/*=========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Release the resources of the transaction. If the transaction was +selected as a victim by InnoDB and rolled back then use this function +to free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_release( +/*===========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Commit a transaction. This function will release the schema latches too. +It will also free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_commit( +/*==========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Rollback a transaction. This function will release the schema latches too. +It will also free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_rollback( +/*============*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Open an InnoDB table and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_table_using_id( +/*==========================*/ + ib_id_u64_t table_id, /*!< in: table id of table to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out,own: InnoDB cursor */ + +/*****************************************************************//** +Open an InnoDB index and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_index_using_id( +/*==========================*/ + ib_id_u64_t index_id, /*!< in: index id of index to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out: InnoDB cursor */ + +/*****************************************************************//** +Open an InnoDB secondary index cursor and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_index_using_name( +/*============================*/ + ib_crsr_t ib_open_crsr, /*!< in: open/active cursor */ + const char* index_name, /*!< in: secondary index name */ + ib_crsr_t* ib_crsr, /*!< out,own: InnoDB index cursor */ + int* idx_type, /*!< out: index is cluster index */ + ib_id_u64_t* idx_id); /*!< out: index id */ + +/*****************************************************************//** +Open an InnoDB table by name and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_table( +/*=================*/ + const char* name, /*!< in: table name */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out,own: InnoDB cursor */ + +/*****************************************************************//** +Reset the cursor. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_reset( +/*============*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + + +/*****************************************************************//** +set a cursor trx to NULL*/ + +void +ib_cursor_clear_trx( +/*================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Close an InnoDB table and free the cursor. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_close( +/*============*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Close the table, decrement n_ref_count count. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_close_table( +/*==================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +update the cursor with new transactions and also reset the cursor +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_new_trx( +/*==============*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx); /*!< in: transaction */ + +/*****************************************************************//** +Commit the transaction in a cursor +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_commit_trx( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx); /*!< in: transaction */ + +/********************************************************************//** +Open a table using the table name, if found then increment table ref count. +@return table instance if found */ + +void* +ib_open_table_by_name( +/*==================*/ + const char* name); /*!< in: table name to lookup */ + +/*****************************************************************//** +Insert a row to a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_insert_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor instance */ + const ib_tpl_t ib_tpl); /*!< in: tuple to insert */ + +/*****************************************************************//** +Update a row in a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_update_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + const ib_tpl_t ib_old_tpl, /*!< in: Old tuple in table */ + const ib_tpl_t ib_new_tpl); /*!< in: New tuple to update */ + +/*****************************************************************//** +Delete a row in a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_delete_row( +/*=================*/ + ib_crsr_t ib_crsr); /*!< in: cursor instance */ + +/*****************************************************************//** +Read current row. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_read_row( +/*===============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl); /*!< out: read cols into this tuple */ + +/*****************************************************************//** +Move cursor to the first record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_first( +/*============*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Move cursor to the last record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_last( +/*===========*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Move cursor to the next record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_next( +/*===========*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Search for key. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_moveto( +/*=============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< in: Key to search for */ + ib_srch_mode_t ib_srch_mode); /*!< in: search mode */ + +/*****************************************************************//** +Set the match mode for ib_cursor_move(). */ + +void +ib_cursor_set_match_mode( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: Cursor instance */ + ib_match_mode_t match_mode); /*!< in: ib_cursor_moveto match mode */ + +/*****************************************************************//** +Set a column of the tuple. Make a copy using the tuple's heap. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_col_set_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t col_no, /*!< in: column index in tuple */ + const void* src, /*!< in: data value */ + ib_ulint_t len); /*!< in: data value len */ + +/*****************************************************************//** +Get the size of the data available in the column the tuple. +@return bytes avail or IB_SQL_NULL */ + +ib_ulint_t +ib_col_get_len( +/*===========*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Copy a column value from the tuple. +@return bytes copied or IB_SQL_NULL */ + +ib_ulint_t +ib_col_copy_value( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + void* dst, /*!< out: copied data value */ + ib_ulint_t len); /*!< in: max data value len to copy */ + +/*************************************************************//** +Read a signed int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i8_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u8_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i16_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u16_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i32_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u32_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i64_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u64_t* ival); /*!< out: integer value */ + +/*****************************************************************//** +Get a column value pointer from the tuple. +@return NULL or pointer to buffer */ + +const void* +ib_col_get_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i); /*!< in: column number */ + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ + +ib_ulint_t +ib_col_get_meta( +/*============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_col_meta_t* ib_col_meta); /*!< out: column meta data */ + +/*****************************************************************//** +"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple. +@return new tuple, or NULL */ + +ib_tpl_t +ib_tuple_clear( +/*============*/ + ib_tpl_t ib_tpl); /*!< in: InnoDB tuple */ + +/*****************************************************************//** +Create a new cluster key search tuple and copy the contents of the +secondary index key tuple columns that refer to the cluster index record +to the cluster key. It does a deep copy of the column data. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_tuple_get_cluster_key( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: secondary index cursor */ + ib_tpl_t* ib_dst_tpl, /*!< out,own: destination tuple */ + const ib_tpl_t ib_src_tpl); /*!< in: source tuple */ + +/*****************************************************************//** +Copy the contents of source tuple to destination tuple. The tuples +must be of the same type and belong to the same table/index. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_tuple_copy( +/*==========*/ + ib_tpl_t ib_dst_tpl, /*!< in: destination tuple */ + const ib_tpl_t ib_src_tpl); /*!< in: source tuple */ + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return tuple for current index */ + +ib_tpl_t +ib_sec_search_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return tuple for current index */ + +ib_tpl_t +ib_sec_read_tuple_create( +/*=====================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple used for table key operations. +@return tuple for current table */ + +ib_tpl_t +ib_clust_search_tuple_create( +/*=========================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple for table row operations. +@return tuple for current table */ + +ib_tpl_t +ib_clust_read_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Return the number of user columns in the tuple definition. +@return number of user columns */ + +ib_ulint_t +ib_tuple_get_n_user_cols( +/*=====================*/ + const ib_tpl_t ib_tpl); /*!< in: Tuple for current table */ + +/*****************************************************************//** +Return the number of columns in the tuple definition. +@return number of columns */ + +ib_ulint_t +ib_tuple_get_n_cols( +/*================*/ + const ib_tpl_t ib_tpl); /*!< in: Tuple for current table */ + +/*****************************************************************//** +Destroy an InnoDB tuple. */ + +void +ib_tuple_delete( +/*============*/ + ib_tpl_t ib_tpl); /*!< in,own: Tuple instance to delete */ + +/*****************************************************************//** +Truncate a table. The cursor handle will be closed and set to NULL +on success. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_truncate( +/*===============*/ + ib_crsr_t* ib_crsr, /*!< in/out: cursor for table + to truncate */ + ib_id_u64_t* table_id); /*!< out: new table id */ + +/*****************************************************************//** +Get a table id. +@return DB_SUCCESS if found */ + +ib_err_t +ib_table_get_id( +/*============*/ + const char* table_name, /*!< in: table to find */ + ib_id_u64_t* table_id); /*!< out: table id if found */ + +/*****************************************************************//** +Get an index id. +@return DB_SUCCESS if found */ + +ib_err_t +ib_index_get_id( +/*============*/ + const char* table_name, /*!< in: find index for this table */ + const char* index_name, /*!< in: index to find */ + ib_id_u64_t* index_id); /*!< out: index id if found */ + +/*****************************************************************//** +Check if cursor is positioned. +@return IB_TRUE if positioned */ + +ib_bool_t +ib_cursor_is_positioned( +/*====================*/ + const ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Checks if the data dictionary is latched in exclusive mode by a +user transaction. +@return TRUE if exclusive latch */ + +ib_bool_t +ib_schema_lock_is_exclusive( +/*========================*/ + const ib_trx_t ib_trx); /*!< in: transaction */ + +/*****************************************************************//** +Lock an InnoDB cursor/table. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_lock( +/*===========*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set the Lock an InnoDB table using the table id. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_table_lock( +/*===========*/ + ib_trx_t ib_trx, /*!< in/out: transaction */ + ib_id_u64_t table_id, /*!< in: table id */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set the Lock mode of the cursor. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_set_lock_mode( +/*====================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set need to access clustered index record flag. */ + +void +ib_cursor_set_cluster_access( +/*=========================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i8_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i16( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i16_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i32( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i32_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i64_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u8_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u16( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u16_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u32( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u32_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u64_t val); /*!< in: value to write */ + +/*****************************************************************//** +Inform the cursor that it's the start of an SQL statement. */ + +void +ib_cursor_stmt_begin( +/*=================*/ + ib_crsr_t ib_crsr); /*!< in: cursor */ + +/*****************************************************************//** +Write a double value to a column. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_write_double( +/*==================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + int col_no, /*!< in: column number */ + double val); /*!< in: value to write */ + +/*************************************************************//** +Read a double column value from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_double( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + double* dval); /*!< out: double value */ + +/*****************************************************************//** +Write a float value to a column. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_write_float( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + float val); /*!< in: value to write */ + +/*************************************************************//** +Read a float value from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_float( +/*================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + float* fval); /*!< out: float value */ + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ + +const char* +ib_col_get_name( +/*============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Get an index field name from the cursor. +@return name of the field */ + +const char* +ib_get_idx_field_name( +/*==================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Truncate a table. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_table_truncate( +/*==============*/ + const char* table_name, /*!< in: table name */ + ib_id_u64_t* table_id); /*!< out: new table id */ + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return DB_SUCCESS or error number */ + +ib_err_t +ib_close_thd( +/*=========*/ + void* thd); /*!< in: handle to the MySQL + thread of the user whose resources + should be free'd */ + +/*****************************************************************//** +Get generic configure status +@return configure status*/ + +int +ib_cfg_get_cfg(); +/*============*/ + +/*****************************************************************//** +Check whether the table name conforms to our requirements. Currently +we only do a simple check for the presence of a '/'. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_table_name_check( +/*================*/ + const char* name); /*!< in: table name to check */ + +/*****************************************************************//** +Return isolation configuration set by "innodb_api_trx_level" +@return trx isolation level*/ + +ib_trx_state_t +ib_cfg_trx_level(); +/*==============*/ + +/*****************************************************************//** +Return configure value for background commit interval (in seconds) +@return background commit interval (in seconds) */ + +ib_ulint_t +ib_cfg_bk_commit_interval(); +/*=======================*/ + +/*****************************************************************//** +Get a trx start time. +@return trx start_time */ + +ib_u64_t +ib_trx_get_start_time( +/*==================*/ + ib_trx_t ib_trx); /*!< in: transaction */ + +#endif /* api0api_h */ diff --git a/storage/innobase/include/api0misc.h b/storage/innobase/include/api0misc.h new file mode 100644 index 00000000000..fcd748390d1 --- /dev/null +++ b/storage/innobase/include/api0misc.h @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/api0misc.h +InnoDB Native API + +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +2008 Created by Sunny Bains +*******************************************************/ + +#ifndef api0misc_h +#define api0misc_h + +#include "univ.i" +#include "os0file.h" +#include "que0que.h" +#include "trx0trx.h" + +/** Whether binlog is enabled for applications using InnoDB APIs */ +extern my_bool ib_binlog_enabled; + +/** Whether MySQL MDL is enabled for applications using InnoDB APIs */ +extern my_bool ib_mdl_enabled; + +/** Whether InnoDB row lock is disabled for applications using InnoDB APIs */ +extern my_bool ib_disable_row_lock; + +/** configure value for transaction isolation level */ +extern ulong ib_trx_level_setting; + +/** configure value for background commit interval (in seconds) */ +extern ulong ib_bk_commit_interval; + +/******************************************************************** +Handles user errors and lock waits detected by the database engine. +@return TRUE if it was a lock wait and we should continue running +the query thread */ +UNIV_INTERN +ibool +ib_handle_errors( +/*=============*/ + dberr_t* new_err, /*!< out: possible new error + encountered in lock wait, or if + no new error, the value of + trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread */ + trx_savept_t* savept); /*!< in: savepoint or NULL */ + +/************************************************************************* +Sets a lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +ib_trx_lock_table_with_retry( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode); /*!< in: lock mode */ + +#endif /* api0misc_h */ diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 5592995d4b2..b99b0c0cd7b 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -92,6 +93,17 @@ insert/delete buffer when the record is not in the buffer pool. */ buffer when the record is not in the buffer pool. */ #define BTR_DELETE 8192 +/** In the case of BTR_SEARCH_LEAF or BTR_MODIFY_LEAF, the caller is +already holding an S latch on the index tree */ +#define BTR_ALREADY_S_LATCHED 16384 + +#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ + ((latch_mode) & ~(BTR_INSERT \ + | BTR_DELETE_MARK \ + | BTR_DELETE \ + | BTR_ESTIMATE \ + | BTR_IGNORE_SEC_UNIQUE \ + | BTR_ALREADY_S_LATCHED)) #endif /* UNIV_HOTBACKUP */ /**************************************************************//** @@ -118,7 +130,7 @@ btr_corruption_report( #ifdef UNIV_BLOB_DEBUG # include "ut0rbt.h" /** An index->blobs entry for keeping track of off-page column references */ -struct btr_blob_dbg_struct +struct btr_blob_dbg_t { unsigned blob_page_no:32; /*!< first BLOB page number */ unsigned ref_page_no:32; /*!< referring page number */ @@ -207,8 +219,32 @@ UNIV_INTERN page_t* btr_root_get( /*=========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +UNIV_INTERN +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Gets the height of the B-tree (the level of the root, when the leaf +level is assumed to be 0). The caller must hold an S or X latch on +the index. +@return tree height (level of the root) */ +UNIV_INTERN +ulint +btr_height_get( +/*===========*/ dict_index_t* index, /*!< in: index tree */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); /**************************************************************//** Gets a buffer page and declares its latching order level. */ UNIV_INLINE @@ -269,7 +305,8 @@ UNIV_INLINE index_id_t btr_page_get_index_id( /*==================*/ - const page_t* page); /*!< in: index page */ + const page_t* page) /*!< in: index page */ + __attribute__((nonnull, pure, warn_unused_result)); #ifndef UNIV_HOTBACKUP /********************************************************//** Gets the node level field in an index page. @@ -278,16 +315,9 @@ UNIV_INLINE ulint btr_page_get_level_low( /*===================*/ - const page_t* page); /*!< in: index page */ -/********************************************************//** -Gets the node level field in an index page. -@return level, leaf level == 0 */ -UNIV_INLINE -ulint -btr_page_get_level( -/*===============*/ - const page_t* page, /*!< in: index page */ - mtr_t* mtr); /*!< in: mini-transaction handle */ + const page_t* page) /*!< in: index page */ + __attribute__((nonnull, pure, warn_unused_result)); +#define btr_page_get_level(page, mtr) btr_page_get_level_low(page) /********************************************************//** Gets the next index page number. @return next page number */ @@ -296,7 +326,8 @@ ulint btr_page_get_next( /*==============*/ const page_t* page, /*!< in: index page */ - mtr_t* mtr); /*!< in: mini-transaction handle */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull, warn_unused_result)); /********************************************************//** Gets the previous index page number. @return prev page number */ @@ -305,7 +336,8 @@ ulint btr_page_get_prev( /*==============*/ const page_t* page, /*!< in: index page */ - mtr_t* mtr); /*!< in: mini-transaction handle */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Gets pointer to the previous user record in the tree. It is assumed that the caller has appropriate latches on the page and its neighbor. @@ -315,8 +347,9 @@ rec_t* btr_get_prev_user_rec( /*==================*/ rec_t* rec, /*!< in: record on leaf level */ - mtr_t* mtr); /*!< in: mtr holding a latch on the page, and if + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if needed, also to the previous page */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Gets pointer to the next user record in the tree. It is assumed that the caller has appropriate latches on the page and its neighbor. @@ -326,8 +359,9 @@ rec_t* btr_get_next_user_rec( /*==================*/ rec_t* rec, /*!< in: record on leaf level */ - mtr_t* mtr); /*!< in: mtr holding a latch on the page, and if + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if needed, also to the next page */ + __attribute__((nonnull, warn_unused_result)); /**************************************************************//** Releases the latch on a leaf page and bufferunfixes it. */ UNIV_INLINE @@ -337,7 +371,8 @@ btr_leaf_page_release( buf_block_t* block, /*!< in: buffer block */ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or BTR_MODIFY_LEAF */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); /**************************************************************//** Gets the child node file address in a node pointer. NOTE: the offsets array must contain all offsets for the record since @@ -350,7 +385,8 @@ ulint btr_node_ptr_get_child_page_no( /*===========================*/ const rec_t* rec, /*!< in: node pointer record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /************************************************************//** Creates the root node for a new index tree. @return page number of the created root, FIL_NULL if did not succeed */ @@ -364,7 +400,8 @@ btr_create( or 0 for uncompressed pages */ index_id_t index_id,/*!< in: index id */ dict_index_t* index, /*!< in: index */ - mtr_t* mtr); /*!< in: mini-transaction handle */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull)); /************************************************************//** Frees a B-tree except the root page, which MUST be freed after this by calling btr_free_root. */ @@ -386,7 +423,8 @@ btr_free_root( ulint zip_size, /*!< in: compressed page size in bytes or 0 for uncompressed pages */ ulint root_page_no, /*!< in: root page number */ - mtr_t* mtr); /*!< in/out: mini-transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /*************************************************************//** Makes tree one level higher by splitting the root, and inserts the tuple. It is assumed that mtr contains an x-latch on the tree. @@ -398,13 +436,18 @@ UNIV_INTERN rec_t* btr_root_raise_and_insert( /*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be on the root page; when the function returns, the cursor is positioned on the predecessor of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Reorganizes an index page. IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf @@ -418,7 +461,8 @@ btr_page_reorganize( /*================*/ buf_block_t* block, /*!< in: page to be reorganized */ dict_index_t* index, /*!< in: record descriptor */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); /*************************************************************//** Decides if the page should be split at the convergence point of inserts converging to left. @@ -428,9 +472,10 @@ ibool btr_page_get_split_rec_to_left( /*===========================*/ btr_cur_t* cursor, /*!< in: cursor at which to insert */ - rec_t** split_rec);/*!< out: if split recommended, + rec_t** split_rec)/*!< out: if split recommended, the first record on upper half page, or NULL if tuple should be first */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Decides if the page should be split at the convergence point of inserts converging to right. @@ -440,9 +485,10 @@ ibool btr_page_get_split_rec_to_right( /*============================*/ btr_cur_t* cursor, /*!< in: cursor at which to insert */ - rec_t** split_rec);/*!< out: if split recommended, + rec_t** split_rec)/*!< out: if split recommended, the first record on upper half page, or NULL if tuple should be first */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Splits an index page to halves and inserts the tuple. It is assumed that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is @@ -456,12 +502,17 @@ UNIV_INTERN rec_t* btr_page_split_and_insert( /*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the function returns, the cursor is positioned on the predecessor of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************//** Inserts a data tuple to a tree on a non-leaf level. It is assumed that mtr holds an x-latch on the tree. */ @@ -469,14 +520,16 @@ UNIV_INTERN void btr_insert_on_non_leaf_level_func( /*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ dict_index_t* index, /*!< in: index */ ulint level, /*!< in: level, must be > 0 */ dtuple_t* tuple, /*!< in: the record to be inserted */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ - mtr_t* mtr); /*!< in: mtr */ -# define btr_insert_on_non_leaf_level(i,l,t,m) \ - btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m) + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +# define btr_insert_on_non_leaf_level(f,i,l,t,m) \ + btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m) #endif /* !UNIV_HOTBACKUP */ /****************************************************************//** Sets a record as the predefined minimum record. */ @@ -485,7 +538,8 @@ void btr_set_min_rec_mark( /*=================*/ rec_t* rec, /*!< in/out: record */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); #ifndef UNIV_HOTBACKUP /*************************************************************//** Deletes on the upper level the node pointer to a page. */ @@ -495,7 +549,8 @@ btr_node_ptr_delete( /*================*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: page whose node pointer is deleted */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); #ifdef UNIV_DEBUG /************************************************************//** Checks that the node pointer to a page is appropriate. @@ -506,7 +561,8 @@ btr_check_node_ptr( /*===============*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: index page */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); #endif /* UNIV_DEBUG */ /*************************************************************//** Tries to merge the page first to the left immediate brother if such a @@ -540,7 +596,8 @@ btr_discard_page( /*=============*/ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on the root page */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /****************************************************************//** Parses the redo log record for setting an index record as the predefined @@ -554,7 +611,8 @@ btr_parse_set_min_rec_mark( byte* end_ptr,/*!< in: buffer end */ ulint comp, /*!< in: nonzero=compact page format */ page_t* page, /*!< in: page or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ + __attribute__((nonnull(1,2), warn_unused_result)); /***********************************************************//** Parses a redo log record of reorganizing a page. @return end of log record or NULL */ @@ -565,8 +623,10 @@ btr_parse_page_reorganize( byte* ptr, /*!< in: buffer */ byte* end_ptr,/*!< in: buffer end */ dict_index_t* index, /*!< in: record descriptor */ + bool compressed,/*!< in: true if compressed page */ buf_block_t* block, /*!< in: page to be reorganized, or NULL */ - mtr_t* mtr); /*!< in: mtr or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ + __attribute__((nonnull(1,2,3), warn_unused_result)); #ifndef UNIV_HOTBACKUP /**************************************************************//** Gets the number of pages in a B-tree. @@ -612,7 +672,8 @@ btr_page_free( /*==========*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: block to be freed, x-latched */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); /**************************************************************//** Frees a file page used in an index tree. Can be used also to BLOB external storage pages, because the page level 0 can be given as an @@ -624,7 +685,8 @@ btr_page_free_low( dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: block to be freed, x-latched */ ulint level, /*!< in: page level */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ @@ -632,7 +694,8 @@ UNIV_INTERN void btr_print_size( /*===========*/ - dict_index_t* index); /*!< in: index tree */ + dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); /**************************************************************//** Prints directories and other info of all nodes in the index. */ UNIV_INTERN @@ -640,8 +703,9 @@ void btr_print_index( /*============*/ dict_index_t* index, /*!< in: index */ - ulint width); /*!< in: print this many entries from start + ulint width) /*!< in: print this many entries from start and end */ + __attribute__((nonnull)); #endif /* UNIV_BTR_PRINT */ /************************************************************//** Checks the size and number of fields in a record based on the definition of @@ -653,18 +717,20 @@ btr_index_rec_validate( /*===================*/ const rec_t* rec, /*!< in: index record */ const dict_index_t* index, /*!< in: index */ - ibool dump_on_error); /*!< in: TRUE if the function + ibool dump_on_error) /*!< in: TRUE if the function should print hex dump of record and page on error */ + __attribute__((nonnull, warn_unused_result)); /**************************************************************//** Checks the consistency of an index tree. @return TRUE if ok */ UNIV_INTERN -ibool +bool btr_validate_index( /*===============*/ - dict_index_t* index, /*!< in: index */ - trx_t* trx); /*!< in: transaction or NULL */ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or 0 */ + __attribute__((nonnull(1), warn_unused_result)); #define BTR_N_LEAF_PAGES 1 #define BTR_TOTAL_SIZE 2 diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index 6f7a66b12ac..00f50b5dcaf 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -126,22 +126,6 @@ btr_page_get_level_low( } /********************************************************//** -Gets the node level field in an index page. -@return level, leaf level == 0 */ -UNIV_INLINE -ulint -btr_page_get_level( -/*===============*/ - const page_t* page, /*!< in: index page */ - mtr_t* mtr __attribute__((unused))) - /*!< in: mini-transaction handle */ -{ - ut_ad(page && mtr); - - return(btr_page_get_level_low(page)); -} - -/********************************************************//** Sets the node level field in an index page. */ UNIV_INLINE void @@ -278,6 +262,7 @@ btr_node_ptr_get_child_page_no( " in a node ptr record at offset %lu\n", (ulong) page_offset(rec)); buf_page_print(page_align(rec), 0, 0); + ut_ad(0); } return(page_no); diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index f437575579e..edba1d1d77f 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -31,14 +31,22 @@ Created 10/16/1994 Heikki Tuuri #include "page0cur.h" #include "btr0types.h" -/* Mode flags for btr_cur operations; these can be ORed */ -#define BTR_NO_UNDO_LOG_FLAG 1 /* do no undo logging */ -#define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */ -#define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the - update vector or inserted entry */ -#define BTR_KEEP_POS_FLAG 8 /* btr_cur_pessimistic_update() - must keep cursor position when - moving columns to big_rec */ +/** Mode flags for btr_cur operations; these can be ORed */ +enum { + /** do no undo logging */ + BTR_NO_UNDO_LOG_FLAG = 1, + /** do no record lock checking */ + BTR_NO_LOCKING_FLAG = 2, + /** sys fields will be found in the update vector or inserted + entry */ + BTR_KEEP_SYS_FLAG = 4, + /** btr_cur_pessimistic_update() must keep cursor position + when moving columns to big_rec */ + BTR_KEEP_POS_FLAG = 8, + /** the caller is creating the index or wants to bypass the + index->info.online creation log */ + BTR_CREATE_FLAG = 16 +}; #ifndef UNIV_HOTBACKUP #include "que0types.h" @@ -164,16 +172,19 @@ UNIV_INTERN void btr_cur_open_at_index_side_func( /*============================*/ - ibool from_left, /*!< in: TRUE if open to the low end, - FALSE if to the high end */ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ dict_index_t* index, /*!< in: index */ ulint latch_mode, /*!< in: latch mode */ - btr_cur_t* cursor, /*!< in: cursor */ + btr_cur_t* cursor, /*!< in/out: cursor */ + ulint level, /*!< in: level to search for + (0=leaf) */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ - mtr_t* mtr); /*!< in: mtr */ -#define btr_cur_open_at_index_side(f,i,l,c,m) \ - btr_cur_open_at_index_side_func(f,i,l,c,__FILE__,__LINE__,m) + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +#define btr_cur_open_at_index_side(f,i,l,c,lv,m) \ + btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m) /**********************************************************************//** Positions a cursor at a randomly chosen position within a B-tree. */ UNIV_INTERN @@ -196,7 +207,7 @@ one record on the page, the insert will always succeed; this is to prevent trying to split a page with just one record. @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_optimistic_insert( /*======================*/ ulint flags, /*!< in: undo logging and locking flags: if not @@ -204,6 +215,8 @@ btr_cur_optimistic_insert( specified */ btr_cur_t* cursor, /*!< in: cursor on page after which to insert; cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ dtuple_t* entry, /*!< in/out: entry to insert */ rec_t** rec, /*!< out: pointer to inserted record if succeed */ @@ -212,11 +225,12 @@ btr_cur_optimistic_insert( NULL */ ulint n_ext, /*!< in: number of externally stored columns */ que_thr_t* thr, /*!< in: query thread or NULL */ - mtr_t* mtr); /*!< in: mtr; if this function returns + mtr_t* mtr) /*!< in: mtr; if this function returns DB_SUCCESS on a leaf page of a secondary index in a compressed tablespace, the mtr must be committed before latching any further pages */ + __attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result)); /*************************************************************//** Performs an insert on a page of an index tree. It is assumed that mtr holds an x-latch on the tree and on the cursor page. If the insert is @@ -224,7 +238,7 @@ made on the leaf level, to avoid deadlocks, mtr must also own x-latches to brothers of page, if those brothers exist. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_pessimistic_insert( /*=======================*/ ulint flags, /*!< in: undo logging and locking flags: if not @@ -235,6 +249,9 @@ btr_cur_pessimistic_insert( insertion will certainly succeed */ btr_cur_t* cursor, /*!< in: cursor after which to insert; cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ dtuple_t* entry, /*!< in/out: entry to insert */ rec_t** rec, /*!< out: pointer to inserted record if succeed */ @@ -243,7 +260,8 @@ btr_cur_pessimistic_insert( NULL */ ulint n_ext, /*!< in: number of externally stored columns */ que_thr_t* thr, /*!< in: query thread or NULL */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result)); /*************************************************************//** See if there is enough place in the page modification log to log an update-in-place. @@ -264,19 +282,23 @@ btr_cur_update_alloc_zip( Updates a record when the update causes no size changes in its fields. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_update_in_place( /*====================*/ ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor on the record to update; cursor stays valid and positioned on the same record */ + const ulint* offsets,/*!< in: offsets on cursor->page_cur.rec */ const upd_t* update, /*!< in: update vector */ ulint cmpl_info,/*!< in: compiler info on secondary index updates */ - que_thr_t* thr, /*!< in: query thread */ - mtr_t* mtr); /*!< in: mtr; must be committed before + que_thr_t* thr, /*!< in: query thread, or NULL if + appropriate flags are set */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ + __attribute__((warn_unused_result, nonnull(2,3,4,8))); /*************************************************************//** Tries to update a record on a page in an index tree. It is assumed that mtr holds an x-latch on the page. The operation does not succeed if there is too @@ -286,20 +308,25 @@ so that tree compression is recommended. DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if there is not enough space left on the compressed page */ UNIV_INTERN -ulint +dberr_t btr_cur_optimistic_update( /*======================*/ ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor on the record to update; cursor stays valid and positioned on the same record */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ const upd_t* update, /*!< in: update vector; this must also contain trx id and roll ptr fields */ ulint cmpl_info,/*!< in: compiler info on secondary index updates */ - que_thr_t* thr, /*!< in: query thread */ - mtr_t* mtr); /*!< in: mtr; must be committed before + que_thr_t* thr, /*!< in: query thread, or NULL if + appropriate flags are set */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ + __attribute__((warn_unused_result, nonnull(2,3,4,5,9))); /*************************************************************//** Performs an update of a record on a page of a tree. It is assumed that mtr holds an x-latch on the tree and on the cursor page. If the @@ -307,7 +334,7 @@ update is made on the leaf level, to avoid deadlocks, mtr must also own x-latches to brothers of page, if those brothers exist. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t btr_cur_pessimistic_update( /*=======================*/ ulint flags, /*!< in: undo logging, locking, and rollback @@ -315,7 +342,13 @@ btr_cur_pessimistic_update( btr_cur_t* cursor, /*!< in/out: cursor on the record to update; cursor may become invalid if *big_rec == NULL || !(flags & BTR_KEEP_POS_FLAG) */ - mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to be stored externally by the caller, or NULL */ const upd_t* update, /*!< in: update vector; this is allowed also @@ -323,9 +356,12 @@ btr_cur_pessimistic_update( the values in update vector have no effect */ ulint cmpl_info,/*!< in: compiler info on secondary index updates */ - que_thr_t* thr, /*!< in: query thread */ - mtr_t* mtr); /*!< in: mtr; must be committed before + que_thr_t* thr, /*!< in: query thread, or NULL if + appropriate flags are set */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ + __attribute__((warn_unused_result, nonnull(2,3,4,5,6,7,11))); /***********************************************************//** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -333,15 +369,13 @@ of the deleting transaction, and in the roll ptr field pointer to the undo log record created. @return DB_SUCCESS, DB_LOCK_WAIT, or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_del_mark_set_clust_rec( /*===========================*/ - ulint flags, /*!< in: undo logging and locking flags */ buf_block_t* block, /*!< in/out: buffer block of the record */ rec_t* rec, /*!< in/out: record */ dict_index_t* index, /*!< in: clustered index of the record */ const ulint* offsets,/*!< in: rec_get_offsets(rec) */ - ibool val, /*!< in: value to set */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr */ __attribute__((nonnull)); @@ -349,7 +383,7 @@ btr_cur_del_mark_set_clust_rec( Sets a secondary index record delete mark to TRUE or FALSE. @return DB_SUCCESS, DB_LOCK_WAIT, or error number */ UNIV_INTERN -ulint +dberr_t btr_cur_del_mark_set_sec_rec( /*=========================*/ ulint flags, /*!< in: locking flag */ @@ -382,16 +416,27 @@ but no latch on the whole tree. @return TRUE if success, i.e., the page did not become too empty */ UNIV_INTERN ibool -btr_cur_optimistic_delete( -/*======================*/ +btr_cur_optimistic_delete_func( +/*===========================*/ btr_cur_t* cursor, /*!< in: cursor on the record to delete; cursor stays valid: if deletion succeeds, on function exit it points to the successor of the deleted record */ - mtr_t* mtr); /*!< in: mtr; if this function returns +# ifdef UNIV_DEBUG + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ +# endif /* UNIV_DEBUG */ + mtr_t* mtr) /*!< in: mtr; if this function returns TRUE on a leaf page of a secondary index, the mtr must be committed before latching any further pages */ + __attribute__((nonnull, warn_unused_result)); +# ifdef UNIV_DEBUG +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, flags, mtr) +# else /* UNIV_DEBUG */ +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, mtr) +# endif /* UNIV_DEBUG */ /*************************************************************//** Removes the record on which the tree cursor is positioned. Tries to compress the page if its fillfactor drops below a threshold @@ -404,7 +449,7 @@ UNIV_INTERN ibool btr_cur_pessimistic_delete( /*=======================*/ - ulint* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; the latter may occur because we may have to update node pointers on upper levels, and in the case of variable length keys @@ -417,8 +462,10 @@ btr_cur_pessimistic_delete( if compression does not occur, the cursor stays valid: it points to successor of deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /***********************************************************//** Parses a redo log record of updating a record in-place. @@ -472,9 +519,10 @@ btr_estimate_n_rows_in_range( ulint mode2); /*!< in: search mode for range end */ /*******************************************************************//** Estimates the number of different key values in a given index, for -each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals[] and -the number of pages that were sampled is saved in index->stat_n_sample_sizes[]. +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +index->stat_n_sample_sizes[]. If innodb_stats_method is nulls_ignored, we also record the number of non-null values for each prefix and stored the estimates in array index->stat_n_non_null_key_vals. */ @@ -528,7 +576,7 @@ The fields are stored on pages allocated from leaf node file segment of the index tree. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ UNIV_INTERN -enum db_err +dberr_t btr_store_big_rec_extern_fields( /*============================*/ dict_index_t* index, /*!< in: index of rec; the index tree @@ -662,8 +710,7 @@ limit, merging it to a neighbor is tried */ /** A slot in the path array. We store here info on a search path down the tree. Each slot contains data on a single level of the tree. */ -typedef struct btr_path_struct btr_path_t; -struct btr_path_struct{ +struct btr_path_t{ ulint nth_rec; /*!< index of the record where the page cursor stopped on this level (index in alphabetical @@ -700,7 +747,7 @@ enum btr_cur_method { /** The tree cursor: the definition appears here only for the compiler to know struct size! */ -struct btr_cur_struct { +struct btr_cur_t { dict_index_t* index; /*!< index where positioned */ page_cur_t page_cur; /*!< page cursor */ purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */ @@ -737,7 +784,7 @@ struct btr_cur_struct { for comparison to the adjacent user record if that record is on a different leaf page! (See the note in - row_ins_duplicate_key.) */ + row_ins_duplicate_error_in_clust.) */ ulint up_bytes; /*!< number of matched bytes to the right at the time cursor positioned; only used internally in searches: not @@ -822,6 +869,11 @@ srv_printf_innodb_monitor(). */ extern ulint btr_cur_n_sea_old; #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +extern uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + #ifndef UNIV_NONINL #include "btr0cur.ic" #endif diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic index 540417e3062..080866c7465 100644 --- a/storage/innobase/include/btr0cur.ic +++ b/storage/innobase/include/btr0cur.ic @@ -27,6 +27,16 @@ Created 10/16/1994 Heikki Tuuri #include "btr0btr.h" #ifdef UNIV_DEBUG +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\ +if (btr_cur_limit_optimistic_insert_debug\ + && (NREC) >= (ulint)btr_cur_limit_optimistic_insert_debug) {\ + CODE;\ +} +#else +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE) +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG /*********************************************************//** Returns the page cursor component of a tree cursor. @return pointer to page cursor component */ @@ -135,6 +145,9 @@ btr_cur_compress_recommendation( page = btr_cur_get_page(cursor); + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2, + return(FALSE)); + if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT) || ((btr_page_get_next(page, mtr) == FIL_NULL) && (btr_page_get_prev(page, mtr) == FIL_NULL))) { diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h index a8eaac4690b..973fae382ab 100644 --- a/storage/innobase/include/btr0pcur.h +++ b/storage/innobase/include/btr0pcur.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -146,13 +146,16 @@ UNIV_INLINE void btr_pcur_open_at_index_side( /*========================*/ - ibool from_left, /*!< in: TRUE if open to the low end, - FALSE if to the high end */ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ dict_index_t* index, /*!< in: index */ ulint latch_mode, /*!< in: latch mode */ - btr_pcur_t* pcur, /*!< in: cursor */ - ibool do_init, /*!< in: TRUE if should be initialized */ - mtr_t* mtr); /*!< in: mtr */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /**************************************************************//** Gets the up_match value for a pcur after a search. @return number of matched fields at the cursor or to the right if @@ -209,8 +212,17 @@ btr_pcur_open_at_rnd_pos_func( #define btr_pcur_open_at_rnd_pos(i,l,c,m) \ btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m) /**************************************************************//** -Frees the possible old_rec_buf buffer of a persistent cursor and sets the -latch mode of the persistent cursor to BTR_NO_LATCHES. */ +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ UNIV_INLINE void btr_pcur_close( @@ -452,14 +464,14 @@ btr_pcur_move_to_prev_on_page( /* The persistent B-tree cursor structure. This is used mainly for SQL selects, updates, and deletes. */ -struct btr_pcur_struct{ +struct btr_pcur_t{ btr_cur_t btr_cur; /*!< a B-tree cursor */ ulint latch_mode; /*!< see TODO note below! BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE, or BTR_NO_LATCHES, depending on the latching state of the page and tree where the cursor is - positioned; the last value means that + positioned; BTR_NO_LATCHES means that the cursor is not currently positioned: we say then that the cursor is detached; it can be restored to diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic index a27033c4a7c..79afd7c322e 100644 --- a/storage/innobase/include/btr0pcur.ic +++ b/storage/innobase/include/btr0pcur.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -429,7 +429,7 @@ btr_pcur_open_low( btr_pcur_init(cursor); - cursor->latch_mode = latch_mode; + cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); cursor->search_mode = mode; /* Search with the tree cursor */ @@ -496,28 +496,26 @@ UNIV_INLINE void btr_pcur_open_at_index_side( /*========================*/ - ibool from_left, /*!< in: TRUE if open to the low end, - FALSE if to the high end */ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ dict_index_t* index, /*!< in: index */ ulint latch_mode, /*!< in: latch mode */ - btr_pcur_t* pcur, /*!< in: cursor */ - ibool do_init, /*!< in: TRUE if should be initialized */ - mtr_t* mtr) /*!< in: mtr */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { - pcur->latch_mode = latch_mode; + pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); - if (from_left) { - pcur->search_mode = PAGE_CUR_G; - } else { - pcur->search_mode = PAGE_CUR_L; - } + pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L; - if (do_init) { + if (init_pcur) { btr_pcur_init(pcur); } btr_cur_open_at_index_side(from_left, index, latch_mode, - btr_pcur_get_btr_cur(pcur), mtr); + btr_pcur_get_btr_cur(pcur), level, mtr); pcur->pos_state = BTR_PCUR_IS_POSITIONED; pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; @@ -556,7 +554,16 @@ btr_pcur_open_at_rnd_pos_func( /**************************************************************//** Frees the possible memory heap of a persistent cursor and sets the latch -mode of the persistent cursor to BTR_NO_LATCHES. */ +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ UNIV_INLINE void btr_pcur_close( diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h index 5316c3efd39..fea117d0aaf 100644 --- a/storage/innobase/include/btr0sea.h +++ b/storage/innobase/include/btr0sea.h @@ -68,7 +68,8 @@ UNIV_INLINE btr_search_t* btr_search_get_info( /*================*/ - dict_index_t* index); /*!< in: index */ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull)); /*****************************************************************//** Creates and initializes a search info struct. @return own: search info struct */ @@ -193,7 +194,7 @@ btr_search_validate(void); #endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ /** The search info struct in an index */ -struct btr_search_struct{ +struct btr_search_t{ ulint ref_count; /*!< Number of blocks in this index tree that have search index built i.e. block->index points to this index. @@ -242,16 +243,13 @@ struct btr_search_struct{ #endif /* UNIV_SEARCH_PERF_STAT */ #ifdef UNIV_DEBUG ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */ -/** value of btr_search_struct::magic_n, used in assertions */ +/** value of btr_search_t::magic_n, used in assertions */ # define BTR_SEARCH_MAGIC_N 1112765 #endif /* UNIV_DEBUG */ }; /** The hash index system */ -typedef struct btr_search_sys_struct btr_search_sys_t; - -/** The hash index system */ -struct btr_search_sys_struct{ +struct btr_search_sys_t{ hash_table_t* hash_index; /*!< the adaptive hash index, mapping dtuple_fold values to rec_t pointers on index pages */ diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic index 49ba0fd3f0b..0bd869be136 100644 --- a/storage/innobase/include/btr0sea.ic +++ b/storage/innobase/include/btr0sea.ic @@ -45,8 +45,6 @@ btr_search_get_info( /*================*/ dict_index_t* index) /*!< in: index */ { - ut_ad(index); - return(index->search_info); } diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index 09f97b3cabd..c1a4531f861 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -33,11 +33,11 @@ Created 2/17/1996 Heikki Tuuri #include "sync0rw.h" /** Persistent cursor */ -typedef struct btr_pcur_struct btr_pcur_t; +struct btr_pcur_t; /** B-tree cursor */ -typedef struct btr_cur_struct btr_cur_t; +struct btr_cur_t; /** B-tree search information for the adaptive hash index */ -typedef struct btr_search_struct btr_search_t; +struct btr_search_t; #ifndef UNIV_HOTBACKUP @@ -68,7 +68,7 @@ extern char btr_search_enabled; #ifdef UNIV_BLOB_DEBUG # include "buf0types.h" /** An index->blobs entry for keeping track of off-page column references */ -typedef struct btr_blob_dbg_struct btr_blob_dbg_t; +struct btr_blob_dbg_t; /** Insert to index->blobs a reference to an off-page column. @param index the index tree diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 2284f21e3ab..74a6e203808 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -89,8 +89,6 @@ extern ibool buf_debug_prints;/*!< If this is set TRUE, the program prints info whenever read or flush occurs */ #endif /* UNIV_DEBUG */ -extern ulint srv_buf_pool_write_requests; /*!< variable to count write request - issued */ extern ulint srv_buf_pool_instances; extern ulint srv_buf_pool_curr_size; #else /* !UNIV_HOTBACKUP */ @@ -102,7 +100,7 @@ extern buf_block_t* back_block2; /*!< second block, for page reorganize */ #define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL /** @brief States of a control block -@see buf_page_struct +@see buf_page_t The enumeration values must be 0..7. */ enum buf_page_state { @@ -132,7 +130,7 @@ enum buf_page_state { /** This structure defines information we will fetch from each buffer pool. It will be used to print table IO stats */ -struct buf_pool_info_struct{ +struct buf_pool_info_t{ /* General buffer pool info */ ulint pool_unique_id; /*!< Buffer Pool ID */ ulint pool_size; /*!< Buffer Pool size in pages */ @@ -203,7 +201,12 @@ struct buf_pool_info_struct{ interval */ }; -typedef struct buf_pool_info_struct buf_pool_info_t; +/** The occupied bytes of lists in all buffer pools */ +struct buf_pools_list_size_t { + ulint LRU_bytes; /*!< LRU size in bytes */ + ulint unzip_LRU_bytes; /*!< unzip_LRU size in bytes */ + ulint flush_list_bytes; /*!< flush_list size in bytes */ +}; #ifndef UNIV_HOTBACKUP /********************************************************************//** @@ -222,9 +225,9 @@ buf_pool_mutex_exit_all(void); /********************************************************************//** Creates the buffer pool. -@return own: buf_pool object, NULL if not enough memory or error */ +@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */ UNIV_INTERN -ulint +dberr_t buf_pool_init( /*=========*/ ulint size, /*!< in: Size of the total pool in bytes */ @@ -629,9 +632,12 @@ UNIV_INTERN ibool buf_page_is_corrupted( /*==================*/ + bool check_lsn, /*!< in: true if we need to check the + and complain about the LSN */ const byte* read_buf, /*!< in: a database page */ - ulint zip_size); /*!< in: size of compressed page; + ulint zip_size) /*!< in: size of compressed page; 0 for uncompressed pages */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_HOTBACKUP /**********************************************************************//** Gets the space id, page offset, and byte offset within page of a @@ -881,7 +887,7 @@ buf_page_belongs_to_unzip_LRU( Gets the mutex of a block. @return pointer to mutex protecting bpage */ UNIV_INLINE -mutex_t* +ib_mutex_t* buf_page_get_mutex( /*===============*/ const buf_page_t* bpage) /*!< in: pointer to control block */ @@ -1010,8 +1016,7 @@ UNIV_INLINE void buf_page_set_accessed( /*==================*/ - buf_page_t* bpage, /*!< in/out: control block */ - ulint time_ms) /*!< in: ut_time_ms() */ + buf_page_t* bpage) /*!< in/out: control block */ __attribute__((nonnull)); /*********************************************************************//** Gets the buf_block_t handle of a buffered file block if an uncompressed @@ -1152,7 +1157,7 @@ UNIV_INTERN buf_page_t* buf_page_init_for_read( /*===================*/ - ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */ ulint space, /*!< in: space id */ ulint zip_size,/*!< in: compressed page size, or 0 */ @@ -1164,9 +1169,9 @@ buf_page_init_for_read( /********************************************************************//** Completes an asynchronous read or write request of a file page to or from the buffer pool. -@return TRUE if successful */ +@return true if successful */ UNIV_INTERN -ibool +bool buf_page_io_complete( /*=================*/ buf_page_t* bpage); /*!< in: pointer to the block in question */ @@ -1368,6 +1373,14 @@ buf_get_total_list_len( ulint* free_len, /*!< out: length of all free lists */ ulint* flush_list_len);/*!< out: length of all flush lists */ /********************************************************************//** +Get total list size in bytes from all buffer pools. */ +UNIV_INTERN +void +buf_get_total_list_size_in_bytes( +/*=============================*/ + buf_pools_list_size_t* buf_pools_list_size); /*!< out: list sizes + in all buffer pools */ +/********************************************************************//** Get total buffer pool statistics. */ UNIV_INTERN void @@ -1385,6 +1398,16 @@ buf_get_nth_chunk_block( ulint n, /*!< in: nth chunk in the buffer pool */ ulint* chunk_size); /*!< in: chunk size */ +/********************************************************************//** +Calculate the checksum of a page from compressed table and update the page. */ +UNIV_INTERN +void +buf_flush_update_zip_checksum( +/*==========================*/ + buf_frame_t* page, /*!< in/out: Page to update */ + ulint zip_size, /*!< in: Compressed page size */ + lsn_t lsn); /*!< in: Lsn to stamp on the page */ + #endif /* !UNIV_HOTBACKUP */ /** The common buffer control block structure @@ -1393,10 +1416,10 @@ for compressed and uncompressed frames */ /** Number of bits used for buffer page states. */ #define BUF_PAGE_STATE_BITS 3 -struct buf_page_struct{ +struct buf_page_t{ /** @name General fields None of these bit-fields must be modified without holding - buf_page_get_mutex() [buf_block_struct::mutex or + buf_page_get_mutex() [buf_block_t::mutex or buf_pool->zip_mutex], since they can be stored in the same machine word. Some of these fields are additionally protected by buf_pool->mutex. */ @@ -1527,7 +1550,7 @@ struct buf_page_struct{ /* @} */ /** @name LRU replacement algorithm fields These fields are protected by buf_pool->mutex only (not - buf_pool->zip_mutex or buf_block_struct::mutex). */ + buf_pool->zip_mutex or buf_block_t::mutex). */ /* @{ */ UT_LIST_NODE_T(buf_page_t) LRU; @@ -1547,23 +1570,24 @@ struct buf_page_struct{ to read this for heuristic purposes without holding any mutex or latch */ - unsigned access_time:32; /*!< time of first access, or - 0 if the block was never accessed - in the buffer pool */ /* @} */ + unsigned access_time; /*!< time of first access, or + 0 if the block was never accessed + in the buffer pool. Protected by + block mutex */ # if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ibool file_page_was_freed; /*!< this is set to TRUE when fsp frees a page in buffer pool; protected by buf_pool->zip_mutex - or buf_block_struct::mutex. */ + or buf_block_t::mutex. */ # endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ }; /** The buffer control block structure */ -struct buf_block_struct{ +struct buf_block_t{ /** @name General fields */ /* @{ */ @@ -1587,7 +1611,7 @@ struct buf_block_struct{ decompressed LRU list; used in debugging */ #endif /* UNIV_DEBUG */ - mutex_t mutex; /*!< mutex protecting this block: + ib_mutex_t mutex; /*!< mutex protecting this block: state (also protected by the buffer pool mutex), io_fix, buf_fix_count, and accessed; we introduce this new @@ -1646,8 +1670,8 @@ struct buf_block_struct{ /** @name Hash search fields These 5 fields may only be modified when we have an x-latch on btr_search_latch AND - - we are holding an s-latch or x-latch on buf_block_struct::lock or - - we know that buf_block_struct::buf_fix_count == 0. + - we are holding an s-latch or x-latch on buf_block_t::lock or + - we know that buf_block_t::buf_fix_count == 0. An exception to this is when we init or create a page in the buffer pool in buf0buf.cc. @@ -1706,7 +1730,7 @@ Compute the hash fold value for blocks in buf_pool->zip_hash. */ /* @} */ /** @brief The buffer pool statistics structure. */ -struct buf_pool_stat_struct{ +struct buf_pool_stat_t{ ulint n_page_gets; /*!< number of page gets performed; also successful searches through the adaptive hash index are @@ -1730,10 +1754,12 @@ struct buf_pool_stat_struct{ young because the first access was not long enough ago, in buf_page_peek_if_too_old() */ + ulint LRU_bytes; /*!< LRU size in bytes */ + ulint flush_list_bytes;/*!< flush_list size in bytes */ }; /** Statistics of buddy blocks of a given size. */ -struct buf_buddy_stat_struct { +struct buf_buddy_stat_t { /** Number of blocks allocated from the buddy system. */ ulint used; /** Number of blocks relocated by the buddy system. */ @@ -1747,13 +1773,13 @@ struct buf_buddy_stat_struct { NOTE! The definition appears here only for other modules of this directory (buf) to see it. Do not use from outside! */ -struct buf_pool_struct{ +struct buf_pool_t{ /** @name General fields */ /* @{ */ - mutex_t mutex; /*!< Buffer pool mutex of this + ib_mutex_t mutex; /*!< Buffer pool mutex of this instance */ - mutex_t zip_mutex; /*!< Zip mutex of this buffer + ib_mutex_t zip_mutex; /*!< Zip mutex of this buffer pool instance, protects compressed only pages (of type buf_page_t, not buf_block_t */ @@ -1807,7 +1833,7 @@ struct buf_pool_struct{ /* @{ */ - mutex_t flush_list_mutex;/*!< mutex protecting the + ib_mutex_t flush_list_mutex;/*!< mutex protecting the flush list access. This mutex protects flush_list, flush_rbt and bpage::list pointers when diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index 88c29ab5603..b310efdf451 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -31,13 +31,13 @@ Created 11/5/1995 Heikki Tuuri *******************************************************/ #include "mtr0mtr.h" +#ifndef UNIV_HOTBACKUP #include "buf0flu.h" #include "buf0lru.h" #include "buf0rea.h" -#ifndef UNIV_HOTBACKUP /** A chunk of buffers. The buffer pool is allocated in chunks. */ -struct buf_chunk_struct{ +struct buf_chunk_t{ ulint mem_size; /*!< allocated size of the chunk */ ulint size; /*!< size of frames[] and blocks[] */ void* mem; /*!< pointer to the memory area which @@ -339,7 +339,7 @@ buf_page_belongs_to_unzip_LRU( Gets the mutex of a block. @return pointer to mutex protecting bpage */ UNIV_INLINE -mutex_t* +ib_mutex_t* buf_page_get_mutex( /*===============*/ const buf_page_t* bpage) /*!< in: pointer to control block */ @@ -419,6 +419,8 @@ buf_page_get_io_fix( /*================*/ const buf_page_t* bpage) /*!< in: pointer to the control block */ { + ut_ad(bpage != NULL); + enum buf_io_fix io_fix = (enum buf_io_fix) bpage->io_fix; #ifdef UNIV_DEBUG switch (io_fix) { @@ -614,18 +616,18 @@ UNIV_INLINE void buf_page_set_accessed( /*==================*/ - buf_page_t* bpage, /*!< in/out: control block */ - ulint time_ms) /*!< in: ut_time_ms() */ + buf_page_t* bpage) /*!< in/out: control block */ { #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); #endif ut_a(buf_page_in_file(bpage)); if (!bpage->access_time) { /* Make this the time of the first access. */ - bpage->access_time = time_ms; + bpage->access_time = ut_time_ms(); } } @@ -942,7 +944,7 @@ buf_page_get_newest_modification( page frame */ { lsn_t lsn; - mutex_t* block_mutex = buf_page_get_mutex(bpage); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); mutex_enter(block_mutex); diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index fcc56d91405..357ba697f6a 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -29,7 +29,6 @@ Created 2011/12/19 Inaam Rana #include "univ.i" #include "ut0byte.h" #include "log0log.h" -#include "buf0types.h" #ifndef UNIV_HOTBACKUP @@ -113,8 +112,8 @@ buf_dblwr_write_single_page( buf_page_t* bpage); /*!< in: buffer block to write */ /** Doublewrite control struct */ -struct buf_dblwr_struct{ - mutex_t mutex; /*!< mutex protecting the first_free field and +struct buf_dblwr_t{ + ib_mutex_t mutex; /*!< mutex protecting the first_free field and write_buf */ ulint block1; /*!< the page number of the first doublewrite block (64 pages) */ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index faf577f718b..94f4e6dedd1 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -95,23 +95,27 @@ void buf_flush_sync_datafiles(void); /*==========================*/ /*******************************************************************//** -This utility flushes dirty blocks from the end of the flush_list of +This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. NOTE: The calling thread is not allowed to own any latches on pages! -@return number of blocks for which the write request was queued; -ULINT_UNDEFINED if there was a flush of the same type already running */ +@return true if a batch was queued successfully for each buffer pool +instance. false if another batch of same type was already running in +at least one of the buffer pool instance */ UNIV_INTERN -ulint +bool buf_flush_list( -/*============*/ +/*===========*/ ulint min_n, /*!< in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) */ - lsn_t lsn_limit); /*!< in the case BUF_FLUSH_LIST all + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ + ulint* n_processed); /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ /******************************************************************//** This function picks up a single dirty page from the tail of the LRU list, flushes it, removes it from page_hash and LRU list and puts @@ -176,31 +180,6 @@ buf_flush_ready_for_replace( /*========================*/ buf_page_t* bpage); /*!< in: buffer control block, must be buf_page_in_file(bpage) and in the LRU list */ - -/** @brief Statistics for selecting flush rate based on redo log -generation speed. - -These statistics are generated for heuristics used in estimating the -rate at which we should flush the dirty blocks to avoid bursty IO -activity. Note that the rate of flushing not only depends on how many -dirty pages we have in the buffer pool but it is also a fucntion of -how much redo the workload is generating and at what rate. */ - -struct buf_flush_stat_struct -{ - lsn_t redo; /**< amount of redo generated. */ - ulint n_flushed; /**< number of pages flushed. */ -}; - -/** Statistics for selecting flush rate of dirty pages. */ -typedef struct buf_flush_stat_struct buf_flush_stat_t; -/********************************************************************* -Update the historical stats that we are collecting for flush rate -heuristics at the end of each interval. */ -UNIV_INTERN -void -buf_flush_stat_update(void); -/*=======================*/ /******************************************************************//** page_cleaner thread tasked with flushing dirty pages from the buffer pools. As of now we'll have only one instance of this thread. @@ -211,6 +190,23 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( /*==========================================*/ void* arg); /*!< in: a dummy parameter required by os_thread_create */ +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_flush_LRU_tail(void); +/*====================*/ +/*********************************************************************//** +Wait for any possible LRU flushes that are in progress to end. */ +UNIV_INTERN +void +buf_flush_wait_LRU_batch_end(void); +/*==============================*/ #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /******************************************************************//** @@ -238,6 +234,44 @@ UNIV_INTERN void buf_flush_free_flush_rbt(void); /*==========================*/ + +/********************************************************************//** +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: in simulated aio we must call +os_aio_simulated_wake_handler_threads after we have posted a batch of +writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be +held upon entering this function, and they will be released by this +function. */ +UNIV_INTERN +void +buf_flush_page( +/*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_page_t* bpage, /*!< in: buffer control block */ + buf_flush flush_type) /*!< in: type of flush */ + __attribute__((nonnull)); + +#ifdef UNIV_DEBUG +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush +list in a particular buffer pool. +@return number of dirty pages present in a single buffer pool */ +UNIV_INTERN +ulint +buf_pool_get_dirty_pages_count( +/*===========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint id); /*!< in: space id to check */ +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush list. +@return count of dirty pages present in all the buffer pools */ +UNIV_INTERN +ulint +buf_flush_get_dirty_pages_count( +/*============================*/ + ulint id); /*!< in: space id to check */ +#endif /* UNIV_DEBUG */ + #endif /* !UNIV_HOTBACKUP */ #ifndef UNIV_NONINL diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic index 68a76c0b637..a763cd115fe 100644 --- a/storage/innobase/include/buf0flu.ic +++ b/storage/innobase/include/buf0flu.ic @@ -26,6 +26,7 @@ Created 11/5/1995 Heikki Tuuri #ifndef UNIV_HOTBACKUP #include "buf0buf.h" #include "mtr0mtr.h" +#include "srv0srv.h" /********************************************************************//** Inserts a modified block into the flush list. */ @@ -61,7 +62,7 @@ buf_flush_note_modification( { buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(block); + ut_ad(!srv_read_only_mode); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); #ifdef UNIV_SYNC_DEBUG @@ -91,7 +92,7 @@ buf_flush_note_modification( mutex_exit(&block->mutex); - ++srv_buf_pool_write_requests; + srv_stats.buf_pool_write_requests.inc(); } /********************************************************************//** @@ -108,7 +109,7 @@ buf_flush_recv_note_modification( { buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(block); + ut_ad(!srv_read_only_mode); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); #ifdef UNIV_SYNC_DEBUG diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index 527852da758..f7a69e1c9e4 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -31,6 +31,9 @@ Created 11/5/1995 Heikki Tuuri #include "ut0byte.h" #include "buf0types.h" +// Forward declaration +struct trx_t; + /******************************************************************//** Returns TRUE if less than 25 % of the buffer pool is available. This can be used in heuristics to prevent huge transactions eating up the whole buffer @@ -49,15 +52,19 @@ These are low-level functions #define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ /******************************************************************//** -Invalidates all pages belonging to a given tablespace when we are deleting -the data file(s) of that tablespace. A PROBLEM: if readahead is being started, -what guarantees that it will not try to read in pages after this operation has -completed? */ +Flushes all dirty pages or removes all pages belonging +to a given tablespace. A PROBLEM: if readahead is being started, what +guarantees that it will not try to read in pages after this operation +has completed? */ UNIV_INTERN void -buf_LRU_invalidate_tablespace( +buf_LRU_flush_or_remove_pages( /*==========================*/ - ulint id); /*!< in: space id */ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove, /*!< in: remove or flush strategy */ + const trx_t* trx); /*!< to check if the operation must + be interrupted */ + #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ @@ -157,7 +164,10 @@ buf_LRU_block_free_non_file_page( /*=============================*/ buf_block_t* block); /*!< in: block, must not contain a file page */ /******************************************************************//** -Adds a block to the LRU list. */ +Adds a block to the LRU list. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ UNIV_INTERN void buf_LRU_add_block( @@ -270,15 +280,12 @@ extern uint buf_LRU_old_threshold_ms; These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O and page_zip_decompress() operations. Based on the statistics we decide if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */ -struct buf_LRU_stat_struct +struct buf_LRU_stat_t { ulint io; /**< Counter of buffer pool I/O operations. */ ulint unzip; /**< Counter of page_zip_decompress operations. */ }; -/** Statistics for selecting the LRU list for eviction. */ -typedef struct buf_LRU_stat_struct buf_LRU_stat_t; - /** Current operation counters. Not protected by any mutex. Cleared by buf_LRU_stat_update(). */ extern buf_LRU_stat_t buf_LRU_stat_cur; diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index ba54a8aeeea..5ed210d3b90 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -27,19 +27,19 @@ Created 11/17/1995 Heikki Tuuri #define buf0types_h /** Buffer page (uncompressed or compressed) */ -typedef struct buf_page_struct buf_page_t; +struct buf_page_t; /** Buffer block for which an uncompressed page exists */ -typedef struct buf_block_struct buf_block_t; +struct buf_block_t; /** Buffer pool chunk comprising buf_block_t */ -typedef struct buf_chunk_struct buf_chunk_t; +struct buf_chunk_t; /** Buffer pool comprising buf_chunk_t */ -typedef struct buf_pool_struct buf_pool_t; +struct buf_pool_t; /** Buffer pool statistics struct */ -typedef struct buf_pool_stat_struct buf_pool_stat_t; +struct buf_pool_stat_t; /** Buffer pool buddy statistics struct */ -typedef struct buf_buddy_stat_struct buf_buddy_stat_t; +struct buf_buddy_stat_t; /** Doublewrite memory struct */ -typedef struct buf_dblwr_struct buf_dblwr_t; +struct buf_dblwr_t; /** A buffer frame. @see page_t */ typedef byte buf_frame_t; @@ -54,6 +54,17 @@ enum buf_flush { BUF_FLUSH_N_TYPES /*!< index of last element + 1 */ }; +/** Algorithm to remove the pages for a tablespace from the buffer pool. +See buf_LRU_flush_or_remove_pages(). */ +enum buf_remove_t { + BUF_REMOVE_ALL_NO_WRITE, /*!< Remove all pages from the buffer + pool, don't write or sync to disk */ + BUF_REMOVE_FLUSH_NO_WRITE, /*!< Remove only, from the flush list, + don't write or sync to disk */ + BUF_REMOVE_FLUSH_WRITE /*!< Flush dirty pages to disk only + don't remove from the buffer pool */ +}; + /** Flags for io_fix types */ enum buf_io_fix { BUF_IO_NONE = 0, /**< no pending I/O */ @@ -66,7 +77,7 @@ enum buf_io_fix { /** Alternatives for srv_checksum_algorithm, which can be changed by setting innodb_checksum_algorithm */ -enum srv_checksum_algorithm_enum { +enum srv_checksum_algorithm_t { SRV_CHECKSUM_ALGORITHM_CRC32, /*!< Write crc32, allow crc32, innodb or none when reading */ SRV_CHECKSUM_ALGORITHM_STRICT_CRC32, /*!< Write crc32, allow crc32 @@ -81,8 +92,6 @@ enum srv_checksum_algorithm_enum { when reading */ }; -typedef enum srv_checksum_algorithm_enum srv_checksum_algorithm_t; - /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ /* @{ */ /** Zip shift value for the smallest page size */ diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h index 37364e891f5..a548c7b89b3 100644 --- a/storage/innobase/include/data0data.h +++ b/storage/innobase/include/data0data.h @@ -35,7 +35,7 @@ Created 5/30/1994 Heikki Tuuri /** Storage for overflow data in a big record, that is, a clustered index record which needs external storage of data fields */ -typedef struct big_rec_struct big_rec_t; +struct big_rec_t; #ifdef UNIV_DEBUG /*********************************************************************//** @@ -45,7 +45,8 @@ UNIV_INLINE dtype_t* dfield_get_type( /*============*/ - const dfield_t* field); /*!< in: SQL data field */ + const dfield_t* field) /*!< in: SQL data field */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Gets pointer to the data in a field. @return pointer to data */ @@ -53,7 +54,8 @@ UNIV_INLINE void* dfield_get_data( /*============*/ - const dfield_t* field); /*!< in: field */ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); #else /* UNIV_DEBUG */ # define dfield_get_type(field) (&(field)->type) # define dfield_get_data(field) ((field)->data) @@ -65,7 +67,8 @@ void dfield_set_type( /*============*/ dfield_t* field, /*!< in: SQL data field */ - dtype_t* type); /*!< in: pointer to data type struct */ + const dtype_t* type) /*!< in: pointer to data type struct */ + __attribute__((nonnull)); /*********************************************************************//** Gets length of field data. @return length of data; UNIV_SQL_NULL if SQL null data */ @@ -73,7 +76,8 @@ UNIV_INLINE ulint dfield_get_len( /*===========*/ - const dfield_t* field); /*!< in: field */ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Sets length in a field. */ UNIV_INLINE @@ -81,7 +85,8 @@ void dfield_set_len( /*===========*/ dfield_t* field, /*!< in: field */ - ulint len); /*!< in: length or UNIV_SQL_NULL */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + __attribute__((nonnull)); /*********************************************************************//** Determines if a field is SQL NULL @return nonzero if SQL null data */ @@ -89,7 +94,8 @@ UNIV_INLINE ulint dfield_is_null( /*===========*/ - const dfield_t* field); /*!< in: field */ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Determines if a field is externally stored @return nonzero if externally stored */ @@ -97,14 +103,16 @@ UNIV_INLINE ulint dfield_is_ext( /*==========*/ - const dfield_t* field); /*!< in: field */ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Sets the "external storage" flag */ UNIV_INLINE void dfield_set_ext( /*===========*/ - dfield_t* field); /*!< in/out: field */ + dfield_t* field) /*!< in/out: field */ + __attribute__((nonnull)); /*********************************************************************//** Sets pointer to the data and length in a field. */ UNIV_INLINE @@ -113,14 +121,16 @@ dfield_set_data( /*============*/ dfield_t* field, /*!< in: field */ const void* data, /*!< in: data */ - ulint len); /*!< in: length or UNIV_SQL_NULL */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + __attribute__((nonnull(1))); /*********************************************************************//** Sets a data field to SQL NULL. */ UNIV_INLINE void dfield_set_null( /*============*/ - dfield_t* field); /*!< in/out: field */ + dfield_t* field) /*!< in/out: field */ + __attribute__((nonnull)); /**********************************************************************//** Writes an SQL null field full of zeros. */ UNIV_INLINE @@ -128,7 +138,8 @@ void data_write_sql_null( /*================*/ byte* data, /*!< in: pointer to a buffer of size len */ - ulint len); /*!< in: SQL null size in bytes */ + ulint len) /*!< in: SQL null size in bytes */ + __attribute__((nonnull)); /*********************************************************************//** Copies the data and len fields. */ UNIV_INLINE @@ -136,7 +147,8 @@ void dfield_copy_data( /*=============*/ dfield_t* field1, /*!< out: field to copy to */ - const dfield_t* field2);/*!< in: field to copy from */ + const dfield_t* field2) /*!< in: field to copy from */ + __attribute__((nonnull)); /*********************************************************************//** Copies a data field to another. */ UNIV_INLINE @@ -144,7 +156,8 @@ void dfield_copy( /*========*/ dfield_t* field1, /*!< out: field to copy to */ - const dfield_t* field2);/*!< in: field to copy from */ + const dfield_t* field2) /*!< in: field to copy from */ + __attribute__((nonnull)); /*********************************************************************//** Copies the data pointed to by a data field. */ UNIV_INLINE @@ -152,7 +165,8 @@ void dfield_dup( /*=======*/ dfield_t* field, /*!< in/out: data field */ - mem_heap_t* heap); /*!< in: memory heap where allocated */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ + __attribute__((nonnull)); #ifndef UNIV_HOTBACKUP /*********************************************************************//** Tests if two data fields are equal. @@ -187,7 +201,8 @@ UNIV_INLINE ulint dtuple_get_n_fields( /*================*/ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); #ifdef UNIV_DEBUG /*********************************************************************//** Gets nth field of a tuple. @@ -208,7 +223,8 @@ UNIV_INLINE ulint dtuple_get_info_bits( /*=================*/ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Sets info bits in a data tuple. */ UNIV_INLINE @@ -216,7 +232,8 @@ void dtuple_set_info_bits( /*=================*/ dtuple_t* tuple, /*!< in: tuple */ - ulint info_bits); /*!< in: info bits */ + ulint info_bits) /*!< in: info bits */ + __attribute__((nonnull)); /*********************************************************************//** Gets number of fields used in record comparisons. @return number of fields used in comparisons in rem0cmp.* */ @@ -224,7 +241,8 @@ UNIV_INLINE ulint dtuple_get_n_fields_cmp( /*====================*/ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Gets number of fields used in record comparisons. */ UNIV_INLINE @@ -232,8 +250,9 @@ void dtuple_set_n_fields_cmp( /*====================*/ dtuple_t* tuple, /*!< in: tuple */ - ulint n_fields_cmp); /*!< in: number of fields used in + ulint n_fields_cmp) /*!< in: number of fields used in comparisons in rem0cmp.* */ + __attribute__((nonnull)); /* Estimate the number of bytes that are going to be allocated when creating a new dtuple_t object */ @@ -252,7 +271,8 @@ dtuple_create_from_mem( /*===================*/ void* buf, /*!< in, out: buffer to use */ ulint buf_size, /*!< in: buffer size */ - ulint n_fields); /*!< in: number of fields */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************//** Creates a data tuple to a memory heap. The default value for number @@ -265,19 +285,8 @@ dtuple_create( mem_heap_t* heap, /*!< in: memory heap where the tuple is created, DTUPLE_EST_ALLOC(n_fields) bytes will be allocated from this heap */ - ulint n_fields); /*!< in: number of fields */ - -/**********************************************************//** -Wrap data fields in a tuple. The default value for number -of fields used in record comparisons for this tuple is n_fields. -@return data tuple */ -UNIV_INLINE -const dtuple_t* -dtuple_from_fields( -/*===============*/ - dtuple_t* tuple, /*!< in: storage for data tuple */ - const dfield_t* fields, /*!< in: fields */ - ulint n_fields); /*!< in: number of fields */ + ulint n_fields)/*!< in: number of fields */ + __attribute__((nonnull, malloc)); /*********************************************************************//** Sets number of fields used in a tuple. Normally this is set in @@ -287,7 +296,8 @@ void dtuple_set_n_fields( /*================*/ dtuple_t* tuple, /*!< in: tuple */ - ulint n_fields); /*!< in: number of fields */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull)); /*********************************************************************//** Copies a data tuple to another. This is a shallow copy; if a deep copy is desired, dfield_dup() will have to be invoked on each field. @@ -297,8 +307,9 @@ dtuple_t* dtuple_copy( /*========*/ const dtuple_t* tuple, /*!< in: tuple to copy from */ - mem_heap_t* heap); /*!< in: memory heap + mem_heap_t* heap) /*!< in: memory heap where the tuple is created */ + __attribute__((nonnull, malloc)); /**********************************************************//** The following function returns the sum of data lengths of a tuple. The space occupied by the field structs or the tuple struct is not counted. @@ -308,7 +319,8 @@ ulint dtuple_get_data_size( /*=================*/ const dtuple_t* tuple, /*!< in: typed data tuple */ - ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull)); /*********************************************************************//** Computes the number of externally stored fields in a data tuple. @return number of fields */ @@ -316,7 +328,8 @@ UNIV_INLINE ulint dtuple_get_n_ext( /*=============*/ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull)); /************************************************************//** Compare two data tuples, respecting the collation of character fields. @return 1, 0 , -1 if tuple1 is greater, equal, less, respectively, @@ -326,7 +339,8 @@ int dtuple_coll_cmp( /*============*/ const dtuple_t* tuple1, /*!< in: tuple 1 */ - const dtuple_t* tuple2);/*!< in: tuple 2 */ + const dtuple_t* tuple2) /*!< in: tuple 2 */ + __attribute__((nonnull, warn_unused_result)); /************************************************************//** Folds a prefix given as the number of fields of a tuple. @return the folded value */ @@ -339,7 +353,7 @@ dtuple_fold( ulint n_bytes,/*!< in: number of bytes to fold in an incomplete last field */ index_id_t tree_id)/*!< in: index tree id */ - __attribute__((pure)); + __attribute__((nonnull, pure, warn_unused_result)); /*******************************************************************//** Sets types of fields binary in a tuple. */ UNIV_INLINE @@ -347,7 +361,8 @@ void dtuple_set_types_binary( /*====================*/ dtuple_t* tuple, /*!< in: data tuple */ - ulint n); /*!< in: number of fields to set */ + ulint n) /*!< in: number of fields to set */ + __attribute__((nonnull)); /**********************************************************************//** Checks if a dtuple contains an SQL null value. @return TRUE if some field is SQL null */ @@ -355,7 +370,8 @@ UNIV_INLINE ibool dtuple_contains_null( /*=================*/ - const dtuple_t* tuple); /*!< in: dtuple */ + const dtuple_t* tuple) /*!< in: dtuple */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************//** Checks that a data field is typed. Asserts an error if not. @return TRUE if ok */ @@ -363,7 +379,8 @@ UNIV_INTERN ibool dfield_check_typed( /*===============*/ - const dfield_t* field); /*!< in: data field */ + const dfield_t* field) /*!< in: data field */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************//** Checks that a data tuple is typed. Asserts an error if not. @return TRUE if ok */ @@ -371,7 +388,8 @@ UNIV_INTERN ibool dtuple_check_typed( /*===============*/ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************//** Checks that a data tuple is typed. @return TRUE if ok */ @@ -379,7 +397,8 @@ UNIV_INTERN ibool dtuple_check_typed_no_assert( /*=========================*/ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); #ifdef UNIV_DEBUG /**********************************************************//** Validates the consistency of a tuple which must be complete, i.e, @@ -389,7 +408,8 @@ UNIV_INTERN ibool dtuple_validate( /*============*/ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); #endif /* UNIV_DEBUG */ /*************************************************************//** Pretty prints a dfield value according to its data type. */ @@ -397,7 +417,8 @@ UNIV_INTERN void dfield_print( /*=========*/ - const dfield_t* dfield);/*!< in: dfield */ + const dfield_t* dfield) /*!< in: dfield */ + __attribute__((nonnull)); /*************************************************************//** Pretty prints a dfield value according to its data type. Also the hex string is printed if a string contains non-printable characters. */ @@ -405,7 +426,8 @@ UNIV_INTERN void dfield_print_also_hex( /*==================*/ - const dfield_t* dfield); /*!< in: dfield */ + const dfield_t* dfield) /*!< in: dfield */ + __attribute__((nonnull)); /**********************************************************//** The following function prints the contents of a tuple. */ UNIV_INTERN @@ -413,7 +435,8 @@ void dtuple_print( /*=========*/ FILE* f, /*!< in: output stream */ - const dtuple_t* tuple); /*!< in: tuple */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull)); /**************************************************************//** Moves parts of long fields in entry to the big record vector so that the size of tuple drops below the maximum record size allowed in the @@ -428,8 +451,9 @@ dtuple_convert_big_rec( /*===================*/ dict_index_t* index, /*!< in: index */ dtuple_t* entry, /*!< in/out: index entry */ - ulint* n_ext); /*!< in/out: number of + ulint* n_ext) /*!< in/out: number of externally stored columns */ + __attribute__((nonnull, malloc, warn_unused_result)); /**************************************************************//** Puts back to entry the data stored in vector. Note that to ensure the fields in entry can accommodate the data, vector must have been created @@ -440,21 +464,23 @@ dtuple_convert_back_big_rec( /*========================*/ dict_index_t* index, /*!< in: index */ dtuple_t* entry, /*!< in: entry whose data was put to vector */ - big_rec_t* vector);/*!< in, own: big rec vector; it is + big_rec_t* vector) /*!< in, own: big rec vector; it is freed in this function */ + __attribute__((nonnull)); /**************************************************************//** Frees the memory in a big rec vector. */ UNIV_INLINE void dtuple_big_rec_free( /*================*/ - big_rec_t* vector); /*!< in, own: big rec vector; it is + big_rec_t* vector) /*!< in, own: big rec vector; it is freed in this function */ + __attribute__((nonnull)); /*######################################################################*/ /** Structure for an SQL data field */ -struct dfield_struct{ +struct dfield_t{ void* data; /*!< pointer to data */ unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */ unsigned len:32; /*!< data length; UNIV_SQL_NULL if SQL null */ @@ -462,7 +488,7 @@ struct dfield_struct{ }; /** Structure for an SQL data tuple of fields (logical record) */ -struct dtuple_struct { +struct dtuple_t { ulint info_bits; /*!< info bits of an index record: the default is 0; this field is used if an index record is built from @@ -482,15 +508,13 @@ struct dtuple_struct { #ifdef UNIV_DEBUG ulint magic_n; /*!< magic number, used in debug assertions */ -/** Value of dtuple_struct::magic_n */ +/** Value of dtuple_t::magic_n */ # define DATA_TUPLE_MAGIC_N 65478679 #endif /* UNIV_DEBUG */ }; /** A slot for a field in a big rec vector */ -typedef struct big_rec_field_struct big_rec_field_t; -/** A slot for a field in a big rec vector */ -struct big_rec_field_struct { +struct big_rec_field_t { ulint field_no; /*!< field number in record */ ulint len; /*!< stored data length, in bytes */ const void* data; /*!< stored data */ @@ -498,7 +522,7 @@ struct big_rec_field_struct { /** Storage format for overflow data in a big record, that is, a clustered index record which needs external storage of data fields */ -struct big_rec_struct { +struct big_rec_t { mem_heap_t* heap; /*!< memory heap from which allocated */ ulint n_fields; /*!< number of stored fields */ diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic index da50e91e98d..6937d55d211 100644 --- a/storage/innobase/include/data0data.ic +++ b/storage/innobase/include/data0data.ic @@ -54,7 +54,7 @@ void dfield_set_type( /*============*/ dfield_t* field, /*!< in: SQL data field */ - dtype_t* type) /*!< in: pointer to data type struct */ + const dtype_t* type) /*!< in: pointer to data type struct */ { ut_ad(field && type); @@ -407,6 +407,8 @@ dtuple_create_from_mem( } } #endif + UNIV_MEM_ASSERT_W(tuple->fields, n_fields * sizeof *tuple->fields); + UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields); return(tuple); } @@ -434,30 +436,6 @@ dtuple_create( tuple = dtuple_create_from_mem(buf, buf_size, n_fields); -#ifdef UNIV_DEBUG - UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields); -#endif - - return(tuple); -} - -/**********************************************************//** -Wrap data fields in a tuple. The default value for number -of fields used in record comparisons for this tuple is n_fields. -@return data tuple */ -UNIV_INLINE -const dtuple_t* -dtuple_from_fields( -/*===============*/ - dtuple_t* tuple, /*!< in: storage for data tuple */ - const dfield_t* fields, /*!< in: fields */ - ulint n_fields) /*!< in: number of fields */ -{ - tuple->info_bits = 0; - tuple->n_fields = tuple->n_fields_cmp = n_fields; - tuple->fields = (dfield_t*) fields; - ut_d(tuple->magic_n = DATA_TUPLE_MAGIC_N); - return(tuple); } diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h index c7fcf316f24..111664b0b52 100644 --- a/storage/innobase/include/data0type.h +++ b/storage/innobase/include/data0type.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,20 +33,20 @@ extern ulint data_mysql_default_charset_coll; #define DATA_MYSQL_BINARY_CHARSET_COLL 63 /* SQL data type struct */ -typedef struct dtype_struct dtype_t; +struct dtype_t; /* SQL Like operator comparison types */ -enum ib_like_enum { +enum ib_like_t { IB_LIKE_EXACT, /* e.g. STRING */ IB_LIKE_PREFIX, /* e.g., STRING% */ IB_LIKE_SUFFIX, /* e.g., %STRING */ IB_LIKE_SUBSTR, /* e.g., %STRING% */ IB_LIKE_REGEXP /* Future */ }; -typedef enum ib_like_enum ib_like_t; /*-------------------------------------------*/ /* The 'MAIN TYPE' of a column */ +#define DATA_MISSING 0 /* missing column */ #define DATA_VARCHAR 1 /* character varying of the latin1_swedish_ci charset-collation; note that the MySQL format for this, DATA_BINARY, @@ -508,7 +508,7 @@ dtype_read_for_order_and_null_size() dtype_new_read_for_order_and_null_size() sym_tab_add_null_lit() */ -struct dtype_struct{ +struct dtype_t{ unsigned prtype:32; /*!< precise type; MySQL data type, charset code, flags to indicate nullability, diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic index a5e94a8edff..d489bef89a8 100644 --- a/storage/innobase/include/data0type.ic +++ b/storage/innobase/include/data0type.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -556,35 +556,18 @@ dtype_get_fixed_size_low( } else if (!comp) { return(len); } else { - /* We play it safe here and ask MySQL for - mbminlen and mbmaxlen. Although - mbminlen and mbmaxlen are - initialized if and only if prtype - is (in one of the 3 functions in this file), - it could be that none of these functions - has been called. */ - +#ifdef UNIV_DEBUG ulint i_mbminlen, i_mbmaxlen; innobase_get_cset_width( dtype_get_charset_coll(prtype), &i_mbminlen, &i_mbmaxlen); - if (DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen) - != mbminmaxlen) { - - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: " - "mbminlen=%lu, " - "mbmaxlen=%lu, " - "type->mbminlen=%lu, " - "type->mbmaxlen=%lu\n", - (ulong) i_mbminlen, - (ulong) i_mbmaxlen, - (ulong) DATA_MBMINLEN(mbminmaxlen), - (ulong) DATA_MBMAXLEN(mbminmaxlen)); - } - if (i_mbminlen == i_mbmaxlen) { + ut_ad(DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen) + == mbminmaxlen); +#endif /* UNIV_DEBUG */ + if (DATA_MBMINLEN(mbminmaxlen) + == DATA_MBMAXLEN(mbminmaxlen)) { return(len); } } diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h index 7d599ef2c8d..bd2bb577611 100644 --- a/storage/innobase/include/data0types.h +++ b/storage/innobase/include/data0types.h @@ -27,10 +27,10 @@ Created 9/21/2000 Heikki Tuuri #define data0types_h /* SQL data field struct */ -typedef struct dfield_struct dfield_t; +struct dfield_t; /* SQL data tuple struct */ -typedef struct dtuple_struct dtuple_t; +struct dtuple_t; #endif diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h index 1a3499b09e0..12e9f543e94 100644 --- a/storage/innobase/include/db0err.h +++ b/storage/innobase/include/db0err.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,7 +27,7 @@ Created 5/24/1996 Heikki Tuuri #define db0err_h -enum db_err { +enum dberr_t { DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new explicit record lock was created */ DB_SUCCESS = 10, @@ -68,11 +68,14 @@ enum db_err { from a table failed */ DB_NO_SAVEPOINT, /*!< no savepoint exists with the given name */ - DB_TABLESPACE_ALREADY_EXISTS, /*!< we cannot create a new single-table + DB_TABLESPACE_EXISTS, /*!< we cannot create a new single-table tablespace because a file of the same name already exists */ - DB_TABLESPACE_DELETED, /*!< tablespace does not exist or is + DB_TABLESPACE_DELETED, /*!< tablespace was deleted or is being dropped right now */ + DB_TABLESPACE_NOT_FOUND, /*<! Attempt to delete a tablespace + instance that was not found in the + tablespace hash table */ DB_LOCK_TABLE_FULL, /*!< lock structs have exhausted the buffer pool (for big transactions, InnoDB stores the lock structs in the @@ -90,8 +93,8 @@ enum db_err { work with e.g., FT indexes created by a later version of the engine. */ - DB_PRIMARY_KEY_IS_NULL, /*!< a column in the PRIMARY KEY - was found to be NULL */ + DB_INVALID_NULL, /*!< a NOT NULL column was found to + be NULL during table rebuild */ DB_STATS_DO_NOT_EXIST, /*!< an operation that requires the persistent storage, used for recording @@ -115,6 +118,12 @@ enum db_err { DB_READ_ONLY, /*!< Update operation attempted in a read-only transaction */ DB_FTS_INVALID_DOCID, /* FTS Doc ID cannot be zero */ + DB_TABLE_IN_FK_CHECK, /* table is being used in foreign + key check */ + DB_ONLINE_LOG_TOO_BIG, /*!< Modification log grew too big + during online index creation */ + + DB_IO_ERROR, /*!< Generic IO error */ /* The following are partial failure codes */ DB_FAIL = 1000, @@ -123,7 +132,23 @@ enum db_err { DB_STRONG_FAIL, DB_ZIP_OVERFLOW, DB_RECORD_NOT_FOUND = 1500, - DB_END_OF_INDEX + DB_END_OF_INDEX, + DB_DICT_CHANGED, /*!< Some part of table dictionary has + changed. Such as index dropped or + foreign key dropped */ + + + /* The following are API only error codes. */ + DB_DATA_MISMATCH = 2000, /*!< Column update or read failed + because the types mismatch */ + + DB_SCHEMA_NOT_LOCKED, /*!< If an API function expects the + schema to be locked in exclusive mode + and if it's not then that API function + will return this error code */ + + DB_NOT_FOUND /*!< Generic error code for "Not found" + type of errors */ }; #endif diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h index 364aa746638..a994c9d8ff1 100644 --- a/storage/innobase/include/dict0boot.h +++ b/storage/innobase/include/dict0boot.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -58,6 +58,13 @@ dict_hdr_get_new_id( ulint* space_id); /*!< out: space id (not assigned if NULL) */ /**********************************************************************//** +Writes the current value of the row id counter to the dictionary header file +page. */ +UNIV_INTERN +void +dict_hdr_flush_row_id(void); +/*=======================*/ +/**********************************************************************//** Returns a new row id. @return the new id */ UNIV_INLINE @@ -82,18 +89,32 @@ dict_sys_write_row_id( row_id_t row_id);/*!< in: row id */ /*****************************************************************//** Initializes the data dictionary memory structures when the database is -started. This function is also called when the data dictionary is created. */ +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ UNIV_INTERN -void -dict_boot(void); +dberr_t +dict_boot(void) /*===========*/ + __attribute__((warn_unused_result)); + /*****************************************************************//** -Creates and initializes the data dictionary at the database creation. */ +Creates and initializes the data dictionary at the server bootstrap. +@return DB_SUCCESS or error code. */ UNIV_INTERN -void -dict_create(void); +dberr_t +dict_create(void) /*=============*/ + __attribute__((warn_unused_result)); +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ + __attribute__((warn_unused_result)); /* Space id and page no where the dictionary header resides */ #define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ @@ -273,6 +294,41 @@ enum dict_fld_sys_foreign_cols_enum { DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5, DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6 }; +/* The columns in SYS_TABLESPACES */ +enum dict_col_sys_tablespaces_enum { + DICT_COL__SYS_TABLESPACES__SPACE = 0, + DICT_COL__SYS_TABLESPACES__NAME = 1, + DICT_COL__SYS_TABLESPACES__FLAGS = 2, + DICT_NUM_COLS__SYS_TABLESPACES = 3 +}; +/* The field numbers in the SYS_TABLESPACES clustered index */ +enum dict_fld_sys_tablespaces_enum { + DICT_FLD__SYS_TABLESPACES__SPACE = 0, + DICT_FLD__SYS_TABLESPACES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLESPACES__NAME = 3, + DICT_FLD__SYS_TABLESPACES__FLAGS = 4, + DICT_NUM_FIELDS__SYS_TABLESPACES = 5 +}; +/* The columns in SYS_DATAFILES */ +enum dict_col_sys_datafiles_enum { + DICT_COL__SYS_DATAFILES__SPACE = 0, + DICT_COL__SYS_DATAFILES__PATH = 1, + DICT_NUM_COLS__SYS_DATAFILES = 2 +}; +/* The field numbers in the SYS_DATAFILES clustered index */ +enum dict_fld_sys_datafiles_enum { + DICT_FLD__SYS_DATAFILES__SPACE = 0, + DICT_FLD__SYS_DATAFILES__DB_TRX_ID = 1, + DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_DATAFILES__PATH = 3, + DICT_NUM_FIELDS__SYS_DATAFILES = 4 +}; + +/* A number of the columns above occur in multiple tables. These are the +length of thos fields. */ +#define DICT_FLD_LEN_SPACE 4 +#define DICT_FLD_LEN_FLAGS 4 /* When a row id which is zero modulo this number (which must be a power of two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic index 0f660ab7555..2b156a4f672 100644 --- a/storage/innobase/include/dict0boot.ic +++ b/storage/innobase/include/dict0boot.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,15 +24,6 @@ Created 4/18/1996 Heikki Tuuri *******************************************************/ /**********************************************************************//** -Writes the current value of the row id counter to the dictionary header file -page. */ -UNIV_INTERN -void -dict_hdr_flush_row_id(void); -/*=======================*/ - - -/**********************************************************************//** Returns a new row id. @return the new id */ UNIV_INLINE @@ -90,4 +81,16 @@ dict_sys_write_row_id( mach_write_to_6(field, row_id); } +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ +{ + return(id < DICT_HDR_FIRST_ID); +} + diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h index 68fc9ba195a..217da0142ee 100644 --- a/storage/innobase/include/dict0crea.h +++ b/storage/innobase/include/dict0crea.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,7 +42,9 @@ tab_create_graph_create( /*====================*/ dict_table_t* table, /*!< in: table to create, built as a memory data structure */ - mem_heap_t* heap); /*!< in: heap where created */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit);/*!< in: true if the commit node should be + added to the query graph */ /*********************************************************************//** Creates an index create graph. @return own: index create node */ @@ -52,7 +54,9 @@ ind_create_graph_create( /*====================*/ dict_index_t* index, /*!< in: index to create, built as a memory data structure */ - mem_heap_t* heap); /*!< in: heap where created */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit);/*!< in: true if the commit node should be + added to the query graph */ /***********************************************************//** Creates a table. This is a high-level function used in SQL execution graphs. @return query thread to run next or NULL */ @@ -99,11 +103,11 @@ dict_drop_index_tree( mtr_t* mtr); /*!< in: mtr having the latch on the record page */ /****************************************************************//** Creates the foreign key constraints system tables inside InnoDB -at database creation or database start if they are not found or are +at server bootstrap or server start if they are not found or are not of the right form. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t dict_create_or_check_foreign_constraint_tables(void); /*================================================*/ /********************************************************************//** @@ -115,7 +119,7 @@ given locally for this table, that is, the number is not global, as in the old format constraints < 4.0.18 it used to be. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t dict_create_add_foreigns_to_dictionary( /*===================================*/ ulint start_id,/*!< in: if we are actually doing ALTER TABLE @@ -127,11 +131,56 @@ dict_create_add_foreigns_to_dictionary( so far has no constraints for which the name was generated here */ dict_table_t* table, /*!< in: table */ - trx_t* trx); /*!< in: transaction */ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Creates the tablespaces and datafiles system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_sys_tablespace(void); +/*=====================================*/ +/********************************************************************//** +Add a single tablespace definition to the data dictionary tables in the +database. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_tablespace_to_dictionary( +/*=====================================*/ + ulint space, /*!< in: tablespace id */ + const char* name, /*!< in: tablespace name */ + ulint flags, /*!< in: tablespace flags */ + const char* path, /*!< in: tablespace path */ + trx_t* trx, /*!< in: transaction */ + bool commit); /*!< in: if true then commit the + transaction */ +/********************************************************************//** +Table create node structure */ -/* Table create node structure */ +/********************************************************************//** +Add a single foreign key definition to the data dictionary tables in the +database. We also generate names to constraints that were not named by the +user. A generated constraint has a name of the format +databasename/tablename_ibfk_NUMBER, where the numbers start from 1, and +are given locally for this table, that is, the number is not global, as in +the old format constraints < 4.0.18 it used to be. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + ulint* id_nr, /*!< in/out: number to use in id generation; + incremented if used */ + dict_table_t* table, /*!< in: table */ + dict_foreign_t* foreign,/*!< in: foreign */ + trx_t* trx) /*!< in/out: dictionary transaction */ + __attribute__((nonnull, warn_unused_result)); -struct tab_node_struct{ +/* Table create node structure */ +struct tab_node_t{ que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */ dict_table_t* table; /*!< table to create, built as a memory data structure with dict_mem_... functions */ @@ -160,7 +209,7 @@ struct tab_node_struct{ /* Index create node struct */ -struct ind_node_struct{ +struct ind_node_t{ que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */ dict_index_t* index; /*!< index to create, built as a memory data structure with dict_mem_... functions */ diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 68008f95c2f..af0a5b31cc4 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,6 +41,7 @@ Created 1/8/1996 Heikki Tuuri #include "ut0rnd.h" #include "ut0byte.h" #include "trx0types.h" +#include "row0types.h" #ifndef UNIV_HOTBACKUP # include "sync0sync.h" @@ -50,7 +52,8 @@ UNIV_INTERN void dict_casedn_str( /*============*/ - char* a); /*!< in/out: string to put in lower case */ + char* a) /*!< in/out: string to put in lower case */ + __attribute__((nonnull)); /********************************************************************//** Get the database name length in a table name. @return database name length */ @@ -58,17 +61,53 @@ UNIV_INTERN ulint dict_get_db_name_len( /*=================*/ - const char* name); /*!< in: table name in the form + const char* name) /*!< in: table name in the form dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +UNIV_INTERN +char* +dict_get_referenced_table( +/*======================*/ + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len,/*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap); /*!< in: heap memory */ +/*********************************************************************//** +Frees a foreign key struct. */ +UNIV_INTERN +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign); /*!< in, own: foreign key struct */ +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +UNIV_INTERN +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table); /*!< in: table in the dictionary + memory cache */ /********************************************************************//** Return the end of table name where we have removed dbname and '/'. @return table name */ - +UNIV_INTERN const char* dict_remove_db_name( /*================*/ - const char* name); /*!< in: table name in the form + const char* name) /*!< in: table name in the form dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Returns a table object based on table id. @return table, NULL if does not exist */ @@ -77,7 +116,11 @@ dict_table_t* dict_table_open_on_id( /*==================*/ table_id_t table_id, /*!< in: table id */ - ibool dict_locked); /*!< in: TRUE=data dictionary locked */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop) /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ + __attribute__((warn_unused_result)); /********************************************************************//** Decrements the count of open handles to a table. */ UNIV_INTERN @@ -85,7 +128,11 @@ void dict_table_close( /*=============*/ dict_table_t* table, /*!< in/out: table */ - ibool dict_locked); /*!< in: TRUE=data dictionary locked */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop) /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ + __attribute__((nonnull)); /**********************************************************************//** Inits the data dictionary module. */ UNIV_INTERN @@ -109,7 +156,8 @@ UNIV_INLINE ulint dict_col_get_mbminlen( /*==================*/ - const dict_col_t* col); /*!< in: column */ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Gets the maximum number of bytes per character. @return maximum multi-byte char size, in bytes */ @@ -117,7 +165,8 @@ UNIV_INLINE ulint dict_col_get_mbmaxlen( /*==================*/ - const dict_col_t* col); /*!< in: column */ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Sets the minimum and maximum number of bytes per character. */ UNIV_INLINE @@ -127,8 +176,9 @@ dict_col_set_mbminmaxlen( dict_col_t* col, /*!< in/out: column */ ulint mbminlen, /*!< in: minimum multi-byte character size, in bytes */ - ulint mbmaxlen); /*!< in: minimum multi-byte + ulint mbmaxlen) /*!< in: minimum multi-byte character size, in bytes */ + __attribute__((nonnull)); /*********************************************************************//** Gets the column data type. */ UNIV_INLINE @@ -136,7 +186,8 @@ void dict_col_copy_type( /*===============*/ const dict_col_t* col, /*!< in: column */ - dtype_t* type); /*!< out: data type */ + dtype_t* type) /*!< out: data type */ + __attribute__((nonnull)); /**********************************************************************//** Determine bytes of column prefix to be stored in the undo log. Please note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix @@ -147,9 +198,9 @@ ulint dict_max_field_len_store_undo( /*==========================*/ dict_table_t* table, /*!< in: table */ - const dict_col_t* col); /*!< in: column which index prefix + const dict_col_t* col) /*!< in: column which index prefix is based on */ - + __attribute__((nonnull, warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ #ifdef UNIV_DEBUG /*********************************************************************//** @@ -160,7 +211,8 @@ ibool dict_col_type_assert_equal( /*=======================*/ const dict_col_t* col, /*!< in: column */ - const dtype_t* type); /*!< in: data type */ + const dtype_t* type) /*!< in: data type */ + __attribute__((nonnull, warn_unused_result)); #endif /* UNIV_DEBUG */ #ifndef UNIV_HOTBACKUP /***********************************************************************//** @@ -170,7 +222,8 @@ UNIV_INLINE ulint dict_col_get_min_size( /*==================*/ - const dict_col_t* col); /*!< in: column */ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); /***********************************************************************//** Returns the maximum size of the column. @return maximum size */ @@ -178,7 +231,8 @@ UNIV_INLINE ulint dict_col_get_max_size( /*==================*/ - const dict_col_t* col); /*!< in: column */ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); /***********************************************************************//** Returns the size of a fixed size column, 0 if not a fixed size column. @return fixed size, or 0 */ @@ -187,7 +241,8 @@ ulint dict_col_get_fixed_size( /*====================*/ const dict_col_t* col, /*!< in: column */ - ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull, warn_unused_result)); /***********************************************************************//** Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. For fixed length types it is the fixed length of the type, otherwise 0. @@ -197,8 +252,8 @@ ulint dict_col_get_sql_null_size( /*=======================*/ const dict_col_t* col, /*!< in: column */ - ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ - + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Gets the column number. @return col->ind, table column position (starting from 0) */ @@ -206,7 +261,8 @@ UNIV_INLINE ulint dict_col_get_no( /*============*/ - const dict_col_t* col); /*!< in: column */ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Gets the column position in the clustered index. */ UNIV_INLINE @@ -214,7 +270,8 @@ ulint dict_col_get_clust_pos( /*===================*/ const dict_col_t* col, /*!< in: table column */ - const dict_index_t* clust_index); /*!< in: clustered index */ + const dict_index_t* clust_index) /*!< in: clustered index */ + __attribute__((nonnull, warn_unused_result)); /****************************************************************//** If the given column name is reserved for InnoDB system columns, return TRUE. @@ -223,14 +280,16 @@ UNIV_INTERN ibool dict_col_name_is_reserved( /*======================*/ - const char* name); /*!< in: column name */ + const char* name) /*!< in: column name */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Acquire the autoinc lock. */ UNIV_INTERN void dict_table_autoinc_lock( /*====================*/ - dict_table_t* table); /*!< in/out: table */ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); /********************************************************************//** Unconditionally set the autoinc counter. */ UNIV_INTERN @@ -238,7 +297,8 @@ void dict_table_autoinc_initialize( /*==========================*/ dict_table_t* table, /*!< in/out: table */ - ib_uint64_t value); /*!< in: next value to assign to a row */ + ib_uint64_t value) /*!< in: next value to assign to a row */ + __attribute__((nonnull)); /********************************************************************//** Reads the next autoinc value (== autoinc counter value), 0 if not yet initialized. @@ -247,7 +307,8 @@ UNIV_INTERN ib_uint64_t dict_table_autoinc_read( /*====================*/ - const dict_table_t* table); /*!< in: table */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Updates the autoinc counter if the value supplied is greater than the current value. */ @@ -257,14 +318,16 @@ dict_table_autoinc_update_if_greater( /*=================================*/ dict_table_t* table, /*!< in/out: table */ - ib_uint64_t value); /*!< in: value which was assigned to a row */ + ib_uint64_t value) /*!< in: value which was assigned to a row */ + __attribute__((nonnull)); /********************************************************************//** Release the autoinc lock. */ UNIV_INTERN void dict_table_autoinc_unlock( /*======================*/ - dict_table_t* table); /*!< in/out: table */ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** Adds system columns to a table object. */ @@ -273,7 +336,8 @@ void dict_table_add_system_columns( /*==========================*/ dict_table_t* table, /*!< in/out: table */ - mem_heap_t* heap); /*!< in: temporary heap */ + mem_heap_t* heap) /*!< in: temporary heap */ + __attribute__((nonnull)); #ifndef UNIV_HOTBACKUP /**********************************************************************//** Adds a table object to the dictionary cache. */ @@ -283,26 +347,30 @@ dict_table_add_to_cache( /*====================*/ dict_table_t* table, /*!< in: table */ ibool can_be_evicted, /*!< in: TRUE if can be evicted*/ - mem_heap_t* heap); /*!< in: temporary heap */ + mem_heap_t* heap) /*!< in: temporary heap */ + __attribute__((nonnull)); /**********************************************************************//** Removes a table object from the dictionary cache. */ UNIV_INTERN void dict_table_remove_from_cache( /*=========================*/ - dict_table_t* table); /*!< in, own: table */ + dict_table_t* table) /*!< in, own: table */ + __attribute__((nonnull)); /**********************************************************************//** Renames a table object. @return TRUE if success */ UNIV_INTERN -ibool +dberr_t dict_table_rename_in_cache( /*=======================*/ dict_table_t* table, /*!< in/out: table */ const char* new_name, /*!< in: new name */ - ibool rename_also_foreigns);/*!< in: in ALTER TABLE we want + ibool rename_also_foreigns) + /*!< in: in ALTER TABLE we want to preserve the original table name in constraints which reference it */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Removes an index from the dictionary cache. */ UNIV_INTERN @@ -310,7 +378,8 @@ void dict_index_remove_from_cache( /*=========================*/ dict_table_t* table, /*!< in/out: table */ - dict_index_t* index); /*!< in, own: index */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); /**********************************************************************//** Change the id of a table object in the dictionary cache. This is used in DISCARD TABLESPACE. */ @@ -319,7 +388,16 @@ void dict_table_change_id_in_cache( /*==========================*/ dict_table_t* table, /*!< in/out: table object already in cache */ - table_id_t new_id);/*!< in: new id to set */ + table_id_t new_id) /*!< in: new id to set */ + __attribute__((nonnull)); +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +UNIV_INTERN +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ + __attribute__((nonnull)); /**********************************************************************//** Adds a foreign key constraint object to the dictionary cache. May free the object if there already is an object with the same identifier in. @@ -327,12 +405,13 @@ At least one of foreign table or referenced table must already be in the dictionary cache! @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t dict_foreign_add_to_cache( /*======================*/ dict_foreign_t* foreign, /*!< in, own: foreign key constraint */ - ibool check_charsets);/*!< in: TRUE=check charset + ibool check_charsets) /*!< in: TRUE=check charset compatibility */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Check if the index is referenced by a foreign key, if TRUE return the matching instance NULL otherwise. @@ -343,7 +422,8 @@ dict_foreign_t* dict_table_get_referenced_constraint( /*=================================*/ dict_table_t* table, /*!< in: InnoDB table */ - dict_index_t* index); /*!< in: InnoDB index */ + dict_index_t* index) /*!< in: InnoDB index */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Checks if a table is referenced by foreign keys. @return TRUE if table is referenced by a foreign key */ @@ -351,17 +431,19 @@ UNIV_INTERN ibool dict_table_is_referenced_by_foreign_key( /*====================================*/ - const dict_table_t* table); /*!< in: InnoDB table */ + const dict_table_t* table) /*!< in: InnoDB table */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** -Replace the index in the foreign key list that matches this index's -definition with an equivalent index. */ +Replace the index passed in with another equivalent index in the +foreign key lists of the table. */ UNIV_INTERN void -dict_table_replace_index_in_foreign_list( -/*=====================================*/ - dict_table_t* table, /*!< in/out: table */ - dict_index_t* index, /*!< in: index to be replaced */ - const trx_t* trx); /*!< in: transaction handle */ +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const dict_index_t* index, /*!< in: index to be replaced */ + const trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull)); /**********************************************************************//** Determines whether a string starts with the specified keyword. @return TRUE if str starts with keyword */ @@ -369,9 +451,10 @@ UNIV_INTERN ibool dict_str_starts_with_keyword( /*=========================*/ - void* mysql_thd, /*!< in: MySQL thread handle */ + THD* thd, /*!< in: MySQL thread handle */ const char* str, /*!< in: string to scan for keyword */ - const char* keyword); /*!< in: keyword to look for */ + const char* keyword) /*!< in: keyword to look for */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Checks if a index is defined for a foreign key constraint. Index is a part of a foreign key constraint if the index is referenced by foreign key @@ -383,7 +466,8 @@ dict_foreign_t* dict_table_get_foreign_constraint( /*==============================*/ dict_table_t* table, /*!< in: InnoDB table */ - dict_index_t* index); /*!< in: InnoDB index */ + dict_index_t* index) /*!< in: InnoDB index */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Scans a table create SQL string and adds to the data dictionary the foreign key constraints declared in the string. This function @@ -393,7 +477,7 @@ bot participating tables. The indexes are allowed to contain more fields than mentioned in the constraint. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t dict_create_foreign_constraints( /*============================*/ trx_t* trx, /*!< in: transaction */ @@ -409,15 +493,16 @@ dict_create_foreign_constraints( const char* name, /*!< in: table full name in the normalized form database_name/table_name */ - ibool reject_fks); /*!< in: if TRUE, fail with error + ibool reject_fks) /*!< in: if TRUE, fail with error code DB_CANNOT_ADD_CONSTRAINT if any foreign keys are found. */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. @return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the constraint id does not match */ UNIV_INTERN -ulint +dberr_t dict_foreign_parse_drop_constraints( /*================================*/ mem_heap_t* heap, /*!< in: heap from which we can @@ -426,8 +511,9 @@ dict_foreign_parse_drop_constraints( dict_table_t* table, /*!< in: table */ ulint* n, /*!< out: number of constraints to drop */ - const char*** constraints_to_drop); /*!< out: id's of the + const char*** constraints_to_drop) /*!< out: id's of the constraints to drop */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Returns a table object and increments its open handle count. NOTE! This is a high-level function to be used mainly from outside the @@ -439,43 +525,40 @@ dict_table_t* dict_table_open_on_name( /*====================*/ const char* table_name, /*!< in: table name */ - ibool dict_locked); /*!< in: TRUE=data dictionary locked */ - -/**********************************************************************//** -Returns a table object and increment its open handle count. Table -statistics will not be updated if they are not initialized. -Call this function when dropping a table. -@return table, NULL if does not exist */ -UNIV_INTERN -dict_table_t* -dict_table_open_on_name_no_stats( -/*=============================*/ - const char* table_name, /*!< in: table name */ ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop, /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ dict_err_ignore_t - ignore_err); /*!< in: error to be ignored when + ignore_err) /*!< in: error to be ignored when loading the table */ -/**********************************************************************//** -Find an index that is equivalent to the one passed in and is not marked -for deletion. -@return index equivalent to foreign->foreign_index, or NULL */ -UNIV_INTERN -dict_index_t* -dict_foreign_find_equiv_index( -/*==========================*/ - dict_foreign_t* foreign);/*!< in: foreign key */ -/**********************************************************************//** -Returns an index object by matching on the name and column names and -if more than one index matches return the index with the max id + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. @return matching index, NULL if not found */ UNIV_INTERN dict_index_t* -dict_table_get_index_by_max_id( -/*===========================*/ - dict_table_t* table, /*!< in: table */ - const char* name, /*!< in: the index name to find */ - const char** columns,/*!< in: array of column names */ - ulint n_cols);/*!< in: number of columns */ +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + ibool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + __attribute__((nonnull(1,2), warn_unused_result)); /**********************************************************************//** Returns a column's name. @return column name. NOTE: not guaranteed to stay valid if table is @@ -485,29 +568,16 @@ const char* dict_table_get_col_name( /*====================*/ const dict_table_t* table, /*!< in: table */ - ulint col_nr);/*!< in: column number */ - + ulint col_nr) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** -Prints a table definition. */ +Prints a table data. */ UNIV_INTERN void dict_table_print( /*=============*/ - dict_table_t* table); /*!< in: table */ -/**********************************************************************//** -Prints a table data. */ -UNIV_INTERN -void -dict_table_print_low( -/*=================*/ - dict_table_t* table); /*!< in: table */ -/**********************************************************************//** -Prints a table data when we know the table name. */ -UNIV_INTERN -void -dict_table_print_by_name( -/*=====================*/ - const char* name); /*!< in: table name */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); /**********************************************************************//** Outputs info on foreign keys of a table. */ UNIV_INTERN @@ -520,7 +590,8 @@ dict_print_info_on_foreign_keys( of SHOW TABLE STATUS */ FILE* file, /*!< in: file where to print */ trx_t* trx, /*!< in: transaction */ - dict_table_t* table); /*!< in: table */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); /**********************************************************************//** Outputs info on a foreign key of a table in a format suitable for CREATE TABLE. */ @@ -531,7 +602,8 @@ dict_print_info_on_foreign_key_in_create_format( FILE* file, /*!< in: file where to print */ trx_t* trx, /*!< in: transaction */ dict_foreign_t* foreign, /*!< in: foreign key constraint */ - ibool add_newline); /*!< in: whether to add a newline */ + ibool add_newline) /*!< in: whether to add a newline */ + __attribute__((nonnull(1,3))); /********************************************************************//** Displays the names of the index and the table. */ UNIV_INTERN @@ -539,8 +611,35 @@ void dict_index_name_print( /*==================*/ FILE* file, /*!< in: output stream */ - trx_t* trx, /*!< in: transaction */ - const dict_index_t* index); /*!< in: index to print */ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to print */ + __attribute__((nonnull(1,3))); +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +UNIV_INTERN +bool +dict_foreign_qualify_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + ibool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + __attribute__((nonnull(1,2), warn_unused_result)); #ifdef UNIV_DEBUG /********************************************************************//** Gets the first index on the table (the clustered index). @@ -549,7 +648,17 @@ UNIV_INLINE dict_index_t* dict_table_get_first_index( /*=======================*/ - const dict_table_t* table); /*!< in: table */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Gets the next index on the table. @return index, NULL if none left */ @@ -557,9 +666,11 @@ UNIV_INLINE dict_index_t* dict_table_get_next_index( /*======================*/ - const dict_index_t* index); /*!< in: index */ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); #else /* UNIV_DEBUG */ # define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes) +# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes) # define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index) #endif /* UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ @@ -605,15 +716,6 @@ dict_index_is_ibuf( const dict_index_t* index) /*!< in: index */ __attribute__((nonnull, pure, warn_unused_result)); /********************************************************************//** -Check whether the index is an universal index tree. -@return nonzero for universal tree, zero for other indexes */ -UNIV_INLINE -ulint -dict_index_is_univ( -/*===============*/ - const dict_index_t* index) /*!< in: index */ - __attribute__((nonnull, pure, warn_unused_result)); -/********************************************************************//** Check whether the index is a secondary index or the insert buffer tree. @return nonzero for insert buffer, zero for other indexes */ UNIV_INLINE @@ -626,13 +728,14 @@ dict_index_is_sec_or_ibuf( /************************************************************************ Gets the all the FTS indexes for the table. NOTE: must not be called for tables which do not have an FTS-index. */ - +UNIV_INTERN ulint dict_table_get_all_fts_indexes( /*===========================*/ /* out: number of indexes collected */ dict_table_t* table, /* in: table */ - ib_vector_t* indexes);/* out: vector for collecting FTS indexes */ + ib_vector_t* indexes)/* out: vector for collecting FTS indexes */ + __attribute__((nonnull)); /********************************************************************//** Gets the number of user-defined columns in a table in the dictionary cache. @@ -662,6 +765,35 @@ dict_table_get_n_cols( /*==================*/ const dict_table_t* table) /*!< in: table */ __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); #ifdef UNIV_DEBUG /********************************************************************//** Gets the nth column of a table. @@ -671,7 +803,8 @@ dict_col_t* dict_table_get_nth_col( /*===================*/ const dict_table_t* table, /*!< in: table */ - ulint pos); /*!< in: position of column */ + ulint pos) /*!< in: position of column */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Gets the given system column of a table. @return pointer to column object */ @@ -680,7 +813,8 @@ dict_col_t* dict_table_get_sys_col( /*===================*/ const dict_table_t* table, /*!< in: table */ - ulint sys); /*!< in: DATA_ROW_ID, ... */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); #else /* UNIV_DEBUG */ #define dict_table_get_nth_col(table, pos) \ ((table)->cols + (pos)) @@ -695,7 +829,8 @@ ulint dict_table_get_sys_col_no( /*======================*/ const dict_table_t* table, /*!< in: table */ - ulint sys); /*!< in: DATA_ROW_ID, ... */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_HOTBACKUP /********************************************************************//** Returns the minimum data size of an index record. @@ -704,7 +839,8 @@ UNIV_INLINE ulint dict_index_get_min_size( /*====================*/ - const dict_index_t* index); /*!< in: index */ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** Check whether the table uses the compact page format. @@ -713,7 +849,8 @@ UNIV_INLINE ibool dict_table_is_comp( /*===============*/ - const dict_table_t* table); /*!< in: table */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Determine the file format of a table. @return file format version */ @@ -721,7 +858,8 @@ UNIV_INLINE ulint dict_table_get_format( /*==================*/ - const dict_table_t* table); /*!< in: table */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Determine the file format from a dict_table_t::flags. @return file format version */ @@ -729,7 +867,8 @@ UNIV_INLINE ulint dict_tf_get_format( /*===============*/ - ulint flags); /*!< in: dict_table_t::flags */ + ulint flags) /*!< in: dict_table_t::flags */ + __attribute__((warn_unused_result)); /********************************************************************//** Set the various values in a dict_table_t::flags pointer. */ UNIV_INLINE @@ -738,7 +877,9 @@ dict_tf_set( /*========*/ ulint* flags, /*!< in/out: table */ rec_format_t format, /*!< in: file format */ - ulint zip_ssize); /*!< in: zip shift size */ + ulint zip_ssize, /*!< in: zip shift size */ + bool remote_path) /*!< in: table uses DATA DIRECTORY */ + __attribute__((nonnull)); /********************************************************************//** Convert a 32 bit integer table flags to the 32 bit integer that is written into the tablespace header at the offset FSP_SPACE_FLAGS and is @@ -756,13 +897,6 @@ dict_tf_to_fsp_flags( /*=================*/ ulint flags) /*!< in: dict_table_t::flags */ __attribute__((const)); -/********************************************************************/ -UNIV_INLINE -ulint -dict_tf_to_sys_tables_type( -/*=======================*/ - ulint flags) /*!< in: dict_table_t::flags */ - __attribute__((const)); /********************************************************************//** Extract the compressed page size from table flags. @return compressed page size, or 0 if not compressed */ @@ -779,7 +913,8 @@ UNIV_INLINE ulint dict_table_zip_size( /*================*/ - const dict_table_t* table); /*!< in: table */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_HOTBACKUP /*********************************************************************//** Obtain exclusive locks on all index trees of the table. This is to prevent @@ -789,15 +924,16 @@ UNIV_INLINE void dict_table_x_lock_indexes( /*======================*/ - dict_table_t* table); /*!< in: table */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); /*********************************************************************//** Release the exclusive locks on all index tree. */ UNIV_INLINE void dict_table_x_unlock_indexes( /*========================*/ - dict_table_t* table); /*!< in: table */ -#endif /* !UNIV_HOTBACKUP */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); /********************************************************************//** Checks if a column is in the ordering columns of the clustered index of a table. Column prefixes are treated like whole columns. @@ -807,7 +943,8 @@ ibool dict_table_col_in_clustered_key( /*============================*/ const dict_table_t* table, /*!< in: table */ - ulint n); /*!< in: column number */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Check if the table has an FTS index. @return TRUE if table has an FTS index */ @@ -815,36 +952,8 @@ UNIV_INLINE ibool dict_table_has_fts_index( /*=====================*/ - dict_table_t* table); /*!< in: table */ -/*******************************************************************//** -Validate and return the table flags. -@return Same as input after validating it as dict_table_t::flags. -If there is an error, trigger assertion failure. */ -UNIV_INLINE -ulint -dict_tf_validate( -/*=============*/ - ulint flags); /*!< in: table flags */ -/********************************************************************//** -Validate a SYS_TABLES TYPE field and return it. -@return Same as input after validating it as a SYS_TABLES TYPE field. -If there is an error, return ULINT_UNDEFINED. */ -UNIV_INLINE -ulint -dict_sys_tables_type_validate( -/*==========================*/ - ulint type, /*!< in: SYS_TABLES.TYPE */ - ulint n_cols); /*!< in: SYS_TABLES.N_COLS */ -/********************************************************************//** -Determine the file format from dict_table_t::flags -The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any -other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set. -@return file format version */ -UNIV_INLINE -rec_format_t -dict_tf_get_rec_format( -/*===================*/ - ulint flags); /*!< in: dict_table_t::flags */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Copies types of columns contained in table to tuple and sets all fields of the tuple to the SQL NULL value. This function should @@ -854,18 +963,20 @@ void dict_table_copy_types( /*==================*/ dtuple_t* tuple, /*!< in/out: data tuple */ - const dict_table_t* table); /*!< in: table */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); /******************************************************************** Wait until all the background threads of the given table have exited, i.e., bg_threads == 0. Note: bg_threads_mutex must be reserved when calling this. */ - +UNIV_INTERN void dict_table_wait_for_bg_threads_to_exit( /*===================================*/ dict_table_t* table, /* in: table */ - ulint delay); /* in: time in microseconds to wait between + ulint delay) /* in: time in microseconds to wait between checks of bg_threads. */ + __attribute__((nonnull)); /**********************************************************************//** Looks for an index with the given id. NOTE that we do not reserve the dictionary mutex: this function is for emergency purposes like @@ -875,7 +986,8 @@ UNIV_INTERN dict_index_t* dict_index_find_on_id_low( /*======================*/ - index_id_t id); /*!< in: index id */ + index_id_t id) /*!< in: index id */ + __attribute__((warn_unused_result)); /**********************************************************************//** Make room in the table cache by evicting an unused table. The unused table should not be part of FK relationship and currently not used in any user @@ -891,16 +1003,17 @@ dict_make_room_in_cache( Adds an index to the dictionary cache. @return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */ UNIV_INTERN -ulint +dberr_t dict_index_add_to_cache( /*====================*/ dict_table_t* table, /*!< in: table on which the index is */ dict_index_t* index, /*!< in, own: index; NOTE! The index memory object is freed in this function! */ ulint page_no,/*!< in: root page number of the index */ - ibool strict);/*!< in: TRUE=refuse to create the index + ibool strict) /*!< in: TRUE=refuse to create the index if records could be too big to fit in an B-tree page */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Removes an index from the dictionary cache. */ UNIV_INTERN @@ -908,8 +1021,9 @@ void dict_index_remove_from_cache( /*=========================*/ dict_table_t* table, /*!< in/out: table */ - dict_index_t* index); /*!< in, own: index */ - + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ /********************************************************************//** Gets the number of fields in the internal representation of an index, including fields added by the dictionary system. @@ -918,9 +1032,10 @@ UNIV_INLINE ulint dict_index_get_n_fields( /*====================*/ - const dict_index_t* index); /*!< in: an internal + const dict_index_t* index) /*!< in: an internal representation of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Gets the number of fields in the internal representation of an index that uniquely determine the position of an index entry in the index, if @@ -931,8 +1046,9 @@ UNIV_INLINE ulint dict_index_get_n_unique( /*====================*/ - const dict_index_t* index); /*!< in: an internal representation + const dict_index_t* index) /*!< in: an internal representation of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Gets the number of fields in the internal representation of an index which uniquely determine the position of an index entry in the index, if @@ -942,8 +1058,9 @@ UNIV_INLINE ulint dict_index_get_n_unique_in_tree( /*============================*/ - const dict_index_t* index); /*!< in: an internal representation + const dict_index_t* index) /*!< in: an internal representation of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Gets the number of user-defined ordering fields in the index. In the internal representation we add the row id to the ordering fields to make all indexes @@ -954,8 +1071,9 @@ UNIV_INLINE ulint dict_index_get_n_ordering_defined_by_user( /*======================================*/ - const dict_index_t* index); /*!< in: an internal representation + const dict_index_t* index) /*!< in: an internal representation of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); #ifdef UNIV_DEBUG /********************************************************************//** Gets the nth field of an index. @@ -965,7 +1083,8 @@ dict_field_t* dict_index_get_nth_field( /*=====================*/ const dict_index_t* index, /*!< in: index */ - ulint pos); /*!< in: position of field */ + ulint pos) /*!< in: position of field */ + __attribute__((nonnull, warn_unused_result)); #else /* UNIV_DEBUG */ # define dict_index_get_nth_field(index, pos) ((index)->fields + (pos)) #endif /* UNIV_DEBUG */ @@ -977,7 +1096,8 @@ const dict_col_t* dict_index_get_nth_col( /*===================*/ const dict_index_t* index, /*!< in: index */ - ulint pos); /*!< in: position of the field */ + ulint pos) /*!< in: position of the field */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Gets the column number of the nth field in an index. @return column number */ @@ -986,7 +1106,8 @@ ulint dict_index_get_nth_col_no( /*======================*/ const dict_index_t* index, /*!< in: index */ - ulint pos); /*!< in: position of the field */ + ulint pos) /*!< in: position of the field */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Looks for column n in an index. @return position in internal representation of the index; @@ -996,7 +1117,8 @@ ulint dict_index_get_nth_col_pos( /*=======================*/ const dict_index_t* index, /*!< in: index */ - ulint n); /*!< in: column number */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Looks for column n in an index. @return position in internal representation of the index; @@ -1007,8 +1129,9 @@ dict_index_get_nth_col_or_prefix_pos( /*=================================*/ const dict_index_t* index, /*!< in: index */ ulint n, /*!< in: column number */ - ibool inc_prefix); /*!< in: TRUE=consider + ibool inc_prefix) /*!< in: TRUE=consider column prefixes too */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Returns TRUE if the index contains a column or a prefix of that column. @return TRUE if contains the column or its prefix */ @@ -1017,7 +1140,8 @@ ibool dict_index_contains_col_or_prefix( /*==============================*/ const dict_index_t* index, /*!< in: index */ - ulint n); /*!< in: column number */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Looks for a matching field in an index. The column has to be the same. The column in index must be complete, or must contain a prefix longer than the @@ -1031,7 +1155,8 @@ dict_index_get_nth_field_pos( /*=========================*/ const dict_index_t* index, /*!< in: index from which to search */ const dict_index_t* index2, /*!< in: index */ - ulint n); /*!< in: field number in index2 */ + ulint n) /*!< in: field number in index2 */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Looks for column n position in the clustered index. @return position in internal representation of the clustered index */ @@ -1040,7 +1165,8 @@ ulint dict_table_get_nth_col_pos( /*=======================*/ const dict_table_t* table, /*!< in: table */ - ulint n); /*!< in: column number */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Returns the position of a system column in an index. @return position, ULINT_UNDEFINED if not contained */ @@ -1049,7 +1175,8 @@ ulint dict_index_get_sys_col_pos( /*=======================*/ const dict_index_t* index, /*!< in: index */ - ulint type); /*!< in: DATA_ROW_ID, ... */ + ulint type) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Adds a column to index. */ UNIV_INTERN @@ -1059,7 +1186,8 @@ dict_index_add_col( dict_index_t* index, /*!< in/out: index */ const dict_table_t* table, /*!< in: table */ dict_col_t* col, /*!< in: column */ - ulint prefix_len); /*!< in: column prefix length */ + ulint prefix_len) /*!< in: column prefix length */ + __attribute__((nonnull)); #ifndef UNIV_HOTBACKUP /*******************************************************************//** Copies types of fields contained in index to tuple. */ @@ -1069,8 +1197,9 @@ dict_index_copy_types( /*==================*/ dtuple_t* tuple, /*!< in/out: data tuple */ const dict_index_t* index, /*!< in: index */ - ulint n_fields); /*!< in: number of + ulint n_fields) /*!< in: number of field types to copy */ + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /*********************************************************************//** Gets the field column. @@ -1079,7 +1208,8 @@ UNIV_INLINE const dict_col_t* dict_field_get_col( /*===============*/ - const dict_field_t* field); /*!< in: index field */ + const dict_field_t* field) /*!< in: index field */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_HOTBACKUP /**********************************************************************//** Returns an index object if it is found in the dictionary cache. @@ -1089,7 +1219,8 @@ UNIV_INTERN dict_index_t* dict_index_get_if_in_cache_low( /*===========================*/ - index_id_t index_id); /*!< in: index id */ + index_id_t index_id) /*!< in: index id */ + __attribute__((warn_unused_result)); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /**********************************************************************//** Returns an index object if it is found in the dictionary cache. @@ -1098,7 +1229,8 @@ UNIV_INTERN dict_index_t* dict_index_get_if_in_cache( /*=======================*/ - index_id_t index_id); /*!< in: index id */ + index_id_t index_id) /*!< in: index id */ + __attribute__((warn_unused_result)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #ifdef UNIV_DEBUG /**********************************************************************//** @@ -1110,7 +1242,17 @@ ibool dict_index_check_search_tuple( /*==========================*/ const dict_index_t* index, /*!< in: index tree */ - const dtuple_t* tuple); /*!< in: tuple used in a search */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ + __attribute__((nonnull, warn_unused_result)); +/** Whether and when to allow temporary index names */ +enum check_name { + /** Require all indexes to be complete. */ + CHECK_ALL_COMPLETE, + /** Allow aborted online index creation. */ + CHECK_ABORTED_OK, + /** Allow partial indexes to exist. */ + CHECK_PARTIAL_OK +}; /**********************************************************************//** Check for duplicate index entries in a table [using the index name] */ UNIV_INTERN @@ -1119,8 +1261,9 @@ dict_table_check_for_dup_indexes( /*=============================*/ const dict_table_t* table, /*!< in: Check for dup indexes in this table */ - ibool tmp_ok);/*!< in: TRUE=allow temporary - index names */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ + __attribute__((nonnull)); #endif /* UNIV_DEBUG */ /**********************************************************************//** Builds a node pointer out of a physical record and a page number. @@ -1136,8 +1279,9 @@ dict_index_build_node_ptr( pointer */ mem_heap_t* heap, /*!< in: memory heap where pointer created */ - ulint level); /*!< in: level of rec in tree: + ulint level) /*!< in: level of rec in tree: 0 means leaf level */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Copies an initial segment of a physical record, long enough to specify an index entry uniquely. @@ -1152,7 +1296,8 @@ dict_index_copy_rec_order_prefix( ulint* n_fields,/*!< out: number of fields copied */ byte** buf, /*!< in/out: memory buffer for the copied prefix, or NULL */ - ulint* buf_size);/*!< in/out: buffer size */ + ulint* buf_size)/*!< in/out: buffer size */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Builds a typed data tuple out of a physical record. @return own: data tuple */ @@ -1163,7 +1308,8 @@ dict_index_build_data_tuple( dict_index_t* index, /*!< in: index */ rec_t* rec, /*!< in: record for which to build data tuple */ ulint n_fields,/*!< in: number of data fields */ - mem_heap_t* heap); /*!< in: memory heap where tuple created */ + mem_heap_t* heap) /*!< in: memory heap where tuple created */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Gets the space id of the root of the index tree. @return space id */ @@ -1171,7 +1317,8 @@ UNIV_INLINE ulint dict_index_get_space( /*=================*/ - const dict_index_t* index); /*!< in: index */ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Sets the space id of the root of the index tree. */ UNIV_INLINE @@ -1179,7 +1326,8 @@ void dict_index_set_space( /*=================*/ dict_index_t* index, /*!< in/out: index */ - ulint space); /*!< in: space id */ + ulint space) /*!< in: space id */ + __attribute__((nonnull)); /*********************************************************************//** Gets the page number of the root of the index tree. @return page number */ @@ -1187,7 +1335,8 @@ UNIV_INLINE ulint dict_index_get_page( /*================*/ - const dict_index_t* tree); /*!< in: index */ + const dict_index_t* tree) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Gets the read-write lock of the index tree. @return read-write lock */ @@ -1195,7 +1344,8 @@ UNIV_INLINE rw_lock_t* dict_index_get_lock( /*================*/ - dict_index_t* index); /*!< in: index */ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Returns free space reserved for future updates of records. This is relevant only in the case of many consecutive inserts, as updates @@ -1205,13 +1355,48 @@ UNIV_INLINE ulint dict_index_get_space_reserve(void); /*==============================*/ + +/* Online index creation @{ */ +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ + __attribute__((nonnull)); +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Calculates the minimum record length in an index. */ UNIV_INTERN ulint dict_index_calc_min_rec_len( /*========================*/ - const dict_index_t* index); /*!< in: index */ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Reserves the dictionary system mutex for MySQL. */ UNIV_INTERN @@ -1233,8 +1418,9 @@ void dict_table_stats_lock( /*==================*/ const dict_table_t* table, /*!< in: table */ - ulint latch_mode); /*!< in: RW_S_LATCH or + ulint latch_mode) /*!< in: RW_S_LATCH or RW_X_LATCH */ + __attribute__((nonnull)); /**********************************************************************//** Unlock the latch that has been locked by dict_table_stats_lock() */ UNIV_INTERN @@ -1242,8 +1428,9 @@ void dict_table_stats_unlock( /*====================*/ const dict_table_t* table, /*!< in: table */ - ulint latch_mode); /*!< in: RW_S_LATCH or + ulint latch_mode) /*!< in: RW_S_LATCH or RW_X_LATCH */ + __attribute__((nonnull)); /********************************************************************//** Checks if the database name in two table names is the same. @return TRUE if same db name */ @@ -1253,8 +1440,9 @@ dict_tables_have_same_db( /*=====================*/ const char* name1, /*!< in: table name in the form dbname '/' tablename */ - const char* name2); /*!< in: table name in the form + const char* name2) /*!< in: table name in the form dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Removes an index from the cache */ UNIV_INTERN @@ -1262,7 +1450,8 @@ void dict_index_remove_from_cache( /*=========================*/ dict_table_t* table, /*!< in/out: table */ - dict_index_t* index); /*!< in, own: index */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); /**********************************************************************//** Get index by name @return index, NULL if does not exist */ @@ -1271,7 +1460,8 @@ dict_index_t* dict_table_get_index_on_name( /*=========================*/ dict_table_t* table, /*!< in: table */ - const char* name); /*!< in: name of the index to find */ + const char* name) /*!< in: name of the index to find */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** In case there is more than one index with the same name return the index with the min(id). @@ -1281,7 +1471,8 @@ dict_index_t* dict_table_get_index_on_name_and_min_id( /*====================================*/ dict_table_t* table, /*!< in: table */ - const char* name); /*!< in: name of the index to find */ + const char* name) /*!< in: name of the index to find */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************** Check whether a column exists in an FTS index. */ UNIV_INLINE @@ -1291,32 +1482,42 @@ dict_table_is_fts_column( /* out: ULINT_UNDEFINED if no match else the offset within the vector */ ib_vector_t* indexes,/* in: vector containing only FTS indexes */ - ulint col_no);/* in: col number to search for */ + ulint col_no) /* in: col number to search for */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Move a table to the non LRU end of the LRU list. */ UNIV_INTERN void dict_table_move_from_lru_to_non_lru( /*================================*/ - dict_table_t* table); /*!< in: table to move from LRU to non-LRU */ + dict_table_t* table) /*!< in: table to move from LRU to non-LRU */ + __attribute__((nonnull)); /**********************************************************************//** Move a table to the LRU list from the non-LRU list. */ UNIV_INTERN void dict_table_move_from_non_lru_to_lru( /*================================*/ - dict_table_t* table); /*!< in: table to move from non-LRU to LRU */ + dict_table_t* table) /*!< in: table to move from non-LRU to LRU */ + __attribute__((nonnull)); /**********************************************************************//** Move to the most recently used segment of the LRU list. */ UNIV_INTERN void dict_move_to_mru( /*=============*/ - dict_table_t* table); /*!< in: table to move to MRU */ + dict_table_t* table) /*!< in: table to move to MRU */ + __attribute__((nonnull)); + +/** Maximum number of columns in a foreign key constraint. Please Note MySQL +has a much lower limit on the number of columns allowed in a foreign key +constraint */ +#define MAX_NUM_FK_COLUMNS 500 + /* Buffers for storing detailed information about the latest foreign key and unique key errors */ extern FILE* dict_foreign_err_file; -extern mutex_t dict_foreign_err_mutex; /* mutex protecting the buffers */ +extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the buffers */ /** the dictionary system */ extern dict_sys_t* dict_sys; @@ -1324,8 +1525,8 @@ extern dict_sys_t* dict_sys; extern rw_lock_t dict_operation_lock; /* Dictionary system struct */ -struct dict_sys_struct{ - mutex_t mutex; /*!< mutex protecting the data +struct dict_sys_t{ + ib_mutex_t mutex; /*!< mutex protecting the data dictionary; protects also the disk-based dictionary system tables; this mutex serializes CREATE TABLE @@ -1376,7 +1577,7 @@ dict_ind_init(void); /* This struct is used to specify the name and type that a column must have when checking a table's schema. */ -struct dict_col_meta_struct { +struct dict_col_meta_t { const char* name; /* column name */ ulint mtype; /* required column main type */ ulint prtype_mask; /* required column precise type mask; @@ -1385,12 +1586,11 @@ struct dict_col_meta_struct { in the column's prtype */ ulint len; /* required column length */ }; -typedef struct dict_col_meta_struct dict_col_meta_t; /* This struct is used for checking whether a given table exists and whether it has a predefined schema (number of columns and columns names and types) */ -struct dict_table_schema_struct { +struct dict_table_schema_t { const char* table_name; /* the name of the table whose structure we are checking */ ulint n_cols; /* the number of columns the @@ -1398,8 +1598,15 @@ struct dict_table_schema_struct { dict_col_meta_t* columns; /* metadata for the columns; this array has n_cols elements */ + ulint n_foreign; /* number of foreign keys this + table has, pointing to other + tables (where this table is + FK child) */ + ulint n_referenced; /* number of foreign keys other + tables have, pointing to this + table (where this table is + parent) */ }; -typedef struct dict_table_schema_struct dict_table_schema_t; /* @} */ /*********************************************************************//** @@ -1410,7 +1617,7 @@ The caller must own the dictionary mutex. dict_table_schema_check() @{ @return DB_SUCCESS if the table exists and contains the necessary columns */ UNIV_INTERN -enum db_err +dberr_t dict_table_schema_check( /*====================*/ dict_table_schema_t* req_schema, /*!< in/out: required table @@ -1419,9 +1626,27 @@ dict_table_schema_check( message if != DB_SUCCESS and != DB_TABLE_NOT_FOUND is returned */ - size_t errstr_sz); /*!< in: errstr size */ + size_t errstr_sz) /*!< in: errstr size */ + __attribute__((nonnull, warn_unused_result)); /* @} */ +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +UNIV_INTERN +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ + __attribute__((nonnull)); + /**********************************************************************//** Closes the data dictionary module. */ UNIV_INTERN @@ -1437,7 +1662,7 @@ ulint dict_table_is_corrupted( /*====================*/ const dict_table_t* table) /*!< in: table */ - __attribute__((nonnull, pure, warn_unused_result)); + __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** Check whether the index is corrupted. @@ -1447,7 +1672,7 @@ ulint dict_index_is_corrupted( /*====================*/ const dict_index_t* index) /*!< in: index */ - __attribute__((nonnull, pure, warn_unused_result)); + __attribute__((nonnull, warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** @@ -1457,7 +1682,9 @@ UNIV_INTERN void dict_set_corrupted( /*===============*/ - dict_index_t* index) /*!< in/out: index */ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx, /*!< in/out: transaction */ + const char* ctx) /*!< in: context */ UNIV_COLD __attribute__((nonnull)); /**********************************************************************//** @@ -1469,7 +1696,8 @@ void dict_set_corrupted_index_cache_only( /*================================*/ dict_index_t* index, /*!< in/out: index */ - dict_table_t* table); /*!< in/out: table */ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); /**********************************************************************//** Flags a table with specified space_id corrupted in the table dictionary @@ -1481,6 +1709,76 @@ dict_set_corrupted_by_space( /*========================*/ ulint space_id); /*!< in: space ID */ +/********************************************************************//** +Validate the table flags. +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( +/*=============*/ + ulint flags) /*!< in: table flags */ + __attribute__((warn_unused_result)); + +/********************************************************************//** +Check if the tablespace for the table has been discarded. +@return true if the tablespace has been discarded. */ +UNIV_INLINE +bool +dict_table_is_discarded( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ + __attribute__((nonnull, pure, warn_unused_result)); + +/********************************************************************//** +Check if it is a temporary table. +@return true if temporary table flag is set. */ +UNIV_INLINE +bool +dict_table_is_temporary( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ + __attribute__((nonnull, pure, warn_unused_result)); + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + __attribute__((nonnull)); +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + __attribute__((nonnull)); +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page may not compress*/ +UNIV_INTERN +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Convert table flag to row format string. +@return row format name */ +UNIV_INTERN +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag); /*!< in: row format setting */ + +#endif /* !UNIV_HOTBACKUP */ + #ifndef UNIV_NONINL #include "dict0dict.ic" #endif diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index f6585ea8205..83953c9325a 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -29,6 +29,7 @@ Created 1/8/1996 Heikki Tuuri #include "rem0types.h" #include "fsp0fsp.h" #include "srv0srv.h" +#include "sync0rw.h" /* RW_S_LATCH */ /*********************************************************************//** Gets the minimum number of bytes per character. @@ -223,6 +224,22 @@ dict_table_get_first_index( } /********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table)) + ->indexes)); +} + +/********************************************************************//** Gets the next index on the table. @return index, NULL if none left */ UNIV_INLINE @@ -365,6 +382,56 @@ dict_table_get_n_cols( return(table->n_cols); } +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->stat_initialized); + + return(table->stat_n_rows); +} + +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows < 0xFFFFFFFFFFFFFFFFULL) { + table->stat_n_rows = n_rows + 1; + } + } +} + +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows > 0) { + table->stat_n_rows = n_rows - 1; + } + } +} + #ifdef UNIV_DEBUG /********************************************************************//** Gets the nth column of a table. @@ -458,12 +525,11 @@ dict_table_has_fts_index( } /********************************************************************//** -Validate and return the table flags. -@return Same as input after validating it as dict_table_t::flags. -If there is an error, trigger assertion failure. */ +Validate the table flags. +@return true if valid. */ UNIV_INLINE -ulint -dict_tf_validate( +bool +dict_tf_is_valid( /*=============*/ ulint flags) /*!< in: table flags */ { @@ -473,31 +539,43 @@ dict_tf_validate( ulint unused = DICT_TF_GET_UNUSED(flags); /* Make sure there are no bits that we do not know about. */ - ut_a(unused == 0); + if (unused != 0) { - if (atomic_blobs) { + return(false); + + } else if (atomic_blobs) { /* Barracuda row formats COMPRESSED and DYNAMIC build on the page structure introduced for the COMPACT row format by allowing keys in secondary indexes to be made from data stored off-page in the clustered index. */ - ut_a(compact); - } else { + + if (!compact) { + return(false); + } + + } else if (zip_ssize) { + /* Antelope does not support COMPRESSED row format. */ - ut_a(!zip_ssize); + return(false); } if (zip_ssize) { + /* COMPRESSED row format must have compact and atomic_blobs - bits set. */ - ut_a(compact); - ut_a(atomic_blobs); + bits set and validate the number is within allowed range. */ - /* Validate the number is within allowed range. */ - ut_a(zip_ssize <= PAGE_ZIP_SSIZE_MAX); + if (!compact + || !atomic_blobs + || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + + return(false); + } } - /* Return the flags sent if we did not crash. */ - return(flags); + /* CREATE TABLE ... DATA DIRECTORY is supported for any row format, + so the DATA_DIR flag is compatible with all other table flags. */ + + return(true); } /********************************************************************//** @@ -517,9 +595,7 @@ dict_sys_tables_type_validate( ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); ulint unused = DICT_TF_GET_UNUSED(type); - /* If the format is UNIV_FORMAT_A, table->flags == 0, but - SYS_TABLES.TYPE == 1, which is defined as SYS_TABLE_TYPE_ANTELOPE. - The low order bit of SYS_TABLES.TYPE is always set to 1. + /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same as dict_table_t::flags. Zero is not allowed here. */ if (!low_order_bit) { @@ -527,12 +603,9 @@ dict_sys_tables_type_validate( } if (redundant) { - /* This is Redundant row format, only the first bit - should be set in SYS_TABLES.TYPE */ - if (type != SYS_TABLE_TYPE_ANTELOPE) { + if (zip_ssize || atomic_blobs) { return(ULINT_UNDEFINED); } - return(DICT_TF_REDUNDANT); } /* Make sure there are no bits that we do not know about. */ @@ -569,6 +642,11 @@ dict_sys_tables_type_validate( } } + /* There is nothing to validate for the data_dir field. + CREATE TABLE ... DATA DIRECTORY is supported for any row + format, so the DATA_DIR flag is compatible with any other + table flags. However, it is not used with TEMPORARY tables.*/ + /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -584,7 +662,7 @@ dict_tf_get_rec_format( /*===================*/ ulint flags) /*!< in: dict_table_t::flags */ { - dict_tf_validate(flags); + ut_a(dict_tf_is_valid(flags)); if (!DICT_TF_GET_COMPACT(flags)) { return(REC_FORMAT_REDUNDANT); @@ -640,7 +718,8 @@ dict_tf_set( /*========*/ ulint* flags, /*!< in/out: table flags */ rec_format_t format, /*!< in: file format */ - ulint zip_ssize) /*!< in: zip shift size */ + ulint zip_ssize, /*!< in: zip shift size */ + bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ { switch (format) { case REC_FORMAT_REDUNDANT: @@ -662,6 +741,10 @@ dict_tf_set( ut_ad(zip_ssize == 0); break; } + + if (use_data_dir) { + *flags |= (1 << DICT_TF_POS_DATA_DIR); + } } /********************************************************************//** @@ -679,15 +762,61 @@ UNIV_INLINE ulint dict_tf_to_fsp_flags( /*=================*/ - ulint flags) /*!< in: dict_table_t::flags */ + ulint table_flags) /*!< in: dict_table_t::flags */ { + ulint fsp_flags; + + DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", + return(ULINT_UNDEFINED);); + /* Adjust bit zero. */ - flags = (flags == DICT_TF_COMPACT) ? 0 : flags; + fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; + + /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ + fsp_flags |= table_flags & DICT_TF_MASK_ZIP_SSIZE; + fsp_flags |= table_flags & DICT_TF_MASK_ATOMIC_BLOBS; /* In addition, tablespace flags also contain the page size. */ - flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); + fsp_flags |= fsp_flags_set_page_size(fsp_flags, UNIV_PAGE_SIZE); + + /* The DATA_DIR flag is in a different position in fsp_flag */ + fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) + ? FSP_FLAGS_MASK_DATA_DIR : 0; - return(fsp_flags_validate(flags)); + ut_a(fsp_flags_is_valid(fsp_flags)); + + return(fsp_flags); +} + +/********************************************************************//** +Convert a 32 bit integer from SYS_TABLES.TYPE to dict_table_t::flags +The following chart shows the translation of the low order bit. +Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +SYS_TABLES.TYPE | 1 | 1 | 1 +dict_table_t::flags | 0 | 1 | 1 +================================================================== +@return ulint containing SYS_TABLES.TYPE */ +UNIV_INLINE +ulint +dict_sys_tables_type_to_tf( +/*=======================*/ + ulint type, /*!< in: SYS_TABLES.TYPE field */ + ulint n_cols) /*!< in: SYS_TABLES.N_COLS field */ +{ + ulint flags; + ulint redundant = !(n_cols & DICT_N_COLS_COMPACT); + + /* Adjust bit zero. */ + flags = redundant ? 0 : 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + flags |= type & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR); + + return(flags); } /********************************************************************//** @@ -706,13 +835,19 @@ dict_tf_to_sys_tables_type( /*=======================*/ ulint flags) /*!< in: dict_table_t::flags */ { - if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) { - ut_a(flags == DICT_TF_REDUNDANT - || flags == DICT_TF_COMPACT); - return(SYS_TABLE_TYPE_ANTELOPE); - } + ulint type; + + ut_a(dict_tf_is_valid(flags)); + + /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ + type = 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + type |= flags & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR); - return(dict_tf_validate(flags)); + return(type); } /********************************************************************//** @@ -1064,6 +1199,103 @@ dict_index_get_space_reserve(void) return(UNIV_PAGE_SIZE / 16); } +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ +{ + enum online_index_status status; + + status = (enum online_index_status) index->online_status; + + /* Without the index->lock protection, the online + status can change from ONLINE_INDEX_CREATION to + ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in + row_log_apply() once log application is done. So to make + sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE + you should always do the recheck after acquiring index->lock */ + +#ifdef UNIV_DEBUG + switch (status) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + return(status); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(status); +} + +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ +{ + ut_ad(!(index->type & DICT_FTS)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + break; + case ONLINE_INDEX_ABORTED: + ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED); + break; + case ONLINE_INDEX_ABORTED_DROPPED: + ut_error; + } +#endif /* UNIV_DEBUG */ + + index->online_status = status; + ut_ad(dict_index_get_online_status(index) == status); +} + +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ +{ +#ifdef UNIV_DEBUG + if (dict_index_is_clust(index)) { + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + ut_ad(0); + return(false); + } +#endif /* UNIV_DEBUG */ + + return(UNIV_UNLIKELY(dict_index_get_online_status(index) + != ONLINE_INDEX_COMPLETE)); +} + /**********************************************************************//** Check whether a column exists in an FTS index. @return ULINT_UNDEFINED if no match else the offset within the vector */ @@ -1147,4 +1379,28 @@ dict_index_is_corrupted( || (index->table && index->table->corrupted)); } +/********************************************************************//** +Check if the tablespace for the table has been discarded. +@return true if the tablespace has been discarded. */ +UNIV_INLINE +bool +dict_table_is_discarded( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_DISCARDED)); +} + +/********************************************************************//** +Check if it is a temporary table. +@return true if temporary table flag is set. */ +UNIV_INLINE +bool +dict_table_is_temporary( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)); +} + #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index 13b9a121c1c..5991d58a686 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,38 +29,35 @@ Created 4/24/1996 Heikki Tuuri #include "univ.i" #include "dict0types.h" +#include "trx0types.h" #include "ut0byte.h" #include "mem0mem.h" #include "btr0types.h" -/** enum that defines all 6 system table IDs */ -enum dict_system_table_id { +/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */ +enum dict_system_id_t { SYS_TABLES = 0, SYS_INDEXES, SYS_COLUMNS, SYS_FIELDS, SYS_FOREIGN, SYS_FOREIGN_COLS, + SYS_TABLESPACES, + SYS_DATAFILES, /* This must be last item. Defines the number of system tables. */ SYS_NUM_SYSTEM_TABLES }; -typedef enum dict_system_table_id dict_system_id_t; - /** Status bit for dict_process_sys_tables_rec_and_mtr_commit() */ -enum dict_table_info { +enum dict_table_info_t { DICT_TABLE_LOAD_FROM_RECORD = 0,/*!< Directly populate a dict_table_t structure with information from a SYS_TABLES record */ - DICT_TABLE_LOAD_FROM_CACHE = 1, /*!< Check first whether dict_table_t + DICT_TABLE_LOAD_FROM_CACHE = 1 /*!< Check first whether dict_table_t is in the cache, if so, return it */ - DICT_TABLE_UPDATE_STATS = 2 /*!< whether to update statistics - when loading SYS_TABLES information. */ }; -typedef enum dict_table_info dict_table_info_t; - /********************************************************************//** In a crash recovery we already have all the tablespace objects created. This function compares the space id information in the InnoDB data dictionary @@ -157,6 +154,27 @@ dict_load_field_low( for temporary storage */ const rec_t* rec); /*!< in: SYS_FIELDS record */ /********************************************************************//** +Using the table->heap, copy the null-terminated filepath into +table->data_dir_path and put a null byte before the extension. +This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path. +Make this data directory path only if it has not yet been saved. */ +UNIV_INTERN +void +dict_save_data_dir_path( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + char* filepath); /*!< in: filepath of tablespace */ +/*****************************************************************//** +Make sure the data_file_name is saved in dict_table_t if needed. Try to +read it from the file dictionary first, then from SYS_DATAFILES. */ +UNIV_INTERN +void +dict_get_and_save_data_dir_path( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + bool dict_mutex_own); /*!< in: true if dict_sys->mutex + is owned already */ +/********************************************************************//** Loads a table definition and also all its index definitions, and also the cluster definition if the table is a member in a cluster. Also loads all foreign key constraints where the foreign key is in the table or where @@ -199,14 +217,15 @@ cache already contains all constraints where the other relevant table is already in the dictionary cache. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t dict_load_foreigns( /*===============*/ const char* table_name, /*!< in: table name */ ibool check_recursive,/*!< in: Whether to check recursive load of tables chained by FK */ - ibool check_charsets);/*!< in: TRUE=check charsets + ibool check_charsets) /*!< in: TRUE=check charsets compatibility */ + __attribute__((nonnull, warn_unused_result)); /********************************************************************//** Prints to the standard output information on all tables found in the data dictionary system table. */ @@ -324,6 +343,66 @@ dict_process_sys_foreign_col_rec( const char** ref_col_name, /*!< out: referenced column name in referenced table */ ulint* pos); /*!< out: column position */ +/********************************************************************//** +This function parses a SYS_TABLESPACES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tablespaces( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */ + ulint* space, /*!< out: pace id */ + const char** name, /*!< out: tablespace name */ + ulint* flags); /*!< out: tablespace flags */ +/********************************************************************//** +This function parses a SYS_DATAFILES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_datafiles( +/*=======================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_DATAFILES rec */ + ulint* space, /*!< out: pace id */ + const char** path); /*!< out: datafile path */ +/********************************************************************//** +Get the filepath for a spaceid from SYS_DATAFILES. This function provides +a temporary heap which is used for the table lookup, but not for the path. +The caller must free the memory for the path returned. This function can +return NULL if the space ID is not found in SYS_DATAFILES, then the caller +will assume that the ibd file is in the normal datadir. +@return own: A copy of the first datafile found in SYS_DATAFILES.PATH for +the given space ID. NULL if space ID is zero or not found. */ +UNIV_INTERN +char* +dict_get_first_path( +/*================*/ + ulint space, /*!< in: space id */ + const char* name); /*!< in: tablespace name */ +/********************************************************************//** +Update the record for space_id in SYS_TABLESPACES to this filepath. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_update_filepath( +/*=================*/ + ulint space_id, /*!< in: space id */ + const char* filepath); /*!< in: filepath */ +/********************************************************************//** +Insert records into SYS_TABLESPACES and SYS_DATAFILES. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_insert_tablespace_and_filepath( +/*================================*/ + ulint space, /*!< in: space id */ + const char* name, /*!< in: talespace name */ + const char* filepath, /*!< in: filepath */ + ulint fsp_flags); /*!< in: tablespace flags */ + #ifndef UNIV_NONINL #include "dict0load.ic" #endif diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index ea7e996dfa8..671f67eb1f8 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,6 +31,7 @@ Created 1/8/1996 Heikki Tuuri #include "dict0types.h" #include "data0type.h" #include "mem0mem.h" +#include "row0types.h" #include "rem0types.h" #include "btr0types.h" #ifndef UNIV_HOTBACKUP @@ -46,7 +48,7 @@ Created 1/8/1996 Heikki Tuuri #include "fts0fts.h" /* Forward declaration. */ -typedef struct ib_rbt_struct ib_rbt_t; +struct ib_rbt_t; /** Type flags of an index: OR'ing of the flags is allowed to define a combination of types */ @@ -93,12 +95,9 @@ and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS are described in fsp0fsp.h. */ /* @{ */ -/** SYS_TABLES.TYPE can be equal to 1 which means that the Row format -is one of two Antelope row formats, Redundant or Compact. */ -#define SYS_TABLE_TYPE_ANTELOPE 1 -/** dict_table_t::flags can be equal to 0 if the row format = Redundant */ +/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */ #define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */ -/** dict_table_t::flags can be equal to 1 if the row format = Compact */ +/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */ #define DICT_TF_COMPACT 1 /*!< Compact row format. */ /** This bitmask is used in SYS_TABLES.N_COLS to set and test whether @@ -115,10 +114,17 @@ Brracuda row formats store the whole blob or text field off-page atomically. Secondary indexes are created from this external data using row_ext_t to cache the BLOB prefixes. */ #define DICT_TF_WIDTH_ATOMIC_BLOBS 1 +/** If a table is created with the MYSQL option DATA DIRECTORY and +innodb-file-per-table, an older engine will not be able to find that table. +This flag prevents older engines from attempting to open the table and +allows InnoDB to update_create_info() accordingly. */ +#define DICT_TF_WIDTH_DATA_DIR 1 + /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + DICT_TF_WIDTH_ZIP_SSIZE \ - + DICT_TF_WIDTH_ATOMIC_BLOBS) + + DICT_TF_WIDTH_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_DATA_DIR) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -131,9 +137,12 @@ to cache the BLOB prefixes. */ /** Zero relative shift position of the ATOMIC_BLOBS field */ #define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \ + DICT_TF_WIDTH_ZIP_SSIZE) -/** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_BLOBS \ +/** Zero relative shift position of the DATA_DIR field */ +#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the start of the UNUSED bits */ +#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -147,6 +156,10 @@ to cache the BLOB prefixes. */ #define DICT_TF_MASK_ATOMIC_BLOBS \ ((~(~0 << DICT_TF_WIDTH_ATOMIC_BLOBS)) \ << DICT_TF_POS_ATOMIC_BLOBS) +/** Bit mask of the DATA_DIR field */ +#define DICT_TF_MASK_DATA_DIR \ + ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ + << DICT_TF_POS_DATA_DIR) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -160,6 +173,10 @@ to cache the BLOB prefixes. */ #define DICT_TF_HAS_ATOMIC_BLOBS(flags) \ ((flags & DICT_TF_MASK_ATOMIC_BLOBS) \ >> DICT_TF_POS_ATOMIC_BLOBS) +/** Return the value of the ATOMIC_BLOBS field */ +#define DICT_TF_HAS_DATA_DIR(flags) \ + ((flags & DICT_TF_MASK_DATA_DIR) \ + >> DICT_TF_POS_DATA_DIR) /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) @@ -174,7 +191,7 @@ ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags for unknown bits in order to protect backward incompatibility. */ /* @{ */ /** Total number of bits in table->flags2. */ -#define DICT_TF2_BITS 5 +#define DICT_TF2_BITS 6 #define DICT_TF2_BIT_MASK ~(~0 << DICT_TF2_BITS) /** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */ @@ -189,6 +206,9 @@ This is a transient bit for index build */ /** This bit is used during table creation to indicate that it will use its own tablespace instead of the system tablespace. */ #define DICT_TF2_USE_TABLESPACE 16 + +/** Set when we discard/detach the tablespace */ +#define DICT_TF2_DISCARDED 32 /* @} */ #define DICT_TF2_FLAG_SET(table, flag) \ @@ -225,9 +245,7 @@ dict_mem_table_create( /*==================*/ const char* name, /*!< in: table name */ ulint space, /*!< in: space where the clustered index - of the table is placed; this parameter - is ignored if the table is made - a member of a cluster */ + of the table is placed */ ulint n_cols, /*!< in: number of columns */ ulint flags, /*!< in: table flags */ ulint flags2); /*!< in: table flags2 */ @@ -249,7 +267,19 @@ dict_mem_table_add_col( const char* name, /*!< in: column name, or NULL */ ulint mtype, /*!< in: main datatype */ ulint prtype, /*!< in: precise type */ - ulint len); /*!< in: precision */ + ulint len) /*!< in: precision */ + __attribute__((nonnull(1))); +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +UNIV_INTERN +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to) /*!< in: new column name */ + __attribute__((nonnull)); /**********************************************************************//** This function populates a dict_col_t memory structure with supplied information. */ @@ -347,8 +377,19 @@ dict_mem_referenced_table_name_lookup_set( dict_foreign_t* foreign, /*!< in/out: foreign struct */ ibool do_alloc); /*!< in: is an alloc needed */ +/*******************************************************************//** +Create a temporary tablename. +@return temporary tablename suitable for InnoDB use */ +UNIV_INTERN __attribute__((nonnull, warn_unused_result)) +char* +dict_mem_create_temporary_tablename( +/*================================*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* dbtab, /*!< in: database/table name */ + table_id_t id); /*!< in: InnoDB table id */ + /** Data structure for a column in a table */ -struct dict_col_struct{ +struct dict_col_t{ /*----------------------*/ /** The following are copied from dtype_t, so that all bit-fields can be packed tightly. */ @@ -424,7 +465,7 @@ be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */ #define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN /** Data structure for a field in an index */ -struct dict_field_struct{ +struct dict_field_t{ dict_col_t* col; /*!< pointer to the table column */ const char* name; /*!< name of the column */ unsigned prefix_len:12; /*!< 0 or the length of the column @@ -440,9 +481,61 @@ struct dict_field_struct{ DICT_ANTELOPE_MAX_INDEX_COL_LEN */ }; +/**********************************************************************//** +PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID +COMPRESSION FAILURES +(Note: this is relevant only for compressed indexes) +GOAL: Avoid compression failures by maintaining information about the +compressibility of data. If data is not very compressible then leave +some extra space 'padding' in the uncompressed page making it more +likely that compression of less than fully packed uncompressed page will +succeed. + +This padding heuristic works by increasing the pad linearly until the +desired failure rate is reached. A "round" is a fixed number of +compression operations. +After each round, the compression failure rate for that round is +computed. If the failure rate is too high, then padding is incremented +by a fixed value, otherwise it's left intact. +If the compression failure is lower than the desired rate for a fixed +number of consecutive rounds, then the padding is decreased by a fixed +value. This is done to prevent overshooting the padding value, +and to accommodate the possible change in data compressibility. */ + +/** Number of zip ops in one round. */ +#define ZIP_PAD_ROUND_LEN (128) + +/** Number of successful rounds after which the padding is decreased */ +#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5) + +/** Amount by which padding is increased. */ +#define ZIP_PAD_INCR (128) + +/** Percentage of compression failures that are allowed in a single +round */ +extern ulong zip_failure_threshold_pct; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +extern ulong zip_pad_max; + +/** Data structure to hold information about about how much space in +an uncompressed page should be left as padding to avoid compression +failures. This estimate is based on a self-adapting heuristic. */ +struct zip_pad_info_t { + os_fast_mutex_t mutex; /*!< mutex protecting the info */ + ulint pad; /*!< number of bytes used as pad */ + ulint success;/*!< successful compression ops during + current round */ + ulint failure;/*!< failed compression ops during + current round */ + ulint n_rounds;/*!< number of currently successful + rounds */ +}; + /** Data structure for an index. Most fields will be initialized to 0, NULL or FALSE in dict_mem_index_create(). */ -struct dict_index_struct{ +struct dict_index_t{ index_id_t id; /*!< id of the index */ mem_heap_t* heap; /*!< memory heap */ const char* name; /*!< index name */ @@ -478,24 +571,35 @@ struct dict_index_struct{ unsigned cached:1;/*!< TRUE if the index object is in the dictionary cache */ unsigned to_be_dropped:1; - /*!< TRUE if this index is marked to be - dropped in ha_innobase::prepare_drop_index(), - otherwise FALSE. Protected by - dict_sys->mutex, dict_operation_lock and - index->lock.*/ + /*!< TRUE if the index is to be dropped; + protected by dict_operation_lock */ + unsigned online_status:2; + /*!< enum online_index_status. + Transitions from ONLINE_INDEX_COMPLETE (to + ONLINE_INDEX_CREATION) are protected + by dict_operation_lock and + dict_sys->mutex. Other changes are + protected by index->lock. */ dict_field_t* fields; /*!< array of field descriptions */ #ifndef UNIV_HOTBACKUP UT_LIST_NODE_T(dict_index_t) indexes;/*!< list of indexes of the table */ - btr_search_t* search_info; /*!< info used in optimistic searches */ + btr_search_t* search_info; + /*!< info used in optimistic searches */ + row_log_t* online_log; + /*!< the log of modifications + during online index creation; + valid when online_status is + ONLINE_INDEX_CREATION */ /*----------------------*/ /** Statistics for query optimization */ /* @{ */ ib_uint64_t* stat_n_diff_key_vals; /*!< approximate number of different key values for this index, for each - n-column prefix where n <= - dict_get_n_unique(index); we + n-column prefix where 1 <= n <= + dict_get_n_unique(index) (the array is + indexed from 0 to n_uniq-1); we periodically calculate new estimates */ ib_uint64_t* stat_n_sample_sizes; @@ -506,7 +610,8 @@ struct dict_index_struct{ ib_uint64_t* stat_n_non_null_key_vals; /* approximate number of non-null key values for this index, for each column where - n < dict_get_n_unique(index); This + 1 <= n <= dict_get_n_unique(index) (the array + is indexed from 0 to n_uniq-1); This is used when innodb_stats_method is "nulls_ignored". */ ulint stat_index_size; @@ -521,9 +626,11 @@ struct dict_index_struct{ trx_id_t trx_id; /*!< id of the transaction that created this index, or 0 if the index existed when InnoDB was started up */ + zip_pad_info_t zip_pad;/*!< Information about state of + compression failures and successes */ #endif /* !UNIV_HOTBACKUP */ #ifdef UNIV_BLOB_DEBUG - mutex_t blobs_mutex; + ib_mutex_t blobs_mutex; /*!< mutex protecting blobs */ ib_rbt_t* blobs; /*!< map of (page_no,heap_no,field_no) to first_blob_page_no; protected by @@ -531,15 +638,35 @@ struct dict_index_struct{ #endif /* UNIV_BLOB_DEBUG */ #ifdef UNIV_DEBUG ulint magic_n;/*!< magic number */ -/** Value of dict_index_struct::magic_n */ +/** Value of dict_index_t::magic_n */ # define DICT_INDEX_MAGIC_N 76789786 #endif }; +/** The status of online index creation */ +enum online_index_status { + /** the index is complete and ready for access */ + ONLINE_INDEX_COMPLETE = 0, + /** the index is being created, online + (allowing concurrent modifications) */ + ONLINE_INDEX_CREATION, + /** secondary index creation was aborted and the index + should be dropped as soon as index->table->n_ref_count reaches 0, + or online table rebuild was aborted and the clustered index + of the original table should soon be restored to + ONLINE_INDEX_COMPLETE */ + ONLINE_INDEX_ABORTED, + /** the online index creation was aborted, the index was + dropped from the data dictionary and the tablespace, and it + should be dropped from the data dictionary cache as soon as + index->table->n_ref_count reaches 0. */ + ONLINE_INDEX_ABORTED_DROPPED +}; + /** Data structure for a foreign key constraint; an example: FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */ -struct dict_foreign_struct{ +struct dict_foreign_t{ mem_heap_t* heap; /*!< this object is allocated from this memory heap */ char* id; /*!< id of the constraint as a @@ -592,7 +719,7 @@ a foreign key constraint is enforced, therefore RESTRICT just means no flag */ /** Data structure for a database table. Most fields will be initialized to 0, NULL or FALSE in dict_mem_table_create(). */ -struct dict_table_struct{ +struct dict_table_t{ table_id_t id; /*!< id of the table */ mem_heap_t* heap; /*!< memory heap */ char* name; /*!< table name */ @@ -602,6 +729,8 @@ struct dict_table_struct{ innodb_file_per_table is defined in my.cnf; in Unix this is usually /tmp/..., in Windows temp\... */ + char* data_dir_path; /*!< NULL or the directory path + specified by DATA DIRECTORY */ unsigned space:32; /*!< space where the clustered index of the table is placed */ @@ -612,13 +741,16 @@ struct dict_table_struct{ tablespace and the .ibd file is missing; then we must return in ha_innodb.cc an error if the user tries to query such an orphaned table */ - unsigned tablespace_discarded:1; - /*!< this flag is set TRUE when the user - calls DISCARD TABLESPACE on this - table, and reset to FALSE in IMPORT - TABLESPACE */ unsigned cached:1;/*!< TRUE if the table object has been added to the dictionary cache */ + unsigned to_be_dropped:1; + /*!< TRUE if the table is to be dropped, but + not yet actually dropped (could in the bk + drop list); It is turned on at the beginning + of row_drop_table_for_mysql() and turned off + just before we start to update system tables + for the drop. It is protected by + dict_operation_lock */ unsigned n_def:10;/*!< number of columns defined so far */ unsigned n_cols:10;/*!< number of columns */ unsigned can_be_evicted:1; @@ -626,6 +758,10 @@ struct dict_table_struct{ or a table that has no FK relationships */ unsigned corrupted:1; /*!< TRUE if table is corrupted */ + unsigned drop_aborted:1; + /*!< TRUE if some indexes should be dropped + after ONLINE_INDEX_ABORTED + or ONLINE_INDEX_ABORTED_DROPPED */ dict_col_t* cols; /*!< array of column descriptions */ const char* col_names; /*!< Column names packed in a character string @@ -659,6 +795,12 @@ struct dict_table_struct{ on the table: we cannot drop the table while there are foreign key checks running on it! */ + trx_id_t def_trx_id; + /*!< transaction id that last touched + the table definition, either when + loading the definition or CREATE + TABLE, or ALTER TABLE (prepare, + commit, and rollback phases) */ trx_id_t query_cache_inv_trx_id; /*!< transactions whose trx id is smaller than this number are not @@ -691,7 +833,55 @@ struct dict_table_struct{ unsigned stat_initialized:1; /*!< TRUE if statistics have been calculated the first time after database startup or table creation */ - ib_int64_t stat_n_rows; + ib_time_t stats_last_recalc; + /*!< Timestamp of last recalc of the stats */ + ib_uint32_t stat_persistent; + /*!< The two bits below are set in the + ::stat_persistent member and have the following + meaning: + 1. _ON=0, _OFF=0, no explicit persistent stats + setting for this table, the value of the global + srv_stats_persistent is used to determine + whether the table has persistent stats enabled + or not + 2. _ON=0, _OFF=1, persistent stats are + explicitly disabled for this table, regardless + of the value of the global srv_stats_persistent + 3. _ON=1, _OFF=0, persistent stats are + explicitly enabled for this table, regardless + of the value of the global srv_stats_persistent + 4. _ON=1, _OFF=1, not allowed, we assert if + this ever happens. */ +#define DICT_STATS_PERSISTENT_ON (1 << 1) +#define DICT_STATS_PERSISTENT_OFF (1 << 2) + ib_uint32_t stats_auto_recalc; + /*!< The two bits below are set in the + ::stats_auto_recalc member and have + the following meaning: + 1. _ON=0, _OFF=0, no explicit auto recalc + setting for this table, the value of the global + srv_stats_persistent_auto_recalc is used to + determine whether the table has auto recalc + enabled or not + 2. _ON=0, _OFF=1, auto recalc is explicitly + disabled for this table, regardless of the + value of the global + srv_stats_persistent_auto_recalc + 3. _ON=1, _OFF=0, auto recalc is explicitly + enabled for this table, regardless of the + value of the global + srv_stats_persistent_auto_recalc + 4. _ON=1, _OFF=1, not allowed, we assert if + this ever happens. */ +#define DICT_STATS_AUTO_RECALC_ON (1 << 1) +#define DICT_STATS_AUTO_RECALC_OFF (1 << 2) + ulint stats_sample_pages; + /*!< the number of pages to sample for this + table during persistent stats estimation; + if this is 0, then the value of the global + srv_stats_persistent_sample_pages will be + used instead. */ + ib_uint64_t stat_n_rows; /*!< approximate number of rows in the table; we periodically calculate new estimates */ ulint stat_clustered_index_size; @@ -699,19 +889,34 @@ struct dict_table_struct{ database pages */ ulint stat_sum_of_other_index_sizes; /*!< other indexes in database pages */ - ulint stat_modified_counter; + ib_uint64_t stat_modified_counter; /*!< when a row is inserted, updated, or deleted, we add 1 to this number; we calculate new estimates for the stat_... values for the - table and the indexes at an interval of 2 GB - or when about 1 / 16 of table has been - modified; also when the estimate operation is + table and the indexes when about 1 / 16 of + table has been modified; + also when the estimate operation is called for MySQL SHOW TABLE STATUS; the counter is reset to zero at statistics calculation; this counter is not protected by any latch, because this is only used for heuristics */ +#define BG_STAT_NONE 0 +#define BG_STAT_IN_PROGRESS (1 << 0) + /*!< BG_STAT_IN_PROGRESS is set in + stats_bg_flag when the background + stats code is working on this table. The DROP + TABLE code waits for this to be cleared + before proceeding. */ +#define BG_STAT_SHOULD_QUIT (1 << 1) + /*!< BG_STAT_SHOULD_QUIT is set in + stats_bg_flag when DROP TABLE starts + waiting on BG_STAT_IN_PROGRESS to be cleared, + the background stats thread will detect this + and will eventually quit sooner */ + byte stats_bg_flag; + /*!< see BG_STAT_* above */ /* @} */ /*----------------------*/ /**!< The following fields are used by the @@ -737,7 +942,7 @@ struct dict_table_struct{ space from the lock heap of the trx: otherwise the lock heap would grow rapidly if we do a large insert from a select */ - mutex_t autoinc_mutex; + ib_mutex_t autoinc_mutex; /*!< mutex protecting the autoincrement counter */ ib_uint64_t autoinc;/*!< autoinc counter value to give to the @@ -758,6 +963,14 @@ struct dict_table_struct{ fts_t* fts; /* FTS specific state variables */ /* @} */ /*----------------------*/ + + ib_quiesce_t quiesce;/*!< Quiescing states, protected by the + dict_index_t::lock. ie. we can only change + the state if we acquire all the latches + (dict_index_t::lock) in X mode of this table's + indexes. */ + + /*----------------------*/ ulint n_rec_locks; /*!< Count of the number of record locks on this table. We use this to determine whether @@ -776,7 +989,7 @@ struct dict_table_struct{ #ifdef UNIV_DEBUG ulint magic_n;/*!< magic number */ -/** Value of dict_table_struct::magic_n */ +/** Value of dict_table_t::magic_n */ # define DICT_TABLE_MAGIC_N 76333786 #endif /* UNIV_DEBUG */ }; diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 879e67a0918..186f90e3694 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2009, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -32,61 +32,128 @@ Created Jan 06, 2010 Vasil Dimov #include "dict0types.h" #include "trx0types.h" -enum dict_stats_upd_option { +enum dict_stats_upd_option_t { DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the statistics using a precise and slow algo and save them to the persistent storage, if the persistent storage is not present then emit a warning and fall back to transient stats */ - DICT_STATS_RECALC_PERSISTENT_SILENT,/* same as - DICT_STATS_RECALC_PERSISTENT - but do not emit a warning */ DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics using an imprecise quick algo without saving the results persistently */ - DICT_STATS_FETCH, /* fetch the statistics from the - persistent storage */ - DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* only fetch the stats + DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense) + into a table and its indexes' statistics + members. The resulting stats correspond to an + empty table. If the table is using persistent + statistics, then they are saved on disk. */ + DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats from the persistent storage if the in-memory structures have not been initialized yet, otherwise do nothing */ }; -typedef enum dict_stats_upd_option dict_stats_upd_option_t; +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively quick and is used to calculate transient statistics that +are not saved on disk. +This was the only way to calculate statistics before the +Persistent Statistics feature was introduced. */ +UNIV_INTERN +void +dict_stats_update_transient( +/*========================*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ + __attribute__((nonnull)); + +/*********************************************************************//** +Check whether persistent statistics is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_is_persistent_enabled( +/*=============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off); /*!< in: explicitly disabled */ + +/*********************************************************************//** +Check whether auto recalc is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_auto_recalc_is_enabled( +/*==============================*/ + const dict_table_t* table); /*!< in: table */ + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); /*********************************************************************//** Calculates new estimates for table and index statistics. The statistics are used in query optimization. @return DB_* error code or DB_SUCCESS */ UNIV_INTERN -enum db_err +dberr_t dict_stats_update( /*==============*/ dict_table_t* table, /*!< in/out: table */ - dict_stats_upd_option_t stats_upd_option, + dict_stats_upd_option_t stats_upd_option); /*!< in: whether to (re) calc the stats or to fetch them from the persistent storage */ - ibool caller_has_dict_sys_mutex); - /*!< in: TRUE if the caller - owns dict_sys->mutex */ /*********************************************************************//** Removes the information for a particular index's stats from the persistent storage if it exists and if there is data stored for this index. -The transaction is not committed, it must not be committed in this -function because this is the user trx that is running DROP INDEX. -The transaction will be committed at the very end when dropping an -index. +This function creates its own trx and commits it. @return DB_SUCCESS or error code */ UNIV_INTERN -enum db_err -dict_stats_delete_index_stats( -/*==========================*/ - dict_index_t* index, /*!< in: index */ - trx_t* trx, /*!< in: transaction to use */ +dberr_t +dict_stats_drop_index( +/*==================*/ + const char* tname, /*!< in: table name */ + const char* iname, /*!< in: index name */ char* errstr, /*!< out: error message if != DB_SUCCESS is returned */ ulint errstr_sz);/*!< in: size of the errstr buffer */ @@ -97,12 +164,39 @@ persistent storage if it exists and if there is data stored for the table. This function creates its own transaction and commits it. @return DB_SUCCESS or error code */ UNIV_INTERN -enum db_err -dict_stats_delete_table_stats( -/*==========================*/ +dberr_t +dict_stats_drop_table( +/*==================*/ const char* table_name, /*!< in: table name */ char* errstr, /*!< out: error message if != DB_SUCCESS is returned */ ulint errstr_sz); /*!< in: size of errstr buffer */ +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +UNIV_INTERN +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ + __attribute__((nonnull)); + +/*********************************************************************//** +Renames a table in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_rename_table( +/*====================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + char* errstr, /*!< out: error string if != DB_SUCCESS + is returned */ + size_t errstr_sz); /*!< in: errstr size */ + +#ifndef UNIV_NONINL +#include "dict0stats.ic" +#endif + #endif /* dict0stats_h */ diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic new file mode 100644 index 00000000000..04763f174d0 --- /dev/null +++ b/storage/innobase/include/dict0stats.ic @@ -0,0 +1,250 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.ic +Code used for calculating and manipulating table statistics. + +Created Jan 23, 2012 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "dict0dict.h" /* dict_table_stats_lock() */ +#include "dict0types.h" /* dict_table_t */ +#include "srv0srv.h" /* srv_stats_persistent, srv_stats_auto_recalc */ + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. +dict_stats_set_persistent() @{ */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ +{ + /* Not allowed to have both flags set, but a CREATE or ALTER + statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would + end up having both set. In this case we clear the OFF flag. */ + if (ps_on && ps_off) { + ps_off = FALSE; + } + + ib_uint32_t stat_persistent = 0; + + if (ps_on) { + stat_persistent |= DICT_STATS_PERSISTENT_ON; + } + + if (ps_off) { + stat_persistent |= DICT_STATS_PERSISTENT_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stat_persistent = stat_persistent; +} +/* @} */ + +/*********************************************************************//** +Check whether persistent statistics is enabled for a given table. +dict_stats_is_persistent_enabled() @{ +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_is_persistent_enabled( +/*=============================*/ + const dict_table_t* table) /*!< in: table */ +{ + /* Because of the nature of this check (non-locking) it is possible + that a table becomes: + * PS-disabled immediately after this function has returned TRUE or + * PS-enabled immediately after this function has returned FALSE. + This means that it is possible that we do: + + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has + just been PS-disabled or + + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has + just been PS-enabled. + This is acceptable. Avoiding this would mean that we would have to + protect the ::stat_persistent with dict_table_stats_lock() like the + other ::stat_ members which would be too big performance penalty, + especially when this function is called from + row_update_statistics_if_needed(). */ + + /* we rely on this read to be atomic */ + ib_uint32_t stat_persistent = table->stat_persistent; + + if (stat_persistent & DICT_STATS_PERSISTENT_ON) { + ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF)); + return(TRUE); + } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) { + return(FALSE); + } else { + return(srv_stats_persistent); + } +} +/* @} */ + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. +dict_stats_auto_recalc_set() @{ */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off) /*!< in: explicitly disabled */ +{ + ut_ad(!auto_recalc_on || !auto_recalc_off); + + ib_uint32_t stats_auto_recalc = 0; + + if (auto_recalc_on) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON; + } + + if (auto_recalc_off) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stats_auto_recalc = stats_auto_recalc; +} +/* @} */ + +/*********************************************************************//** +Check whether auto recalc is enabled for a given table. +dict_stats_auto_recalc_is_enabled() @{ +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_auto_recalc_is_enabled( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + /* we rely on this read to be atomic */ + ib_uint32_t stats_auto_recalc = table->stats_auto_recalc; + + if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) { + ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF)); + return(TRUE); + } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) { + return(FALSE); + } else { + return(srv_stats_auto_recalc); + } +} +/* @} */ + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. +dict_stats_init() @{ */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!mutex_own(&dict_sys->mutex)); + + if (table->stat_initialized) { + return; + } + + dict_stats_upd_option_t opt; + + if (dict_stats_is_persistent_enabled(table)) { + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + + dict_stats_update(table, opt); +} +/* @} */ + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. +dict_stats_deinit() @{ */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + ut_a(table->n_ref_count == 0); + + dict_table_stats_lock(table, RW_X_LATCH); + + if (!table->stat_initialized) { + dict_table_stats_unlock(table, RW_X_LATCH); + return; + } + + table->stat_initialized = FALSE; + +#ifdef UNIV_DEBUG_VALGRIND + UNIV_MEM_INVALID(&table->stat_n_rows, + sizeof(table->stat_n_rows)); + UNIV_MEM_INVALID(&table->stat_clustered_index_size, + sizeof(table->stat_clustered_index_size)); + UNIV_MEM_INVALID(&table->stat_sum_of_other_index_sizes, + sizeof(table->stat_sum_of_other_index_sizes)); + UNIV_MEM_INVALID(&table->stat_modified_counter, + sizeof(table->stat_modified_counter)); + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + ulint n_uniq = dict_index_get_n_unique(index); + + UNIV_MEM_INVALID( + index->stat_n_diff_key_vals, + n_uniq * sizeof(index->stat_n_diff_key_vals[0])); + UNIV_MEM_INVALID( + index->stat_n_sample_sizes, + n_uniq * sizeof(index->stat_n_sample_sizes[0])); + UNIV_MEM_INVALID( + index->stat_n_non_null_key_vals, + n_uniq * sizeof(index->stat_n_non_null_key_vals[0])); + UNIV_MEM_INVALID( + &index->stat_index_size, + sizeof(index->stat_index_size)); + UNIV_MEM_INVALID( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + dict_table_stats_unlock(table, RW_X_LATCH); +} +/* @} */ + +/* vim: set foldmethod=marker foldmarker=@{,@}: */ diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h new file mode 100644 index 00000000000..dd85088c7ba --- /dev/null +++ b/storage/innobase/include/dict0stats_bg.h @@ -0,0 +1,116 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats_bg.h +Code used for background table and index stats gathering. + +Created Apr 26, 2012 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_bg_h +#define dict0stats_bg_h + +#include "univ.i" + +#include "dict0types.h" /* dict_table_t, table_id_t */ +#include "os0sync.h" /* os_event_t */ +#include "os0thread.h" /* DECLARE_THREAD */ + +/** Event to wake up the stats thread */ +extern os_event_t dict_stats_event; + +/*****************************************************************//** +Add a table to the recalc pool, which is processed by the +background stats gathering thread. Only the table id is added to the +list, so the table can be closed after being enqueued and it will be +opened when needed. If the table does not exist later (has been DROPped), +then it will be removed from the pool and skipped. +dict_stats_recalc_pool_add() @{ */ +UNIV_INTERN +void +dict_stats_recalc_pool_add( +/*=======================*/ + const dict_table_t* table); /*!< in: table to add */ +/* @} */ + +/*****************************************************************//** +Delete a given table from the auto recalc pool. +dict_stats_recalc_pool_del() */ +UNIV_INTERN +void +dict_stats_recalc_pool_del( +/*=======================*/ + const dict_table_t* table); /*!< in: table to remove */ +/* @} */ + +/*****************************************************************//** +Wait until background stats thread has stopped using the specified table(s). +The caller must have locked the data dictionary using +row_mysql_lock_data_dictionary() and this function may unlock it temporarily +and restore the lock before it exits. +The background stats thead is guaranteed not to start using the specified +tables after this function returns and before the caller unlocks the data +dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag +under dict_sys->mutex. +dict_stats_wait_bg_to_stop_using_table() @{ */ +UNIV_INTERN +void +dict_stats_wait_bg_to_stop_using_tables( +/*====================================*/ + dict_table_t* table1, /*!< in/out: table1 */ + dict_table_t* table2, /*!< in/out: table2, could be NULL */ + trx_t* trx); /*!< in/out: transaction to use for + unlocking/locking the data dict */ +/* @} */ + +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread(). +Must be called before dict_stats_thread() is started. +dict_stats_thread_init() @{ */ +UNIV_INTERN +void +dict_stats_thread_init(); +/*====================*/ +/* @} */ + +/*****************************************************************//** +Free resources allocated by dict_stats_thread_init(), must be called +after dict_stats_thread() has exited. +dict_stats_thread_deinit() @{ */ +UNIV_INTERN +void +dict_stats_thread_deinit(); +/*======================*/ +/* @} */ + +/*****************************************************************//** +This is the thread for background stats gathering. It pops tables, from +the auto recalc list and proceeds them, eventually recalculating their +statistics. +dict_stats_thread() @{ +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(dict_stats_thread)( +/*==============================*/ + void* arg); /*!< in: a dummy parameter + required by os_thread_create */ +/* @} */ + +#endif /* dict0stats_bg_h */ diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index cd2863582c1..b7f7c2d9df9 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -26,15 +26,15 @@ Created 1/8/1996 Heikki Tuuri #ifndef dict0types_h #define dict0types_h -typedef struct dict_sys_struct dict_sys_t; -typedef struct dict_col_struct dict_col_t; -typedef struct dict_field_struct dict_field_t; -typedef struct dict_index_struct dict_index_t; -typedef struct dict_table_struct dict_table_t; -typedef struct dict_foreign_struct dict_foreign_t; +struct dict_sys_t; +struct dict_col_t; +struct dict_field_t; +struct dict_index_t; +struct dict_table_t; +struct dict_foreign_t; -typedef struct ind_node_struct ind_node_t; -typedef struct tab_node_struct tab_node_t; +struct ind_node_t; +struct tab_node_t; /* Space id and page no where the dictionary header resides */ #define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ @@ -52,7 +52,7 @@ the table and index will be marked as "corrupted", and caller will be responsible to deal with corrupted table or index. Note: please define the IGNORE_ERR_* as bits, so their value can be or-ed together */ -enum dict_err_ignore { +enum dict_err_ignore_t { DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */ DICT_ERR_IGNORE_INDEX_ROOT = 1, /*!< ignore error if index root page is FIL_NULL or incorrect value */ @@ -60,6 +60,11 @@ enum dict_err_ignore { DICT_ERR_IGNORE_ALL = 0xFFFF /*!< ignore all errors */ }; -typedef enum dict_err_ignore dict_err_ignore_t; +/** Quiescing states for flushing tables to disk. */ +enum ib_quiesce_t { + QUIESCE_NONE, + QUIESCE_START, /*!< Initialise, prepare to start */ + QUIESCE_COMPLETE /*!< All done */ +}; #endif diff --git a/storage/innobase/include/dyn0dyn.h b/storage/innobase/include/dyn0dyn.h index 5e69cb13122..ffb4f270d0e 100644 --- a/storage/innobase/include/dyn0dyn.h +++ b/storage/innobase/include/dyn0dyn.h @@ -31,10 +31,9 @@ Created 2/5/1996 Heikki Tuuri #include "mem0mem.h" /** A block in a dynamically allocated array */ -typedef struct dyn_block_struct dyn_block_t; +struct dyn_block_t; /** Dynamically allocated array */ -typedef dyn_block_t dyn_array_t; - +typedef dyn_block_t dyn_array_t; /** This is the initial 'payload' size of a dynamic array; this must be > MLOG_BUF_MARGIN + 30! */ @@ -159,7 +158,7 @@ dyn_push_string( /** @brief A block in a dynamically allocated array. NOTE! Do not access the fields of the struct directly: the definition appears here only for the compiler to know its size! */ -struct dyn_block_struct{ +struct dyn_block_t{ mem_heap_t* heap; /*!< in the first block this is != NULL if dynamic allocation has been needed */ ulint used; /*!< number of data bytes used in this block; diff --git a/storage/innobase/include/dyn0dyn.ic b/storage/innobase/include/dyn0dyn.ic index b86697d6865..39254e632a8 100644 --- a/storage/innobase/include/dyn0dyn.ic +++ b/storage/innobase/include/dyn0dyn.ic @@ -23,9 +23,9 @@ The dynamically allocated array Created 2/5/1996 Heikki Tuuri *******************************************************/ -/** Value of dyn_block_struct::magic_n */ +/** Value of dyn_block_t::magic_n */ #define DYN_BLOCK_MAGIC_N 375767 -/** Flag for dyn_block_struct::used that indicates a full block */ +/** Flag for dyn_block_t::used that indicates a full block */ #define DYN_BLOCK_FULL_FLAG 0x1000000UL /************************************************************//** diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 1e2b8049860..56fda8b39b1 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,6 +39,14 @@ Created 10/25/1995 Heikki Tuuri #include "log0log.h" #endif /* !UNIV_HOTBACKUP */ +#include <list> + +// Forward declaration +struct trx_t; +struct fil_space_t; + +typedef std::list<const char*> space_name_list_t; + /** When mysqld is run, the default directory "." is the mysqld datadir, but in the MySQL Embedded Server Library and ibbackup it is not the default directory, and we must set the base file path explicitly */ @@ -61,12 +69,8 @@ typedef byte fil_faddr_t; /*!< 'type' definition in C: an address #define FIL_ADDR_SIZE 6 /* address size is 6 bytes */ -/** A struct for storing a space address FIL_ADDR, when it is used -in C program data structures. */ - -typedef struct fil_addr_struct fil_addr_t; /** File space address */ -struct fil_addr_struct{ +struct fil_addr_t{ ulint page; /*!< page number within a space */ ulint boffset; /*!< byte offset within the page */ }; @@ -200,17 +204,19 @@ fil_space_get_type( ulint id); /*!< in: space id */ #endif /* !UNIV_HOTBACKUP */ /*******************************************************************//** -Appends a new file to the chain of files of a space. File must be closed. */ +Appends a new file to the chain of files of a space. File must be closed. +@return pointer to the file name, or NULL on error */ UNIV_INTERN -void +char* fil_node_create( /*============*/ const char* name, /*!< in: file name (file must be closed) */ ulint size, /*!< in: file size in database blocks, rounded downwards to an integer */ ulint id, /*!< in: space id where to append */ - ibool is_raw);/*!< in: TRUE if a raw device or + ibool is_raw) /*!< in: TRUE if a raw device or a raw disk partition */ + __attribute__((nonnull, warn_unused_result)); #ifdef UNIV_LOG_ARCHIVE /****************************************************************//** Drops files from the start of a file space, so that its size is cut by @@ -248,6 +254,16 @@ fil_assign_new_space_id( /*====================*/ ulint* space_id); /*!< in/out: space id */ /*******************************************************************//** +Returns the path from the first fil_node_t found for the space ID sent. +The caller is responsible for freeing the memory allocated here for the +value returned. +@return a copy of fil_node_t::path, NULL if space is zero or not found. */ +UNIV_INTERN +char* +fil_space_get_first_path( +/*=====================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** Returns the size of the space in pages. The tablespace must be cached in the memory cache. @return space size, 0 if space not found */ @@ -316,6 +332,14 @@ void fil_close_all_files(void); /*=====================*/ /*******************************************************************//** +Closes the redo log files. There must not be any pending i/o's or not +flushed modifications in the files. */ +UNIV_INTERN +void +fil_close_log_files( +/*================*/ + bool free); /*!< in: whether to free the memory object */ +/*******************************************************************//** Sets the max tablespace id counter if the given number is bigger than the previous value. */ UNIV_INTERN @@ -329,7 +353,7 @@ Writes the flushed lsn and the latest archived log number to the page header of the first page of each data file in the system tablespace. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t fil_write_flushed_lsn_to_data_files( /*================================*/ lsn_t lsn, /*!< in: lsn to write */ @@ -346,6 +370,7 @@ fil_read_first_page( parameters below already contain sensible data */ ulint* flags, /*!< out: tablespace flags */ + ulint* space_id, /*!< out: tablespace ID */ #ifdef UNIV_LOG_ARCHIVE ulint* min_arch_log_no, /*!< out: min of archived log numbers in data files */ @@ -405,25 +430,44 @@ Deletes a single-table tablespace. The tablespace must be cached in the memory cache. @return TRUE if success */ UNIV_INTERN -ibool +dberr_t fil_delete_tablespace( /*==================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove); /*!< in: specify the action to take + on the tables pages in the buffer + pool */ +/*******************************************************************//** +Closes a single-table tablespace. The tablespace must be cached in the +memory cache. Free all pages used by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_close_tablespace( +/*=================*/ + trx_t* trx, /*!< in/out: Transaction covering the close */ ulint id); /*!< in: space id */ #ifndef UNIV_HOTBACKUP /*******************************************************************//** Discards a single-table tablespace. The tablespace must be cached in the memory cache. Discarding is like deleting a tablespace, but -1) we do not drop the table from the data dictionary; -2) we remove all insert buffer entries for the tablespace immediately; in DROP -TABLE they are only removed gradually in the background; -3) when the user does IMPORT TABLESPACE, the tablespace will have the same id -as it originally had. -@return TRUE if success */ + + 1. We do not drop the table from the data dictionary; + + 2. We remove all insert buffer entries for the tablespace immediately; + in DROP TABLE they are only removed gradually in the background; + + 3. When the user does IMPORT TABLESPACE, the tablespace will have the + same id as it originally had. + + 4. Free all the pages in use by the tablespace if rename=TRUE. +@return DB_SUCCESS or error */ UNIV_INTERN -ibool +dberr_t fil_discard_tablespace( /*===================*/ - ulint id); /*!< in: space id */ + ulint id) /*!< in: space id */ + __attribute__((warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /*******************************************************************//** Renames a single-table tablespace. The tablespace must be cached in the @@ -433,16 +477,70 @@ UNIV_INTERN ibool fil_rename_tablespace( /*==================*/ - const char* old_name_in, /*!< in: old table name in the standard - databasename/tablename format of - InnoDB, or NULL if we do the rename - based on the space id only */ + const char* old_name_in, /*!< in: old table name in the + standard databasename/tablename + format of InnoDB, or NULL if we + do the rename based on the space + id only */ ulint id, /*!< in: space id */ - const char* new_name); /*!< in: new table name in the standard - databasename/tablename format - of InnoDB */ + const char* new_name, /*!< in: new table name in the + standard databasename/tablename + format of InnoDB */ + const char* new_path); /*!< in: new full datafile path + if the tablespace is remotely + located, or NULL if it is located + in the normal data directory. */ /*******************************************************************//** +Allocates a file name for a single-table tablespace. The string must be freed +by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_ibd_name( +/*==============*/ + const char* name, /*!< in: table name or a dir path */ + bool is_full_path); /*!< in: TRUE if it is a dir path */ +/*******************************************************************//** +Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link). +The string must be freed by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_isl_name( +/*==============*/ + const char* name); /*!< in: table name */ +/*******************************************************************//** +Creates a new InnoDB Symbolic Link (ISL) file. It is always created +under the 'datadir' of MySQL. The datadir is the directory of a +running mysqld program. We can refer to it by simply using the path '.'. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_link_file( +/*=================*/ + const char* tablename, /*!< in: tablename */ + const char* filepath); /*!< in: pathname of tablespace */ +/*******************************************************************//** +Deletes an InnoDB Symbolic Link (ISL) file. */ +UNIV_INTERN +void +fil_delete_link_file( +/*==================*/ + const char* tablename); /*!< in: name of table */ +/*******************************************************************//** +Reads an InnoDB Symbolic Link (ISL) file. +It is always created under the 'datadir' of MySQL. The name is of the +form {databasename}/{tablename}. and the isl file is expected to be in a +'{databasename}' directory called '{tablename}.isl'. The caller must free +the memory of the null-terminated path returned if it is not null. +@return own: filepath found in link file, NULL if not found. */ +UNIV_INTERN +char* +fil_read_link_file( +/*===============*/ + const char* name); /*!< in: tablespace name */ +/*******************************************************************//** Creates a new single-table tablespace to a database directory of MySQL. Database directories are under the 'datadir' of MySQL. The datadir is the directory of a running mysqld program. We can refer to it by simply the @@ -450,21 +548,20 @@ path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp dir of the mysqld server. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fil_create_new_single_table_tablespace( /*===================================*/ ulint space_id, /*!< in: space id */ const char* tablename, /*!< in: the table name in the usual databasename/tablename format - of InnoDB, or a dir path to a temp - table */ - ibool is_temp, /*!< in: TRUE if a table created with - CREATE TEMPORARY TABLE */ + of InnoDB */ + const char* dir_path, /*!< in: NULL or a dir path */ ulint flags, /*!< in: tablespace flags */ ulint flags2, /*!< in: table flags2 */ - ulint size); /*!< in: the initial size of the + ulint size) /*!< in: the initial size of the tablespace file in pages, must be >= FIL_IBD_FILE_INITIAL_SIZE */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_HOTBACKUP /********************************************************************//** Tries to open a single-table tablespace and optionally checks the space id is @@ -475,41 +572,31 @@ NOTE that we assume this operation is used either at the database startup or under the protection of the dictionary mutex, so that two users cannot race here. This operation does not leave the file associated with the tablespace open, but closes it after we have looked at the space id in it. -@return TRUE if success */ + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially FALSE, but if +a remote tablespace is found it will be changed to true. + +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. + +@return DB_SUCCESS or error code */ UNIV_INTERN -ibool +dberr_t fil_open_single_table_tablespace( /*=============================*/ - ibool check_space_id, /*!< in: should we check that the space - id in the file is right; we assume - that this function runs much faster - if no check is made, since accessing - the file inode probably is much - faster (the OS caches them) than - accessing the first page of the file */ + bool validate, /*!< in: Do we validate tablespace? */ + bool fix_dict, /*!< in: Can we fix the dictionary? */ ulint id, /*!< in: space id */ ulint flags, /*!< in: tablespace flags */ - const char* name); /*!< in: table name in the - databasename/tablename format */ -/********************************************************************//** -It is possible, though very improbable, that the lsn's in the tablespace to be -imported have risen above the current system lsn, if a lengthy purge, ibuf -merge, or rollback was performed on a backup taken with ibbackup. If that is -the case, reset page lsn's in the file. We assume that mysqld was shut down -after it performed these cleanup operations on the .ibd file, so that it at -the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the -first page of the .ibd file, and we can determine whether we need to reset the -lsn's just by looking at that flush lsn. -@return TRUE if success */ -UNIV_INTERN -ibool -fil_reset_too_high_lsns( -/*====================*/ - const char* name, /*!< in: table name in the + const char* tablename, /*!< in: table name in the databasename/tablename format */ - lsn_t current_lsn); /*!< in: reset lsn's if the lsn stamped - to FIL_PAGE_FILE_FLUSH_LSN in the - first page is too high */ + const char* filepath) /*!< in: tablespace filepath */ + __attribute__((nonnull(5), warn_unused_result)); + #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** At the server startup, if we need crash recovery, scans the database @@ -520,7 +607,7 @@ in the doublewrite buffer, also to know where to apply log records where the space id is != 0. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t fil_load_single_table_tablespaces(void); /*===================================*/ /*******************************************************************//** @@ -562,11 +649,15 @@ fil_space_for_table_exists_in_mem( data dictionary, so that we can print a warning about orphaned tablespaces */ - ibool print_error_if_does_not_exist); + ibool print_error_if_does_not_exist, /*!< in: print detailed error information to the .err log if a matching tablespace is not found from memory */ + bool adjust_space, /*!< in: whether to adjust space id + when find table space mismatch */ + mem_heap_t* heap, /*!< in: heap memory */ + table_id_t table_id); /*!< in: table id */ #else /* !UNIV_HOTBACKUP */ /********************************************************************//** Extends all tablespaces to the size stored in the space header. During the @@ -625,7 +716,7 @@ Reads or writes data. This operation is asynchronous (aio). @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ UNIV_INTERN -ulint +dberr_t fil_io( /*===*/ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE, @@ -651,8 +742,9 @@ fil_io( void* buf, /*!< in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ - void* message); /*!< in: message for aio handler if non-sync + void* message) /*!< in: message for aio handler if non-sync aio used, else ignored */ + __attribute__((nonnull(8))); /**********************************************************************//** Waits for an aio operation to complete. This function is used to write the handler for completed requests. The aio array of pending requests is divided @@ -739,8 +831,154 @@ fil_tablespace_is_being_deleted( /*============================*/ ulint id); /*!< in: space id */ -typedef struct fil_space_struct fil_space_t; +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +UNIV_INTERN +void +fil_delete_file( +/*============*/ + const char* path); /*!< in: filepath of the ibd tablespace */ + +/** Callback functor. */ +struct PageCallback { + + /** + Default constructor */ + PageCallback() + : + m_zip_size(), + m_page_size(), + m_filepath() UNIV_NOTHROW {} + + virtual ~PageCallback() UNIV_NOTHROW {} + + /** + Called for page 0 in the tablespace file at the start. + @param file_size - size of the file in bytes + @param block - contents of the first page in the tablespace file + @retval DB_SUCCESS or error code.*/ + virtual dberr_t init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW = 0; + + /** + Called for every page in the tablespace. If the page was not + updated then its state must be set to BUF_PAGE_NOT_USED. For + compressed tables the page descriptor memory will be at offset: + block->frame + UNIV_PAGE_SIZE; + @param offset - physical offset within the file + @param block - block read from file, note it is not from the buffer pool + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator()( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW = 0; + + /** + Set the name of the physical file and the file handle that is used + to open it for the file that is being iterated over. + @param filename - then physical name of the tablespace file. + @param file - OS file handle */ + void set_file(const char* filename, os_file_t file) UNIV_NOTHROW + { + m_file = file; + m_filepath = filename; + } + + /** + @return the space id of the tablespace */ + virtual ulint get_space_id() const UNIV_NOTHROW = 0; + + /** The compressed page size + @return the compressed page size */ + ulint get_zip_size() const + { + return(m_zip_size); + } + + /** + Set the tablespace compressed table size. + @return DB_SUCCESS if it is valie or DB_CORRUPTION if not */ + dberr_t set_zip_size(const buf_frame_t* page) UNIV_NOTHROW; + + /** The compressed page size + @return the compressed page size */ + ulint get_page_size() const + { + return(m_page_size); + } + + /** Compressed table page size */ + ulint m_zip_size; + + /** The tablespace page size. */ + ulint m_page_size; + + /** File handle to the tablespace */ + os_file_t m_file; + + /** Physical file path. */ + const char* m_filepath; + +protected: + // Disable copying + PageCallback(const PageCallback&); + PageCallback& operator=(const PageCallback&); +}; + +/********************************************************************//** +Iterate over all the pages in the tablespace. +@param table - the table definiton in the server +@param n_io_buffers - number of blocks to read and write together +@param callback - functor that will do the page updates +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_tablespace_iterate( +/*===================*/ + dict_table_t* table, + ulint n_io_buffers, + PageCallback& callback) + __attribute__((nonnull, warn_unused_result)); -#endif /* !UNIV_INNOCHECKSUM */ +/*******************************************************************//** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. +@return space id, ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +fil_get_space_id_for_table( +/*=======================*/ + const char* name); /*!< in: table name in the standard + 'databasename/tablename' format */ + +/** +Iterate over all the spaces in the space list and fetch the +tablespace names. It will return a copy of the name that must be +freed by the caller using: delete[]. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fil_get_space_names( +/*================*/ + space_name_list_t& space_name_list) + /*!< in/out: Vector for collecting the names. */ + __attribute__((warn_unused_result)); -#endif +/****************************************************************//** +Generate redo logs for swapping two .ibd files */ +UNIV_INTERN +void +fil_mtr_rename_log( +/*===============*/ + ulint old_space_id, /*!< in: tablespace id of the old + table. */ + const char* old_name, /*!< in: old table name */ + ulint new_space_id, /*!< in: tablespace id of the new + table */ + const char* new_name, /*!< in: new table name */ + const char* tmp_name); /*!< in: temp table name used while + swapping */ + +#endif /* !UNIV_INNOCHECKSUM */ +#endif /* fil0fil_h */ diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 994783c2db9..a587ccc9f20 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -50,11 +50,15 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */ #define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1 /** Number of flag bits used to indicate the tablespace page size */ #define FSP_FLAGS_WIDTH_PAGE_SSIZE 4 +/** Width of the DATA_DIR flag. This flag indicates that the tablespace +is found in a remote location, not the default data directory. */ +#define FSP_FLAGS_WIDTH_DATA_DIR 1 /** Width of all the currently known tablespace flags */ #define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \ + FSP_FLAGS_WIDTH_ZIP_SSIZE \ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \ - + FSP_FLAGS_WIDTH_PAGE_SSIZE) + + FSP_FLAGS_WIDTH_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_DATA_DIR) /** A mask of all the known/used bits in tablespace flags */ #define FSP_FLAGS_MASK (~(~0 << FSP_FLAGS_WIDTH)) @@ -71,8 +75,11 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */ #define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) /** Zero relative shift position of the start of the UNUSED bits */ -#define FSP_FLAGS_POS_UNUSED (FSP_FLAGS_POS_PAGE_SSIZE \ +#define FSP_FLAGS_POS_DATA_DIR (FSP_FLAGS_POS_PAGE_SSIZE \ + FSP_FLAGS_WIDTH_PAGE_SSIZE) +/** Zero relative shift position of the start of the UNUSED bits */ +#define FSP_FLAGS_POS_UNUSED (FSP_FLAGS_POS_DATA_DIR \ + + FSP_FLAGS_WIDTH_DATA_DIR) /** Bit mask of the POST_ANTELOPE field */ #define FSP_FLAGS_MASK_POST_ANTELOPE \ @@ -90,6 +97,10 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */ #define FSP_FLAGS_MASK_PAGE_SSIZE \ ((~(~0 << FSP_FLAGS_WIDTH_PAGE_SSIZE)) \ << FSP_FLAGS_POS_PAGE_SSIZE) +/** Bit mask of the DATA_DIR field */ +#define FSP_FLAGS_MASK_DATA_DIR \ + ((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR)) \ + << FSP_FLAGS_POS_DATA_DIR) /** Return the value of the POST_ANTELOPE field */ #define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ @@ -107,6 +118,10 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */ #define FSP_FLAGS_GET_PAGE_SSIZE(flags) \ ((flags & FSP_FLAGS_MASK_PAGE_SSIZE) \ >> FSP_FLAGS_POS_PAGE_SSIZE) +/** Return the value of the DATA_DIR field */ +#define FSP_FLAGS_HAS_DATA_DIR(flags) \ + ((flags & FSP_FLAGS_MASK_DATA_DIR) \ + >> FSP_FLAGS_POS_DATA_DIR) /** Return the contents of the UNUSED bits */ #define FSP_FLAGS_GET_UNUSED(flags) \ (flags >> FSP_FLAGS_POS_UNUSED) @@ -555,6 +570,17 @@ fseg_free_page( ulint page, /*!< in: page offset */ mtr_t* mtr); /*!< in/out: mini-transaction */ /**********************************************************************//** +Checks if a single page of a segment is free. +@return true if free */ +UNIV_INTERN +bool +fseg_page_is_free( +/*==============*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page) /*!< in: page offset */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** Frees part of a segment. This function can be used to free a segment by repeatedly calling this function in different mini-transactions. Doing the freeing in a single mini-transaction might result in @@ -643,12 +669,13 @@ tablespace header at offset FSP_SPACE_FLAGS. They should be 0 for ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats, COMPRESSED and DYNAMIC, use a file format > Antelope so they should have a file format number plus the DICT_TF_COMPACT bit set. -@return ulint containing the validated tablespace flags. */ +@return true if check ok */ UNIV_INLINE -ulint -fsp_flags_validate( +bool +fsp_flags_is_valid( /*===============*/ - ulint flags); /*!< in: tablespace flags */ + ulint flags) /*!< in: tablespace flags */ + __attribute__((warn_unused_result, const)); /********************************************************************//** Determine if the tablespace is compressed from dict_table_t::flags. @return TRUE if compressed, FALSE if not compressed */ @@ -658,6 +685,40 @@ fsp_flags_is_compressed( /*====================*/ ulint flags); /*!< in: tablespace flags */ +/********************************************************************//** +Calculates the descriptor index within a descriptor page. +@return descriptor index */ +UNIV_INLINE +ulint +xdes_calc_descriptor_index( +/*=======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset); /*!< in: page offset */ + +/**********************************************************************//** +Gets a descriptor bit of a page. +@return TRUE if free */ +UNIV_INLINE +ibool +xdes_get_bit( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset);/*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ + +/********************************************************************//** +Calculates the page where the descriptor of a page resides. +@return descriptor page offset */ +UNIV_INLINE +ulint +xdes_calc_descriptor_page( +/*======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset); /*!< in: page offset */ + #endif /* !UNIV_INNOCHECKSUM */ /********************************************************************//** @@ -669,7 +730,7 @@ UNIV_INLINE ulint fsp_flags_get_zip_size( /*====================*/ - ulint flags); /*!< in: tablespace flags */ + ulint flags); /*!< in: tablespace flags */ /********************************************************************//** Extract the page size from tablespace flags. @return page size of the tablespace in bytes */ @@ -677,16 +738,7 @@ UNIV_INLINE ulint fsp_flags_get_page_size( /*====================*/ - ulint flags); /*!< in: tablespace flags */ - -/********************************************************************//** -Set page size */ -UNIV_INLINE -ulint -fsp_flags_set_page_size( -/*====================*/ - ulint flags, /*!< in: tablespace flags */ - ulint page_size); /*!< in: page size in bytes */ + ulint flags); /*!< in: tablespace flags */ #ifndef UNIV_NONINL #include "fsp0fsp.ic" diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index 498f9000888..0d81e817cc9 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -51,11 +51,10 @@ tablespace header at offset FSP_SPACE_FLAGS. They should be 0 for ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats, COMPRESSED and DYNAMIC, use a file format > Antelope so they should have a file format number plus the DICT_TF_COMPACT bit set. -@return Same as input after validating it as FSP_SPACE_FLAGS. -If there is an error, trigger assertion failure. */ +@return true if check ok */ UNIV_INLINE -ulint -fsp_flags_validate( +bool +fsp_flags_is_valid( /*===============*/ ulint flags) /*!< in: tablespace flags */ { @@ -65,16 +64,20 @@ fsp_flags_validate( ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); ulint unused = FSP_FLAGS_GET_UNUSED(flags); - /* Make sure there are no bits that we do not know about. */ - ut_a(unused == 0); + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); /* fsp_flags is zero unless atomic_blobs is set. */ - ut_a(flags != 1); - if (post_antelope) { + /* Make sure there are no bits that we do not know about. */ + if (unused != 0 || flags == 1) { + return(false); + } else if (post_antelope) { /* The Antelope row formats REDUNDANT and COMPACT did not use tablespace flags, so this flag and the entire 4-byte field is zero for Antelope row formats. */ - ut_a(atomic_blobs); + + if (!atomic_blobs) { + return(false); + } } if (!atomic_blobs) { @@ -82,27 +85,33 @@ fsp_flags_validate( the page structure introduced for the COMPACT row format by allowing long fields to be broken into prefix and externally stored parts. */ - ut_a(!post_antelope); - ut_a(zip_ssize == 0); - } else { - ut_a(post_antelope); - /* Validate the zip shift size is within allowed range. */ - ut_a(zip_ssize <= PAGE_ZIP_SSIZE_MAX); - } + if (post_antelope || zip_ssize != 0) { + return(false); + } + + } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + return(false); + } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { + + /* The page size field can be used for any row type, or it may + be zero for an original 16k page size. + Validate the page shift size is within allowed range. */ + + return(false); - /* The page size field can be used for any row type, or it may - be zero for an original 16k page size. - Validate the page shift size is within allowed range. */ - ut_a(page_ssize <= UNIV_PAGE_SSIZE_MAX); - ut_a((UNIV_PAGE_SIZE == UNIV_PAGE_SIZE_ORIG) || (page_ssize)); + } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + return(false); + } #if UNIV_FORMAT_MAX != UNIV_FORMAT_B # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." #endif - /* Return the flags sent in if we did not fail an assert. */ - return(flags); + /* The DATA_DIR field can be used for any row type so there is + nothing here to validate. */ + + return(true); } /********************************************************************//** @@ -208,9 +217,98 @@ fsp_flags_set_page_size( flags = FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize); - ut_ad(flags == fsp_flags_validate(flags)); + ut_ad(fsp_flags_is_valid(flags)); return(flags); } +/********************************************************************//** +Calculates the descriptor index within a descriptor page. +@return descriptor index */ +UNIV_INLINE +ulint +xdes_calc_descriptor_index( +/*=======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /*!< in: page offset */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (zip_size == 0) { + return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE) + / FSP_EXTENT_SIZE); + } else { + return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE); + } +} + +/**********************************************************************//** +Gets a descriptor bit of a page. +@return TRUE if free */ +UNIV_INLINE +ibool +xdes_get_bit( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset) /*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ +{ + ut_ad(offset < FSP_EXTENT_SIZE); + ut_ad(bit == XDES_FREE_BIT || bit == XDES_CLEAN_BIT); + + ulint index = bit + XDES_BITS_PER_PAGE * offset; + + ulint bit_index = index % 8; + ulint byte_index = index / 8; + + return(ut_bit_get_nth( + mach_read_ulint(descr + XDES_BITMAP + byte_index, + MLOG_1BYTE), + bit_index)); +} + +/********************************************************************//** +Calculates the page where the descriptor of a page resides. +@return descriptor page offset */ +UNIV_INLINE +ulint +xdes_calc_descriptor_page( +/*======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /*!< in: page offset */ +{ +#ifndef DOXYGEN /* Doxygen gets confused by these */ +# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET \ + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) \ + * XDES_SIZE_MAX +# error +# endif +# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET \ + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN) \ + * XDES_SIZE_MIN +# error +# endif +#endif /* !DOXYGEN */ + + ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) + * XDES_SIZE); + + ut_ad(ut_is_2pow(zip_size)); + + if (zip_size == 0) { + return(ut_2pow_round(offset, UNIV_PAGE_SIZE)); + } else { + ut_ad(zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return(ut_2pow_round(offset, zip_size)); + } +} + #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h index da40e2bbc96..7f2525dc450 100644 --- a/storage/innobase/include/fts0ast.h +++ b/storage/innobase/include/fts0ast.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,7 +29,7 @@ Created 2007/03/16/03 Sunny Bains #include "mem0mem.h" /* The type of AST Node */ -enum fts_ast_type_enum { +enum fts_ast_type_t { FTS_AST_OPER, /*!< Operator */ FTS_AST_NUMB, /*!< Number */ FTS_AST_TERM, /*!< Term (or word) */ @@ -39,7 +39,7 @@ enum fts_ast_type_enum { }; /* The FTS query operators that we support */ -enum fts_ast_oper_enum { +enum fts_ast_oper_t { FTS_NONE, /*!< No operator */ FTS_IGNORE, /*!< Ignore rows that contain @@ -58,20 +58,18 @@ enum fts_ast_oper_enum { FTS_DECR_RATING, /*!< Decrease the rank for this word*/ - FTS_DISTANCE /*!< Proximity distance */ + FTS_DISTANCE, /*!< Proximity distance */ + FTS_IGNORE_SKIP /*!< Transient node operator + signifies that this is a + FTS_IGNORE node, and ignored in + the first pass of + fts_ast_visit() */ }; -/* Enum types used by the FTS parser */ -typedef enum fts_ast_type_enum fts_ast_type_t; -typedef enum fts_ast_oper_enum fts_ast_oper_t; - /* Data types used by the FTS parser */ -typedef struct fts_lexer_struct fts_lexer_t; -typedef struct fts_ast_text_struct fts_ast_text_t; -typedef struct fts_ast_term_struct fts_ast_term_t; -typedef struct fts_ast_node_struct fts_ast_node_t; -typedef struct fts_ast_list_struct fts_ast_list_t; -typedef struct fts_ast_state_struct fts_ast_state_t; +struct fts_lexer_t; +struct fts_ast_node_t; +struct fts_ast_state_t; typedef ulint (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*); @@ -180,60 +178,76 @@ fts_ast_state_free( /*===============*/ fts_ast_state_t*state); /*!< in: state instance to free */ -/******************************************************************** -Traverse the AST.*/ -ulint +/******************************************************************//** +Traverse the AST - in-order traversal. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t fts_ast_visit( /*==========*/ fts_ast_oper_t oper, /*!< in: FTS operator */ fts_ast_node_t* node, /*!< in: instance to traverse*/ fts_ast_callback visitor, /*!< in: callback */ - void* arg); /*!< in: callback arg */ -/******************************************************************** -Traverse the sub expression list.*/ -ulint + void* arg, /*!< in: callback arg */ + bool* has_ignore) /*!< out: whether we encounter + and ignored processing an + operator, currently we only + ignore FTS_IGNORE operator */ + __attribute__((nonnull, warn_unused_result)); +/*****************************************************************//** +Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t fts_ast_visit_sub_exp( -/*==========*/ +/*==================*/ fts_ast_node_t* node, /*!< in: instance to traverse*/ fts_ast_callback visitor, /*!< in: callback */ - void* arg); /*!< in: callback arg */ + void* arg) /*!< in: callback arg */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************** Create a lex instance.*/ +UNIV_INTERN fts_lexer_t* fts_lexer_create( /*=============*/ ibool boolean_mode, /*!< in: query type */ const byte* query, /*!< in: query string */ - ulint query_len); /*!< in: query string len */ + ulint query_len) /*!< in: query string len */ + __attribute__((nonnull, malloc, warn_unused_result)); /******************************************************************** Free an fts_lexer_t instance.*/ +UNIV_INTERN void fts_lexer_free( /*===========*/ - fts_lexer_t* fts_lexer); /*!< in: lexer instance to + fts_lexer_t* fts_lexer) /*!< in: lexer instance to free */ + __attribute__((nonnull)); /* Query term type */ -struct fts_ast_term_struct { +struct fts_ast_term_t { byte* ptr; /*!< Pointer to term string.*/ ibool wildcard; /*!< TRUE if wild card set.*/ }; /* Query text type */ -struct fts_ast_text_struct { +struct fts_ast_text_t { byte* ptr; /*!< Pointer to term string.*/ ulint distance; /*!< > 0 if proximity distance set */ }; /* The list of nodes in an expr list */ -struct fts_ast_list_struct { +struct fts_ast_list_t { fts_ast_node_t* head; /*!< Children list head */ fts_ast_node_t* tail; /*!< Children list tail */ }; /* FTS AST node to store the term, text, operator and sub-expressions.*/ -struct fts_ast_node_struct { +struct fts_ast_node_t { fts_ast_type_t type; /*!< The type of node */ fts_ast_text_t text; /*!< Text node */ fts_ast_term_t term; /*!< Term node */ @@ -241,10 +255,12 @@ struct fts_ast_node_struct { fts_ast_list_t list; /*!< Expression list */ fts_ast_node_t* next; /*!< Link for expr list */ fts_ast_node_t* next_alloc; /*!< For tracking allocations */ + bool visited; /*!< whether this node is + already processed */ }; /* To track state during parsing */ -struct fts_ast_state_struct { +struct fts_ast_state_t { mem_heap_t* heap; /*!< Heap to use for alloc */ fts_ast_node_t* root; /*!< If all goes OK, then this will point to the root.*/ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index e515772bdbd..f2f8617012a 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -114,16 +114,16 @@ to mark invalid states. NOTE: Do not change the order or value of these, fts_trx_row_get_new_state depends on them being exactly as they are. */ -typedef enum { +enum fts_row_state { FTS_INSERT = 0, FTS_MODIFY, FTS_DELETE, FTS_NOTHING, FTS_INVALID -} fts_row_state; +}; /** The FTS table types. */ -enum fts_table_type_enum { +enum fts_table_type_t { FTS_INDEX_TABLE, /*!< FTS auxiliary table that is specific to a particular FTS index on a table */ @@ -132,21 +132,11 @@ enum fts_table_type_enum { for all FTS index on a table */ }; -typedef struct fts_struct fts_t; -typedef struct fts_doc_struct fts_doc_t; -typedef struct fts_trx_struct fts_trx_t; -typedef struct fts_table_struct fts_table_t; -typedef struct fts_cache_struct fts_cache_t; -typedef struct fts_token_struct fts_token_t; -typedef struct fts_string_struct fts_string_t; -typedef struct fts_result_struct fts_result_t; -typedef struct fts_ranking_struct fts_ranking_t; -typedef struct fts_trx_row_struct fts_trx_row_t; -typedef struct fts_doc_ids_struct fts_doc_ids_t; -typedef enum fts_table_type_enum fts_table_type_t; -typedef struct fts_trx_table_struct fts_trx_table_t; -typedef struct fts_savepoint_struct fts_savepoint_t; -typedef struct fts_index_cache_struct fts_index_cache_t; +struct fts_doc_t; +struct fts_cache_t; +struct fts_token_t; +struct fts_doc_ids_t; +struct fts_index_cache_t; /** Initialize the "fts_table" for internal query into FTS auxiliary @@ -172,7 +162,7 @@ do { \ /** Information about changes in a single transaction affecting the FTS system. */ -struct fts_trx_struct { +struct fts_trx_t { trx_t* trx; /*!< InnoDB transaction */ ib_vector_t* savepoints; /*!< Active savepoints, must have at @@ -184,7 +174,7 @@ struct fts_trx_struct { }; /** Information required for transaction savepoint handling. */ -struct fts_savepoint_struct { +struct fts_savepoint_t { char* name; /*!< First entry is always NULL, the default instance. Otherwise the name of the savepoint */ @@ -193,7 +183,7 @@ struct fts_savepoint_struct { }; /** Information about changed rows in a transaction for a single table. */ -struct fts_trx_table_struct { +struct fts_trx_table_t { dict_table_t* table; /*!< table */ fts_trx_t* fts_trx; /*!< link to parent */ @@ -209,7 +199,7 @@ struct fts_trx_table_struct { }; /** Information about one changed row in a transaction. */ -struct fts_trx_row_struct { +struct fts_trx_row_t { doc_id_t doc_id; /*!< Id of the ins/upd/del document */ fts_row_state state; /*!< state of the row */ @@ -220,7 +210,7 @@ struct fts_trx_row_struct { /** List of document ids that were added during a transaction. This list is passed on to a background 'Add' thread and OPTIMIZE, so it needs its own memory heap. */ -struct fts_doc_ids_struct { +struct fts_doc_ids_t { ib_vector_t* doc_ids; /*!< document ids (each element is of type doc_id_t). */ @@ -237,7 +227,7 @@ as our in-memory format. This typedef is a single such character. */ typedef unsigned short ib_uc_t; /** An UTF-16 ro UTF-8 string. */ -struct fts_string_struct { +struct fts_string_t { byte* f_str; /*!< string, not necessary terminated in any way */ ulint f_len; /*!< Length of the string in bytes */ @@ -245,7 +235,7 @@ struct fts_string_struct { }; /** Query ranked doc ids. */ -struct fts_ranking_struct { +struct fts_ranking_t { doc_id_t doc_id; /*!< Document id */ fts_rank_t rank; /*!< Rank is between 0 .. 1 */ @@ -256,7 +246,7 @@ struct fts_ranking_struct { }; /** Query result. */ -struct fts_result_struct { +struct fts_result_t { ib_rbt_node_t* current; /*!< Current element */ ib_rbt_t* rankings_by_id; /*!< RB tree of type fts_ranking_t @@ -268,7 +258,7 @@ struct fts_result_struct { /** This is used to generate the FTS auxiliary table name, we need the table id and the index id to generate the column specific FTS auxiliary table name. */ -struct fts_table_struct { +struct fts_table_t { const char* parent; /*!< Parent table name, this is required only for the database name */ @@ -311,10 +301,10 @@ enum fts_status { typedef enum fts_status fts_status_t; /** The state of the FTS sub system. */ -struct fts_struct { +struct fts_t { /*!< mutex protecting bg_threads* and fts_add_wq. */ - mutex_t bg_threads_mutex; + ib_mutex_t bg_threads_mutex; ulint bg_threads; /*!< number of background threads accessing this table */ @@ -339,10 +329,10 @@ struct fts_struct { ib_vector_t* indexes; /*!< Vector of FTS indexes, this is mainly for caching purposes. */ - mem_heap_t* fts_heap; /*!< heap for fts_struct allocation */ + mem_heap_t* fts_heap; /*!< heap for fts_t allocation */ }; -typedef struct fts_stopword_struct fts_stopword_t; +struct fts_stopword_t; /** status bits for fts_stopword_t status field. */ #define STOPWORD_NOT_INIT 0x1 @@ -395,15 +385,15 @@ fts_cache_index_cache_create( /******************************************************************//** Get the next available document id. This function creates a new -transaction to generate the document id. */ +transaction to generate the document id. +@return DB_SUCCESS if OK */ UNIV_INTERN -ulint +dberr_t fts_get_next_doc_id( /*================*/ - /*!< out: DB_SUCCESS if OK */ - const dict_table_t* table, /*!< in: table */ - doc_id_t* doc_id); /*!< out: new document id */ - + const dict_table_t* table, /*!< in: table */ + doc_id_t* doc_id) /*!< out: new document id */ + __attribute__((nonnull)); /*********************************************************************//** Update the next and last Doc ID in the CONFIG table to be the input "doc_id" value (+ 1). We would do so after each FTS index build or @@ -412,28 +402,17 @@ UNIV_INTERN void fts_update_next_doc_id( /*===================*/ + trx_t* trx, /*!< in/out: transaction */ const dict_table_t* table, /*!< in: table */ - const char* table_name, /*!< in: table name */ - doc_id_t doc_id); /*!< in: DOC ID to set */ - -/******************************************************************//** -Update the last document id. This function could create a new -transaction to update the last document id. */ -UNIV_INTERN -ulint -fts_update_sync_doc_id( -/*===================*/ - /*!< out: DB_SUCCESS if OK */ - const dict_table_t* table, /*!< in: table */ - const char* table_name, /*!< in: table name */ - doc_id_t doc_id, /*!< in: last document id */ - trx_t* trx); /*!< in: update trx */ + const char* table_name, /*!< in: table name, or NULL */ + doc_id_t doc_id) /*!< in: DOC ID to set */ + __attribute__((nonnull(2))); /******************************************************************//** Create a new document id . @return DB_SUCCESS if all went well else error */ UNIV_INTERN -ulint +dberr_t fts_create_doc_id( /*==============*/ dict_table_t* table, /*!< in: row is of this @@ -442,8 +421,8 @@ fts_create_doc_id( value to this row. This is the current row that is being inserted. */ - mem_heap_t* heap); /*!< in: heap */ - + mem_heap_t* heap) /*!< in: heap */ + __attribute__((nonnull)); /******************************************************************//** Create a new fts_doc_ids_t. @return new fts_doc_ids_t. */ @@ -488,7 +467,7 @@ on the given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_create_common_tables( /*=====================*/ trx_t* trx, /*!< in: transaction handle */ @@ -496,27 +475,27 @@ fts_create_common_tables( table, /*!< in: table with one FTS index */ const char* name, /*!< in: table name */ - ibool skip_doc_id_index); - /*!< in: Skip index on doc id */ + bool skip_doc_id_index) /*!< in: Skip index on doc id */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Wrapper function of fts_create_index_tables_low(), create auxiliary tables for an FTS index @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_create_index_tables( /*====================*/ trx_t* trx, /*!< in: transaction handle */ - const dict_index_t* index); /*!< in: the FTS index + const dict_index_t* index) /*!< in: the FTS index instance */ - + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Creates the column specific ancillary tables needed for supporting an FTS index on the given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_create_index_tables_low( /*========================*/ trx_t* trx, /*!< in: transaction handle */ @@ -524,16 +503,17 @@ fts_create_index_tables_low( index, /*!< in: the FTS index instance */ const char* table_name, /*!< in: the table name */ - table_id_t table_id); /*!< in: the table id */ - + table_id_t table_id) /*!< in: the table id */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Add the FTS document id hidden column. */ UNIV_INTERN void fts_add_doc_id_column( /*==================*/ - dict_table_t* table); /*!< in/out: Table with - FTS index */ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap) /*!< in: temporary memory heap, or NULL */ + __attribute__((nonnull(1))); /*********************************************************************//** Drops the ancillary tables needed for supporting an FTS index on the @@ -541,28 +521,29 @@ given table. row_mysql_lock_data_dictionary must have been called before this. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_drop_tables( /*============*/ trx_t* trx, /*!< in: transaction */ - dict_table_t* table); /*!< in: table has the FTS + dict_table_t* table) /*!< in: table has the FTS index */ - + __attribute__((nonnull)); /******************************************************************//** The given transaction is about to be committed; do whatever is necessary from the FTS system's POV. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_commit( /*=======*/ - trx_t* trx); /*!< in: transaction */ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** FTS Query entry point. @return DB_SUCCESS if successful otherwise error code */ UNIV_INTERN -ulint +dberr_t fts_query( /*======*/ trx_t* trx, /*!< in: transaction */ @@ -571,8 +552,9 @@ fts_query( const byte* query, /*!< in: FTS query */ ulint query_len, /*!< in: FTS query string len in bytes */ - fts_result_t** result); /*!< out: query result, to be + fts_result_t** result) /*!< out: query result, to be freed by the caller.*/ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Retrieve the FTS Relevance Ranking result for doc with doc_id @@ -686,10 +668,11 @@ fts_free( Run OPTIMIZE on the given table. @return DB_SUCCESS if all OK */ UNIV_INTERN -ulint +dberr_t fts_optimize_table( /*===============*/ - dict_table_t* table); /*!< in: table to optimiza */ + dict_table_t* table) /*!< in: table to optimiza */ + __attribute__((nonnull)); /**********************************************************************//** Startup the optimize thread and create the work queue. */ @@ -710,11 +693,12 @@ fts_optimize_is_init(void); Drops index ancillary tables for a FTS index @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_drop_index_tables( /*==================*/ trx_t* trx, /*!< in: transaction */ - dict_index_t* index); /*!< in: Index to drop */ + dict_index_t* index) /*!< in: Index to drop */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Remove the table from the OPTIMIZER's list. We do wait for @@ -740,24 +724,22 @@ fts_optimize_end(void); /*===================*/ /**********************************************************************//** -Take a FTS savepoint. -@return DB_SUCCESS or error code */ +Take a FTS savepoint. */ UNIV_INTERN void fts_savepoint_take( /*===============*/ trx_t* trx, /*!< in: transaction */ - const char* name); /*!< in: savepoint name */ - + const char* name) /*!< in: savepoint name */ + __attribute__((nonnull)); /**********************************************************************//** -Refresh last statement savepoint. -@return DB_SUCCESS or error code */ +Refresh last statement savepoint. */ UNIV_INTERN void fts_savepoint_laststmt_refresh( /*===========================*/ - trx_t* trx); /*!< in: transaction */ - + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); /**********************************************************************//** Release the savepoint data identified by name. */ UNIV_INTERN @@ -821,26 +803,26 @@ fts_drop_orphaned_tables(void); /*==========================*/ /******************************************************************//** -Since we do a horizontal split on the index table, we need to drop the -all the split tables. */ +Since we do a horizontal split on the index table, we need to drop +all the split tables. +@return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_drop_index_split_tables( /*========================*/ - /*!< out: DB_SUCCESS - or error code */ trx_t* trx, /*!< in: transaction */ - dict_index_t* index); /*!< in: fts instance */ + dict_index_t* index) /*!< in: fts instance */ + __attribute__((nonnull, warn_unused_result)); /****************************************************************//** Run SYNC on the table, i.e., write out data from the cache to the -FTS auxiliary INDEX table and clear the cache at the end. -@return DB_SUCCESS if all OK */ +FTS auxiliary INDEX table and clear the cache at the end. */ UNIV_INTERN -ulint +void fts_sync_table( /*===========*/ - dict_table_t* table); /*!< in: table */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); /****************************************************************//** Free the query graph but check whether dict_sys->mutex is already @@ -978,9 +960,9 @@ fts_get_docs_create( /****************************************************************//** Read the rows from the FTS index -@return vector of rows fetched */ +@return DB_SUCCESS if OK */ UNIV_INTERN -ulint +dberr_t fts_table_fetch_doc_ids( /*====================*/ trx_t* trx, /*!< in: transaction */ @@ -1011,12 +993,13 @@ fts_add_index( Drop auxiliary tables related to an FTS index @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t fts_drop_index( /*===========*/ dict_table_t* table, /*!< in: Table where indexes are dropped */ dict_index_t* index, /*!< in: Index to be dropped */ - trx_t* trx); /*!< in: Transaction for the drop */ + trx_t* trx) /*!< in: Transaction for the drop */ + __attribute__((nonnull)); /*******************************************************************//** Check indexes in the fts->indexes is also present in index cache and diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h index 8524f988e47..c6aca27f6ec 100644 --- a/storage/innobase/include/fts0priv.h +++ b/storage/innobase/include/fts0priv.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -113,23 +113,25 @@ component. /******************************************************************//** Parse an SQL string. %s is replaced with the table's id. -@return DB_SUCCESS or error code */ +@return query graph */ UNIV_INTERN que_t* fts_parse_sql( /*==========*/ fts_table_t* fts_table, /*!< in: FTS aux table */ pars_info_t* info, /*!< in: info struct, or NULL */ - const char* sql); /*!< in: SQL string to evaluate */ + const char* sql) /*!< in: SQL string to evaluate */ + __attribute__((nonnull(3), malloc, warn_unused_result)); /******************************************************************//** Evaluate a parsed SQL statement @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_eval_sql( /*=========*/ trx_t* trx, /*!< in: transaction */ - que_t* graph); /*!< in: Parsed statement */ + que_t* graph) /*!< in: Parsed statement */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Construct the name of an ancillary FTS table for the given table. @return own: table name, must be freed with mem_free() */ @@ -138,7 +140,8 @@ char* fts_get_table_name( /*===============*/ const fts_table_t* - fts_table); /*!< in: FTS aux table info */ + fts_table) /*!< in: FTS aux table info */ + __attribute__((nonnull, malloc, warn_unused_result)); /******************************************************************//** Construct the column specification part of the SQL string for selecting the indexed FTS columns for the given table. Adds the necessary bound @@ -160,7 +163,8 @@ fts_get_select_columns_str( /*=======================*/ dict_index_t* index, /*!< in: FTS index */ pars_info_t* info, /*!< in/out: parser info */ - mem_heap_t* heap); /*!< in: memory heap */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull, warn_unused_result)); /** define for fts_doc_fetch_by_doc_id() "option" value, defines whether we want to get Doc whose ID is equal to or greater or smaller than supplied @@ -174,41 +178,45 @@ Fetch document (= a single row's indexed text) with the given document id. @return: DB_SUCCESS if fetch is successful, else error */ UNIV_INTERN -ulint +dberr_t fts_doc_fetch_by_doc_id( /*====================*/ fts_get_doc_t* get_doc, /*!< in: state */ doc_id_t doc_id, /*!< in: id of document to fetch */ - dict_index_t* index_to_use, /*!< in: caller supplied FTS index */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ ulint option, /*!< in: search option, if it is greater than doc_id or equal */ fts_sql_callback callback, /*!< in: callback to read records */ - void* arg); /*!< in: callback arg */ + void* arg) /*!< in: callback arg */ + __attribute__((nonnull(6))); /*******************************************************************//** Callback function for fetch that stores the text of an FTS document, converting each column to UTF-16. -@return: always returns NULL */ +@return always FALSE */ UNIV_INTERN ibool fts_query_expansion_fetch_doc( /*==========================*/ void* row, /*!< in: sel_node_t* */ - void* user_arg); /*!< in: fts_doc_t* */ + void* user_arg) /*!< in: fts_doc_t* */ + __attribute__((nonnull)); /******************************************************************** Write out a single word's data as new entry/entries in the INDEX table. @return DB_SUCCESS if all OK. */ UNIV_INTERN -ulint +dberr_t fts_write_node( /*===========*/ trx_t* trx, /*!< in: transaction */ que_t** graph, /*!< in: query graph */ fts_table_t* fts_table, /*!< in: the FTS aux index */ fts_string_t* word, /*!< in: word in UTF-8 */ - fts_node_t* node); /*!< in: node columns */ + fts_node_t* node) /*!< in: node columns */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Tokenize a document. */ UNIV_INTERN @@ -217,8 +225,10 @@ fts_tokenize_document( /*==================*/ fts_doc_t* doc, /*!< in/out: document to tokenize */ - fts_doc_t* result); /*!< out: if provided, save + fts_doc_t* result) /*!< out: if provided, save result tokens here */ + __attribute__((nonnull(1))); + /*******************************************************************//** Continue to tokenize a document. */ UNIV_INTERN @@ -229,16 +239,18 @@ fts_tokenize_document_next( tokenize */ ulint add_pos, /*!< in: add this position to all tokens from this tokenization */ - fts_doc_t* result); /*!< out: if provided, save + fts_doc_t* result) /*!< out: if provided, save result tokens here */ + __attribute__((nonnull(1))); /******************************************************************//** -Create a new empty document. -@return own: new document */ +Initialize a document. */ UNIV_INTERN -fts_doc_t* +void fts_doc_init( /*=========*/ - fts_doc_t* doc); /*!< in: doc to initialize */ + fts_doc_t* doc) /*!< in: doc to initialize */ + __attribute__((nonnull)); + /******************************************************************//** Do a binary search for a doc id in the array @return +ve index if found -ve index where it should be @@ -250,26 +262,29 @@ fts_bsearch( fts_update_t* array, /*!< in: array to sort */ int lower, /*!< in: lower bound of array*/ int upper, /*!< in: upper bound of array*/ - doc_id_t doc_id); /*!< in: doc id to lookup */ + doc_id_t doc_id) /*!< in: doc id to lookup */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Free document. */ UNIV_INTERN void fts_doc_free( /*=========*/ - fts_doc_t* doc); /*!< in: document */ + fts_doc_t* doc) /*!< in: document */ + __attribute__((nonnull)); /******************************************************************//** Free fts_optimizer_word_t instanace.*/ - +UNIV_INTERN void fts_word_free( /*==========*/ - fts_word_t* word); /*!< in: instance to free.*/ + fts_word_t* word) /*!< in: instance to free.*/ + __attribute__((nonnull)); /******************************************************************//** Read the rows from the FTS inde -@return vector of rows fetched */ +@return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_index_fetch_nodes( /*==================*/ trx_t* trx, /*!< in: transaction */ @@ -277,7 +292,8 @@ fts_index_fetch_nodes( fts_table_t* fts_table, /*!< in: FTS aux table */ const fts_string_t* word, /*!< in: the word to fetch */ - fts_fetch_t* fetch); /*!< in: fetch callback.*/ + fts_fetch_t* fetch) /*!< in: fetch callback.*/ + __attribute__((nonnull)); /******************************************************************//** Create a fts_optimizer_word_t instance. @return new instance */ @@ -287,7 +303,8 @@ fts_word_init( /*==========*/ fts_word_t* word, /*!< in: word to initialize */ byte* utf8, /*!< in: UTF-8 string */ - ulint len); /*!< in: length of string in bytes */ + ulint len) /*!< in: length of string in bytes */ + __attribute__((nonnull)); /******************************************************************//** Compare two fts_trx_table_t instances, we actually compare the table id's here. @@ -297,7 +314,8 @@ int fts_trx_table_cmp( /*==============*/ const void* v1, /*!< in: id1 */ - const void* v2); /*!< in: id2 */ + const void* v2) /*!< in: id2 */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Compare a table id with a trx_table_t table id. @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ @@ -306,23 +324,26 @@ int fts_trx_table_id_cmp( /*=================*/ const void* p1, /*!< in: id1 */ - const void* p2); /*!< in: id2 */ + const void* p2) /*!< in: id2 */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Commit a transaction. @return DB_SUCCESS if all OK */ UNIV_INTERN -ulint +dberr_t fts_sql_commit( /*===========*/ - trx_t* trx); /*!< in: transaction */ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); /******************************************************************//** Rollback a transaction. @return DB_SUCCESS if all OK */ UNIV_INTERN -ulint +dberr_t fts_sql_rollback( /*=============*/ - trx_t* trx); /*!< in: transaction */ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); /******************************************************************//** Parse an SQL string. %s is replaced with the table's id. Don't acquire the dict mutex @@ -333,41 +354,44 @@ fts_parse_sql_no_dict_lock( /*=======================*/ fts_table_t* fts_table, /*!< in: table with FTS index */ pars_info_t* info, /*!< in: parser info */ - const char* sql); /*!< in: SQL string to evaluate */ + const char* sql) /*!< in: SQL string to evaluate */ + __attribute__((nonnull(3), malloc, warn_unused_result)); /******************************************************************//** Get value from config table. The caller must ensure that enough space is allocated for value to hold the column contents @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_value( /*=================*/ trx_t* trx, /* transaction */ fts_table_t* fts_table, /*!< in: the indexed FTS table */ const char* name, /*!< in: get config value for this parameter name */ - fts_string_t* value); /*!< out: value read from + fts_string_t* value) /*!< out: value read from config table */ + __attribute__((nonnull)); /******************************************************************//** Get value specific to an FTS index from the config table. The caller must ensure that enough space is allocated for value to hold the column contents. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_index_value( /*=======================*/ trx_t* trx, /*!< transaction */ dict_index_t* index, /*!< in: index */ const char* param, /*!< in: get config value for this parameter name */ - fts_string_t* value); /*!< out: value read from + fts_string_t* value) /*!< out: value read from config table */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Set the value in the config table for name. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_value( /*=================*/ trx_t* trx, /*!< transaction */ @@ -375,89 +399,96 @@ fts_config_set_value( const char* name, /*!< in: get config value for this parameter name */ const fts_string_t* - value); /*!< in: value to update */ + value) /*!< in: value to update */ + __attribute__((nonnull)); /****************************************************************//** Set an ulint value in the config table. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_ulint( /*=================*/ trx_t* trx, /*!< in: transaction */ fts_table_t* fts_table, /*!< in: the indexed FTS table */ const char* name, /*!< in: param name */ - ulint int_value); /*!< in: value */ - + ulint int_value) /*!< in: value */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Set the value specific to an FTS index in the config table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_index_value( /*=======================*/ trx_t* trx, /*!< transaction */ dict_index_t* index, /*!< in: index */ const char* param, /*!< in: get config value for this parameter name */ - fts_string_t* value); /*!< out: value read from + fts_string_t* value) /*!< out: value read from config table */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Increment the value in the config table for column name. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_increment_value( /*=======================*/ trx_t* trx, /*!< transaction */ fts_table_t* fts_table, /*!< in: the indexed FTS table */ const char* name, /*!< in: increment config value for this parameter name */ - ulint delta); /*!< in: increment by this much */ + ulint delta) /*!< in: increment by this much */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Increment the per index value in the config table for column name. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_increment_index_value( /*=============================*/ trx_t* trx, /*!< transaction */ dict_index_t* index, /*!< in: FTS index */ const char* name, /*!< in: increment config value for this parameter name */ - ulint delta); /*!< in: increment by this much */ + ulint delta) /*!< in: increment by this much */ + __attribute__((nonnull)); /******************************************************************//** Get an ulint value from the config table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_index_ulint( /*=======================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: FTS index */ const char* name, /*!< in: param name */ - ulint* int_value); /*!< out: value */ + ulint* int_value) /*!< out: value */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Set an ulint value int the config table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_set_index_ulint( /*=======================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: FTS index */ const char* name, /*!< in: param name */ - ulint int_value); /*!< in: value */ + ulint int_value) /*!< in: value */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Get an ulint value from the config table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_config_get_ulint( /*=================*/ trx_t* trx, /*!< in: transaction */ fts_table_t* fts_table, /*!< in: the indexed FTS table */ const char* name, /*!< in: param name */ - ulint* int_value); /*!< out: value */ + ulint* int_value) /*!< out: value */ + __attribute__((nonnull)); /******************************************************************//** Search cache for word. @return the word node vector if found else NULL */ @@ -468,7 +499,8 @@ fts_cache_find_word( const fts_index_cache_t* index_cache, /*!< in: cache to search */ const fts_string_t* - text); /*!< in: word to search for */ + text) /*!< in: word to search for */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Check cache for deleted doc id. @return TRUE if deleted */ @@ -478,7 +510,8 @@ fts_cache_is_deleted_doc_id( /*========================*/ const fts_cache_t* cache, /*!< in: cache ito search */ - doc_id_t doc_id); /*!< in: doc id to search for */ + doc_id_t doc_id) /*!< in: doc id to search for */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Append deleted doc ids to vector and sort the vector. */ UNIV_INTERN @@ -502,35 +535,31 @@ fts_wait_for_background_thread_to_start( ulint max_wait); /*!< in: time in microseconds, if set to 0 then it disables timeout checking */ -/*********************************************************************//** -Get the total number of documents in the FTS. -@return estimated number of rows in the table */ -UNIV_INTERN -ulint -fts_get_total_document_count( -/*=========================*/ - dict_table_t* table); /*!< in: table instance */ +#ifdef FTS_DOC_STATS_DEBUG /******************************************************************//** Get the total number of words in the FTS for a particular FTS index. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t fts_get_total_word_count( /*=====================*/ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: for this index */ - ulint* total); /*!< out: total words */ + ulint* total) /*!< out: total words */ + __attribute__((nonnull, warn_unused_result)); +#endif /******************************************************************//** Search the index specific cache for a particular FTS index. @return the index specific cache else NULL */ UNIV_INTERN -const fts_index_cache_t* +fts_index_cache_t* fts_find_index_cache( /*================*/ const fts_cache_t* cache, /*!< in: cache to search */ const dict_index_t* - index); /*!< in: index to search for */ + index) /*!< in: index to search for */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Write the table id to the given buffer (including final NUL). Buffer must be at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. @@ -539,8 +568,9 @@ UNIV_INLINE int fts_write_object_id( /*================*/ - ib_id_t id, /*!< in: a table/index id */ - char* str); /*!< in: buffer to write the id to */ + ib_id_t id, /*!< in: a table/index id */ + char* str) /*!< in: buffer to write the id to */ + __attribute__((nonnull)); /******************************************************************//** Read the table id from the string generated by fts_write_object_id(). @return TRUE if parse successful */ @@ -549,7 +579,8 @@ ibool fts_read_object_id( /*===============*/ ib_id_t* id, /*!< out: a table id */ - const char* str); /*!< in: buffer to read from */ + const char* str) /*!< in: buffer to read from */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Get the table id. @return number of bytes written */ @@ -559,23 +590,26 @@ fts_get_table_id( /*=============*/ const fts_table_t* fts_table, /*!< in: FTS Auxiliary table */ - char* table_id); /*!< out: table id, must be at least + char* table_id) /*!< out: table id, must be at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Add the table to add to the OPTIMIZER's list. */ UNIV_INTERN void fts_optimize_add_table( /*===================*/ - dict_table_t* table); /*!< in: table to add */ + dict_table_t* table) /*!< in: table to add */ + __attribute__((nonnull)); /******************************************************************//** Optimize a table. */ UNIV_INTERN void fts_optimize_do_table( /*==================*/ - dict_table_t* table); /*!< in: table to optimize */ + dict_table_t* table) /*!< in: table to optimize */ + __attribute__((nonnull)); /******************************************************************//** Construct the prefix name of an FTS table. @return own: table name, must be freed with mem_free() */ @@ -584,7 +618,8 @@ char* fts_get_table_name_prefix( /*======================*/ const fts_table_t* - fts_table); /*!< in: Auxiliary table type */ + fts_table) /*!< in: Auxiliary table type */ + __attribute__((nonnull, malloc, warn_unused_result)); /******************************************************************//** Add node positions. */ UNIV_INTERN @@ -594,7 +629,8 @@ fts_cache_node_add_positions( fts_cache_t* cache, /*!< in: cache */ fts_node_t* node, /*!< in: word node */ doc_id_t doc_id, /*!< in: doc id */ - ib_vector_t* positions); /*!< in: fts_token_t::positions */ + ib_vector_t* positions) /*!< in: fts_token_t::positions */ + __attribute__((nonnull(2,4))); /******************************************************************//** Create the config table name for retrieving index specific value. @@ -604,7 +640,8 @@ char* fts_config_create_index_param_name( /*===============================*/ const char* param, /*!< in: base name of param */ - const dict_index_t* index); /*!< in: index for config */ + const dict_index_t* index) /*!< in: index for config */ + __attribute__((nonnull, malloc, warn_unused_result)); #ifndef UNIV_NONINL #include "fts0priv.ic" diff --git a/storage/innobase/include/fts0priv.ic b/storage/innobase/include/fts0priv.ic index 716ea4713b5..268bb7e2227 100644 --- a/storage/innobase/include/fts0priv.ic +++ b/storage/innobase/include/fts0priv.ic @@ -31,15 +31,9 @@ UNIV_INLINE int fts_write_object_id( /*================*/ - ib_id_t id, /* in: a table/index id */ + ib_id_t id, /* in: a table/index id */ char* str) /* in: buffer to write the id to */ { -#ifdef __WIN__ -# define UINT64PFx "%016I64u" -#else -# define UINT64PFx "%016"PRIx64 -# endif /* __WIN__ */ - // FIXME: Use ut_snprintf() return(sprintf(str, UINT64PFx, id)); } @@ -54,6 +48,45 @@ fts_read_object_id( ib_id_t* id, /* out: an id */ const char* str) /* in: buffer to read from */ { - return(sscanf(str, IB_ID_FMT, id) == 2); + return(sscanf(str, UINT64PFx, id) == 1); +} + +/******************************************************************//** +Compare two fts_trx_table_t instances. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((table1->id > table2->id) + ? 1 + : (table1->id == table2->id) + ? 0 + : -1); } +/******************************************************************//** +Compare a table id with a fts_trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const ullint* table_id = (const ullint*) p1; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((*table_id > table2->id) + ? 1 + : (*table_id == table2->id) + ? 0 + : -1); +} diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h index 5b28f2c9473..8fc52c9fc5e 100644 --- a/storage/innobase/include/fts0types.h +++ b/storage/innobase/include/fts0types.h @@ -32,41 +32,35 @@ Created 2007-03-27 Sunny Bains #include "ut0rbt.h" #include "fts0fts.h" -/** Types (aliases) used within FTS. */ -typedef struct fts_que_struct fts_que_t; -typedef struct fts_node_struct fts_node_t; -typedef struct fts_word_struct fts_word_t; -typedef struct fts_fetch_struct fts_fetch_t; -typedef struct fts_update_struct fts_update_t; -typedef struct fts_get_doc_struct fts_get_doc_t; -typedef struct fts_utf8_str_struct fts_utf8_str_t; -typedef struct fts_doc_stats_struct fts_doc_stats_t; -typedef struct fts_tokenizer_word_struct fts_tokenizer_word_t; -typedef struct fts_index_selector_struct fts_index_selector_t; +/** Types used within FTS. */ +struct fts_que_t; +struct fts_node_t; +struct fts_utf8_str_t; /** Callbacks used within FTS. */ typedef pars_user_func_cb_t fts_sql_callback; typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); /** Statistics relevant to a particular document, used during retrieval. */ -struct fts_doc_stats_struct { +struct fts_doc_stats_t { doc_id_t doc_id; /*!< Document id */ ulint word_count; /*!< Total words in the document */ }; /** It's main purpose is to store the SQL prepared statements that are required to retrieve a document from the database. */ -struct fts_get_doc_struct { +struct fts_get_doc_t { fts_index_cache_t* index_cache; /*!< The index cache instance */ /*!< Parsed sql statement */ que_t* get_document_graph; + fts_cache_t* cache; /*!< The parent cache */ }; /** Since we can have multiple FTS indexes on a table, we keep a per index cache of words etc. */ -struct fts_index_cache_struct { +struct fts_index_cache_t { dict_index_t* index; /*!< The FTS index instance */ ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, @@ -88,7 +82,7 @@ struct fts_index_cache_struct { /** For supporting the tracking of updates on multiple FTS indexes we need to track which FTS indexes need to be updated. For INSERT and DELETE we update all fts indexes. */ -struct fts_update_struct { +struct fts_update_t { doc_id_t doc_id; /*!< The doc id affected */ ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be @@ -100,7 +94,7 @@ struct fts_update_struct { }; /** Stop word control infotmation. */ -struct fts_stopword_struct { +struct fts_stopword_t { ulint status; /*!< Status of the stopword tree */ ib_alloc_t* heap; /*!< The memory allocator to use */ ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ @@ -109,7 +103,7 @@ struct fts_stopword_struct { /** The SYNC state of the cache. There is one instance of this struct associated with each ADD thread. */ -struct fts_sync_struct { +struct fts_sync_t { trx_t* trx; /*!< The transaction used for SYNCing the cache to disk */ dict_table_t* table; /*!< Table with FTS index(es) */ @@ -131,12 +125,10 @@ struct fts_sync_struct { ib_time_t start_time; /*!< SYNC start time */ }; -typedef struct fts_sync_struct fts_sync_t; - /** The cache for the FTS system. It is a memory-based inverted index that new entries are added to, until it grows over the configured maximum size, at which time its contents are written to the INDEX table. */ -struct fts_cache_struct { +struct fts_cache_t { rw_lock_t lock; /*!< lock protecting all access to the memory buffer. FIXME: this needs to be our new upgrade-capable rw-lock */ @@ -145,11 +137,11 @@ struct fts_cache_struct { intialization, it has different SYNC level as above cache lock */ - mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ + ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ - mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ + ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ - mutex_t doc_id_lock; /*!< Lock covering Doc ID */ + ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each element is of type fts_update_t */ @@ -200,7 +192,7 @@ struct fts_cache_struct { }; /** Columns of the FTS auxiliary INDEX table */ -struct fts_node_struct { +struct fts_node_t { doc_id_t first_doc_id; /*!< First document id in ilist. */ doc_id_t last_doc_id; /*!< Last document id in ilist. */ @@ -223,7 +215,7 @@ struct fts_node_struct { }; /** A tokenizer word. Contains information about one word. */ -struct fts_tokenizer_word_struct { +struct fts_tokenizer_word_t { fts_string_t text; /*!< Token text. */ ib_vector_t* nodes; /*!< Word node ilists, each element is @@ -231,7 +223,7 @@ struct fts_tokenizer_word_struct { }; /** Word text plus it's array of nodes as on disk in FTS index */ -struct fts_word_struct { +struct fts_word_t { fts_string_t text; /*!< Word value in UTF-8 */ ib_vector_t* nodes; /*!< Nodes read from disk */ @@ -239,7 +231,7 @@ struct fts_word_struct { }; /** Callback for reading and filtering nodes that are read from FTS index */ -struct fts_fetch_struct { +struct fts_fetch_t { void* read_arg; /*!< Arg for the sql_callback */ fts_sql_callback @@ -248,7 +240,7 @@ struct fts_fetch_struct { }; /** For horizontally splitting an FTS auxiliary index */ -struct fts_index_selector_struct { +struct fts_index_selector_t { ulint value; /*!< Character value at which to split */ @@ -256,7 +248,7 @@ struct fts_index_selector_struct { }; /** This type represents a single document. */ -struct fts_doc_struct { +struct fts_doc_t { fts_string_t text; /*!< document text */ ibool found; /*!< TRUE if the document was found @@ -276,7 +268,7 @@ struct fts_doc_struct { }; /** A token and its positions within a document. */ -struct fts_token_struct { +struct fts_token_t { fts_string_t text; /*!< token text */ ib_vector_t* positions; /*!< an array of the positions the diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic index 2734a331a86..b96c3f9dac8 100644 --- a/storage/innobase/include/fts0types.ic +++ b/storage/innobase/include/fts0types.ic @@ -37,46 +37,6 @@ extern const ulint UTF8_ERROR; #define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80) /******************************************************************//** -Compare two fts_trx_table_t instances. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_trx_table_cmp( -/*==============*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ -{ - const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table; - const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; - - return((table1->id > table2->id) - ? 1 - : (table1->id == table2->id) - ? 0 - : -1); -} - -/******************************************************************//** -Compare a table id with a fts_trx_table_t table id. -@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ -UNIV_INLINE -int -fts_trx_table_id_cmp( -/*=================*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ -{ - const ullint* table_id = (const ullint*) p1; - const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; - - return((*table_id > table2->id) - ? 1 - : (*table_id == table2->id) - ? 0 - : -1); -} - -/******************************************************************//** Duplicate an UTF-8 string. @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ UNIV_INLINE diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h index 1a2b8dac014..2e4397ea5fc 100644 --- a/storage/innobase/include/ha0ha.h +++ b/storage/innobase/include/ha0ha.h @@ -221,10 +221,7 @@ ha_print_info( #endif /* !UNIV_HOTBACKUP */ /** The hash table external chain node */ -typedef struct ha_node_struct ha_node_t; - -/** The hash table external chain node */ -struct ha_node_struct { +struct ha_node_t { ha_node_t* next; /*!< next chain node or NULL if none */ #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG buf_block_t* block; /*!< buffer block containing the data, or NULL */ diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h index caf42abfcfe..0073930b502 100644 --- a/storage/innobase/include/ha0storage.h +++ b/storage/innobase/include/ha0storage.h @@ -39,7 +39,7 @@ constant per ha_storage's lifetime. */ #define HA_STORAGE_DEFAULT_HASH_CELLS 4096 /** Hash storage */ -typedef struct ha_storage_struct ha_storage_t; +struct ha_storage_t; /*******************************************************************//** Creates a hash storage. If any of the parameters is 0, then a default diff --git a/storage/innobase/include/ha0storage.ic b/storage/innobase/include/ha0storage.ic index ce6e7406b43..7150ca045ec 100644 --- a/storage/innobase/include/ha0storage.ic +++ b/storage/innobase/include/ha0storage.ic @@ -31,7 +31,7 @@ Created September 24, 2007 Vasil Dimov #include "mem0mem.h" /** Hash storage for strings */ -struct ha_storage_struct { +struct ha_storage_t { mem_heap_t* heap; /*!< memory heap from which memory is allocated */ hash_table_t* hash; /*!< hash table used to avoid @@ -39,9 +39,7 @@ struct ha_storage_struct { }; /** Objects of this type are stored in ha_storage_t */ -typedef struct ha_storage_node_struct ha_storage_node_t; -/** Objects of this type are stored in ha_storage_struct */ -struct ha_storage_node_struct { +struct ha_storage_node_t { ulint data_len;/*!< length of the data */ const void* data; /*!< pointer to data */ ha_storage_node_t* next; /*!< next node in hash chain */ diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index 5512bf7c62f..fb4b0120bbb 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2006, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,15 +28,19 @@ Created 5/11/2006 Osku Salerma #define HA_INNODB_PROTOTYPES_H #include "my_dbug.h" +#include "mysqld_error.h" #include "my_compare.h" #include "my_sys.h" #include "m_string.h" +#include "debug_sync.h" +#include "my_base.h" #include "trx0types.h" #include "m_ctype.h" /* CHARSET_INFO */ -// Forward declaration -typedef struct fts_string_struct fts_string_t; +// Forward declarations +class Field; +struct fts_string_t; /*********************************************************************//** Wrapper around MySQL's copy_and_convert function. @@ -105,7 +109,7 @@ innobase_convert_name( ulint buflen, /*!< in: length of buf, in bytes */ const char* id, /*!< in: identifier to convert */ ulint idlen, /*!< in: length of id, in bytes */ - void* thd, /*!< in: MySQL connection thread, or NULL */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ ibool table_id);/*!< in: TRUE=id is a table or database name; FALSE=id is an index name */ @@ -120,7 +124,19 @@ UNIV_INTERN ibool thd_is_replication_slave_thread( /*============================*/ - void* thd); /*!< in: thread handle (THD*) */ + THD* thd); /*!< in: thread handle */ + +/******************************************************************//** +Gets information on the durability property requested by thread. +Used when writing either a prepare or commit record to the log +buffer. +@return the durability property. */ +UNIV_INTERN +enum durability_properties +thd_requested_durability( +/*=====================*/ + const THD* thd) /*!< in: thread handle */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Returns true if the transaction this thread is processing has edited @@ -132,7 +148,7 @@ UNIV_INTERN ibool thd_has_edited_nontrans_tables( /*===========================*/ - void* thd); /*!< in: thread handle (THD*) */ + THD* thd); /*!< in: thread handle */ /*************************************************************//** Prints info of a THD object (== user session thread) to the given file. */ @@ -141,21 +157,10 @@ void innobase_mysql_print_thd( /*=====================*/ FILE* f, /*!< in: output stream */ - void* thd, /*!< in: pointer to a MySQL THD object */ + THD* thd, /*!< in: pointer to a MySQL THD object */ uint max_query_len); /*!< in: max query length to print, or 0 to use the default max length */ -/*****************************************************************//** -Log code calls this whenever log has been written and/or flushed up -to a new position. We use this to notify upper layer of a new commit -checkpoint when necessary.*/ -UNIV_INTERN -void -innobase_mysql_log_notify( -/*===============*/ - ib_uint64_t write_lsn, /*!< in: LSN written to log file */ - ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */ - /*************************************************************//** InnoDB uses this function to compare two data fields for which the data type is such that we must use MySQL code to compare them. @@ -173,6 +178,18 @@ innobase_mysql_cmp( unsigned int b_length) /*!< in: data field length, not UNIV_SQL_NULL */ __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Log code calls this whenever log has been written and/or flushed up +to a new position. We use this to notify upper layer of a new commit +checkpoint when necessary.*/ +extern "C" UNIV_INTERN +void +innobase_mysql_log_notify( +/*===============*/ + ib_uint64_t write_lsn, /*!< in: LSN written to log file */ + ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */ + /**************************************************************//** Converts a MySQL type to an InnoDB type. Note that this function returns the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 @@ -233,11 +250,11 @@ innobase_basename( /******************************************************************//** Returns true if the thread is executing a SELECT statement. @return true if thd is executing SELECT */ - +UNIV_INTERN ibool thd_is_select( /*==========*/ - const void* thd); /*!< in: thread handle (THD*) */ + const THD* thd); /*!< in: thread handle */ /******************************************************************//** Converts an identifier to a table name. */ @@ -276,7 +293,7 @@ UNIV_INTERN struct charset_info_st* innobase_get_charset( /*=================*/ - void* mysql_thd); /*!< in: MySQL thread handle */ + THD* thd); /*!< in: MySQL thread handle */ /**********************************************************************//** Determines the current SQL statement. @return SQL statement string */ @@ -284,7 +301,7 @@ UNIV_INTERN const char* innobase_get_stmt( /*==============*/ - void* mysql_thd, /*!< in: MySQL thread handle */ + THD* thd, /*!< in: MySQL thread handle */ size_t* length) /*!< out: length of the SQL statement */ __attribute__((nonnull)); /******************************************************************//** @@ -321,17 +338,17 @@ UNIV_INTERN ibool thd_supports_xa( /*============*/ - void* thd); /*!< in: thread handle (THD*), or NULL to query + THD* thd); /*!< in: thread handle, or NULL to query the global innodb_supports_xa */ /******************************************************************//** Returns the lock wait timeout for the current connection. @return the lock wait timeout, in seconds */ - +UNIV_INTERN ulong thd_lock_wait_timeout( /*==================*/ - void* thd); /*!< in: thread handle (THD*), or NULL to query + THD* thd); /*!< in: thread handle, or NULL to query the global innodb_lock_wait_timeout */ /******************************************************************//** Add up the time waited for the lock for the current query. */ @@ -339,7 +356,7 @@ UNIV_INTERN void thd_set_lock_wait_time( /*===================*/ - void* thd, /*!< in: thread handle (THD*) */ + THD* thd, /*!< in/out: thread handle */ ulint value); /*!< in: time waited for the lock */ /**********************************************************************//** @@ -363,6 +380,15 @@ ulint innobase_get_lower_case_table_names(void); /*=====================================*/ +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +int +innobase_close_thd( +/*===============*/ + THD* thd); /*!< in: MySQL thread handle for + which to close the connection */ /*************************************************************//** Get the next token from the given string and store it in *token. */ UNIV_INTERN @@ -414,7 +440,7 @@ UNIV_INTERN ibool thd_trx_is_read_only( /*=================*/ - void* thd); /*!< in: thread handle (THD*) */ + THD* thd); /*!< in/out: thread handle */ /******************************************************************//** Check if the transaction is an auto-commit transaction. TRUE also @@ -424,5 +450,139 @@ UNIV_INTERN ibool thd_trx_is_auto_commit( /*===================*/ - void* thd); /*!< in: thread handle (THD*) can be NULL */ + THD* thd); /*!< in: thread handle, or NULL */ + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table or +index name to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted + identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name, /*!< in: index or table name + to format */ + ibool is_index_name) /*!< in: index name */ + __attribute__((nonnull)); + +/** Corresponds to Sql_condition:enum_warning_level. */ +enum ib_log_level_t { + IB_LOG_LEVEL_INFO, + IB_LOG_LEVEL_WARN, + IB_LOG_LEVEL_ERROR, + IB_LOG_LEVEL_FATAL +}; + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + __attribute__((format(printf, 4, 5))); + +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...); /*!< Args */ + +/******************************************************************//** +Write a message to the MySQL log, prefixed with "InnoDB: ". +Wrapper around sql_print_information() */ +UNIV_INTERN +void +ib_logf( +/*====*/ + ib_log_level_t level, /*!< in: warning level */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + __attribute__((format(printf, 2, 3))); + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +UNIV_INTERN +const char* +server_get_hostname(); +/*=================*/ + +/******************************************************************//** +Get the error message format string. +@return the format string or 0 if not found. */ +UNIV_INTERN +const char* +innobase_get_err_msg( +/*=================*/ + int error_code); /*!< in: MySQL error code */ + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +UNIV_INTERN +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ + __attribute__((pure, warn_unused_result)); + +/********************************************************************//** +Get the upper limit of the MySQL integral and floating-point type. +@return maximum allowed value for the field */ +UNIV_INTERN +ulonglong +innobase_get_int_col_max_value( +/*===========================*/ + const Field* field) /*!< in: MySQL field */ + __attribute__((nonnull, pure, warn_unused_result)); + #endif /* HA_INNODB_PROTOTYPES_H */ diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h index c5d439ef21b..52aaf2d25ef 100644 --- a/storage/innobase/include/handler0alter.h +++ b/storage/innobase/include/handler0alter.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2005, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,11 +27,34 @@ UNIV_INTERN void innobase_rec_to_mysql( /*==================*/ - struct TABLE* table, /*!< in/out: MySQL table */ - const rec_t* rec, /*!< in: record */ - const dict_index_t* index, /*!< in: index */ - const ulint* offsets); /*!< in: rec_get_offsets( - rec, index, ...) */ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ + __attribute__((nonnull)); + +/*************************************************************//** +Copies an InnoDB index entry to table->record[0]. */ +UNIV_INTERN +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ + __attribute__((nonnull)); + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. */ +UNIV_INTERN +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ + __attribute__((nonnull)); /*************************************************************//** Resets table->record[0]. */ @@ -39,4 +62,53 @@ UNIV_INTERN void innobase_rec_reset( /*===============*/ - struct TABLE* table); /*!< in/out: MySQL table */ + struct TABLE* table) /*!< in/out: MySQL table */ + __attribute__((nonnull)); + +/** Generate the next autoinc based on a snapshot of the session +auto_increment_increment and auto_increment_offset variables. */ +struct ib_sequence_t { + + /** + @param thd - the session + @param start_value - the lower bound + @param max_value - the upper bound (inclusive) */ + ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value); + + /** + Postfix increment + @return the value to insert */ + ulonglong operator++(int) UNIV_NOTHROW; + + /** Check if the autoinc "sequence" is exhausted. + @return true if the sequence is exhausted */ + bool eof() const UNIV_NOTHROW + { + return(m_eof); + } + + /** + @return the next value in the sequence */ + ulonglong last() const UNIV_NOTHROW + { + ut_ad(m_next_value > 0); + + return(m_next_value); + } + + /** Maximum calumn value if adding an AUTOINC column else 0. Once + we reach the end of the sequence it will be set to ~0. */ + const ulonglong m_max_value; + + /** Value of auto_increment_increment */ + ulong m_increment; + + /** Value of auto_increment_offset */ + ulong m_offset; + + /** Next value in the sequence */ + ulonglong m_next_value; + + /** true if no more values left in the sequence */ + bool m_eof; +}; diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h index 1c19ea53a23..6f9a628df5d 100644 --- a/storage/innobase/include/hash0hash.h +++ b/storage/innobase/include/hash0hash.h @@ -33,8 +33,8 @@ Created 5/20/1997 Heikki Tuuri # include "sync0rw.h" #endif /* !UNIV_HOTBACKUP */ -typedef struct hash_table_struct hash_table_t; -typedef struct hash_cell_struct hash_cell_t; +struct hash_table_t; +struct hash_cell_t; typedef void* hash_node_t; @@ -382,7 +382,7 @@ hash_get_heap( Gets the nth mutex in a hash table. @return mutex */ UNIV_INLINE -mutex_t* +ib_mutex_t* hash_get_nth_mutex( /*===============*/ hash_table_t* table, /*!< in: hash table */ @@ -400,7 +400,7 @@ hash_get_nth_lock( Gets the mutex for a fold value in a hash table. @return mutex */ UNIV_INLINE -mutex_t* +ib_mutex_t* hash_get_mutex( /*===========*/ hash_table_t* table, /*!< in: hash table */ @@ -451,7 +451,7 @@ void hash_mutex_exit_all_but( /*====================*/ hash_table_t* table, /*!< in: hash table */ - mutex_t* keep_mutex); /*!< in: mutex to keep */ + ib_mutex_t* keep_mutex); /*!< in: mutex to keep */ /************************************************************//** s-lock a lock for a fold value in a hash table. */ UNIV_INTERN @@ -524,12 +524,12 @@ hash_unlock_x_all_but( # define hash_unlock_x_all_but(t, l) ((void) 0) #endif /* !UNIV_HOTBACKUP */ -struct hash_cell_struct{ +struct hash_cell_t{ void* node; /*!< hash chain node, NULL if none */ }; /* The hash table structure */ -struct hash_table_struct { +struct hash_table_t { enum hash_table_sync_t type; /*<! type of hash_table. */ #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG # ifndef UNIV_HOTBACKUP @@ -547,7 +547,7 @@ struct hash_table_struct { rw_locks depending on the type. Must be a power of 2 */ union { - mutex_t* mutexes;/* NULL, or an array of mutexes + ib_mutex_t* mutexes;/* NULL, or an array of mutexes used to protect segments of the hash table */ rw_lock_t* rw_locks;/* NULL, or an array of rw_lcoks diff --git a/storage/innobase/include/hash0hash.ic b/storage/innobase/include/hash0hash.ic index 1e5474601d5..254f3f82e5d 100644 --- a/storage/innobase/include/hash0hash.ic +++ b/storage/innobase/include/hash0hash.ic @@ -150,7 +150,7 @@ hash_get_heap( Gets the nth mutex in a hash table. @return mutex */ UNIV_INLINE -mutex_t* +ib_mutex_t* hash_get_nth_mutex( /*===============*/ hash_table_t* table, /*!< in: hash table */ @@ -168,7 +168,7 @@ hash_get_nth_mutex( Gets the mutex for a fold value in a hash table. @return mutex */ UNIV_INLINE -mutex_t* +ib_mutex_t* hash_get_mutex( /*===========*/ hash_table_t* table, /*!< in: hash table */ diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h index f405ebf8d11..e64f067d364 100644 --- a/storage/innobase/include/ibuf0ibuf.h +++ b/storage/innobase/include/ibuf0ibuf.h @@ -376,24 +376,16 @@ will be merged from ibuf trees to the pages read, 0 if ibuf is empty */ UNIV_INTERN ulint -ibuf_contract( -/*==========*/ - ibool sync); /*!< in: TRUE if the caller wants to wait for the - issued read with the highest tablespace address - to complete */ -/*********************************************************************//** -Contracts insert buffer trees by reading pages to the buffer pool. -@return a lower limit for the combined size in bytes of entries which -will be merged from ibuf trees to the pages read, 0 if ibuf is -empty */ -UNIV_INTERN -ulint ibuf_contract_in_background( /*========================*/ - ibool full); /*!< in: TRUE if the caller wants to do a full - contract based on PCT_IO(100). If FALSE then - the size of contract batch is determined based - on the current size of the ibuf tree. */ + table_id_t table_id, /*!< in: if merge should be done only + for a specific table, for all tables + this should be 0 */ + ibool full); /*!< in: TRUE if the caller wants to + do a full contract based on PCT_IO(100). + If FALSE then the size of contract + batch is determined based on the + current size of the ibuf tree. */ #endif /* !UNIV_HOTBACKUP */ /*********************************************************************//** Parses a redo log record of an ibuf bitmap page init. @@ -449,6 +441,17 @@ void ibuf_close(void); /*============*/ +/******************************************************************//** +Checks the insert buffer bitmaps on IMPORT TABLESPACE. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +ibuf_check_bitmap_on_import( +/*========================*/ + const trx_t* trx, /*!< in: transaction */ + ulint space_id) /*!< in: tablespace identifier */ + __attribute__((nonnull, warn_unused_result)); + #define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO #define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO @@ -462,36 +465,6 @@ for the file segment from which the pages for the ibuf tree are allocated */ /* The insert buffer tree itself is always located in space 0. */ #define IBUF_SPACE_ID 0 -/** Insert buffer struct */ -struct ibuf_struct{ - ulint size; /*!< current size of the ibuf index - tree, in pages */ - ulint max_size; /*!< recommended maximum size of the - ibuf index tree, in pages */ - ulint seg_size; /*!< allocated pages of the file - segment containing ibuf header and - tree */ - ibool empty; /*!< Protected by the page - latch of the root page of the - insert buffer tree - (FSP_IBUF_TREE_ROOT_PAGE_NO). TRUE - if and only if the insert - buffer tree is empty. */ - ulint free_list_len; /*!< length of the free list */ - ulint height; /*!< tree height */ - dict_index_t* index; /*!< insert buffer index */ - - ulint n_merges; /*!< number of pages merged */ - ulint n_merged_ops[IBUF_OP_COUNT]; - /*!< number of operations of each type - merged to index pages */ - ulint n_discarded_ops[IBUF_OP_COUNT]; - /*!< number of operations of each type - discarded without merging due to the - tablespace being deleted or the - index being dropped */ -}; - #ifndef UNIV_NONINL #include "ibuf0ibuf.ic" #endif diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic index 8a4ec633b01..92ca2cbb9a2 100644 --- a/storage/innobase/include/ibuf0ibuf.ic +++ b/storage/innobase/include/ibuf0ibuf.ic @@ -58,6 +58,36 @@ ibuf_mtr_commit( mtr_commit(mtr); } +/** Insert buffer struct */ +struct ibuf_t{ + ulint size; /*!< current size of the ibuf index + tree, in pages */ + ulint max_size; /*!< recommended maximum size of the + ibuf index tree, in pages */ + ulint seg_size; /*!< allocated pages of the file + segment containing ibuf header and + tree */ + ibool empty; /*!< Protected by the page + latch of the root page of the + insert buffer tree + (FSP_IBUF_TREE_ROOT_PAGE_NO). TRUE + if and only if the insert + buffer tree is empty. */ + ulint free_list_len; /*!< length of the free list */ + ulint height; /*!< tree height */ + dict_index_t* index; /*!< insert buffer index */ + + ulint n_merges; /*!< number of pages merged */ + ulint n_merged_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + merged to index pages */ + ulint n_discarded_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + discarded without merging due to the + tablespace being deleted or the + index being dropped */ +}; + /************************************************************************//** Sets the free bit of the page in the ibuf bitmap. This is done in a separate mini-transaction, hence this operation does not restrict further work to only @@ -97,6 +127,7 @@ ibuf_should_try( return(ibuf_use != IBUF_USE_NONE && ibuf->max_size != 0 && !dict_index_is_clust(index) + && index->table->quiesce == QUIESCE_NONE && (ignore_sec_unique || !dict_index_is_unique(index))); } diff --git a/storage/innobase/include/ibuf0types.h b/storage/innobase/include/ibuf0types.h index e404b62a011..3fdbf078b0b 100644 --- a/storage/innobase/include/ibuf0types.h +++ b/storage/innobase/include/ibuf0types.h @@ -26,6 +26,6 @@ Created 7/29/1997 Heikki Tuuri #ifndef ibuf0types_h #define ibuf0types_h -typedef struct ibuf_struct ibuf_t; +struct ibuf_t; #endif diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h index 42b4f7281e4..0054850b526 100644 --- a/storage/innobase/include/lock0iter.h +++ b/storage/innobase/include/lock0iter.h @@ -29,13 +29,13 @@ Created July 16, 2007 Vasil Dimov #include "univ.i" #include "lock0types.h" -typedef struct lock_queue_iterator_struct { +struct lock_queue_iterator_t { const lock_t* current_lock; /* In case this is a record lock queue (not table lock queue) then bit_no is the record number within the heap in which the record is stored. */ ulint bit_no; -} lock_queue_iterator_t; +}; /*******************************************************************//** Initialize lock queue iterator so that it starts to iterate from diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index a1ffe87d5bd..8e6fdaed3d5 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -275,7 +275,7 @@ the query thread to the lock wait state and inserts a waiting request for a gap x-lock to the lock queue. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_rec_insert_check_and_lock( /*===========================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is @@ -285,10 +285,11 @@ lock_rec_insert_check_and_lock( dict_index_t* index, /*!< in: index */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr, /*!< in/out: mini-transaction */ - ibool* inherit);/*!< out: set to TRUE if the new + ibool* inherit)/*!< out: set to TRUE if the new inserted record maybe should inherit LOCK_GAP type locks from the successor record */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Checks if locks of other transactions prevent an immediate modify (update, delete mark, or delete unmark) of a clustered index record. If they do, @@ -298,7 +299,7 @@ lock wait state and inserts a waiting request for a record x-lock to the lock queue. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_clust_rec_modify_check_and_lock( /*=================================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -308,13 +309,14 @@ lock_clust_rec_modify_check_and_lock( modified */ dict_index_t* index, /*!< in: clustered index */ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ - que_thr_t* thr); /*!< in: query thread */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((warn_unused_result, nonnull)); /*********************************************************************//** Checks if locks of other transactions prevent an immediate modify (delete mark or delete unmark) of a secondary index record. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_sec_rec_modify_check_and_lock( /*===============================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -326,15 +328,17 @@ lock_sec_rec_modify_check_and_lock( clustered index record first: see the comment below */ dict_index_t* index, /*!< in: secondary index */ - que_thr_t* thr, /*!< in: query thread */ - mtr_t* mtr); /*!< in/out: mini-transaction */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((warn_unused_result, nonnull(2,3,4,6))); /*********************************************************************//** Like lock_clust_rec_read_check_and_lock(), but reads a secondary index record. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -enum db_err +dberr_t lock_sec_rec_read_check_and_lock( /*=============================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -364,7 +368,7 @@ lock on the record. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -enum db_err +dberr_t lock_clust_rec_read_check_and_lock( /*===============================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -395,7 +399,7 @@ lock_clust_rec_read_check_and_lock() that does not require the parameter "offsets". @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_clust_rec_read_check_and_lock_alt( /*===================================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -413,13 +417,14 @@ lock_clust_rec_read_check_and_lock_alt( SELECT FOR UPDATE */ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or LOCK_REC_NOT_GAP */ - que_thr_t* thr); /*!< in: query thread */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Checks that a record is seen in a consistent read. -@return TRUE if sees, or FALSE if an earlier version of the record +@return true if sees, or false if an earlier version of the record should be retrieved */ UNIV_INTERN -ibool +bool lock_clust_rec_cons_read_sees( /*==========================*/ const rec_t* rec, /*!< in: user record which should be read or @@ -431,26 +436,27 @@ lock_clust_rec_cons_read_sees( Checks that a non-clustered index record is seen in a consistent read. NOTE that a non-clustered index page contains so little information on -its modifications that also in the case FALSE, the present version of +its modifications that also in the case false, the present version of rec may be the right, but we must check this from the clustered index record. -@return TRUE if certainly sees, or FALSE if an earlier version of the +@return true if certainly sees, or false if an earlier version of the clustered index record might be needed */ UNIV_INTERN -ulint +bool lock_sec_rec_cons_read_sees( /*========================*/ const rec_t* rec, /*!< in: user record which should be read or passed over by a read cursor */ - const read_view_t* view); /*!< in: consistent read view */ + const read_view_t* view) /*!< in: consistent read view */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Locks the specified database table in the mode given. If the lock cannot be granted immediately, the query thread is put to wait. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_table( /*=======*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, @@ -458,7 +464,8 @@ lock_table( dict_table_t* table, /*!< in/out: database table in dictionary cache */ enum lock_mode mode, /*!< in: lock mode */ - que_thr_t* thr); /*!< in: query thread */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Removes a granted record lock of a transaction from the queue and grants locks to other transactions waiting in the queue if they now are entitled @@ -780,7 +787,7 @@ was selected as a deadlock victim, or if it has to wait then cancel the wait lock. @return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */ UNIV_INTERN -enum db_err +dberr_t lock_trx_handle_wait( /*=================*/ trx_t* trx) /*!< in/out: trx lock state */ @@ -864,29 +871,35 @@ lock_trx_has_sys_table_locks( remains set when the waiting lock is granted, or if the lock is inherited to a neighboring record */ -#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_MODE_MASK +#define LOCK_CONV_BY_OTHER 4096 /*!< this bit is set when the lock is created + by other transaction */ +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_CONV_BY_OTHER)&LOCK_MODE_MASK # error #endif -#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_TYPE_MASK +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_CONV_BY_OTHER)&LOCK_TYPE_MASK # error #endif /* @} */ +/** Checks if this is a waiting lock created by lock->trx itself. +@param type_mode lock->type_mode +@return whether it is a waiting lock belonging to lock->trx */ +#define lock_is_wait_not_by_other(type_mode) \ + ((type_mode & (LOCK_CONV_BY_OTHER | LOCK_WAIT)) == LOCK_WAIT) + /** Lock operation struct */ -typedef struct lock_op_struct lock_op_t; -/** Lock operation struct */ -struct lock_op_struct{ +struct lock_op_t{ dict_table_t* table; /*!< table to be locked */ enum lock_mode mode; /*!< lock mode */ }; /** The lock system struct */ -struct lock_sys_struct{ - mutex_t mutex; /*!< Mutex protecting the +struct lock_sys_t{ + ib_mutex_t mutex; /*!< Mutex protecting the locks */ hash_table_t* rec_hash; /*!< hash table of the record locks */ - mutex_t wait_mutex; /*!< Mutex protecting the + ib_mutex_t wait_mutex; /*!< Mutex protecting the next two fields */ srv_slot_t* waiting_threads; /*!< Array of user threads suspended while waiting for @@ -901,6 +914,16 @@ struct lock_sys_struct{ recovered transactions is complete. Protected by lock_sys->mutex */ + + ulint n_lock_max_wait_time; /*!< Max wait time */ + + os_event_t timeout_event; /*!< Set to the event that is + created in the lock wait monitor + thread. A value of 0 means the + thread is not active */ + + bool timeout_thread_active; /*!< True if the timeout thread + is running */ }; /** The lock system */ @@ -935,14 +958,6 @@ extern lock_sys_t* lock_sys; mutex_exit(&lock_sys->wait_mutex); \ } while (0) -// FIXME: Move these to lock_sys_t -extern ibool srv_lock_timeout_active; -extern ulint srv_n_lock_wait_count; -extern ulint srv_n_lock_wait_current_count; -extern ib_int64_t srv_n_lock_wait_time; -extern ulint srv_n_lock_max_wait_time; -extern os_event_t srv_lock_timeout_thread_event; - #ifndef UNIV_NONINL #include "lock0lock.ic" #endif diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h index d516289e1f2..9f7ab9f76b6 100644 --- a/storage/innobase/include/lock0priv.h +++ b/storage/innobase/include/lock0priv.h @@ -40,9 +40,7 @@ those functions in lock/ */ #include "ut0lst.h" /** A table lock */ -typedef struct lock_table_struct lock_table_t; -/** A table lock */ -struct lock_table_struct { +struct lock_table_t { dict_table_t* table; /*!< database table in dictionary cache */ UT_LIST_NODE_T(lock_t) @@ -51,9 +49,7 @@ struct lock_table_struct { }; /** Record lock for a page */ -typedef struct lock_rec_struct lock_rec_t; -/** Record lock for a page */ -struct lock_rec_struct { +struct lock_rec_t { ulint space; /*!< space id */ ulint page_no; /*!< page number */ ulint n_bits; /*!< number of bits in the lock @@ -63,7 +59,7 @@ struct lock_rec_struct { }; /** Lock struct; protected by lock_sys->mutex */ -struct lock_struct { +struct lock_t { trx_t* trx; /*!< transaction owning the lock */ UT_LIST_NODE_T(lock_t) diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h index 16e6b2e0113..cf32e72f864 100644 --- a/storage/innobase/include/lock0types.h +++ b/storage/innobase/include/lock0types.h @@ -27,8 +27,8 @@ Created 5/7/1996 Heikki Tuuri #define lock0types_h #define lock_t ib_lock_t -typedef struct lock_struct lock_t; -typedef struct lock_sys_struct lock_sys_t; +struct lock_t; +struct lock_sys_t; /* Basic lock modes */ enum lock_mode { diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 5d72c7a96da..dd5e37012b7 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -48,9 +48,9 @@ typedef ib_uint64_t lsn_t; #define LSN_PF UINT64PF /** Redo log buffer */ -typedef struct log_struct log_t; +struct log_t; /** Redo log group */ -typedef struct log_group_struct log_group_t; +struct log_group_t; #ifdef UNIV_DEBUG /** Flag: write to log file? */ @@ -67,7 +67,7 @@ extern ibool log_debug_writes; #define LOG_WAIT_ONE_GROUP 92 #define LOG_WAIT_ALL_GROUPS 93 /* @} */ -/** Maximum number of log groups in log_group_struct::checkpoint_buf */ +/** Maximum number of log groups in log_group_t::checkpoint_buf */ #define LOG_MAX_N_GROUPS 32 /*******************************************************************//** @@ -161,6 +161,14 @@ UNIV_INLINE lsn_t log_get_capacity(void); /*==================*/ +/**************************************************************** +Get log_sys::max_modified_age_async. It is OK to read the value without +holding log_sys::mutex because it is constant. +@return max_modified_age_async */ +UNIV_INLINE +lsn_t +log_get_max_modified_age_async(void); +/*================================*/ /******************************************************//** Initializes the log. */ UNIV_INTERN @@ -223,15 +231,6 @@ void log_buffer_sync_in_background( /*==========================*/ ibool flush); /*<! in: flush the logs to disk */ -/****************************************************************//** -Checks if an asynchronous flushing of dirty pages is required in the -background. This function is only called from the page cleaner thread. -@return lsn to which the flushing should happen or LSN_MAX -if flushing is not required */ -UNIV_INTERN -lsn_t -log_async_flush_lsn(void); -/*=====================*/ /******************************************************//** Makes a checkpoint. Note that this function does not flush dirty blocks from the buffer pool: it only checks what is lsn of the oldest @@ -550,13 +549,19 @@ UNIV_INTERN void log_refresh_stats(void); /*===================*/ -/********************************************************** +/********************************************************//** +Closes all log groups. */ +UNIV_INTERN +void +log_group_close_all(void); +/*=====================*/ +/********************************************************//** Shutdown the log system but do not release all the memory. */ UNIV_INTERN void log_shutdown(void); /*==============*/ -/********************************************************** +/********************************************************//** Free the log system data structures. */ UNIV_INTERN void @@ -712,7 +717,7 @@ extern log_t* log_sys; /** Log group consists of a number of log files, each of the same size; a log group is implemented as a space in the sense of the module fil0fil. */ -struct log_group_struct{ +struct log_group_t{ /* The following fields are protected by log_sys->mutex */ ulint id; /*!< log group id */ ulint n_files; /*!< number of files in the group */ @@ -764,7 +769,7 @@ struct log_group_struct{ }; /** Redo log buffer */ -struct log_struct{ +struct log_t{ byte pad[64]; /*!< padding to prevent other memory update hotspots from residing on the same memory cache line */ @@ -772,9 +777,9 @@ struct log_struct{ ulint buf_free; /*!< first free offset within the log buffer */ #ifndef UNIV_HOTBACKUP - mutex_t mutex; /*!< mutex protecting the log */ + ib_mutex_t mutex; /*!< mutex protecting the log */ - mutex_t log_flush_order_mutex;/*!< mutex to serialize access to + ib_mutex_t log_flush_order_mutex;/*!< mutex to serialize access to the flush list when we are putting dirty blocks in the list. The idea behind this mutex is to be able diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic index ad7b7e790a2..67792395ac9 100644 --- a/storage/innobase/include/log0log.ic +++ b/storage/innobase/include/log0log.ic @@ -446,6 +446,18 @@ log_get_capacity(void) return(log_sys->log_group_capacity); } +/**************************************************************** +Get log_sys::max_modified_age_async. It is OK to read the value without +holding log_sys::mutex because it is constant. +@return max_modified_age_async */ +UNIV_INLINE +lsn_t +log_get_max_modified_age_async(void) +/*================================*/ +{ + return(log_sys->max_modified_age_async); +} + /***********************************************************************//** Checks if there is need for a log buffer flush or a new checkpoint, and does this if yes. Any database operation should call this when it has modified diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 218298a1698..dcdd4bdd8aa 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -128,7 +128,7 @@ recv_recovery_from_checkpoint_finish should be called later to complete the recovery and free the resources used in it. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t recv_recovery_from_checkpoint_start_func( /*=====================================*/ #ifdef UNIV_LOG_ARCHIVE @@ -212,18 +212,18 @@ UNIV_INTERN void recv_reset_logs( /*============*/ - lsn_t lsn, /*!< in: reset to this lsn - rounded up to be divisible by - OS_FILE_LOG_BLOCK_SIZE, after - which we add - LOG_BLOCK_HDR_SIZE */ #ifdef UNIV_LOG_ARCHIVE ulint arch_log_no, /*!< in: next archived log file number */ -#endif /* UNIV_LOG_ARCHIVE */ - ibool new_logs_created);/*!< in: TRUE if resetting logs + ibool new_logs_created,/*!< in: TRUE if resetting logs is done at the log creation; FALSE if it is done after archive recovery */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t lsn); /*!< in: reset to this lsn + rounded up to be divisible by + OS_FILE_LOG_BLOCK_SIZE, after + which we add + LOG_BLOCK_HDR_SIZE */ #ifdef UNIV_HOTBACKUP /******************************************************//** Creates new log files after a backup has been restored. */ @@ -318,9 +318,7 @@ recv_recovery_from_archive_finish(void); #endif /* UNIV_LOG_ARCHIVE */ /** Block of log record data */ -typedef struct recv_data_struct recv_data_t; -/** Block of log record data */ -struct recv_data_struct{ +struct recv_data_t{ recv_data_t* next; /*!< pointer to the next block or NULL */ /*!< the log record data is stored physically immediately after this struct, max amount @@ -328,9 +326,7 @@ struct recv_data_struct{ }; /** Stored log record struct */ -typedef struct recv_struct recv_t; -/** Stored log record struct */ -struct recv_struct{ +struct recv_t{ byte type; /*!< log record type */ ulint len; /*!< log record body length in bytes */ recv_data_t* data; /*!< chain of blocks containing the log record @@ -347,7 +343,7 @@ struct recv_struct{ rec_list;/*!< list of log records for this page */ }; -/** States of recv_addr_struct */ +/** States of recv_addr_t */ enum recv_addr_state { /** not yet processed */ RECV_NOT_PROCESSED, @@ -361,9 +357,7 @@ enum recv_addr_state { }; /** Hashed page file address struct */ -typedef struct recv_addr_struct recv_addr_t; -/** Hashed page file address struct */ -struct recv_addr_struct{ +struct recv_addr_t{ enum recv_addr_state state; /*!< recovery state of the page */ unsigned space:32;/*!< space id */ @@ -374,13 +368,14 @@ struct recv_addr_struct{ }; /** Recovery system data structure */ -typedef struct recv_sys_struct recv_sys_t; -/** Recovery system data structure */ -struct recv_sys_struct{ +struct recv_sys_t{ #ifndef UNIV_HOTBACKUP - mutex_t mutex; /*!< mutex protecting the fields apply_log_recs, + ib_mutex_t mutex; /*!< mutex protecting the fields apply_log_recs, n_addrs, and the state field in each recv_addr struct */ + ib_mutex_t writer_mutex;/*!< mutex coordinating + flushing between recv_writer_thread and + the recovery thread. */ #endif /* !UNIV_HOTBACKUP */ ibool apply_log_recs; /*!< this is TRUE when log rec application to diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h index 3066070ef39..d0087f56aaa 100644 --- a/storage/innobase/include/mach0data.h +++ b/storage/innobase/include/mach0data.h @@ -374,6 +374,40 @@ mach_read_int_type( const byte* src, /*!< in: where to read from */ ulint len, /*!< in: length of src */ ibool unsigned_type); /*!< in: signed or unsigned flag */ +/***********************************************************//** +Convert integral type from host byte order to (big-endian) storage +byte order. */ +UNIV_INLINE +void +mach_write_int_type( +/*================*/ + byte* dest, /*!< in: where to write*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + bool usign); /*!< in: signed or unsigned flag */ + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign); /*!< in: signed or unsigned flag */ + +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INLINE +ulint +mach_read_ulint( +/*============*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type); /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + #endif /* !UNIV_HOTBACKUP */ #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic index ec1a28bca47..fffef87f09d 100644 --- a/storage/innobase/include/mach0data.ic +++ b/storage/innobase/include/mach0data.ic @@ -776,5 +776,104 @@ mach_read_int_type( return(ret); } +/*********************************************************//** +Swap byte ordering. */ +UNIV_INLINE +void +mach_swap_byte_order( +/*=================*/ + byte* dest, /*!< out: where to write */ + const byte* from, /*!< in: where to read from */ + ulint len) /*!< in: length of src */ +{ + ut_ad(len > 0); + ut_ad(len <= 8); + + dest += len; + + switch (len & 0x7) { + case 0: *--dest = *from++; + case 7: *--dest = *from++; + case 6: *--dest = *from++; + case 5: *--dest = *from++; + case 4: *--dest = *from++; + case 3: *--dest = *from++; + case 2: *--dest = *from++; + case 1: *--dest = *from; + } +} + +/************************************************************* +Convert integral type from host byte order (big-endian) storage +byte order. */ +UNIV_INLINE +void +mach_write_int_type( +/*================*/ + byte* dest, /*!< in: where to write */ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + bool usign) /*!< in: signed or unsigned flag */ +{ +#ifdef WORDS_BIGENDIAN + memcpy(dest, src, len); +#else + mach_swap_byte_order(dest, src, len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign) /*!< in: signed or unsigned flag */ +{ + byte* ptr = reinterpret_cast<byte*>(&src); + + ut_ad(len <= sizeof(ulonglong)); + +#ifdef WORDS_BIGENDIAN + memcpy(dest, ptr + (sizeof(src) - len), len); +#else + mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INLINE +ulint +mach_read_ulint( +/*============*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type) /*!< in: 1,2 or 4 bytes */ +{ + switch (type) { + case 1: + return(mach_read_from_1(ptr)); + case 2: + return(mach_read_from_2(ptr)); + case 4: + return(mach_read_from_4(ptr)); + default: + ut_error; + } +} + #endif /* !UNIV_HOTBACKUP */ #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/mem0dbg.h b/storage/innobase/include/mem0dbg.h index 9f95e84c81e..cc339b82910 100644 --- a/storage/innobase/include/mem0dbg.h +++ b/storage/innobase/include/mem0dbg.h @@ -32,7 +32,7 @@ check fields whose sizes are given below */ /* The mutex which protects in the debug version the hash table containing the list of live memory heaps, and also the global variables in mem0dbg.cc. */ -extern mutex_t mem_hash_mutex; +extern ib_mutex_t mem_hash_mutex; # endif /* !UNIV_HOTBACKUP */ #define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\ diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h index 6851a5bc01b..c36ef06b554 100644 --- a/storage/innobase/include/mem0mem.h +++ b/storage/innobase/include/mem0mem.h @@ -38,15 +38,12 @@ Created 6/9/1994 Heikki Tuuri /* -------------------- MEMORY HEAPS ----------------------------- */ -/* The info structure stored at the beginning of a heap block */ -typedef struct mem_block_info_struct mem_block_info_t; - /* A block of a memory heap consists of the info structure followed by an area of memory */ -typedef mem_block_info_t mem_block_t; +typedef struct mem_block_info_t mem_block_t; /* A memory heap is a nonempty linear list of memory blocks */ -typedef mem_block_t mem_heap_t; +typedef mem_block_t mem_heap_t; /* Types of allocation for memory heaps: DYNAMIC means allocation from the dynamic memory pool of the C compiler, BUFFER means allocation from the @@ -343,9 +340,8 @@ mem_validate_all_blocks(void); /*#######################################################################*/ -/* The info header of a block in a memory heap */ - -struct mem_block_info_struct { +/** The info structure stored at the beginning of a heap block */ +struct mem_block_info_t { ulint magic_n;/* magic number for debugging */ char file_name[8];/* file name where the mem heap was created */ ulint line; /*!< line number where the mem heap was created */ diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic index eee3806dd52..7f0e128cc40 100644 --- a/storage/innobase/include/mem0mem.ic +++ b/storage/innobase/include/mem0mem.ic @@ -247,16 +247,13 @@ mem_heap_free_heap_top( { mem_block_t* block; mem_block_t* prev_block; -#ifdef UNIV_MEM_DEBUG +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG ibool error; ulint total_size; ulint size; -#endif ut_ad(mem_heap_check(heap)); -#ifdef UNIV_MEM_DEBUG - /* Validate the heap and get its total allocated size */ mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size, NULL, NULL); @@ -294,9 +291,9 @@ mem_heap_free_heap_top( /* Set the free field of block */ mem_block_set_free(block, old_top - (byte*) block); -#ifdef UNIV_MEM_DEBUG ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); - + UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top); +#if defined UNIV_MEM_DEBUG /* In the debug version erase block from top up */ mem_erase_buf(old_top, (byte*) block + block->len - old_top); @@ -304,8 +301,6 @@ mem_heap_free_heap_top( mutex_enter(&mem_hash_mutex); mem_current_allocated_memory -= (total_size - size); mutex_exit(&mem_hash_mutex); -#else /* UNIV_MEM_DEBUG */ - UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top); #endif /* UNIV_MEM_DEBUG */ UNIV_MEM_ALLOC(old_top, (byte*) block + block->len - old_top); diff --git a/storage/innobase/include/mem0pool.h b/storage/innobase/include/mem0pool.h index 451055e857f..a65ba50fdf9 100644 --- a/storage/innobase/include/mem0pool.h +++ b/storage/innobase/include/mem0pool.h @@ -30,17 +30,14 @@ Created 6/9/1994 Heikki Tuuri #include "os0file.h" #include "ut0lst.h" -/** Memory area header */ -typedef struct mem_area_struct mem_area_t; /** Memory pool */ -typedef struct mem_pool_struct mem_pool_t; +struct mem_pool_t; /** The common memory pool */ extern mem_pool_t* mem_comm_pool; /** Memory area header */ - -struct mem_area_struct{ +struct mem_area_t{ ulint size_and_free; /*!< memory area size is obtained by anding with ~MEM_AREA_FREE; area in a free list if ANDing with @@ -50,7 +47,7 @@ struct mem_area_struct{ }; /** Each memory area takes this many extra bytes for control information */ -#define MEM_AREA_EXTRA_SIZE (ut_calc_align(sizeof(struct mem_area_struct),\ +#define MEM_AREA_EXTRA_SIZE (ut_calc_align(sizeof(struct mem_area_t),\ UNIV_MEM_ALIGNMENT)) /********************************************************************//** diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index 1427a981bef..18a345d050f 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -32,8 +32,8 @@ Created 12/7/1995 Heikki Tuuri #ifndef UNIV_HOTBACKUP /********************************************************//** -Writes 1 - 4 bytes to a file page buffered in the buffer pool. -Writes the corresponding log record to the mini-transaction log. */ +Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log +record to the mini-transaction log if mtr is not NULL. */ UNIV_INTERN void mlog_write_ulint( @@ -43,8 +43,8 @@ mlog_write_ulint( byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ mtr_t* mtr); /*!< in: mini-transaction handle */ /********************************************************//** -Writes 8 bytes to a file page buffered in the buffer pool. -Writes the corresponding log record to the mini-transaction log. */ +Writes 8 bytes to a file page. Writes the corresponding log +record to the mini-transaction log, only if mtr is not NULL */ UNIV_INTERN void mlog_write_ull( @@ -217,12 +217,13 @@ UNIV_INTERN byte* mlog_open_and_write_index( /*======================*/ - mtr_t* mtr, /*!< in: mtr */ - const byte* rec, /*!< in: index record or page */ - dict_index_t* index, /*!< in: record descriptor */ - byte type, /*!< in: log item type */ - ulint size); /*!< in: requested buffer size in bytes - (if 0, calls mlog_close() and returns NULL) */ + mtr_t* mtr, /*!< in: mtr */ + const byte* rec, /*!< in: index record or page */ + const dict_index_t* index, /*!< in: record descriptor */ + byte type, /*!< in: log item type */ + ulint size); /*!< in: requested buffer size in bytes + (if 0, calls mlog_close() and + returns NULL) */ #endif /* !UNIV_HOTBACKUP */ /********************************************************//** diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index fd84f1119cc..f8c1874412c 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,6 +40,7 @@ Created 11/26/1995 Heikki Tuuri #define MTR_LOG_ALL 21 /* default mode: log all operations modifying disk-based data */ #define MTR_LOG_NONE 22 /* log no operations */ +#define MTR_LOG_NO_REDO 23 /* Don't generate REDO */ /*#define MTR_LOG_SPACE 23 */ /* log only operations modifying file space page allocation data (operations in fsp0fsp.* ) */ @@ -180,7 +182,11 @@ For 1 - 8 bytes, the flag value must give the length also! @{ */ #define MLOG_ZIP_WRITE_HEADER ((byte)50) /*!< write to compressed page header */ #define MLOG_ZIP_PAGE_COMPRESS ((byte)51) /*!< compress an index page */ -#define MLOG_BIGGEST_TYPE ((byte)51) /*!< biggest value (used in +#define MLOG_ZIP_PAGE_COMPRESS_NO_DATA ((byte)52)/*!< compress an index page + without logging it's image */ +#define MLOG_ZIP_PAGE_REORGANIZE ((byte)53) /*!< reorganize a compressed + page */ +#define MLOG_BIGGEST_TYPE ((byte)53) /*!< biggest value (used in assertions) */ /* @} */ @@ -358,15 +364,14 @@ mtr_memo_push( void* object, /*!< in: object */ ulint type); /*!< in: object type: MTR_MEMO_S_LOCK, ... */ -/* Type definition of a mini-transaction memo stack slot. */ -typedef struct mtr_memo_slot_struct mtr_memo_slot_t; -struct mtr_memo_slot_struct{ +/** Mini-transaction memo stack slot. */ +struct mtr_memo_slot_t{ ulint type; /*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */ void* object; /*!< pointer to the object */ }; /* Mini-transaction handle and buffer */ -struct mtr_struct{ +struct mtr_t{ #ifdef UNIV_DEBUG ulint state; /*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */ #endif diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index dcd9826b380..bb24734c9bb 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -269,7 +269,7 @@ mtr_s_lock_func( ut_ad(mtr); ut_ad(lock); - rw_lock_s_lock_func(lock, 0, file, line); + rw_lock_s_lock_inline(lock, 0, file, line); mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK); } @@ -288,7 +288,7 @@ mtr_x_lock_func( ut_ad(mtr); ut_ad(lock); - rw_lock_x_lock_func(lock, 0, file, line); + rw_lock_x_lock_inline(lock, 0, file, line); mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK); } diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index 7a2bcefadb9..43368c0b726 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -26,6 +26,6 @@ Created 11/26/1995 Heikki Tuuri #ifndef mtr0types_h #define mtr0types_h -typedef struct mtr_struct mtr_t; +struct mtr_t; #endif diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 8f84193cb0f..4a744c1b268 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1,6 +1,6 @@ /*********************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. Portions of this file contain modifications contributed and copyrighted @@ -44,7 +44,7 @@ Created 10/21/1995 Heikki Tuuri #endif /** File node of a tablespace or the log data space */ -typedef struct fil_node_struct fil_node_t; +struct fil_node_t; extern ibool os_has_said_disk_full; /** Flag: enable debug printout for asynchronous i/o */ @@ -102,7 +102,7 @@ log. */ #define OS_FILE_LOG_BLOCK_SIZE 512 /** Options for os_file_create_func @{ */ -typedef enum os_file_create_enum { +enum os_file_create_t { OS_FILE_OPEN = 51, /*!< to open an existing file (if doesn't exist, error) */ OS_FILE_CREATE, /*!< to create new file (if @@ -122,7 +122,7 @@ typedef enum os_file_create_enum { the log unless it is a fatal error, this flag is only used if ON_ERROR_NO_EXIT is set */ -} os_file_create_t; +}; #define OS_FILE_READ_ONLY 333 #define OS_FILE_READ_WRITE 444 @@ -217,10 +217,10 @@ used to register actual file read, write and flush */ # define register_pfs_file_open_begin(state, locker, key, op, name, \ src_file, src_line) \ do { \ - locker = PSI_CALL(get_thread_file_name_locker)( \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ state, key, op, name, &locker); \ if (UNIV_LIKELY(locker != NULL)) { \ - PSI_CALL(start_file_open_wait)( \ + PSI_FILE_CALL(start_file_open_wait)( \ locker, src_file, src_line); \ } \ } while (0) @@ -228,7 +228,7 @@ do { \ # define register_pfs_file_open_end(locker, file) \ do { \ if (UNIV_LIKELY(locker != NULL)) { \ - PSI_CALL(end_file_open_wait_and_bind_to_descriptor)( \ + PSI_FILE_CALL(end_file_open_wait_and_bind_to_descriptor)(\ locker, file); \ } \ } while (0) @@ -236,10 +236,10 @@ do { \ # define register_pfs_file_io_begin(state, locker, file, count, op, \ src_file, src_line) \ do { \ - locker = PSI_CALL(get_thread_file_descriptor_locker)( \ + locker = PSI_FILE_CALL(get_thread_file_descriptor_locker)( \ state, file, op); \ if (UNIV_LIKELY(locker != NULL)) { \ - PSI_CALL(start_file_wait)( \ + PSI_FILE_CALL(start_file_wait)( \ locker, count, src_file, src_line); \ } \ } while (0) @@ -247,7 +247,7 @@ do { \ # define register_pfs_file_io_end(locker, count) \ do { \ if (UNIV_LIKELY(locker != NULL)) { \ - PSI_CALL(end_file_wait)(locker, count); \ + PSI_FILE_CALL(end_file_wait)(locker, count); \ } \ } while (0) #endif /* UNIV_PFS_IO */ @@ -345,13 +345,12 @@ to original un-instrumented file I/O APIs */ /* File types for directory entry data type */ -enum os_file_type_enum{ +enum os_file_type_t { OS_FILE_TYPE_UNKNOWN = 0, OS_FILE_TYPE_FILE, /* regular file */ OS_FILE_TYPE_DIR, /* directory */ OS_FILE_TYPE_LINK /* symbolic link */ }; -typedef enum os_file_type_enum os_file_type_t; /* Maximum path string length in bytes when referring to tables with in the './databasename/tablename.ibd' path format; we can allocate at least 2 buffers @@ -359,16 +358,18 @@ of this size from the thread stack; that is why this should not be made much bigger than 4000 bytes */ #define OS_FILE_MAX_PATH 4000 -/* Struct used in fetching information of a file in a directory */ -struct os_file_stat_struct{ +/** Struct used in fetching information of a file in a directory */ +struct os_file_stat_t { char name[OS_FILE_MAX_PATH]; /*!< path to a file */ os_file_type_t type; /*!< file type */ ib_int64_t size; /*!< file size */ time_t ctime; /*!< creation time */ time_t mtime; /*!< modification time */ time_t atime; /*!< access time */ + bool rw_perm; /*!< true if can be opened + in read-write mode. Only valid + if type == OS_FILE_TYPE_FILE */ }; -typedef struct os_file_stat_struct os_file_stat_t; #ifdef __WIN__ typedef HANDLE os_file_dir_t; /*!< directory stream */ @@ -525,7 +526,7 @@ os_file_create_func( Deletes a file. The file has to be closed before calling this. @return TRUE if success */ UNIV_INTERN -ibool +bool os_file_delete( /*===========*/ const char* name); /*!< in: file path as a null-terminated @@ -535,7 +536,7 @@ os_file_delete( Deletes a file if it exists. The file has to be closed before calling this. @return TRUE if success */ UNIV_INTERN -ibool +bool os_file_delete_if_exists( /*=====================*/ const char* name); /*!< in: file path as a null-terminated @@ -826,7 +827,7 @@ UNIV_INTERN ulint os_file_get_last_error( /*===================*/ - ibool report_all_errors); /*!< in: TRUE if we want an error message + bool report_all_errors); /*!< in: TRUE if we want an error message printed of all errors */ /*******************************************************************//** NOTE! Use the corresponding macro os_file_read(), not directly this function! @@ -925,6 +926,60 @@ os_file_dirname( /*============*/ const char* path); /*!< in: pathname */ /****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* new_name); /*!< in: new file name */ +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention); /*!< in: file extention; ibd,cfg*/ +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path); /*!< in/out: full path/data_dir_path */ +/****************************************************************//** Creates all missing subdirectories along the given path. @return TRUE if call succeeded FALSE otherwise */ UNIV_INTERN @@ -1108,14 +1163,16 @@ os_aio_all_slots_free(void); /*******************************************************************//** This function returns information about the specified file -@return TRUE if stat information found */ +@return DB_SUCCESS if all OK */ UNIV_INTERN -ibool +dberr_t os_file_get_status( /*===============*/ - const char* path, /*!< in: pathname of the file */ - os_file_stat_t* stat_info); /*!< information of a file in a + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info, /*!< information of a file in a directory */ + bool check_rw_perm); /*!< in: for testing whether the + file can be opened in RW mode */ #if !defined(UNIV_HOTBACKUP) /*********************************************************************//** diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h index d68823b72ca..d3ce68253ec 100644 --- a/storage/innobase/include/os0sync.h +++ b/storage/innobase/include/os0sync.h @@ -54,22 +54,19 @@ typedef pthread_cond_t os_cond_t; /** Structure that includes Performance Schema Probe pfs_psi in the os_fast_mutex structure if UNIV_PFS_MUTEX is defined */ -typedef struct os_fast_mutex_struct { +struct os_fast_mutex_t { fast_mutex_t mutex; /*!< os_fast_mutex */ #ifdef UNIV_PFS_MUTEX struct PSI_mutex* pfs_psi;/*!< The performance schema instrumentation hook */ #endif -} os_fast_mutex_t; - +}; -/** Operating system event */ -typedef struct os_event_struct os_event_struct_t; /** Operating system event handle */ -typedef os_event_struct_t* os_event_t; +typedef struct os_event* os_event_t; /** An asynchronous signal sent between threads */ -struct os_event_struct { +struct os_event { #ifdef __WIN__ HANDLE handle; /*!< kernel event object, slow, used on older Windows */ @@ -84,7 +81,7 @@ struct os_event_struct { the event becomes signaled */ os_cond_t cond_var; /*!< condition variable is used in waiting for the event */ - UT_LIST_NODE_T(os_event_struct_t) os_event_list; + UT_LIST_NODE_T(os_event_t) os_event_list; /*!< list of all created events */ }; @@ -94,16 +91,11 @@ struct os_event_struct { /** Return value of os_event_wait_time() when the time is exceeded */ #define OS_SYNC_TIME_EXCEEDED 1 -/** Operating system mutex */ -typedef struct os_mutex_struct os_mutex_str_t; /** Operating system mutex handle */ -typedef os_mutex_str_t* os_mutex_t; - -/** Return value of os_event_wait_time() when the time is exceeded */ -#define OS_SYNC_TIME_EXCEEDED 1 +typedef struct os_mutex_t* os_ib_mutex_t; /** Mutex protecting counts and the event and OS 'slow' mutex lists */ -extern os_mutex_t os_sync_mutex; +extern os_ib_mutex_t os_sync_mutex; /** This is incremented by 1 in os_thread_create and decremented by 1 in os_thread_exit */ @@ -132,10 +124,8 @@ explicitly by calling sync_os_reset_event. @return the event handle */ UNIV_INTERN os_event_t -os_event_create( -/*============*/ - const char* name); /*!< in: the name of the event, if NULL - the event is created without a name */ +os_event_create(void); +/*==================*/ /**********************************************************//** Sets an event semaphore to the signaled state: lets waiting threads proceed. */ @@ -191,7 +181,7 @@ os_event_wait_low( os_event_reset(). */ #define os_event_wait(event) os_event_wait_low(event, 0) -#define os_event_wait_time(e, t) os_event_wait_time_low(event, t, 0) +#define os_event_wait_time(event, t) os_event_wait_time_low(event, t, 0) /**********************************************************//** Waits for an event object until it is in the signaled state or @@ -210,10 +200,10 @@ os_event_wait_time_low( os_event_reset(). */ /*********************************************************//** Creates an operating system mutex semaphore. Because these are slow, the -mutex semaphore of InnoDB itself (mutex_t) should be used where possible. +mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible. @return the mutex handle */ UNIV_INTERN -os_mutex_t +os_ib_mutex_t os_mutex_create(void); /*=================*/ /**********************************************************//** @@ -222,21 +212,21 @@ UNIV_INTERN void os_mutex_enter( /*===========*/ - os_mutex_t mutex); /*!< in: mutex to acquire */ + os_ib_mutex_t mutex); /*!< in: mutex to acquire */ /**********************************************************//** Releases ownership of a mutex. */ UNIV_INTERN void os_mutex_exit( /*==========*/ - os_mutex_t mutex); /*!< in: mutex to release */ + os_ib_mutex_t mutex); /*!< in: mutex to release */ /**********************************************************//** Frees an mutex object. */ UNIV_INTERN void os_mutex_free( /*==========*/ - os_mutex_t mutex); /*!< in: mutex to free */ + os_ib_mutex_t mutex); /*!< in: mutex to free */ /**********************************************************//** Acquires ownership of a fast mutex. Currently in Windows this is the same as os_fast_mutex_lock! @@ -365,7 +355,11 @@ Atomic compare-and-swap and increment for InnoDB. */ #if defined(HAVE_IB_GCC_ATOMIC_BUILTINS) -#define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS + +# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_64 +# define HAVE_ATOMIC_BUILTINS_64 +# endif /**********************************************************//** Returns true if swapped, ptr is pointer to target, old_val is value to @@ -419,6 +413,9 @@ amount to decrement. */ # define os_atomic_decrement_ulint(ptr, amount) \ os_atomic_decrement(ptr, amount) +# define os_atomic_decrement_uint64(ptr, amount) \ + os_atomic_decrement(ptr, amount) + /**********************************************************//** Returns the old value of *ptr, atomically sets *ptr to new_val */ @@ -430,12 +427,13 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */ #elif defined(HAVE_IB_SOLARIS_ATOMICS) -#define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS_64 /* If not compiling with GCC or GCC doesn't support the atomic intrinsics and running on Solaris >= 10 use Solaris atomics */ -#include <atomic.h> +# include <atomic.h> /**********************************************************//** Returns true if swapped, ptr is pointer to target, old_val is value to @@ -487,6 +485,9 @@ amount to decrement. */ # define os_atomic_decrement_ulint(ptr, amount) \ os_atomic_increment_ulint(ptr, -(amount)) +# define os_atomic_decrement_uint64(ptr, amount) \ + os_atomic_increment_uint64(ptr, -(amount)) + /**********************************************************//** Returns the old value of *ptr, atomically sets *ptr to new_val */ @@ -498,7 +499,11 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */ #elif defined(HAVE_WINDOWS_ATOMICS) -#define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS + +# ifndef _WIN32 +# define HAVE_ATOMIC_BUILTINS_64 +# endif /**********************************************************//** Atomic compare and exchange of signed integers (both 32 and 64 bit). @@ -574,8 +579,10 @@ amount of increment. */ # define os_atomic_increment_ulint(ptr, amount) \ ((ulint) (win_xchg_and_add((lint*) ptr, (lint) amount) + amount)) -# define os_atomic_increment_uint64(ptr, amount) \ - ((ulint) (win_xchg_and_add(ptr, (lint) amount) + amount)) +# define os_atomic_increment_uint64(ptr, amount) \ + ((ib_uint64_t) (InterlockedExchangeAdd64( \ + (ib_int64_t*) ptr, \ + (ib_int64_t) amount) + amount)) /**********************************************************//** Returns the resulting value, ptr is pointer to target, amount is the @@ -587,6 +594,11 @@ amount to decrement. There is no atomic substract function on Windows */ # define os_atomic_decrement_ulint(ptr, amount) \ ((ulint) (win_xchg_and_add((lint*) ptr, -(lint) amount) - amount)) +# define os_atomic_decrement_uint64(ptr, amount) \ + ((ib_uint64_t) (InterlockedExchangeAdd64( \ + (ib_int64_t*) ptr, \ + -(ib_int64_t) amount) - amount)) + /**********************************************************//** Returns the old value of *ptr, atomically sets *ptr to new_val. InterlockedExchange() operates on LONG, and the LONG will be diff --git a/storage/innobase/include/os0sync.ic b/storage/innobase/include/os0sync.ic index 0d907b31366..33c238ceb47 100644 --- a/storage/innobase/include/os0sync.ic +++ b/storage/innobase/include/os0sync.ic @@ -66,7 +66,7 @@ pfs_os_fast_mutex_init( os_fast_mutex_t* fast_mutex) /*!< out: fast mutex */ { #ifdef HAVE_PSI_MUTEX_INTERFACE - fast_mutex->pfs_psi = PSI_CALL(init_mutex)(key, &fast_mutex->mutex); + fast_mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, &fast_mutex->mutex); #else fast_mutex->pfs_psi = NULL; #endif @@ -86,7 +86,7 @@ pfs_os_fast_mutex_free( { #ifdef HAVE_PSI_MUTEX_INTERFACE if (fast_mutex->pfs_psi != NULL) - PSI_CALL(destroy_mutex)(fast_mutex->pfs_psi); + PSI_MUTEX_CALL(destroy_mutex)(fast_mutex->pfs_psi); #endif fast_mutex->pfs_psi = NULL; @@ -112,13 +112,13 @@ pfs_os_fast_mutex_lock( PSI_mutex_locker* locker; PSI_mutex_locker_state state; - locker = PSI_CALL(start_mutex_wait)(&state, fast_mutex->pfs_psi, + locker = PSI_MUTEX_CALL(start_mutex_wait)(&state, fast_mutex->pfs_psi, PSI_MUTEX_LOCK, file_name, line); os_fast_mutex_lock_func(&fast_mutex->mutex); if (locker != NULL) - PSI_CALL(end_mutex_wait)(locker, 0); + PSI_MUTEX_CALL(end_mutex_wait)(locker, 0); } else #endif @@ -141,7 +141,7 @@ pfs_os_fast_mutex_unlock( { #ifdef HAVE_PSI_MUTEX_INTERFACE if (fast_mutex->pfs_psi != NULL) - PSI_CALL(unlock_mutex)(fast_mutex->pfs_psi); + PSI_MUTEX_CALL(unlock_mutex)(fast_mutex->pfs_psi); #endif os_fast_mutex_unlock_func(&fast_mutex->mutex); diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h index 52f5c5de58a..038a05edbd0 100644 --- a/storage/innobase/include/page0cur.h +++ b/storage/innobase/include/page0cur.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -170,8 +170,11 @@ page_cur_tuple_insert( page_cur_t* cursor, /*!< in/out: a page cursor */ const dtuple_t* tuple, /*!< in: pointer to a data tuple */ dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ ulint n_ext, /*!< in: number of externally stored columns */ - mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ + __attribute__((nonnull(1,2,3,4,5), warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /***********************************************************//** Inserts a record next to page cursor. Returns pointer to inserted record if @@ -238,10 +241,11 @@ UNIV_INTERN void page_cur_delete_rec( /*================*/ - page_cur_t* cursor, /*!< in/out: a page cursor */ - dict_index_t* index, /*!< in: record descriptor */ - const ulint* offsets,/*!< in: rec_get_offsets(cursor->rec, index) */ - mtr_t* mtr); /*!< in: mini-transaction handle */ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr); /*!< in: mini-transaction handle */ #ifndef UNIV_HOTBACKUP /****************************************************************//** Searches the right position for a page cursor. @@ -331,10 +335,24 @@ page_cur_parse_delete_rec( buf_block_t* block, /*!< in: page or NULL */ dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr); /*!< in: mtr or NULL */ +/*******************************************************//** +Removes the record from a leaf page. This function does not log +any changes. It is used by the IMPORT tablespace functions. +@return true if success, i.e., the page did not become too empty */ +UNIV_INTERN +bool +page_delete_rec( +/*============*/ + const dict_index_t* index, /*!< in: The index that the record + belongs to */ + page_cur_t* pcur, /*!< in/out: page cursor on record + to delete */ + page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + const ulint* offsets);/*!< in: offsets for record */ /** Index page cursor */ -struct page_cur_struct{ +struct page_cur_t{ byte* rec; /*!< pointer to a record on page */ buf_block_t* block; /*!< pointer to the block containing rec */ }; diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic index a065f9ff30d..90a5a690487 100644 --- a/storage/innobase/include/page0cur.ic +++ b/storage/innobase/include/page0cur.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -245,33 +245,36 @@ page_cur_tuple_insert( page_cur_t* cursor, /*!< in/out: a page cursor */ const dtuple_t* tuple, /*!< in: pointer to a data tuple */ dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ ulint n_ext, /*!< in: number of externally stored columns */ mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ { - mem_heap_t* heap; - ulint* offsets; ulint size = rec_get_converted_size(index, tuple, n_ext); rec_t* rec; - heap = mem_heap_create(size - + (4 + REC_OFFS_HEADER_SIZE - + dtuple_get_n_fields(tuple)) - * sizeof *offsets); - rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(heap, size), + if (!*heap) { + *heap = mem_heap_create(size + + (4 + REC_OFFS_HEADER_SIZE + + dtuple_get_n_fields(tuple)) + * sizeof **offsets); + } + + rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size), index, tuple, n_ext); - offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + *offsets = rec_get_offsets( + rec, index, *offsets, ULINT_UNDEFINED, heap); if (buf_block_get_page_zip(cursor->block)) { rec = page_cur_insert_rec_zip(&cursor->rec, cursor->block, - index, rec, offsets, mtr); + index, rec, *offsets, mtr); } else { rec = page_cur_insert_rec_low(cursor->rec, - index, rec, offsets, mtr); + index, rec, *offsets, mtr); } - ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, offsets)); - mem_heap_free(heap); + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets)); return(rec); } #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index e4571b69376..773ec4c2177 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -551,6 +551,16 @@ page_rec_get_next_const( /*====================*/ const rec_t* rec); /*!< in: pointer to record */ /************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** Sets the pointer to the next record on the page. */ UNIV_INLINE void @@ -737,11 +747,14 @@ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /*!< in/out: index page */ - page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ - rec_t* rec, /*!< in: pointer to the (origin of) record */ - dict_index_t* index, /*!< in: index of rec */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, + or NULL */ + rec_t* rec, /*!< in: pointer to the (origin of) + record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets);/*!< in: array returned by + rec_get_offsets() */ /**********************************************************//** Create an uncompressed B-tree index page. @return pointer to the page */ @@ -1031,7 +1044,6 @@ page_find_rec_with_heap_no( /*=======================*/ const page_t* page, /*!< in: index page */ ulint heap_no);/*!< in: heap number */ - #ifdef UNIV_MATERIALIZE #undef UNIV_INLINE #define UNIV_INLINE UNIV_INLINE_ORIGINAL diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index e73e547e92b..c2e20d81a29 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -776,6 +776,30 @@ page_rec_get_next_const( } /************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + const rec_t* r; + ulint page_is_compact = page_rec_is_comp(rec); + + for (r = page_rec_get_next_const(rec); + !page_rec_is_supremum(r) + && rec_get_deleted_flag(r, page_is_compact); + r = page_rec_get_next_const(r)) { + /* noop */ + } + + return(r); +} + +/************************************************************//** Sets the pointer to the next record on the page. */ UNIV_INLINE void @@ -1085,11 +1109,14 @@ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /*!< in/out: index page */ - page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ - rec_t* rec, /*!< in: pointer to the (origin of) record */ - dict_index_t* index, /*!< in: index of rec */ - const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + rec_t* rec, /*!< in: pointer to the + (origin of) record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: array returned by + rec_get_offsets() */ { rec_t* free; ulint garbage; diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h index da2ac1c7de2..533b0d3cf98 100644 --- a/storage/innobase/include/page0types.h +++ b/storage/innobase/include/page0types.h @@ -26,6 +26,10 @@ Created 2/2/1994 Heikki Tuuri #ifndef page0types_h #define page0types_h +using namespace std; + +#include <map> + #include "univ.i" #include "dict0types.h" #include "mtr0types.h" @@ -35,12 +39,12 @@ Created 2/2/1994 Heikki Tuuri /** Type of the index page */ typedef byte page_t; /** Index page cursor */ -typedef struct page_cur_struct page_cur_t; +struct page_cur_t; /** Compressed index page */ typedef byte page_zip_t; /** Compressed page descriptor */ -typedef struct page_zip_des_struct page_zip_des_t; +struct page_zip_des_t; /* The following definitions would better belong to page0zip.h, but we cannot include page0zip.h from rem0rec.ic, because @@ -60,12 +64,14 @@ ssize, which is the number of shifts from 512. */ #endif /** Compressed page descriptor */ -struct page_zip_des_struct +struct page_zip_des_t { page_zip_t* data; /*!< compressed page data */ #ifdef UNIV_DEBUG unsigned m_start:16; /*!< start offset of modification log */ + bool m_external; /*!< Allocated externally, not from the + buffer pool */ #endif /* UNIV_DEBUG */ unsigned m_end:16; /*!< end offset of modification log */ unsigned m_nonempty:1; /*!< TRUE if the modification log @@ -80,7 +86,7 @@ struct page_zip_des_struct }; /** Compression statistics for a given page size */ -struct page_zip_stat_struct { +struct page_zip_stat_t { /** Number of page compressions */ ulint compressed; /** Number of successful page compressions */ @@ -91,13 +97,29 @@ struct page_zip_stat_struct { ib_uint64_t compressed_usec; /** Duration of page decompressions in microseconds */ ib_uint64_t decompressed_usec; + page_zip_stat_t() : + /* Initialize members to 0 so that when we do + stlmap[key].compressed++ and element with "key" does not + exist it gets inserted with zeroed members. */ + compressed(0), + compressed_ok(0), + decompressed(0), + compressed_usec(0), + decompressed_usec(0) + { } }; -/** Compression statistics */ -typedef struct page_zip_stat_struct page_zip_stat_t; - -/** Statistics on compression, indexed by page_zip_des_struct::ssize - 1 */ -extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Compression statistics types */ +typedef map<index_id_t, page_zip_stat_t> page_zip_stat_per_index_t; + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by dict_index_t::id */ +extern page_zip_stat_per_index_t page_zip_stat_per_index; +extern ib_mutex_t page_zip_stat_per_index_mutex; +#ifdef HAVE_PSI_INTERFACE +extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ /**********************************************************************//** Write the "deleted" flag of a record on a compressed page. The flag must diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index 2c84f75b2ab..12781bd61b8 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,6 +40,16 @@ Created June 2005 by Marko Makela #include "trx0types.h" #include "mem0mem.h" +/* Compression level to be used by zlib. Settable by user. */ +extern ulint page_compression_level; + +/* Default compression level. */ +#define DEFAULT_COMPRESSION_LEVEL 6 + +/* Whether or not to log compressed page images to avoid possible +compression algorithm changes in zlib. */ +extern bool page_log_compressed_pages; + /**********************************************************************//** Determine the size of a compressed page in bytes. @return size in bytes */ @@ -114,6 +125,7 @@ page_zip_compress( m_start, m_end, m_nonempty */ const page_t* page, /*!< in: uncompressed page */ dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: commpression level */ mtr_t* mtr) /*!< in: mini-transaction, or NULL */ __attribute__((nonnull(1,2,3))); @@ -337,11 +349,12 @@ UNIV_INTERN void page_zip_dir_delete( /*================*/ - page_zip_des_t* page_zip,/*!< in/out: compressed page */ - byte* rec, /*!< in: deleted record */ - dict_index_t* index, /*!< in: index of rec */ - const ulint* offsets,/*!< in: rec_get_offsets(rec) */ - const byte* free) /*!< in: previous start of the free list */ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of + the free list */ __attribute__((nonnull(1,2,3,4))); /**********************************************************************//** @@ -461,14 +474,49 @@ page_zip_verify_checksum( /*=====================*/ const void* data, /*!< in: compressed page */ ulint size); /*!< in: size of compressed page */ +/**********************************************************************//** +Write a log record of compressing an index page without the data on the page. */ +UNIV_INLINE +void +page_zip_compress_write_log_no_data( +/*================================*/ + ulint level, /*!< in: compression level */ + const page_t* page, /*!< in: page that is compressed */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +Parses a log record of compressing an index page without the data. +@return end of log record or NULL */ +UNIV_INLINE +byte* +page_zip_parse_compress_no_data( +/*============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + page_t* page, /*!< in: uncompressed page */ + page_zip_des_t* page_zip, /*!< out: compressed page */ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index(); +/*===========================*/ #ifndef UNIV_HOTBACKUP /** Check if a pointer to an uncompressed page matches a compressed page. +When we IMPORT a tablespace the blocks and accompanying frames are allocted +from outside the buffer pool. @param ptr pointer to an uncompressed page frame @param page_zip compressed page descriptor @return TRUE if ptr and page_zip refer to the same block */ -# define PAGE_ZIP_MATCH(ptr, page_zip) \ - (buf_frame_get_page_zip(ptr) == (page_zip)) +# define PAGE_ZIP_MATCH(ptr, page_zip) \ + (((page_zip)->m_external \ + && (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data)) \ + || buf_frame_get_page_zip(ptr) == (page_zip)) #else /* !UNIV_HOTBACKUP */ /** Check if a pointer to an uncompressed page matches a compressed page. @param ptr pointer to an uncompressed page frame diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic index c9300aa4e9f..0062e1cb39f 100644 --- a/storage/innobase/include/page0zip.ic +++ b/storage/innobase/include/page0zip.ic @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,6 +30,7 @@ Created June 2005 by Marko Makela #endif #include "page0zip.h" +#include "mtr0log.h" #include "page0page.h" /* The format of compressed pages is as follows. @@ -389,6 +391,75 @@ page_zip_write_header( } } +/**********************************************************************//** +Write a log record of compressing an index page without the data on the page. */ +UNIV_INLINE +void +page_zip_compress_write_log_no_data( +/*================================*/ + ulint level, /*!< in: compression level */ + const page_t* page, /*!< in: page that is compressed */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr = mlog_open_and_write_index( + mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1); + + if (log_ptr) { + mach_write_to_1(log_ptr, level); + mlog_close(mtr, log_ptr + 1); + } +} + +/**********************************************************************//** +Parses a log record of compressing an index page without the data. +@return end of log record or NULL */ +UNIV_INLINE +byte* +page_zip_parse_compress_no_data( +/*============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + page_t* page, /*!< in: uncompressed page */ + page_zip_des_t* page_zip, /*!< out: compressed page */ + dict_index_t* index) /*!< in: index */ +{ + ulint level; + if (end_ptr == ptr) { + return(NULL); + } + + level = mach_read_from_1(ptr); + + /* If page compression fails then there must be something wrong + because a compress log record is logged only if the compression + was successful. Crash in this case. */ + + if (page + && !page_zip_compress(page_zip, page, index, level, NULL)) { + ut_error; + } + + return(ptr + 1); +} + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index() +/*===========================*/ +{ + mutex_enter(&page_zip_stat_per_index_mutex); + + page_zip_stat_per_index.erase( + page_zip_stat_per_index.begin(), + page_zip_stat_per_index.end()); + + mutex_exit(&page_zip_stat_per_index_mutex); +} + #ifdef UNIV_MATERIALIZE # undef UNIV_INLINE # define UNIV_INLINE UNIV_INLINE_ORIGINAL diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h index 9eb8aeb747f..65ff7533828 100644 --- a/storage/innobase/include/pars0pars.h +++ b/storage/innobase/include/pars0pars.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -469,9 +469,10 @@ que_thr_t* pars_complete_graph_for_exec( /*=========================*/ que_node_t* node, /*!< in: root node for an incomplete - query graph */ + query graph, or NULL for dummy graph */ trx_t* trx, /*!< in: transaction handle */ - mem_heap_t* heap); /*!< in: memory heap from which allocated */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((nonnull(2,3), warn_unused_result)); /****************************************************************//** Create parser info struct. @@ -618,6 +619,18 @@ pars_info_add_ull_literal( ib_uint64_t val); /*!< in: value */ /****************************************************************//** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_ull_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val) /*!< in: value */ + __attribute__((nonnull)); + +/****************************************************************//** Add bound id. */ UNIV_INTERN void @@ -628,16 +641,6 @@ pars_info_add_id( const char* id); /*!< in: id */ /****************************************************************//** -Get user function with the given name. -@return user func, or NULL if not found */ -UNIV_INTERN -pars_user_func_t* -pars_info_get_user_func( -/*====================*/ - pars_info_t* info, /*!< in: info struct */ - const char* name); /*!< in: function name to find*/ - -/****************************************************************//** Get bound literal with the given name. @return bound literal, or NULL if not found */ UNIV_INTERN @@ -665,7 +668,7 @@ pars_lexer_close(void); /*==================*/ /** Extra information supplied for pars_sql(). */ -struct pars_info_struct { +struct pars_info_t { mem_heap_t* heap; /*!< our own memory heap */ ib_vector_t* funcs; /*!< user functions, or NUll @@ -680,14 +683,14 @@ struct pars_info_struct { }; /** User-supplied function and argument. */ -struct pars_user_func_struct { +struct pars_user_func_t { const char* name; /*!< function name */ pars_user_func_cb_t func; /*!< function address */ void* arg; /*!< user-supplied argument */ }; /** Bound literal. */ -struct pars_bound_lit_struct { +struct pars_bound_lit_t { const char* name; /*!< name */ const void* address; /*!< address */ ulint length; /*!< length of data */ @@ -697,20 +700,20 @@ struct pars_bound_lit_struct { }; /** Bound identifier. */ -struct pars_bound_id_struct { +struct pars_bound_id_t { const char* name; /*!< name */ const char* id; /*!< identifier */ }; /** Struct used to denote a reserved word in a parsing tree */ -struct pars_res_word_struct{ +struct pars_res_word_t{ int code; /*!< the token code for the reserved word from pars0grm.h */ }; /** A predefined function or operator node in a parsing tree; this construct is also used for some non-functions like the assignment ':=' */ -struct func_node_struct{ +struct func_node_t{ que_common_t common; /*!< type: QUE_NODE_FUNC */ int func; /*!< token code of the function name */ ulint fclass; /*!< class of the function */ @@ -725,14 +728,14 @@ struct func_node_struct{ }; /** An order-by node in a select */ -struct order_node_struct{ +struct order_node_t{ que_common_t common; /*!< type: QUE_NODE_ORDER */ sym_node_t* column; /*!< order-by column */ ibool asc; /*!< TRUE if ascending, FALSE if descending */ }; /** Procedure definition node */ -struct proc_node_struct{ +struct proc_node_t{ que_common_t common; /*!< type: QUE_NODE_PROC */ sym_node_t* proc_id; /*!< procedure name symbol in the symbol table of this same procedure */ @@ -742,14 +745,14 @@ struct proc_node_struct{ }; /** elsif-element node */ -struct elsif_node_struct{ +struct elsif_node_t{ que_common_t common; /*!< type: QUE_NODE_ELSIF */ que_node_t* cond; /*!< if condition */ que_node_t* stat_list; /*!< statement list */ }; /** if-statement node */ -struct if_node_struct{ +struct if_node_t{ que_common_t common; /*!< type: QUE_NODE_IF */ que_node_t* cond; /*!< if condition */ que_node_t* stat_list; /*!< statement list */ @@ -758,14 +761,14 @@ struct if_node_struct{ }; /** while-statement node */ -struct while_node_struct{ +struct while_node_t{ que_common_t common; /*!< type: QUE_NODE_WHILE */ que_node_t* cond; /*!< while condition */ que_node_t* stat_list; /*!< statement list */ }; /** for-loop-statement node */ -struct for_node_struct{ +struct for_node_t{ que_common_t common; /*!< type: QUE_NODE_FOR */ sym_node_t* loop_var; /*!< loop variable: this is the dereferenced symbol from the @@ -782,24 +785,24 @@ struct for_node_struct{ }; /** exit statement node */ -struct exit_node_struct{ +struct exit_node_t{ que_common_t common; /*!< type: QUE_NODE_EXIT */ }; /** return-statement node */ -struct return_node_struct{ +struct return_node_t{ que_common_t common; /*!< type: QUE_NODE_RETURN */ }; /** Assignment statement node */ -struct assign_node_struct{ +struct assign_node_t{ que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */ sym_node_t* var; /*!< variable to set */ que_node_t* val; /*!< value to assign */ }; /** Column assignment node */ -struct col_assign_node_struct{ +struct col_assign_node_t{ que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */ sym_node_t* col; /*!< column to set */ que_node_t* val; /*!< value to assign */ diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h index 4b3b342a533..bcf73639228 100644 --- a/storage/innobase/include/pars0sym.h +++ b/storage/innobase/include/pars0sym.h @@ -119,9 +119,9 @@ sym_tab_add_bound_id( sym_tab_t* sym_tab, /*!< in: symbol table */ const char* name); /*!< in: name of bound id */ -/** Index of sym_node_struct::field_nos corresponding to the clustered index */ +/** Index of sym_node_t::field_nos corresponding to the clustered index */ #define SYM_CLUST_FIELD_NO 0 -/** Index of sym_node_struct::field_nos corresponding to a secondary index */ +/** Index of sym_node_t::field_nos corresponding to a secondary index */ #define SYM_SEC_FIELD_NO 1 /** Types of a symbol table node */ @@ -143,7 +143,7 @@ enum sym_tab_entry { }; /** Symbol table node */ -struct sym_node_struct{ +struct sym_node_t{ que_common_t common; /*!< node type: QUE_NODE_SYMBOL */ /* NOTE: if the data field in 'common.val' is not NULL and the symbol @@ -227,7 +227,7 @@ struct sym_node_struct{ }; /** Symbol table */ -struct sym_tab_struct{ +struct sym_tab_t{ que_t* query_graph; /*!< query graph generated by the parser */ diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h index 13ae53f3fd6..47f4b432d20 100644 --- a/storage/innobase/include/pars0types.h +++ b/storage/innobase/include/pars0types.h @@ -26,24 +26,24 @@ Created 1/11/1998 Heikki Tuuri #ifndef pars0types_h #define pars0types_h -typedef struct pars_info_struct pars_info_t; -typedef struct pars_user_func_struct pars_user_func_t; -typedef struct pars_bound_lit_struct pars_bound_lit_t; -typedef struct pars_bound_id_struct pars_bound_id_t; -typedef struct sym_node_struct sym_node_t; -typedef struct sym_tab_struct sym_tab_t; -typedef struct pars_res_word_struct pars_res_word_t; -typedef struct func_node_struct func_node_t; -typedef struct order_node_struct order_node_t; -typedef struct proc_node_struct proc_node_t; -typedef struct elsif_node_struct elsif_node_t; -typedef struct if_node_struct if_node_t; -typedef struct while_node_struct while_node_t; -typedef struct for_node_struct for_node_t; -typedef struct exit_node_struct exit_node_t; -typedef struct return_node_struct return_node_t; -typedef struct assign_node_struct assign_node_t; -typedef struct col_assign_node_struct col_assign_node_t; +struct pars_info_t; +struct pars_user_func_t; +struct pars_bound_lit_t; +struct pars_bound_id_t; +struct sym_node_t; +struct sym_tab_t; +struct pars_res_word_t; +struct func_node_t; +struct order_node_t; +struct proc_node_t; +struct elsif_node_t; +struct if_node_t; +struct while_node_t; +struct for_node_t; +struct exit_node_t; +struct return_node_t; +struct assign_node_t; +struct col_assign_node_t; typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t; diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h index 531794ce688..ba8828623af 100644 --- a/storage/innobase/include/que0que.h +++ b/storage/innobase/include/que0que.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,7 +42,7 @@ of SQL execution in the UNIV_SQL_DEBUG version */ extern ibool que_trace_on; /** Mutex protecting the query threads. */ -extern mutex_t que_thr_mutex; +extern ib_mutex_t que_thr_mutex; /***********************************************************************//** Creates a query graph fork node. @@ -310,7 +310,7 @@ que_node_print_info( Evaluate the given SQL @return error code or DB_SUCCESS */ UNIV_INTERN -enum db_err +dberr_t que_eval_sql( /*=========*/ pars_info_t* info, /*!< in: info struct, or NULL */ @@ -349,7 +349,7 @@ que_close(void); /* Query graph query thread node: the fields are protected by the trx_t::mutex with the exceptions named below */ -struct que_thr_struct{ +struct que_thr_t{ que_common_t common; /*!< type: QUE_NODE_THR */ ulint magic_n; /*!< magic number to catch memory corruption */ @@ -374,7 +374,7 @@ struct que_thr_struct{ thus far */ ulint lock_state; /*!< lock state of thread (table or row) */ - struct srv_slot_struct* + struct srv_slot_t* slot; /* The thread slot in the wait array in srv_sys_t */ /*------------------------------*/ @@ -398,7 +398,7 @@ struct que_thr_struct{ #define QUE_THR_MAGIC_FREED 123461526 /* Query graph fork node: its fields are protected by the query thread mutex */ -struct que_fork_struct{ +struct que_fork_t{ que_common_t common; /*!< type: QUE_NODE_FORK */ que_t* graph; /*!< query graph of this node */ ulint fork_type; /*!< fork type */ diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h index b165b817d87..0f11cad301a 100644 --- a/storage/innobase/include/que0types.h +++ b/storage/innobase/include/que0types.h @@ -32,18 +32,15 @@ Created 5/27/1996 Heikki Tuuri /* Pseudotype for all graph nodes */ typedef void que_node_t; -typedef struct que_fork_struct que_fork_t; - /* Query graph root is a fork node */ -typedef que_fork_t que_t; +typedef struct que_fork_t que_t; -typedef struct que_thr_struct que_thr_t; -typedef struct que_common_struct que_common_t; +struct que_thr_t; /* Common struct at the beginning of each query graph node; the name of this substruct must be 'common' */ -struct que_common_struct{ +struct que_common_t{ ulint type; /*!< query node type */ que_node_t* parent; /*!< back pointer to parent node, or NULL */ que_node_t* brother;/* pointer to a possible brother node */ diff --git a/storage/innobase/include/read0read.h b/storage/innobase/include/read0read.h index 6ea57fffcd2..980faddf98e 100644 --- a/storage/innobase/include/read0read.h +++ b/storage/innobase/include/read0read.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -57,12 +57,14 @@ read_view_purge_open( mem_heap_t* heap); /*!< in: memory heap from which allocated */ /*********************************************************************//** -Remove read view from the trx_sys->view_list. */ -UNIV_INTERN +Remove a read view from the trx_sys->view_list. */ +UNIV_INLINE void read_view_remove( /*=============*/ - read_view_t* view); /*!< in: read view */ + read_view_t* view, /*!< in: read view, can be 0 */ + bool own_mutex); /*!< in: true if caller owns the + trx_sys_t::mutex */ /*********************************************************************//** Closes a consistent read view for MySQL. This function is called at an SQL statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ @@ -73,13 +75,14 @@ read_view_close_for_mysql( trx_t* trx); /*!< in: trx which has a read view */ /*********************************************************************//** Checks if a read view sees the specified transaction. -@return TRUE if sees */ +@return true if sees */ UNIV_INLINE -ibool +bool read_view_sees_trx_id( /*==================*/ const read_view_t* view, /*!< in: read view */ - trx_id_t trx_id);/*!< in: trx id */ + trx_id_t trx_id) /*!< in: trx id */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Prints a read view to stderr. */ UNIV_INTERN @@ -119,7 +122,7 @@ read_cursor_set_for_mysql( /** Read view lists the trx ids of those transactions for which a consistent read should not see the modifications to the database. */ -struct read_view_struct{ +struct read_view_t{ ulint type; /*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */ undo_no_t undo_no;/*!< 0 or if type is VIEW_HIGH_GRANULARITY @@ -145,7 +148,7 @@ struct read_view_struct{ trx_id_t* trx_ids;/*!< Additional trx ids which the read should not see: typically, these are the read-write active transactions at the time when the read - is serialized, except the reading transaction + is serialized, except the reading transaction itself; the trx ids in this array are in a descending order. These trx_ids should be between the "low" and "high" water marks, @@ -173,7 +176,7 @@ struct read_view_struct{ cursors. This struct holds both heap where consistent read view is allocated and pointer to a read view. */ -struct cursor_view_struct{ +struct cursor_view_t{ mem_heap_t* heap; /*!< Memory heap for the cursor view */ read_view_t* read_view; diff --git a/storage/innobase/include/read0read.ic b/storage/innobase/include/read0read.ic index 436800e1585..82c1028f12e 100644 --- a/storage/innobase/include/read0read.ic +++ b/storage/innobase/include/read0read.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,11 +23,64 @@ Cursor read Created 2/16/1997 Heikki Tuuri *******************************************************/ +#include "trx0sys.h" + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Validates a read view object. */ +static +bool +read_view_validate( +/*===============*/ + const read_view_t* view) /*!< in: view to validate */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* Check that the view->trx_ids array is in descending order. */ + for (ulint i = 1; i < view->n_trx_ids; ++i) { + + ut_a(view->trx_ids[i] < view->trx_ids[i - 1]); + } + + return(true); +} + +/** Functor to validate the view list. */ +struct ViewCheck { + + ViewCheck() : m_prev_view(0) { } + + void operator()(const read_view_t* view) + { + ut_a(m_prev_view == NULL + || m_prev_view->low_limit_no >= view->low_limit_no); + + m_prev_view = view; + } + + const read_view_t* m_prev_view; +}; + +/*********************************************************************//** +Validates a read view list. */ +static +bool +read_view_list_validate(void) +/*=========================*/ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_list_map(trx_sys->view_list, &read_view_t::view_list, ViewCheck()); + + return(true); +} +#endif /* UNIV_DEBUG */ + /*********************************************************************//** Checks if a read view sees the specified transaction. -@return TRUE if sees */ +@return true if sees */ UNIV_INLINE -ibool +bool read_view_sees_trx_id( /*==================*/ const read_view_t* view, /*!< in: read view */ @@ -35,10 +88,10 @@ read_view_sees_trx_id( { if (trx_id < view->up_limit_id) { - return(TRUE); + return(true); } else if (trx_id >= view->low_limit_id) { - return(FALSE); + return(false); } else { ulint lower = 0; ulint upper = view->n_trx_ids - 1; @@ -63,5 +116,33 @@ read_view_sees_trx_id( } while (lower <= upper); } - return(TRUE); + return(true); +} + +/*********************************************************************//** +Remove a read view from the trx_sys->view_list. */ +UNIV_INLINE +void +read_view_remove( +/*=============*/ + read_view_t* view, /*!< in: read view, can be 0 */ + bool own_mutex) /*!< in: true if caller owns the + trx_sys_t::mutex */ +{ + if (view != 0) { + if (!own_mutex) { + mutex_enter(&trx_sys->mutex); + } + + ut_ad(read_view_validate(view)); + + UT_LIST_REMOVE(view_list, trx_sys->view_list, view); + + ut_ad(read_view_list_validate()); + + if (!own_mutex) { + mutex_exit(&trx_sys->mutex); + } + } } + diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h index 0b6aa132b88..969f4ebb637 100644 --- a/storage/innobase/include/read0types.h +++ b/storage/innobase/include/read0types.h @@ -26,7 +26,7 @@ Created 2/16/1997 Heikki Tuuri #ifndef read0types_h #define read0types_h -typedef struct read_view_struct read_view_t; -typedef struct cursor_view_struct cursor_view_t; +struct read_view_t; +struct cursor_view_t; #endif diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h index ed6486aa603..cb3c85ac2c8 100644 --- a/storage/innobase/include/rem0cmp.h +++ b/storage/innobase/include/rem0cmp.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -156,21 +156,28 @@ respectively, when only the common first fields are compared, or until the first externally stored field in rec */ UNIV_INTERN int -cmp_dtuple_rec_with_match( -/*======================*/ +cmp_dtuple_rec_with_match_low( +/*==========================*/ const dtuple_t* dtuple, /*!< in: data tuple */ const rec_t* rec, /*!< in: physical record which differs from dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint* matched_fields, /*!< in/out: number of already completely + ulint n_cmp, /*!< in: number of fields to compare */ + ulint* matched_fields, + /*!< in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ - ulint* matched_bytes); /*!< in/out: number of already matched + ulint* matched_bytes) + /*!< in/out: number of already matched bytes within the first field not completely matched; when function returns, contains the value for current comparison */ + __attribute__((nonnull)); +#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields,bytes) \ + cmp_dtuple_rec_with_match_low( \ + tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields,bytes) /**************************************************************//** Compares a data tuple to a physical record. @see cmp_dtuple_rec_with_match @@ -196,7 +203,9 @@ cmp_dtuple_is_prefix_of_rec( /*************************************************************//** Compare two physical records that contain the same number of columns, none of which are stored externally. -@return 1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */ +@retval 1 if rec1 (including non-ordering columns) is greater than rec2 +@retval -1 if rec1 (including non-ordering columns) is less than rec2 +@retval 0 if rec1 is a duplicate of rec2 */ UNIV_INTERN int cmp_rec_rec_simple( @@ -206,8 +215,10 @@ cmp_rec_rec_simple( const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ const dict_index_t* index, /*!< in: data dictionary index */ - ibool* null_eq);/*!< out: set to TRUE if - found matching null values */ + struct TABLE* table) /*!< in: MySQL table, for reporting + duplicate key value if applicable, + or NULL */ + __attribute__((nonnull(1,2,3,4), warn_unused_result)); /*************************************************************//** This function is used to compare two physical records. Only the common first fields are compared, and if an externally stored field is diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index c6c70bb5f09..2a84aee7a6f 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -66,6 +66,15 @@ The status is stored in the low-order bits. */ /* Length of a B-tree node pointer, in bytes */ #define REC_NODE_PTR_SIZE 4 +/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */ +#define REC_1BYTE_SQL_NULL_MASK 0x80UL +/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */ +#define REC_2BYTE_SQL_NULL_MASK 0x8000UL + +/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most +significant bit denotes that the tail of a field is stored off-page. */ +#define REC_2BYTE_EXTERN_MASK 0x4000UL + #ifdef UNIV_DEBUG /* Length of the rec_get_offsets() header */ # define REC_OFFS_HEADER_SIZE 4 @@ -88,7 +97,8 @@ const rec_t* rec_get_next_ptr_const( /*===================*/ const rec_t* rec, /*!< in: physical record */ - ulint comp); /*!< in: nonzero=compact page format */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to get the pointer of the next chained record on the same page. @@ -98,7 +108,8 @@ rec_t* rec_get_next_ptr( /*=============*/ rec_t* rec, /*!< in: physical record */ - ulint comp); /*!< in: nonzero=compact page format */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to get the offset of the next chained record on the same page. @@ -108,7 +119,8 @@ ulint rec_get_next_offs( /*==============*/ const rec_t* rec, /*!< in: physical record */ - ulint comp); /*!< in: nonzero=compact page format */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the next record offset field of an old-style record. */ @@ -117,7 +129,8 @@ void rec_set_next_offs_old( /*==================*/ rec_t* rec, /*!< in: old-style physical record */ - ulint next); /*!< in: offset of the next record */ + ulint next) /*!< in: offset of the next record */ + __attribute__((nonnull)); /******************************************************//** The following function is used to set the next record offset field of a new-style record. */ @@ -126,7 +139,8 @@ void rec_set_next_offs_new( /*==================*/ rec_t* rec, /*!< in/out: new-style physical record */ - ulint next); /*!< in: offset of the next record */ + ulint next) /*!< in: offset of the next record */ + __attribute__((nonnull)); /******************************************************//** The following function is used to get the number of fields in an old-style record. @@ -135,7 +149,8 @@ UNIV_INLINE ulint rec_get_n_fields_old( /*=================*/ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to get the number of fields in a record. @@ -145,7 +160,8 @@ ulint rec_get_n_fields( /*=============*/ const rec_t* rec, /*!< in: physical record */ - const dict_index_t* index); /*!< in: record descriptor */ + const dict_index_t* index) /*!< in: record descriptor */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to get the number of records owned by the previous directory record. @@ -154,7 +170,8 @@ UNIV_INLINE ulint rec_get_n_owned_old( /*================*/ - const rec_t* rec); /*!< in: old-style physical record */ + const rec_t* rec) /*!< in: old-style physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the number of owned records. */ UNIV_INLINE @@ -162,7 +179,8 @@ void rec_set_n_owned_old( /*================*/ rec_t* rec, /*!< in: old-style physical record */ - ulint n_owned); /*!< in: the number of owned */ + ulint n_owned) /*!< in: the number of owned */ + __attribute__((nonnull)); /******************************************************//** The following function is used to get the number of records owned by the previous directory record. @@ -171,7 +189,8 @@ UNIV_INLINE ulint rec_get_n_owned_new( /*================*/ - const rec_t* rec); /*!< in: new-style physical record */ + const rec_t* rec) /*!< in: new-style physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the number of owned records. */ UNIV_INLINE @@ -180,7 +199,8 @@ rec_set_n_owned_new( /*================*/ rec_t* rec, /*!< in/out: new-style physical record */ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ - ulint n_owned);/*!< in: the number of owned */ + ulint n_owned)/*!< in: the number of owned */ + __attribute__((nonnull(1))); /******************************************************//** The following function is used to retrieve the info bits of a record. @@ -190,7 +210,8 @@ ulint rec_get_info_bits( /*==============*/ const rec_t* rec, /*!< in: physical record */ - ulint comp); /*!< in: nonzero=compact page format */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the info bits of a record. */ UNIV_INLINE @@ -198,7 +219,8 @@ void rec_set_info_bits_old( /*==================*/ rec_t* rec, /*!< in: old-style physical record */ - ulint bits); /*!< in: info bits */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); /******************************************************//** The following function is used to set the info bits of a record. */ UNIV_INLINE @@ -206,7 +228,8 @@ void rec_set_info_bits_new( /*==================*/ rec_t* rec, /*!< in/out: new-style physical record */ - ulint bits); /*!< in: info bits */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); /******************************************************//** The following function retrieves the status bits of a new-style record. @return status bits */ @@ -214,7 +237,8 @@ UNIV_INLINE ulint rec_get_status( /*===========*/ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the status bits of a new-style record. */ @@ -223,7 +247,8 @@ void rec_set_status( /*===========*/ rec_t* rec, /*!< in/out: physical record */ - ulint bits); /*!< in: info bits */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); /******************************************************//** The following function is used to retrieve the info and status @@ -234,7 +259,8 @@ ulint rec_get_info_and_status_bits( /*=========================*/ const rec_t* rec, /*!< in: physical record */ - ulint comp); /*!< in: nonzero=compact page format */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the info and status bits of a record. (Only compact records have status bits.) */ @@ -243,7 +269,8 @@ void rec_set_info_and_status_bits( /*=========================*/ rec_t* rec, /*!< in/out: compact physical record */ - ulint bits); /*!< in: info bits */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); /******************************************************//** The following function tells if record is delete marked. @@ -253,7 +280,8 @@ ulint rec_get_deleted_flag( /*=================*/ const rec_t* rec, /*!< in: physical record */ - ulint comp); /*!< in: nonzero=compact page format */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the deleted bit. */ UNIV_INLINE @@ -261,7 +289,8 @@ void rec_set_deleted_flag_old( /*=====================*/ rec_t* rec, /*!< in: old-style physical record */ - ulint flag); /*!< in: nonzero if delete marked */ + ulint flag) /*!< in: nonzero if delete marked */ + __attribute__((nonnull)); /******************************************************//** The following function is used to set the deleted bit. */ UNIV_INLINE @@ -270,7 +299,8 @@ rec_set_deleted_flag_new( /*=====================*/ rec_t* rec, /*!< in/out: new-style physical record */ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ - ulint flag); /*!< in: nonzero if delete marked */ + ulint flag) /*!< in: nonzero if delete marked */ + __attribute__((nonnull(1))); /******************************************************//** The following function tells if a new-style record is a node pointer. @return TRUE if node pointer */ @@ -278,7 +308,8 @@ UNIV_INLINE ibool rec_get_node_ptr_flag( /*==================*/ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to get the order number of an old-style record in the heap of the index page. @@ -287,7 +318,8 @@ UNIV_INLINE ulint rec_get_heap_no_old( /*================*/ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the heap number field in an old-style record. */ @@ -296,7 +328,8 @@ void rec_set_heap_no_old( /*================*/ rec_t* rec, /*!< in: physical record */ - ulint heap_no);/*!< in: the heap number */ + ulint heap_no)/*!< in: the heap number */ + __attribute__((nonnull)); /******************************************************//** The following function is used to get the order number of a new-style record in the heap of the index page. @@ -305,7 +338,8 @@ UNIV_INLINE ulint rec_get_heap_no_new( /*================*/ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** The following function is used to set the heap number field in a new-style record. */ @@ -314,7 +348,8 @@ void rec_set_heap_no_new( /*================*/ rec_t* rec, /*!< in/out: physical record */ - ulint heap_no);/*!< in: the heap number */ + ulint heap_no)/*!< in: the heap number */ + __attribute__((nonnull)); /******************************************************//** The following function is used to test whether the data offsets in the record are stored in one-byte or two-byte format. @@ -323,7 +358,57 @@ UNIV_INLINE ibool rec_get_1byte_offs_flag( /*====================*/ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ + __attribute__((nonnull)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +ulint +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** Determine how many of the first n columns in a compact @@ -333,9 +418,10 @@ UNIV_INTERN ulint rec_get_n_extern_new( /*=================*/ - const rec_t* rec, /*!< in: compact physical record */ - dict_index_t* index, /*!< in: record descriptor */ - ulint n); /*!< in: number of columns to scan */ + const rec_t* rec, /*!< in: compact physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n) /*!< in: number of columns to scan */ + __attribute__((nonnull, warn_unused_result)); /******************************************************//** The following function determines the offsets to each field @@ -356,30 +442,13 @@ rec_get_offsets_func( (ULINT_UNDEFINED if all fields) */ mem_heap_t** heap, /*!< in/out: memory heap */ const char* file, /*!< in: file name where called */ - ulint line); /*!< in: line number where called */ + ulint line) /*!< in: line number where called */ + __attribute__((nonnull(1,2,5,6),warn_unused_result)); #define rec_get_offsets(rec,index,offsets,n,heap) \ rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__) /******************************************************//** -Determine the offset to each field in a leaf-page record -in ROW_FORMAT=COMPACT. This is a special case of -rec_init_offsets() and rec_get_offsets_func(). */ -UNIV_INTERN -void -rec_init_offsets_comp_ordinary( -/*===========================*/ - const rec_t* rec, /*!< in: physical record in - ROW_FORMAT=COMPACT */ - ulint extra, /*!< in: number of bytes to reserve - between the record header and - the data payload - (usually REC_N_NEW_EXTRA_BYTES) */ - const dict_index_t* index, /*!< in: record descriptor */ - ulint* offsets);/*!< in/out: array of offsets; - in: n=rec_offs_n_fields(offsets) */ - -/******************************************************//** The following function determines the offsets to each field in the record. It can reuse a previously allocated array. */ UNIV_INTERN @@ -393,9 +462,10 @@ rec_get_offsets_reverse( const dict_index_t* index, /*!< in: record descriptor */ ulint node_ptr,/*!< in: nonzero=node pointer, 0=leaf node */ - ulint* offsets);/*!< in/out: array consisting of + ulint* offsets)/*!< in/out: array consisting of offsets[0] allocated elements */ - + __attribute__((nonnull)); +#ifdef UNIV_DEBUG /************************************************************//** Validates offsets returned by rec_get_offsets(). @return TRUE if valid */ @@ -405,9 +475,9 @@ rec_offs_validate( /*==============*/ const rec_t* rec, /*!< in: record or NULL */ const dict_index_t* index, /*!< in: record descriptor or NULL */ - const ulint* offsets);/*!< in: array returned by + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ -#ifdef UNIV_DEBUG + __attribute__((nonnull(3), warn_unused_result)); /************************************************************//** Updates debug data in offsets, in order to avoid bogus rec_offs_validate() failures. */ @@ -417,8 +487,9 @@ rec_offs_make_valid( /*================*/ const rec_t* rec, /*!< in: record */ const dict_index_t* index, /*!< in: record descriptor */ - ulint* offsets);/*!< in: array returned by + ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); #else # define rec_offs_make_valid(rec, index, offsets) ((void) 0) #endif /* UNIV_DEBUG */ @@ -433,8 +504,9 @@ rec_get_nth_field_offs_old( /*=======================*/ const rec_t* rec, /*!< in: record */ ulint n, /*!< in: index of the field */ - ulint* len); /*!< out: length of the field; UNIV_SQL_NULL + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL if SQL null */ + __attribute__((nonnull)); #define rec_get_nth_field_old(rec, n, len) \ ((rec) + rec_get_nth_field_offs_old(rec, n, len)) /************************************************************//** @@ -447,7 +519,8 @@ ulint rec_get_nth_field_size( /*===================*/ const rec_t* rec, /*!< in: record */ - ulint n); /*!< in: index of the field */ + ulint n) /*!< in: index of the field */ + __attribute__((nonnull, pure, warn_unused_result)); /************************************************************//** The following function is used to get an offset to the nth data field in a record. @@ -458,8 +531,9 @@ rec_get_nth_field_offs( /*===================*/ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ ulint n, /*!< in: index of the field */ - ulint* len); /*!< out: length of the field; UNIV_SQL_NULL + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL if SQL null */ + __attribute__((nonnull)); #define rec_get_nth_field(rec, offsets, n, len) \ ((rec) + rec_get_nth_field_offs(offsets, n, len)) /******************************************************//** @@ -470,7 +544,8 @@ UNIV_INLINE ulint rec_offs_comp( /*==========*/ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** Determine if the offsets are for a record containing externally stored columns. @@ -479,8 +554,8 @@ UNIV_INLINE ulint rec_offs_any_extern( /*================*/ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ @@ -490,8 +565,7 @@ rec_offs_any_null_extern( /*=====================*/ const rec_t* rec, /*!< in: record */ const ulint* offsets) /*!< in: rec_get_offsets(rec) */ - __attribute__((nonnull, warn_unused_result)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. @return nonzero if externally stored */ @@ -500,7 +574,8 @@ ulint rec_offs_nth_extern( /*================*/ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint n); /*!< in: nth field */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** Returns nonzero if the SQL NULL bit is set in nth field of rec. @return nonzero if SQL NULL */ @@ -509,7 +584,8 @@ ulint rec_offs_nth_sql_null( /*==================*/ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint n); /*!< in: nth field */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** Gets the physical size of a field. @return length of field */ @@ -518,7 +594,8 @@ ulint rec_offs_nth_size( /*==============*/ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint n); /*!< in: nth field */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); /******************************************************//** Returns the number of extern bits set in a record. @@ -527,7 +604,8 @@ UNIV_INLINE ulint rec_offs_n_extern( /*==============*/ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /***********************************************************//** This is used to modify the value of an already existing field in a record. The previous value must have exactly the same size as the new value. If len @@ -542,11 +620,12 @@ rec_set_nth_field( const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ ulint n, /*!< in: index number of the field */ const void* data, /*!< in: pointer to the data if not SQL null */ - ulint len); /*!< in: length of the data or UNIV_SQL_NULL. + ulint len) /*!< in: length of the data or UNIV_SQL_NULL. If not SQL null, must have the same length as the previous value. If SQL null, previous value must be SQL null. */ + __attribute__((nonnull(1,2))); /**********************************************************//** The following function returns the data size of an old-style physical record, that is the sum of field lengths. SQL null fields @@ -557,7 +636,8 @@ UNIV_INLINE ulint rec_get_data_size_old( /*==================*/ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); /**********************************************************//** The following function returns the number of allocated elements for an array of offsets. @@ -566,7 +646,8 @@ UNIV_INLINE ulint rec_offs_get_n_alloc( /*=================*/ - const ulint* offsets);/*!< in: array for rec_get_offsets() */ + const ulint* offsets)/*!< in: array for rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /**********************************************************//** The following function sets the number of allocated elements for an array of offsets. */ @@ -576,7 +657,8 @@ rec_offs_set_n_alloc( /*=================*/ ulint* offsets, /*!< out: array for rec_get_offsets(), must be allocated */ - ulint n_alloc); /*!< in: number of elements */ + ulint n_alloc) /*!< in: number of elements */ + __attribute__((nonnull)); #define rec_offs_init(offsets) \ rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) /**********************************************************//** @@ -586,7 +668,8 @@ UNIV_INLINE ulint rec_offs_n_fields( /*==============*/ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /**********************************************************//** The following function returns the data size of a physical record, that is the sum of field lengths. SQL null fields @@ -597,7 +680,8 @@ UNIV_INLINE ulint rec_offs_data_size( /*===============*/ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /**********************************************************//** Returns the total size of record minus data size of record. The value returned by the function is the distance from record @@ -607,7 +691,8 @@ UNIV_INLINE ulint rec_offs_extra_size( /*================*/ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /**********************************************************//** Returns the total size of a physical record. @return size */ @@ -615,7 +700,8 @@ UNIV_INLINE ulint rec_offs_size( /*==========*/ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); #ifdef UNIV_DEBUG /**********************************************************//** Returns a pointer to the start of the record. @@ -625,7 +711,8 @@ byte* rec_get_start( /*==========*/ const rec_t* rec, /*!< in: pointer to record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); /**********************************************************//** Returns a pointer to the end of the record. @return pointer to end */ @@ -634,7 +721,8 @@ byte* rec_get_end( /*========*/ const rec_t* rec, /*!< in: pointer to record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); #else /* UNIV_DEBUG */ # define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets)) # define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets)) @@ -648,8 +736,48 @@ rec_copy( /*=====*/ void* buf, /*!< in: buffer */ const rec_t* rec, /*!< in: physical record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); #ifndef UNIV_HOTBACKUP +/**********************************************************//** +Determines the size of a data tuple prefix in a temporary file. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_temp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + __attribute__((warn_unused_result, nonnull)); + +/******************************************************//** +Determine the offset to each field in temporary file. +@see rec_convert_dtuple_to_temp() */ +UNIV_INTERN +void +rec_init_offsets_temp( +/*==================*/ + const rec_t* rec, /*!< in: temporary file record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ + __attribute__((nonnull)); + +/*********************************************************//** +Builds a temporary file record out of a data tuple. +@see rec_init_offsets_temp() */ +UNIV_INTERN +void +rec_convert_dtuple_to_temp( +/*=======================*/ + rec_t* rec, /*!< out: record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull)); + /**************************************************************//** Copies the first n fields of a physical record to a new physical record in a buffer. @@ -665,7 +793,8 @@ rec_copy_prefix_to_buf( byte** buf, /*!< in/out: memory buffer for the copied prefix, or NULL */ - ulint* buf_size); /*!< in/out: buffer size */ + ulint* buf_size) /*!< in/out: buffer size */ + __attribute__((nonnull)); /************************************************************//** Folds a prefix of a physical record to a ulint. @return the folded value */ @@ -681,24 +810,9 @@ rec_fold( ulint n_bytes, /*!< in: number of bytes to fold in an incomplete last field */ index_id_t tree_id) /*!< in: index tree id */ - __attribute__((pure)); + __attribute__((nonnull, pure, warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /*********************************************************//** -Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ -UNIV_INTERN -void -rec_convert_dtuple_to_rec_comp( -/*===========================*/ - rec_t* rec, /*!< in: origin of record */ - ulint extra, /*!< in: number of bytes to - reserve between the record - header and the data payload - (normally REC_N_NEW_EXTRA_BYTES) */ - const dict_index_t* index, /*!< in: record descriptor */ - ulint status, /*!< in: status bits of the record */ - const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields);/*!< in: number of data fields */ -/*********************************************************//** Builds a physical record out of a data tuple and stores it into the given buffer. @return pointer to the origin of physical record */ @@ -710,8 +824,9 @@ rec_convert_dtuple_to_rec( physical record */ const dict_index_t* index, /*!< in: record descriptor */ const dtuple_t* dtuple, /*!< in: data tuple */ - ulint n_ext); /*!< in: number of + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((nonnull, warn_unused_result)); /**********************************************************//** Returns the extra size of an old-style physical record if we know its data size and number of fields. @@ -723,7 +838,7 @@ rec_get_converted_extra_size( ulint data_size, /*!< in: data size */ ulint n_fields, /*!< in: number of fields */ ulint n_ext) /*!< in: number of externally stored columns */ - __attribute__((const)); + __attribute__((const)); /**********************************************************//** Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. @return total size */ @@ -731,13 +846,11 @@ UNIV_INTERN ulint rec_get_converted_size_comp_prefix( /*===============================*/ - const dict_index_t* index, /*!< in: record descriptor; - dict_table_is_comp() is - assumed to hold, even if - it does not */ + const dict_index_t* index, /*!< in: record descriptor */ const dfield_t* fields, /*!< in: array of data fields */ ulint n_fields,/*!< in: number of data fields */ - ulint* extra); /*!< out: extra size */ + ulint* extra) /*!< out: extra size */ + __attribute__((warn_unused_result, nonnull(1,2))); /**********************************************************//** Determines the size of a data tuple in ROW_FORMAT=COMPACT. @return total size */ @@ -752,7 +865,8 @@ rec_get_converted_size_comp( ulint status, /*!< in: status bits of the record */ const dfield_t* fields, /*!< in: array of data fields */ ulint n_fields,/*!< in: number of data fields */ - ulint* extra); /*!< out: extra size */ + ulint* extra) /*!< out: extra size */ + __attribute__((nonnull(1,3))); /**********************************************************//** The following function returns the size of a data tuple when converted to a physical record. @@ -763,7 +877,8 @@ rec_get_converted_size( /*===================*/ dict_index_t* index, /*!< in: record descriptor */ const dtuple_t* dtuple, /*!< in: data tuple */ - ulint n_ext); /*!< in: number of externally stored columns */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((warn_unused_result, nonnull)); #ifndef UNIV_HOTBACKUP /**************************************************************//** Copies the first n fields of a physical record to a data tuple. @@ -777,7 +892,8 @@ rec_copy_prefix_to_dtuple( const dict_index_t* index, /*!< in: record descriptor */ ulint n_fields, /*!< in: number of fields to copy */ - mem_heap_t* heap); /*!< in: memory heap */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /***************************************************************//** Validates the consistency of a physical record. @@ -787,7 +903,8 @@ ibool rec_validate( /*=========*/ const rec_t* rec, /*!< in: physical record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); /***************************************************************//** Prints an old-style physical record. */ UNIV_INTERN @@ -795,7 +912,8 @@ void rec_print_old( /*==========*/ FILE* file, /*!< in: file where to print */ - const rec_t* rec); /*!< in: physical record */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull)); #ifndef UNIV_HOTBACKUP /***************************************************************//** Prints a physical record in ROW_FORMAT=COMPACT. Ignores the @@ -806,7 +924,8 @@ rec_print_comp( /*===========*/ FILE* file, /*!< in: file where to print */ const rec_t* rec, /*!< in: physical record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); /***************************************************************//** Prints a physical record. */ UNIV_INTERN @@ -815,7 +934,8 @@ rec_print_new( /*==========*/ FILE* file, /*!< in: file where to print */ const rec_t* rec, /*!< in: physical record */ - const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); /***************************************************************//** Prints a physical record. */ UNIV_INTERN @@ -824,7 +944,21 @@ rec_print( /*======*/ FILE* file, /*!< in: file where to print */ const rec_t* rec, /*!< in: physical record */ - const dict_index_t* index); /*!< in: record descriptor */ + const dict_index_t* index) /*!< in: record descriptor */ + __attribute__((nonnull)); + +# ifdef UNIV_DEBUG +/************************************************************//** +Reads the DB_TRX_ID of a clustered index record. +@return the value of DB_TRX_ID */ +UNIV_INTERN +trx_id_t +rec_get_trx_id( +/*===========*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index) /*!< in: clustered index */ + __attribute__((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG */ #endif /* UNIV_HOTBACKUP */ /* Maximum lengths for the data in a physical record if the offsets diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index 6950263fe81..18a7deb9d26 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -118,17 +118,6 @@ and the shift needed to obtain each bit-field of the record. */ #define REC_INFO_BITS_MASK 0xF0UL #define REC_INFO_BITS_SHIFT 0 -/* The following masks are used to filter the SQL null bit from -one-byte and two-byte offsets */ - -#define REC_1BYTE_SQL_NULL_MASK 0x80UL -#define REC_2BYTE_SQL_NULL_MASK 0x8000UL - -/* In a 2-byte offset the second most significant bit denotes -a field stored to another page: */ - -#define REC_2BYTE_EXTERN_MASK 0x4000UL - #if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ @@ -883,6 +872,20 @@ rec_2_get_field_end_info( return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); } +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK); +} + /* Get the base address of offsets. The extra_size is stored at this position, and following positions hold the end offsets of the fields. */ @@ -1084,7 +1087,6 @@ rec_offs_any_extern( return(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL); } -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG /******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ @@ -1120,7 +1122,6 @@ rec_offs_any_null_extern( return(NULL); } -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h index 2f1ead43c07..f8133f77466 100644 --- a/storage/innobase/include/rem0types.h +++ b/storage/innobase/include/rem0types.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -34,6 +34,15 @@ typedef byte rec_t; #define REC_MAX_HEAP_NO (2 * 8192 - 1) #define REC_MAX_N_OWNED (16 - 1) +/* Maximum number of user defined fields/columns. The reserved columns +are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR. +We need "* 2" because mlog_parse_index() creates a dummy table object +possibly, with some of the system columns in it, and then adds the 3 +system columns (again) using dict_table_add_system_columns(). The problem +is that mlog_parse_index() cannot recognize the system columns by +just having n_fields, n_uniq and the lengths of the columns. */ +#define REC_MAX_N_USER_FIELDS (REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2) + /* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum indexed field length (or indexed prefix length) for indexes on tables of ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format. diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h index 60aaf16c09a..a098e2f9b29 100644 --- a/storage/innobase/include/row0ext.h +++ b/storage/innobase/include/row0ext.h @@ -84,7 +84,7 @@ row_ext_lookup( DICT_MAX_FIELD_LEN_BY_FORMAT() */ /** Prefixes of externally stored columns */ -struct row_ext_struct{ +struct row_ext_t{ ulint n_ext; /*!< number of externally stored columns */ const ulint* ext; /*!< col_no's of externally stored columns */ byte* buf; /*!< backing store of the column prefix cache */ diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h index cc5efea026f..4a486450efc 100644 --- a/storage/innobase/include/row0ftsort.h +++ b/storage/innobase/include/row0ftsort.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -56,16 +56,16 @@ typedef UT_LIST_BASE_NODE_T(fts_doc_item_t) fts_doc_list_t; #define FTS_PLL_MERGE 1 /** Sort information passed to each individual parallel sort thread */ -typedef struct fts_psort_struct fts_psort_t; +struct fts_psort_t; /** Common info passed to each parallel sort thread */ -struct fts_psort_common_struct { - struct TABLE* table; /*!< MySQL table */ +struct fts_psort_common_t { + row_merge_dup_t* dup; /*!< descriptor of FTS index */ dict_table_t* new_table; /*!< source table */ trx_t* trx; /*!< transaction */ - dict_index_t* sort_index; /*!< FTS index */ fts_psort_t* all_info; /*!< all parallel sort info */ os_event_t sort_event; /*!< sort event */ + os_event_t merge_event; /*!< merge event */ ibool opt_doc_id_size;/*!< whether to use 4 bytes instead of 8 bytes integer to store Doc ID during sort, if @@ -73,9 +73,7 @@ struct fts_psort_common_struct { to use 8 bytes value */ }; -typedef struct fts_psort_common_struct fts_psort_common_t; - -struct fts_psort_struct { +struct fts_psort_t { ulint psort_id; /*!< Parallel sort ID */ row_merge_buf_t* merge_buf[FTS_NUM_AUX_INDEX]; /*!< sort buffer */ @@ -89,6 +87,7 @@ struct fts_psort_struct { ulint state; /*!< child thread state */ fts_doc_list_t fts_doc_list; /*!< doc list to process */ fts_psort_common_t* psort_common; /*!< ptr to all psort info */ + os_thread_t thread_hdl; /*!< thread handler */ }; /** Structure stores information from string tokenization operation */ @@ -126,6 +125,7 @@ typedef struct fts_psort_insert fts_psort_insert_t; /** status bit used for communication between parent and child thread */ #define FTS_PARENT_COMPLETE 1 #define FTS_CHILD_COMPLETE 1 +#define FTS_CHILD_EXITING 2 /** Print some debug information */ #define FTSORT_PRINT @@ -171,18 +171,19 @@ ibool row_fts_psort_info_init( /*====================*/ trx_t* trx, /*!< in: transaction */ - struct TABLE* table, /*!< in: MySQL table object */ + row_merge_dup_t* dup, /*!< in,own: descriptor of + FTS index being created */ const dict_table_t* new_table,/*!< in: table where indexes are created */ - dict_index_t* index, /*!< in: FTS index to be created */ ibool opt_doc_id_size, /*!< in: whether to use 4 bytes instead of 8 bytes integer to store Doc ID during sort */ fts_psort_t** psort, /*!< out: parallel sort info to be instantiated */ - fts_psort_t** merge); /*!< out: parallel merge info + fts_psort_t** merge) /*!< out: parallel merge info to be instantiated */ + __attribute__((nonnull)); /********************************************************************//** Clean up and deallocate FTS parallel sort structures, and close temparary merge sort files */ @@ -231,19 +232,6 @@ row_fts_start_parallel_merge( /*=========================*/ fts_psort_t* merge_info); /*!< in: parallel sort info */ /********************************************************************//** -Insert processed FTS data to the auxillary tables. -@return DB_SUCCESS if insertion runs fine */ -UNIV_INTERN -ulint -row_merge_write_fts_word( -/*=====================*/ - trx_t* trx, /*!< in: transaction */ - que_t** ins_graph, /*!< in: Insert query graphs */ - fts_tokenizer_word_t*word, /*!< in: sorted and tokenized - word */ - fts_table_t* fts_table, /*!< in: fts aux table instance */ - CHARSET_INFO* charset); /*!< in: charset */ -/********************************************************************//** Read sorted FTS data files and insert data tuples to auxillary tables. @return DB_SUCCESS or error number */ UNIV_INTERN @@ -275,13 +263,13 @@ Read sorted file containing index data tuples and insert these data tuples to the index @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t row_fts_merge_insert( /*=================*/ dict_index_t* index, /*!< in: index */ dict_table_t* table, /*!< in: new table */ fts_psort_t* psort_info, /*!< parallel sort info */ - ulint id); /* !< in: which auxiliary table's data + ulint id) /* !< in: which auxiliary table's data to insert to */ - + __attribute__((nonnull)); #endif /* row0ftsort_h */ diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h new file mode 100644 index 00000000000..aa46fdb7c27 --- /dev/null +++ b/storage/innobase/include/row0import.h @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0import.h +Header file for import tablespace functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0import_h +#define row0import_h + +#include "univ.i" +#include "db0err.h" +#include "dict0types.h" + +// Forward declarations +struct trx_t; +struct dict_table_t; +struct row_prebuilt_t; + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct + in MySQL */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Update the DICT_TF2_DISCARDED flag in SYS_TABLES. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_import_update_discarded_flag( +/*=============================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + table_id_t table_id, /*!< in: Table for which we want + to set the root table->flags2 */ + bool discarded, /*!< in: set MIX_LEN column bit + to discarded, if true */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Update the (space, root page) of a table's indexes from the values +in the data dictionary. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_import_update_index_root( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + const dict_table_t* table, /*!< in: Table for which we want + to set the root page_no */ + bool reset, /*!< in: if true then set to + FIL_NUL */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_NONINL +#include "row0import.ic" +#endif + +#endif /* row0import_h */ diff --git a/storage/innobase/include/row0import.ic b/storage/innobase/include/row0import.ic new file mode 100644 index 00000000000..c5bbab49f6f --- /dev/null +++ b/storage/innobase/include/row0import.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0import.ic + +Import tablespace inline functions. + +Created 2012-02-08 Sunny Bains +*******************************************************/ diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h index 54ad7241a4f..2a892d2f5df 100644 --- a/storage/innobase/include/row0ins.h +++ b/storage/innobase/include/row0ins.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,7 +40,7 @@ the caller must have a shared latch on dict_foreign_key_check_lock. @return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ UNIV_INTERN -ulint +dberr_t row_ins_check_foreign_constraint( /*=============================*/ ibool check_ref,/*!< in: TRUE If we want to check that @@ -52,7 +52,8 @@ row_ins_check_foreign_constraint( dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign table, else the referenced table */ dtuple_t* entry, /*!< in: index entry for index */ - que_thr_t* thr); /*!< in: query thread */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Creates an insert node struct. @return own: insert node struct */ @@ -74,21 +75,110 @@ ins_node_set_new_row( ins_node_t* node, /*!< in: insert node */ dtuple_t* row); /*!< in: new row (or first row) for the node */ /***************************************************************//** -Inserts an index entry to index. Tries first optimistic, then pessimistic -descent down the tree. If the entry matches enough to a delete marked record, -performs the insert by updating or delete unmarking the delete marked -record. -@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ UNIV_INTERN -ulint -row_ins_index_entry( -/*================*/ - dict_index_t* index, /*!< in: index */ +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ dtuple_t* entry, /*!< in/out: index entry to insert */ ulint n_ext, /*!< in: number of externally stored columns */ - ibool foreign,/*!< in: TRUE=check foreign key constraints - (foreign=FALSE only during CREATE INDEX) */ - que_thr_t* thr); /*!< in: query thread */ + que_thr_t* thr) /*!< in: query thread or NULL */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Tries to insert the externally stored fields (off-page columns) +of a clustered index entry. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +row_ins_index_entry_big_rec_func( +/*=============================*/ + const dtuple_t* entry, /*!< in/out: index entry to insert */ + const big_rec_t* big_rec,/*!< in: externally stored fields */ + ulint* offsets,/*!< in/out: rec offsets */ + mem_heap_t** heap, /*!< in/out: memory heap */ + dict_index_t* index, /*!< in: index */ + const char* file, /*!< in: file name of caller */ +#ifndef DBUG_OFF + const void* thd, /*!< in: connection, or NULL */ +#endif /* DBUG_OFF */ + ulint line) /*!< in: line number of caller */ + __attribute__((nonnull(1,2,3,4,5,6), warn_unused_result)); +#ifdef DBUG_OFF +# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \ + row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,line) +#else /* DBUG_OFF */ +# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \ + row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,thd,line) +#endif /* DBUG_OFF */ +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); /***********************************************************//** Inserts a row to a table. This is a high-level function used in SQL execution graphs. @@ -98,17 +188,10 @@ que_thr_t* row_ins_step( /*=========*/ que_thr_t* thr); /*!< in: query thread */ -/***********************************************************//** -Creates an entry template for each index of a table. */ -UNIV_INTERN -void -ins_node_create_entry_list( -/*=======================*/ - ins_node_t* node); /*!< in: row insert node */ /* Insert node structure */ -struct ins_node_struct{ +struct ins_node_t{ que_common_t common; /*!< node type: QUE_NODE_INSERT */ ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */ dtuple_t* row; /*!< row to insert */ diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h new file mode 100644 index 00000000000..984d907d390 --- /dev/null +++ b/storage/innobase/include/row0log.h @@ -0,0 +1,241 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.h +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#ifndef row0log_h +#define row0log_h + +#include "univ.i" +#include "mtr0types.h" +#include "row0types.h" +#include "rem0types.h" +#include "data0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +UNIV_INTERN +bool +row_log_allocate( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map)/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ + __attribute__((nonnull(1), warn_unused_result)); + +/******************************************************//** +Free the row log for an index that was being created online. */ +UNIV_INTERN +void +row_log_free( +/*=========*/ + row_log_t*& log) /*!< in,own: row log */ + __attribute__((nonnull)); + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*==============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ + __attribute__((nonnull)); + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************//** +Logs an operation to a secondary index that is (or was) being created. */ +UNIV_INTERN +void +row_log_online_op( +/*==============*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + UNIV_COLD __attribute__((nonnull)); + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +UNIV_INTERN +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + trx_id_t trx_id) /*!< in: DB_TRX_ID of the record before + it was deleted */ + UNIV_COLD __attribute__((nonnull)); + +/******************************************************//** +Logs an update operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +UNIV_INTERN +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk) /*!< in: row_log_table_get_pk() + before the update */ + UNIV_COLD __attribute__((nonnull(1,2,3))); + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +UNIV_INTERN +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), + or NULL */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ + UNIV_COLD __attribute__((nonnull(1,2,4), warn_unused_result)); + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +UNIV_INTERN +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets)/*!< in: rec_get_offsets(rec,index) */ + UNIV_COLD __attribute__((nonnull)); + +/******************************************************//** +Notes that a transaction is being rolled back. */ +UNIV_INTERN +void +row_log_table_rollback( +/*===================*/ + dict_index_t* index, /*!< in/out: clustered index */ + trx_id_t trx_id) /*!< in: transaction being rolled back */ + UNIV_COLD __attribute__((nonnull)); + +/******************************************************//** +Check if a transaction rollback has been initiated. +@return true if inserts of this transaction were rolled back */ +UNIV_INTERN +bool +row_log_table_is_rollback( +/*======================*/ + const dict_index_t* index, /*!< in: clustered index */ + trx_id_t trx_id) /*!< in: transaction id */ + __attribute__((nonnull)); + +/******************************************************//** +Apply the row_log_table log to a table upon completing rebuild. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_table_apply( +/*================*/ + que_thr_t* thr, /*!< in: query graph */ + dict_table_t* old_table, + /*!< in: old table */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +UNIV_INTERN +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Merge the row log to the index upon completing index creation. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_apply( +/*==========*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: secondary index */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "row0log.ic" +#endif + +#endif /* row0log.h */ diff --git a/storage/innobase/include/row0log.ic b/storage/innobase/include/row0log.ic new file mode 100644 index 00000000000..b0f37dbd8e7 --- /dev/null +++ b/storage/innobase/include/row0log.ic @@ -0,0 +1,84 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.ic +Modification log for online index creation and online table rebuild + +Created 2012-10-18 Marko Makela +*******************************************************/ + +#include "dict0dict.h" + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*===============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!dict_index_is_clust(index)); + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + row_log_free(index->online_log); +} + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + /* This is a normal index. Do not log anything. + The caller must perform the operation on the + index tree directly. */ + return(false); + case ONLINE_INDEX_CREATION: + /* The index is being created online. Log the + operation. */ + row_log_online_op(index, tuple, trx_id); + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* The index was created online, but the operation was + aborted. Do not log the operation and tell the caller + to skip the operation. */ + break; + } + + return(true); +} diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index c4e2f5ddf41..f464e46ae5b 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2005, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,15 +40,17 @@ Created 13/06/2005 Jan Lindstrom #include "lock0types.h" #include "srv0srv.h" +// Forward declaration +struct ib_sequence_t; + /** @brief Block size for I/O operations in merge sort. The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty() rounded to a power of 2. When not creating a PRIMARY KEY that contains column prefixes, this -can be set as small as UNIV_PAGE_SIZE / 2. See the comment above -ut_ad(data_size < sizeof(row_merge_block_t)). */ -typedef byte row_merge_block_t; +can be set as small as UNIV_PAGE_SIZE / 2. */ +typedef byte row_merge_block_t; /** @brief Secondary buffer for I/O operations of merge records. @@ -64,114 +66,146 @@ The format is the same as a record in ROW_FORMAT=COMPACT with the exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ typedef byte mrec_t; +/** Merge record in row_merge_buf_t */ +struct mtuple_t { + dfield_t* fields; /*!< data fields */ +}; + /** Buffer for sorting in main memory. */ -struct row_merge_buf_struct { +struct row_merge_buf_t { mem_heap_t* heap; /*!< memory heap where allocated */ dict_index_t* index; /*!< the index the tuples belong to */ ulint total_size; /*!< total amount of data bytes */ ulint n_tuples; /*!< number of data tuples */ ulint max_tuples; /*!< maximum number of data tuples */ - const dfield_t**tuples; /*!< array of pointers to - arrays of fields that form - the data tuples */ - const dfield_t**tmp_tuples; /*!< temporary copy of tuples, + mtuple_t* tuples; /*!< array of data tuples */ + mtuple_t* tmp_tuples; /*!< temporary copy of tuples, for sorting */ }; -/** Buffer for sorting in main memory. */ -typedef struct row_merge_buf_struct row_merge_buf_t; - /** Information about temporary files used in merge sort */ -struct merge_file_struct { +struct merge_file_t { int fd; /*!< file descriptor */ ulint offset; /*!< file offset (end of file) */ ib_uint64_t n_rec; /*!< number of records in the file */ }; -/** Information about temporary files used in merge sort */ -typedef struct merge_file_struct merge_file_t; - /** Index field definition */ -struct merge_index_field_struct { +struct index_field_t { + ulint col_no; /*!< column offset */ ulint prefix_len; /*!< column prefix length, or 0 if indexing the whole column */ - const char* field_name; /*!< field name */ }; -/** Index field definition */ -typedef struct merge_index_field_struct merge_index_field_t; - /** Definition of an index being created */ -struct merge_index_def_struct { - const char* name; /*!< index name */ - ulint ind_type; /*!< 0, DICT_UNIQUE, - or DICT_CLUSTERED */ - ulint n_fields; /*!< number of fields - in index */ - merge_index_field_t* fields; /*!< field definitions */ +struct index_def_t { + const char* name; /*!< index name */ + ulint ind_type; /*!< 0, DICT_UNIQUE, + or DICT_CLUSTERED */ + ulint key_number; /*!< MySQL key number, + or ULINT_UNDEFINED if none */ + ulint n_fields; /*!< number of fields in index */ + index_field_t* fields; /*!< field definitions */ }; -/** Definition of an index being created */ -typedef struct merge_index_def_struct merge_index_def_t; - /** Structure for reporting duplicate records. */ -struct row_merge_dup_struct { - const dict_index_t* index; /*!< index being sorted */ - struct TABLE* table; /*!< MySQL table object */ - ulint n_dup; /*!< number of duplicates */ +struct row_merge_dup_t { + dict_index_t* index; /*!< index being sorted */ + struct TABLE* table; /*!< MySQL table object */ + const ulint* col_map;/*!< mapping of column numbers + in table to the rebuilt table + (index->table), or NULL if not + rebuilding table */ + ulint n_dup; /*!< number of duplicates */ }; -/** Structure for reporting duplicate records. */ -typedef struct row_merge_dup_struct row_merge_dup_t; - +/*************************************************************//** +Report a duplicate key. */ +UNIV_INTERN +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ + __attribute__((nonnull)); /*********************************************************************//** Sets an exclusive lock on a table, for the duration of creating indexes. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_merge_lock_table( /*=================*/ trx_t* trx, /*!< in/out: transaction */ dict_table_t* table, /*!< in: table to lock */ - enum lock_mode mode); /*!< in: LOCK_X or LOCK_S */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** -Drop an index from the InnoDB system tables. The data dictionary must -have been locked exclusively by the caller, because the transaction -will not be committed. */ +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ UNIV_INTERN void -row_merge_drop_index( -/*=================*/ - dict_index_t* index, /*!< in: index to be removed */ - dict_table_t* table, /*!< in: table */ - trx_t* trx); /*!< in: transaction handle */ +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ + __attribute__((nonnull)); /*********************************************************************//** -Drop those indexes which were created before an error occurred when -building an index. The data dictionary must have been locked -exclusively by the caller, because the transaction will not be -committed. */ +Drop those indexes which were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ UNIV_INTERN void row_merge_drop_indexes( /*===================*/ - trx_t* trx, /*!< in: transaction */ - dict_table_t* table, /*!< in: table containing the indexes */ - dict_index_t** index, /*!< in: indexes to drop */ - ulint num_created); /*!< in: number of elements in - index[] */ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in/out: table containing the indexes */ + ibool locked) /*!< in: TRUE=table locked, + FALSE=may need to do a lazy drop */ + __attribute__((nonnull)); /*********************************************************************//** Drop all partially created indexes during crash recovery. */ UNIV_INTERN void row_merge_drop_temp_indexes(void); /*=============================*/ + +/*********************************************************************//** +Creates temporary merge files, and if UNIV_PFS_IO defined, register +the file descriptor with Performance Schema. +@return File descriptor */ +UNIV_INTERN +int +row_merge_file_create_low(void) +/*===========================*/ + __attribute__((warn_unused_result)); +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +UNIV_INTERN +void +row_merge_file_destroy_low( +/*=======================*/ + int fd); /*!< in: merge file descriptor */ + +/*********************************************************************//** +Provide a new pathname for a table that is being renamed if it belongs to +a file-per-table tablespace. The caller is responsible for freeing the +memory allocated for the return value. +@return new pathname of tablespace file, or NULL if space = 0 */ +UNIV_INTERN +char* +row_make_new_pathname( +/*==================*/ + dict_table_t* table, /*!< in: table to be renamed */ + const char* new_name); /*!< in: new name */ /*********************************************************************//** Rename the tables in the data dictionary. The data dictionary must have been locked exclusively by the caller, because the transaction will not be committed. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_merge_rename_tables( /*====================*/ dict_table_t* old_table, /*!< in/out: old table, renamed to @@ -179,32 +213,35 @@ row_merge_rename_tables( dict_table_t* new_table, /*!< in/out: new table, renamed to old_table->name */ const char* tmp_name, /*!< in: new name for old_table */ - trx_t* trx); /*!< in: transaction handle */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull, warn_unused_result)); + /*********************************************************************//** -Create a temporary table for creating a primary key, using the definition -of an existing table. -@return table, or NULL on error */ +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ UNIV_INTERN -dict_table_t* -row_merge_create_temporary_table( -/*=============================*/ - const char* table_name, /*!< in: new table name */ - const merge_index_def_t*index_def, /*!< in: the index definition - of the primary key */ - const dict_table_t* table, /*!< in: old table definition */ - trx_t* trx); /*!< in/out: transaction - (sets error_state) */ +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + __attribute__((nonnull)); /*********************************************************************//** -Rename the temporary indexes in the dictionary to permanent ones. The -data dictionary must have been locked exclusively by the caller, -because the transaction will not be committed. +Rename an index in the dictionary that is to be dropped. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. @return DB_SUCCESS if all OK */ UNIV_INTERN -ulint -row_merge_rename_indexes( -/*=====================*/ +dberr_t +row_merge_rename_index_to_drop( +/*===========================*/ trx_t* trx, /*!< in/out: transaction */ - dict_table_t* table); /*!< in/out: table with new indexes */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + __attribute__((nonnull)); /*********************************************************************//** Create the index and load in to the dictionary. @return index, or NULL on error */ @@ -214,7 +251,7 @@ row_merge_create_index( /*===================*/ trx_t* trx, /*!< in/out: trx (sets error_state) */ dict_table_t* table, /*!< in: the index is on this table */ - const merge_index_def_t*index_def); + const index_def_t* index_def); /*!< in: the index definition */ /*********************************************************************//** Check if a transaction can use an index. @@ -226,22 +263,25 @@ row_merge_is_index_usable( const trx_t* trx, /*!< in: transaction */ const dict_index_t* index); /*!< in: index to check */ /*********************************************************************//** -If there are views that refer to the old table name then we "attach" to -the new instance of the table else we drop it immediately. +Drop a table. The caller must have ensured that the background stats +thread is not processing the table. This can be done by calling +dict_stats_wait_bg_to_stop_using_tables() after locking the dictionary and +before calling this function. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_drop_table( /*=================*/ trx_t* trx, /*!< in: transaction */ - dict_table_t* table); /*!< in: table instance to drop */ + dict_table_t* table) /*!< in: table instance to drop */ + __attribute__((nonnull)); /*********************************************************************//** Build indexes on a table by reading a clustered index, creating a temporary file containing index entries, merge sorting these index entries and inserting sorted index entries to indexes. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_build_indexes( /*====================*/ trx_t* trx, /*!< in: transaction */ @@ -250,11 +290,24 @@ row_merge_build_indexes( dict_table_t* new_table, /*!< in: table where indexes are created; identical to old_table unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ dict_index_t** indexes, /*!< in: indexes to be created */ + const ulint* key_numbers, /*!< in: MySQL key numbers */ ulint n_indexes, /*!< in: size of indexes[] */ - struct TABLE* table); /*!< in/out: MySQL table, for + struct TABLE* table, /*!< in/out: MySQL table, for reporting erroneous key value if applicable */ + const dtuple_t* add_cols, /*!< in: default values of + added columns, or NULL */ + const ulint* col_map, /*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence) /*!< in/out: autoinc sequence */ + __attribute__((nonnull(1,2,3,5,6,8), warn_unused_result)); /********************************************************************//** Write a buffer to a block. */ UNIV_INTERN @@ -263,15 +316,18 @@ row_merge_buf_write( /*================*/ const row_merge_buf_t* buf, /*!< in: sorted buffer */ const merge_file_t* of, /*!< in: output file */ - row_merge_block_t* block); /*!< out: buffer for writing to file */ + row_merge_block_t* block) /*!< out: buffer for writing to file */ + __attribute__((nonnull)); /********************************************************************//** Sort a buffer. */ UNIV_INTERN void row_merge_buf_sort( /*===============*/ - row_merge_buf_t* buf, /*!< in/out: sort buffer */ - row_merge_dup_t* dup); /*!< in/out: for reporting duplicates */ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + __attribute__((nonnull(1))); /********************************************************************//** Write a merge block to the file system. @return TRUE if request was successful, FALSE if fail */ @@ -290,30 +346,32 @@ UNIV_INTERN row_merge_buf_t* row_merge_buf_empty( /*================*/ - row_merge_buf_t* buf); /*!< in,own: sort buffer */ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ + __attribute__((warn_unused_result, nonnull)); /*********************************************************************//** -Create a merge file. */ +Create a merge file. +@return file descriptor, or -1 on failure */ UNIV_INTERN -void +int row_merge_file_create( /*==================*/ - merge_file_t* merge_file); /*!< out: merge file structure */ + merge_file_t* merge_file) /*!< out: merge file structure */ + __attribute__((nonnull)); /*********************************************************************//** Merge disk files. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_sort( /*===========*/ trx_t* trx, /*!< in: transaction */ - const dict_index_t* index, /*!< in: index being created */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ merge_file_t* file, /*!< in/out: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ - int* tmpfd, /*!< in/out: temporary file handle */ - struct TABLE* table); /*!< in/out: MySQL table, for - reporting erroneous key value - if applicable */ + int* tmpfd) /*!< in/out: temporary file handle */ + __attribute__((nonnull)); /*********************************************************************//** Allocate a sort buffer. @return own: sort buffer */ @@ -321,37 +379,24 @@ UNIV_INTERN row_merge_buf_t* row_merge_buf_create( /*=================*/ - dict_index_t* index); /*!< in: secondary index */ + dict_index_t* index) /*!< in: secondary index */ + __attribute__((warn_unused_result, nonnull, malloc)); /*********************************************************************//** Deallocate a sort buffer. */ UNIV_INTERN void row_merge_buf_free( /*===============*/ - row_merge_buf_t* buf); /*!< in,own: sort buffer, to be freed */ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ + __attribute__((nonnull)); /*********************************************************************//** Destroy a merge file. */ UNIV_INTERN void row_merge_file_destroy( /*===================*/ - merge_file_t* merge_file); /*!< out: merge file structure */ -/*********************************************************************//** -Compare two merge records. -@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */ -UNIV_INTERN -int -row_merge_cmp( -/*==========*/ - const mrec_t* mrec1, /*!< in: first merge - record to be compared */ - const mrec_t* mrec2, /*!< in: second merge - record to be compared */ - const ulint* offsets1, /*!< in: first record offsets */ - const ulint* offsets2, /*!< in: second record offsets */ - const dict_index_t* index, /*!< in: index */ - ibool* null_eq); /*!< out: set to TRUE if - found matching null values */ + merge_file_t* merge_file) /*!< in/out: merge file structure */ + __attribute__((nonnull)); /********************************************************************//** Read a merge block from the file system. @return TRUE if request was successful, FALSE if fail */ @@ -367,7 +412,7 @@ row_merge_read( /********************************************************************//** Read a merge record. @return pointer to next record, or NULL on I/O error or end of list */ -UNIV_INTERN __attribute__((nonnull)) +UNIV_INTERN const byte* row_merge_read_rec( /*===============*/ @@ -380,5 +425,6 @@ row_merge_read_rec( const mrec_t** mrec, /*!< out: pointer to merge record, or NULL on end of list (non-NULL on I/O error) */ - ulint* offsets);/*!< out: offsets of mrec */ + ulint* offsets)/*!< out: offsets of mrec */ + __attribute__((nonnull, warn_unused_result)); #endif /* row0merge.h */ diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h index 17a29e38ec7..1e0f3b30f8c 100644 --- a/storage/innobase/include/row0mysql.h +++ b/storage/innobase/include/row0mysql.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2000, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,9 +36,12 @@ Created 9/17/2000 Heikki Tuuri #include "btr0pcur.h" #include "trx0types.h" +// Forward declaration +struct SysIndexCallback; + extern ibool row_rollback_on_timeout; -typedef struct row_prebuilt_struct row_prebuilt_t; +struct row_prebuilt_t; /*******************************************************************//** Frees the blob heap in prebuilt when no longer needed. */ @@ -152,18 +155,19 @@ row_mysql_store_col_in_innobase_format( ulint comp); /*!< in: nonzero=compact format */ /****************************************************************//** Handles user errors and lock waits detected by the database engine. -@return TRUE if it was a lock wait and we should continue running the +@return true if it was a lock wait and we should continue running the query thread */ UNIV_INTERN -ibool +bool row_mysql_handle_errors( /*====================*/ - ulint* new_err,/*!< out: possible new error encountered in + dberr_t* new_err,/*!< out: possible new error encountered in rollback, or the old error which was during the function entry */ trx_t* trx, /*!< in: transaction */ - que_thr_t* thr, /*!< in: query thread */ - trx_savept_t* savept);/*!< in: savepoint */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ + __attribute__((nonnull(1,2))); /********************************************************************//** Create a prebuilt struct for a MySQL table handle. @return own: a prebuilt struct */ @@ -200,16 +204,17 @@ It is not compatible with another AUTO_INC or exclusive lock on the table. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_lock_table_autoinc_for_mysql( /*=============================*/ - row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in the MySQL + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL table handle */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Sets a table lock on the table mentioned in prebuilt. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_lock_table_for_mysql( /*=====================*/ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL @@ -218,19 +223,20 @@ row_lock_table_for_mysql( if prebuilt->table should be locked as prebuilt->select_lock_type */ - ulint mode); /*!< in: lock mode of table + ulint mode) /*!< in: lock mode of table (ignored if table==NULL) */ - + __attribute__((nonnull(1))); /*********************************************************************//** Does an insert for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_insert_for_mysql( /*=================*/ byte* mysql_rec, /*!< in: row in the MySQL format */ - row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL handle */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Builds a dummy query graph used in selects. */ UNIV_INTERN @@ -263,13 +269,14 @@ row_table_got_default_clust_index( Does an update or delete of a row for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_update_for_mysql( /*=================*/ byte* mysql_rec, /*!< in: the row to be updated, in the MySQL format */ - row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL handle */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** This can only be used when srv_locks_unsafe_for_binlog is TRUE or this session is using a READ COMMITTED or READ UNCOMMITTED isolation level. @@ -278,19 +285,31 @@ initialized prebuilt->new_rec_locks to store the information which new record locks really were set. This function removes a newly set clustered index record lock under prebuilt->pcur or prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that -releases the latest clustered index record lock we set. -@return error code or DB_SUCCESS */ +releases the latest clustered index record lock we set. */ UNIV_INTERN -int +void row_unlock_for_mysql( /*=================*/ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL handle */ - ibool has_latches_on_recs);/*!< in: TRUE if called + ibool has_latches_on_recs)/*!< in: TRUE if called so that we have the latches on the records under pcur and clust_pcur, and we do not need to reposition the cursors. */ + __attribute__((nonnull)); +/*********************************************************************//** +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. +@return true if temporary table */ +UNIV_INTERN +bool +row_is_mysql_tmp_table_name( +/*========================*/ + const char* name) __attribute__((warn_unused_result)); + /*!< in: table name in the form + 'database/tablename' */ + /*********************************************************************//** Creates an query graph node of 'update' type to be used in the MySQL interface. @@ -305,13 +324,14 @@ row_create_update_node_for_mysql( Does a cascaded delete or set null in a foreign key operation. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_update_cascade_for_mysql( /*=========================*/ que_thr_t* thr, /*!< in: query thread */ upd_node_t* node, /*!< in: update node used in the cascade or set null operation */ - dict_table_t* table); /*!< in: table where we do the operation */ + dict_table_t* table) /*!< in: table where we do the operation */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Locks the data dictionary exclusively for performing a table create or other data dictionary modification operation. */ @@ -355,33 +375,38 @@ Creates a table for MySQL. If the name of the table ends in one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", "innodb_table_monitor", then this will also start the printing of monitor output by the master thread. If the table name ends in "innodb_mem_validate", -InnoDB will try to invoke mem_validate(). +InnoDB will try to invoke mem_validate(). On failure the transaction will +be rolled back. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_create_table_for_mysql( /*=======================*/ - dict_table_t* table, /*!< in, own: table definition - (will be freed) */ - trx_t* trx); /*!< in: transaction handle */ + dict_table_t* table, /*!< in, own: table definition + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true, commit the transaction */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Does an index creation operation for MySQL. TODO: currently failure to create an index results in dropping the whole table! This is no problem currently as all indexes must be created at the same time as the table. @return error number or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_create_index_for_mysql( /*=======================*/ dict_index_t* index, /*!< in, own: index definition (will be freed) */ trx_t* trx, /*!< in: transaction handle */ - const ulint* field_lengths); /*!< in: if not NULL, must contain + const ulint* field_lengths) /*!< in: if not NULL, must contain dict_index_get_n_fields(index) actual field lengths for the index columns, which are then checked for not being too large. */ + __attribute__((nonnull(1,2), warn_unused_result)); /*********************************************************************//** Scans a table create SQL string and adds to the data dictionary the foreign key constraints declared in the string. This function @@ -391,7 +416,7 @@ bot participating tables. The indexes are allowed to contain more fields than mentioned in the constraint. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_table_add_foreign_constraints( /*==============================*/ trx_t* trx, /*!< in: transaction */ @@ -404,10 +429,10 @@ row_table_add_foreign_constraints( const char* name, /*!< in: table full name in the normalized form database_name/table_name */ - ibool reject_fks); /*!< in: if TRUE, fail with error + ibool reject_fks) /*!< in: if TRUE, fail with error code DB_CANNOT_ADD_CONSTRAINT if any foreign keys are found. */ - + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** The master thread in srv0srv.cc calls this regularly to drop tables which we must drop in background after queries to them have ended. Such lazy @@ -426,14 +451,28 @@ ulint row_get_background_drop_list_len_low(void); /*======================================*/ /*********************************************************************//** +Sets an exclusive lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */ + const char* op_info) /*!< in: string for trx->op_info */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** Truncates a table for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_truncate_table_for_mysql( /*=========================*/ dict_table_t* table, /*!< in: table handle */ - trx_t* trx); /*!< in: transaction handle */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Drops a table for MySQL. If the name of the dropped table ends in one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", @@ -443,12 +482,16 @@ by the transaction, the transaction will be committed. Otherwise, the data dictionary will remain locked. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_drop_table_for_mysql( /*=====================*/ const char* name, /*!< in: table name */ - trx_t* trx, /*!< in: transaction handle */ - ibool drop_db);/*!< in: TRUE=dropping whole database */ + trx_t* trx, /*!< in: dictionary transaction handle */ + bool drop_db,/*!< in: true=dropping whole database */ + bool nonatomic = true) + /*!< in: whether it is permitted + to release and reacquire dict_operation_lock */ + __attribute__((nonnull)); /*********************************************************************//** Drop all temporary tables during crash recovery. */ UNIV_INTERN @@ -462,66 +505,70 @@ means that this function deletes the .ibd file and assigns a new table id for the table. Also the flag table->ibd_file_missing is set TRUE. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_discard_tablespace_for_mysql( /*=============================*/ const char* name, /*!< in: table name */ - trx_t* trx); /*!< in: transaction handle */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull, warn_unused_result)); /*****************************************************************//** Imports a tablespace. The space id in the .ibd file must match the space id of the table in the data dictionary. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_import_tablespace_for_mysql( /*============================*/ - const char* name, /*!< in: table name */ - trx_t* trx); /*!< in: transaction handle */ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Drops a database for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_drop_database_for_mysql( /*========================*/ const char* name, /*!< in: database name which ends to '/' */ - trx_t* trx); /*!< in: transaction handle */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull)); /*********************************************************************//** Renames a table for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_rename_table_for_mysql( /*=======================*/ const char* old_name, /*!< in: old table name */ const char* new_name, /*!< in: new table name */ - trx_t* trx, /*!< in: transaction handle */ - ibool commit); /*!< in: if TRUE then commit trx */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: whether to commit trx */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Checks that the index contains entries in an ascending order, unique constraint is not broken, and calculates the number of index entries in the read view of the current transaction. -@return DB_SUCCESS if ok */ +@return true if ok */ UNIV_INTERN -ulint +bool row_check_index_for_mysql( /*======================*/ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in MySQL handle */ const dict_index_t* index, /*!< in: index */ - ulint* n_rows); /*!< out: number of entries + ulint* n_rows) /*!< out: number of entries seen in the consistent read */ - + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Determines if a table is a magic monitor table. -@return TRUE if monitor table */ +@return true if monitor table */ UNIV_INTERN -ibool +bool row_is_magic_monitor_table( /*=======================*/ - const char* table_name); /*!< in: name of the table, in the + const char* table_name) /*!< in: name of the table, in the form database/table_name */ - + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Initialize this module */ UNIV_INTERN @@ -536,13 +583,24 @@ void row_mysql_close(void); /*=================*/ +/*********************************************************************//** +Reassigns the table identifier of a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_table_id_reassign( +/*========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx, /*!< in/out: transaction */ + table_id_t* new_id) /*!< out: new table id */ + __attribute__((nonnull, warn_unused_result)); + /* A struct describing a place for an individual column in the MySQL row format which is presented to the table handler in ha_innobase. This template struct is used to speed up row transformations between Innobase and MySQL. */ -typedef struct mysql_row_templ_struct mysql_row_templ_t; -struct mysql_row_templ_struct { +struct mysql_row_templ_t { ulint col_no; /*!< column number of the column */ ulint rec_field_no; /*!< field number of the column in an Innobase record in the current index; @@ -597,7 +655,7 @@ struct mysql_row_templ_struct { /** A struct for (sometimes lazily) prebuilt structures in an Innobase table handle used within MySQL; these are used to save CPU time. */ -struct row_prebuilt_struct { +struct row_prebuilt_t { ulint magic_n; /*!< this magic number is set to ROW_PREBUILT_ALLOCATED when created, or ROW_PREBUILT_FREED when the @@ -682,8 +740,11 @@ struct row_prebuilt_struct { columns in the table */ upd_node_t* upd_node; /*!< Innobase SQL update node used to perform updates and deletes */ + trx_id_t trx_id; /*!< The table->def_trx_id when + ins_graph was built */ que_fork_t* ins_graph; /*!< Innobase SQL query graph used - in inserts */ + in inserts. Will be rebuilt on + trx_id or n_indexes mismatch. */ que_fork_t* upd_graph; /*!< Innobase SQL query graph used in updates or deletes */ btr_pcur_t pcur; /*!< persistent cursor used in selects @@ -780,7 +841,7 @@ struct row_prebuilt_struct { to this heap */ mem_heap_t* old_vers_heap; /*!< memory heap where a previous version is built in consistent read */ - fts_result_t* result; /* The result of an FTS query */ + bool in_fts_query; /*!< Whether we are in a FTS query */ /*----------------------*/ ulonglong autoinc_last_value; /*!< last value of AUTO-INC interval */ @@ -791,7 +852,7 @@ struct row_prebuilt_struct { ulonglong autoinc_offset; /*!< The offset passed to get_auto_increment() by MySQL. Required to calculate the next value */ - ulint autoinc_error; /*!< The actual error code encountered + dberr_t autoinc_error; /*!< The actual error code encountered while trying to init or read the autoinc value from the table. We store it here so that we can return @@ -806,6 +867,20 @@ struct row_prebuilt_struct { /*----------------------*/ ulint magic_n2; /*!< this should be the same as magic_n */ + /*----------------------*/ + unsigned innodb_api:1; /*!< whether this is a InnoDB API + query */ + const rec_t* innodb_api_rec; /*!< InnoDB API search result */ +}; + +/** Callback for row_mysql_sys_index_iterate() */ +struct SysIndexCallback { + virtual ~SysIndexCallback() { } + + /** Callback method + @param mtr - current mini transaction + @param pcur - persistent cursor. */ + virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0; }; #define ROW_PREBUILT_FETCH_MAGIC_N 465765687 @@ -829,4 +904,4 @@ struct row_prebuilt_struct { #include "row0mysql.ic" #endif -#endif +#endif /* row0mysql.h */ diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h index 740771fa3eb..93dcf9cf49b 100644 --- a/storage/innobase/include/row0purge.h +++ b/storage/innobase/include/row0purge.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -46,7 +46,8 @@ row_purge_node_create( /*==================*/ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ - mem_heap_t* heap); /*!< in: memory heap where created */ + mem_heap_t* heap) /*!< in: memory heap where created */ + __attribute__((nonnull, warn_unused_result)); /***********************************************************//** Determines if it is possible to remove a secondary index entry. Removal is possible if the secondary index entry does not refer to any @@ -56,19 +57,20 @@ is newer than the purge view. NOTE: This function should only be called by the purge thread, only while holding a latch on the leaf page of the secondary index entry (or keeping the buffer pool watch on the page). It is possible that -this function first returns TRUE and then FALSE, if a user transaction +this function first returns true and then false, if a user transaction inserts a record that the secondary index entry would refer to. However, in that case, the user transaction would also re-insert the secondary index entry after purge has removed it and released the leaf page latch. -@return TRUE if the secondary index record can be purged */ +@return true if the secondary index record can be purged */ UNIV_INTERN -ibool +bool row_purge_poss_sec( /*===============*/ purge_node_t* node, /*!< in/out: row purge node */ dict_index_t* index, /*!< in: secondary index */ - const dtuple_t* entry); /*!< in: secondary index entry */ + const dtuple_t* entry) /*!< in: secondary index entry */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************** Does the purge operation for a single undo log record. This is a high-level function used in an SQL execution graph. @@ -77,11 +79,12 @@ UNIV_INTERN que_thr_t* row_purge_step( /*===========*/ - que_thr_t* thr); /*!< in: query thread */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); /* Purge node structure */ -struct purge_node_struct{ +struct purge_node_t{ que_common_t common; /*!< node type: QUE_NODE_PURGE */ /*----------------------*/ /* Local storage for this graph node */ diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h new file mode 100644 index 00000000000..1d6d11291b8 --- /dev/null +++ b/storage/innobase/include/row0quiesce.h @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.h + +Header file for tablespace quiesce functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0quiesce_h +#define row0quiesce_h + +#include "univ.i" +#include "dict0types.h" + +struct trx_t; + +/** The version number of the export meta-data text file. */ +#define IB_EXPORT_CFG_VERSION_V1 0x1UL + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +UNIV_INTERN +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + __attribute__((nonnull)); + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or errro code. */ +UNIV_INTERN +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Cleanup after table quiesce. */ +UNIV_INTERN +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + __attribute__((nonnull)); + +#ifndef UNIV_NONINL +#include "row0quiesce.ic" +#endif + +#endif /* row0quiesce_h */ diff --git a/storage/innobase/include/row0quiesce.ic b/storage/innobase/include/row0quiesce.ic new file mode 100644 index 00000000000..f570a6aed05 --- /dev/null +++ b/storage/innobase/include/row0quiesce.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.ic + +Quiesce a tablespace. + +Created 2012-02-08 Sunny Bains +*******************************************************/ + diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h index cf253ab2347..a4e5e0dd2fa 100644 --- a/storage/innobase/include/row0row.h +++ b/storage/innobase/include/row0row.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -73,20 +73,41 @@ row_get_rec_roll_ptr( /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ +UNIV_INTERN +dtuple_t* +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ + __attribute__((warn_unused_result, nonnull(1,3,4))); +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. @return index entry which should be inserted or purged, or NULL if the externally stored columns in the clustered index record are unavailable and ext != NULL */ -UNIV_INTERN +UNIV_INLINE dtuple_t* row_build_index_entry( /*==================*/ - const dtuple_t* row, /*!< in: row which should be - inserted or purged */ - row_ext_t* ext, /*!< in: externally stored column prefixes, - or NULL */ - dict_index_t* index, /*!< in: index on the table */ - mem_heap_t* heap); /*!< in: memory heap from which the memory for - the index entry is allocated */ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ + __attribute__((warn_unused_result, nonnull(1,3,4))); /*******************************************************************//** An inverse function to row_build_index_entry. Builds a row from a record in a clustered index. @@ -124,11 +145,17 @@ row_build( consulted instead; the user columns in this table should be the same columns as in index->table */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ row_ext_t** ext, /*!< out, own: cache of externally stored column prefixes, or NULL */ - mem_heap_t* heap); /*!< in: memory heap from which + mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ + __attribute__((nonnull(2,3,9))); /*******************************************************************//** Converts an index record to a typed data tuple. @return index entry built; does not set info_bits, and the data fields @@ -142,37 +169,25 @@ row_rec_to_index_entry_low( const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ ulint* n_ext, /*!< out: number of externally stored columns */ - mem_heap_t* heap); /*!< in: memory heap from which + mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Converts an index record to a typed data tuple. NOTE that externally stored (often big) fields are NOT copied to heap. -@return own: index entry built; see the NOTE below! */ +@return own: index entry built */ UNIV_INTERN dtuple_t* row_rec_to_index_entry( /*===================*/ - ulint type, /*!< in: ROW_COPY_DATA, or - ROW_COPY_POINTERS: the former - copies also the data fields to - heap as the latter only places - pointers to data fields on the - index page */ - const rec_t* rec, /*!< in: record in the index; - NOTE: in the case - ROW_COPY_POINTERS the data - fields in the row will point - directly into this record, - therefore, the buffer page of - this record must be at least - s-latched and the latch held - as long as the dtuple is used! */ + const rec_t* rec, /*!< in: record in the index */ const dict_index_t* index, /*!< in: index */ - ulint* offsets,/*!< in/out: rec_get_offsets(rec) */ + const ulint* offsets,/*!< in/out: rec_get_offsets(rec) */ ulint* n_ext, /*!< out: number of externally stored columns */ - mem_heap_t* heap); /*!< in: memory heap from which + mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Builds from a secondary index record a row reference with which we can search the clustered index record. @@ -193,8 +208,9 @@ row_build_row_ref( the buffer page of this record must be at least s-latched and the latch held as long as the row reference is used! */ - mem_heap_t* heap); /*!< in: memory heap from which the memory + mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Builds from a secondary index record a row reference with which we can search the clustered index record. */ @@ -215,7 +231,8 @@ row_build_row_ref_in_tuple( const dict_index_t* index, /*!< in: secondary index */ ulint* offsets,/*!< in: rec_get_offsets(rec, index) or NULL */ - trx_t* trx); /*!< in: transaction */ + trx_t* trx) /*!< in: transaction or NULL */ + __attribute__((nonnull(1,2,3))); /*******************************************************************//** Builds from a secondary index record a row reference with which we can search the clustered index record. */ @@ -245,7 +262,8 @@ row_search_on_row_ref( ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ const dict_table_t* table, /*!< in: table */ const dtuple_t* ref, /*!< in: row reference */ - mtr_t* mtr); /*!< in/out: mtr */ + mtr_t* mtr) /*!< in/out: mtr */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Fetches the clustered index record for a secondary index record. The latches on the secondary index record are preserved. @@ -258,7 +276,8 @@ row_get_clust_rec( const rec_t* rec, /*!< in: record in a secondary index */ dict_index_t* index, /*!< in: secondary index */ dict_index_t** clust_index,/*!< out: clustered index */ - mtr_t* mtr); /*!< in: mtr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); /** Result of row_search_index_entry */ enum row_search_result { @@ -285,8 +304,8 @@ row_search_index_entry( ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must be closed by the caller */ - mtr_t* mtr); /*!< in: mtr */ - + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); #define ROW_COPY_DATA 1 #define ROW_COPY_POINTERS 2 @@ -313,8 +332,9 @@ row_raw_format( in bytes */ const dict_field_t* dict_field, /*!< in: index field */ char* buf, /*!< out: output buffer */ - ulint buf_size); /*!< in: output buffer size + ulint buf_size) /*!< in: output buffer size in bytes */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_NONINL #include "row0row.ic" diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic index 8e9f3460519..ac62422be1f 100644 --- a/storage/innobase/include/row0row.ic +++ b/storage/innobase/include/row0row.ic @@ -104,6 +104,33 @@ row_get_rec_roll_ptr( return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); } +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ +{ + dtuple_t* entry; + + ut_ad(dtuple_check_typed(row)); + entry = row_build_index_entry_low(row, ext, index, heap); + ut_ad(!entry || dtuple_check_typed(entry)); + return(entry); +} + /*******************************************************************//** Builds from a secondary index record a row reference with which we can search the clustered index record. */ diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h index fa3c93b6b9a..c8be80f89d9 100644 --- a/storage/innobase/include/row0sel.h +++ b/storage/innobase/include/row0sel.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -148,7 +148,7 @@ position and fetch next or fetch prev must not be tried to the cursor! @return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */ UNIV_INTERN -ulint +dberr_t row_search_for_mysql( /*=================*/ byte* buf, /*!< in/out: buffer for the fetched @@ -163,11 +163,12 @@ row_search_for_mysql( 'mode' */ ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX */ - ulint direction); /*!< in: 0 or ROW_SEL_NEXT or + ulint direction) /*!< in: 0 or ROW_SEL_NEXT or ROW_SEL_PREV; NOTE: if this is != 0, then prebuilt must have a pcur with stored position! In opening of a cursor 'direction' should be 0. */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Checks if MySQL at the moment is allowed for this table to retrieve a consistent read result, or store it to the query cache. @@ -179,28 +180,20 @@ row_search_check_if_query_cache_permitted( trx_t* trx, /*!< in: transaction object */ const char* norm_name); /*!< in: concatenation of database name, '/' char, table name */ -void -row_create_key( -/*===========*/ - dtuple_t* tuple, /* in: tuple where to build; - NOTE: we assume that the type info - in the tuple is already according - to index! */ - dict_index_t* index, /* in: index of the key value */ - doc_id_t* doc_id); /* in: doc id to lookup.*/ /*******************************************************************//** Read the max AUTOINC value from an index. @return DB_SUCCESS if all OK else error code */ UNIV_INTERN -ulint +dberr_t row_search_max_autoinc( /*===================*/ dict_index_t* index, /*!< in: index to search */ const char* col_name, /*!< in: autoinc column name */ - ib_uint64_t* value); /*!< out: AUTOINC value read */ + ib_uint64_t* value) /*!< out: AUTOINC value read */ + __attribute__((nonnull, warn_unused_result)); /** A structure for caching column values for prefetched rows */ -struct sel_buf_struct{ +struct sel_buf_t{ byte* data; /*!< data, or NULL; if not NULL, this field has allocated memory which must be explicitly freed; can be != NULL even when len is @@ -213,7 +206,7 @@ struct sel_buf_struct{ }; /** Query plan */ -struct plan_struct{ +struct plan_t{ dict_table_t* table; /*!< table struct in the dictionary cache */ dict_index_t* index; /*!< table index used in the search */ @@ -299,7 +292,7 @@ enum sel_node_state { }; /** Select statement node */ -struct sel_node_struct{ +struct sel_node_t{ que_common_t common; /*!< node type: QUE_NODE_SELECT */ enum sel_node_state state; /*!< node state */ @@ -352,7 +345,7 @@ struct sel_node_struct{ }; /** Fetch statement node */ -struct fetch_node_struct{ +struct fetch_node_t{ que_common_t common; /*!< type: QUE_NODE_FETCH */ sel_node_t* cursor_def; /*!< cursor definition */ sym_node_t* into_list; /*!< variables to set */ @@ -379,7 +372,7 @@ enum open_node_op { }; /** Open or close cursor statement node */ -struct open_node_struct{ +struct open_node_t{ que_common_t common; /*!< type: QUE_NODE_OPEN */ enum open_node_op op_type; /*!< operation type: open or @@ -388,7 +381,7 @@ struct open_node_struct{ }; /** Row printf statement node */ -struct row_printf_node_struct{ +struct row_printf_node_t{ que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */ sel_node_t* sel_node; /*!< select */ }; diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h index 463651b43b8..52c89cb01fa 100644 --- a/storage/innobase/include/row0types.h +++ b/storage/innobase/include/row0types.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,32 +26,28 @@ Created 12/27/1996 Heikki Tuuri #ifndef row0types_h #define row0types_h -typedef struct plan_struct plan_t; +struct plan_t; -typedef struct upd_struct upd_t; +struct upd_t; +struct upd_field_t; +struct upd_node_t; +struct del_node_t; +struct ins_node_t; +struct sel_node_t; +struct open_node_t; +struct fetch_node_t; -typedef struct upd_field_struct upd_field_t; +struct row_printf_node_t; +struct sel_buf_t; -typedef struct upd_node_struct upd_node_t; +struct undo_node_t; -typedef struct del_node_struct del_node_t; +struct purge_node_t; -typedef struct ins_node_struct ins_node_t; +struct row_ext_t; -typedef struct sel_node_struct sel_node_t; - -typedef struct open_node_struct open_node_t; - -typedef struct fetch_node_struct fetch_node_t; - -typedef struct row_printf_node_struct row_printf_node_t; -typedef struct sel_buf_struct sel_buf_t; - -typedef struct undo_node_struct undo_node_t; - -typedef struct purge_node_struct purge_node_t; - -typedef struct row_ext_struct row_ext_t; +/** Buffer for logging modifications during online index creation */ +struct row_log_t; /* MySQL data types */ struct TABLE; diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h index 5f3a7212ee1..ebf4881208a 100644 --- a/storage/innobase/include/row0uins.h +++ b/storage/innobase/include/row0uins.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,11 +42,11 @@ if it figures out that an index record will be removed in the purge anyway, it will remove it in the rollback. @return DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_undo_ins( /*=========*/ - undo_node_t* node); /*!< in: row undo node */ - + undo_node_t* node) /*!< in: row undo node */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_NONINL #include "row0uins.ic" #endif diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h index 84831e59d90..f89d5a334fc 100644 --- a/storage/innobase/include/row0umod.h +++ b/storage/innobase/include/row0umod.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,12 +38,12 @@ Created 2/27/1997 Heikki Tuuri Undoes a modify operation on a row of a table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_undo_mod( /*=========*/ undo_node_t* node, /*!< in: row undo node */ - que_thr_t* thr); /*!< in: query thread */ - + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); #ifndef UNIV_NONINL #include "row0umod.ic" diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h index 47f9afdc74a..5dddfb4eae1 100644 --- a/storage/innobase/include/row0undo.h +++ b/storage/innobase/include/row0undo.h @@ -95,7 +95,7 @@ enum undo_exec { }; /** Undo node structure */ -struct undo_node_struct{ +struct undo_node_t{ que_common_t common; /*!< node type: QUE_NODE_UNDO */ enum undo_exec state; /*!< node execution state */ trx_t* trx; /*!< trx for which undo is done */ diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h index a7687bb1ded..27dedeb65a7 100644 --- a/storage/innobase/include/row0upd.h +++ b/storage/innobase/include/row0upd.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -101,7 +101,7 @@ byte* row_upd_write_sys_vals_to_log( /*==========================*/ dict_index_t* index, /*!< in: clustered index */ - trx_t* trx, /*!< in: transaction */ + trx_id_t trx_id, /*!< in: transaction id */ roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ byte* log_ptr,/*!< pointer to a buffer of size > 20 opened in mlog */ @@ -118,8 +118,9 @@ row_upd_rec_sys_fields( uncompressed part will be updated, or NULL */ dict_index_t* index, /*!< in: clustered index */ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ - trx_t* trx, /*!< in: transaction */ - roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record */ + const trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record, + can be 0 during IMPORT */ /*********************************************************************//** Sets the trx id or roll ptr field of a clustered index entry. */ UNIV_INTERN @@ -165,6 +166,15 @@ row_upd_changes_field_size_or_external( dict_index_t* index, /*!< in: index */ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ const upd_t* update);/*!< in: update vector */ +/***********************************************************//** +Returns true if row update contains disowned external fields. +@return true if the update contains disowned external fields. */ +UNIV_INTERN +bool +row_upd_changes_disowned_external( +/*==============================*/ + const upd_t* update) /*!< in: update vector */ + __attribute__((nonnull, warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /***********************************************************//** Replaces the new column values stored in the update vector to the @@ -192,11 +202,12 @@ UNIV_INTERN upd_t* row_upd_build_sec_rec_difference_binary( /*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ const dtuple_t* entry, /*!< in: entry to insert */ - const rec_t* rec, /*!< in: secondary index record */ - trx_t* trx, /*!< in: transaction */ - mem_heap_t* heap); /*!< in: memory heap from which allocated */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((warn_unused_result, nonnull)); /***************************************************************//** Builds an update vector from those fields, excluding the roll ptr and trx id fields, which in an index entry differ from a record that has @@ -204,14 +215,19 @@ the equal ordering fields. NOTE: we compare the fields as binary strings! @return own: update vector of differing fields, excluding roll ptr and trx id */ UNIV_INTERN -upd_t* +const upd_t* row_upd_build_difference_binary( /*============================*/ dict_index_t* index, /*!< in: clustered index */ const dtuple_t* entry, /*!< in: entry to insert */ const rec_t* rec, /*!< in: clustered index record */ - trx_t* trx, /*!< in: transaction */ - mem_heap_t* heap); /*!< in: memory heap from which allocated */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), or NULL */ + bool no_sys, /*!< in: skip the system columns + DB_TRX_ID and DB_ROLL_PTR */ + trx_t* trx, /*!< in: transaction (for diagnostics), + or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((nonnull(1,2,3,7), warn_unused_result)); /***********************************************************//** Replaces the new column values stored in the update vector to the index entry given. */ @@ -315,25 +331,14 @@ row_upd_changes_fts_column( upd_field_t* upd_field); /*!< in: field to check */ /***********************************************************//** Checks if an FTS Doc ID column is affected by an UPDATE. -@return TRUE if Doc ID column is affected */ +@return whether Doc ID column is affected */ UNIV_INTERN -ulint +bool row_upd_changes_doc_id( /*===================*/ dict_table_t* table, /*!< in: table */ - upd_field_t* upd_field); /*!< in: field to check */ -/***********************************************************//** -Checks if an update vector changes the table's FTS-indexed columns. -NOTE: must not be called for tables which do not have an FTS-index. -Also, the vector returned must be explicitly freed as it's allocated -using the ut_malloc() allocator. -@return vector of FTS indexes that were affected by the update else NULL */ -UNIV_INTERN -ib_vector_t* -row_upd_changes_fts_columns( -/*========================*/ - dict_table_t* table, /*!< in: table */ - upd_t* update); /*!< in: update vector for the row */ + upd_field_t* upd_field) /*!< in: field to check */ + __attribute__((nonnull, warn_unused_result)); /***********************************************************//** Checks if an update vector changes an ordering field of an index record. This function is fast if the update vector is short or the number of ordering @@ -397,7 +402,7 @@ row_upd_index_parse( /* Update vector field */ -struct upd_field_struct{ +struct upd_field_t{ unsigned field_no:16; /*!< field number in an index, usually the clustered index, but in updating a secondary index record in btr0cur.cc @@ -416,7 +421,7 @@ struct upd_field_struct{ }; /* Update vector structure */ -struct upd_struct{ +struct upd_t{ ulint info_bits; /*!< new value of info bits to record; default is 0 */ ulint n_fields; /*!< number of update fields */ @@ -427,7 +432,7 @@ struct upd_struct{ /* Update node structure which also implements the delete operation of a row */ -struct upd_node_struct{ +struct upd_node_t{ que_common_t common; /*!< node type: QUE_NODE_UPDATE */ ibool is_delete;/* TRUE if delete, FALSE if update */ ibool searched_update; diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic index d054662c080..618a77fa4bf 100644 --- a/storage/innobase/include/row0upd.ic +++ b/storage/innobase/include/row0upd.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -46,7 +46,6 @@ upd_create( update = (upd_t*) mem_heap_zalloc(heap, sizeof(upd_t)); - update->info_bits = 0; update->n_fields = n; update->fields = (upd_field_t*) mem_heap_zalloc(heap, sizeof(upd_field_t) * n); @@ -111,6 +110,7 @@ upd_field_set_field_no( fprintf(stderr, "\n" "InnoDB: but index only has %lu fields\n", (ulong) dict_index_get_n_fields(index)); + ut_ad(0); } dict_col_copy_type(dict_index_get_nth_col(index, field_no), @@ -152,8 +152,9 @@ row_upd_rec_sys_fields( uncompressed part will be updated, or NULL */ dict_index_t* index, /*!< in: clustered index */ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ - trx_t* trx, /*!< in: transaction */ - roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record */ + const trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record, + can be 0 during IMPORT */ { ut_ad(dict_index_is_clust(index)); ut_ad(rec_offs_validate(rec, index, offsets)); @@ -172,8 +173,14 @@ row_upd_rec_sys_fields( #if DATA_TRX_ID + 1 != DATA_ROLL_PTR # error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" #endif - ut_ad(lock_check_trx_id_sanity(trx_read_trx_id(rec + offset), - rec, index, offsets)); + /* During IMPORT the trx id in the record can be in the + future, if the .ibd file is being imported from another + instance. During IMPORT roll_ptr will be 0. */ + ut_ad(roll_ptr == 0 + || lock_check_trx_id_sanity( + trx_read_trx_id(rec + offset), + rec, index, offsets)); + trx_write_trx_id(rec + offset, trx->id); trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); } diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h index d9e3471b3dc..1df5b4d3e98 100644 --- a/storage/innobase/include/row0vers.h +++ b/storage/innobase/include/row0vers.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -87,7 +87,7 @@ read should see. We assume that the trx id stored in rec is such that the consistent read should not see rec in its present version. @return DB_SUCCESS or DB_MISSING_HISTORY */ UNIV_INTERN -ulint +dberr_t row_vers_build_for_consistent_read( /*===============================*/ const rec_t* rec, /*!< in: record in a clustered index; the @@ -106,16 +106,17 @@ row_vers_build_for_consistent_read( *old_vers is allocated; memory for possible intermediate versions is allocated and freed locally within the function */ - rec_t** old_vers);/*!< out, own: old version, or NULL if the - record does not exist in the view, that is, + rec_t** old_vers)/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, it was freshly inserted afterwards */ + __attribute__((nonnull(1,2,3,4,5,6,7))); /*****************************************************************//** Constructs the last committed version of a clustered index record, -which should be seen by a semi-consistent read. -@return DB_SUCCESS or DB_MISSING_HISTORY */ +which should be seen by a semi-consistent read. */ UNIV_INTERN -ulint +void row_vers_build_for_semi_consistent_read( /*====================================*/ const rec_t* rec, /*!< in: record in a clustered index; the @@ -132,9 +133,10 @@ row_vers_build_for_semi_consistent_read( *old_vers is allocated; memory for possible intermediate versions is allocated and freed locally within the function */ - const rec_t** old_vers);/*!< out: rec, old version, or NULL if the + const rec_t** old_vers)/*!< out: rec, old version, or NULL if the record does not exist in the view, that is, it was freshly inserted afterwards */ + __attribute__((nonnull(1,2,3,4,5))); #ifndef UNIV_NONINL diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 5e47f82f416..48d4b94dcae 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -1,6 +1,7 @@ /*********************************************************************** -Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -55,7 +56,7 @@ fill in counter information as described in "monitor_info_t" and create the internal counter ID in "monitor_id_t". */ /** Structure containing the actual values of a monitor counter. */ -struct monitor_value_struct { +struct monitor_value_t { ib_time_t mon_start_time; /*!< Start time of monitoring */ ib_time_t mon_stop_time; /*!< Stop time of monitoring */ ib_time_t mon_reset_time; /*!< Time counter resetted */ @@ -70,11 +71,9 @@ struct monitor_value_struct { monitor_running_t mon_status; /* whether monitor still running */ }; -typedef struct monitor_value_struct monitor_value_t; - /** Follwoing defines are possible values for "monitor_type" field in "struct monitor_info" */ -enum monitor_type_value { +enum monitor_type_t { MONITOR_NONE = 0, /*!< No monitoring */ MONITOR_MODULE = 1, /*!< This is a monitor module type, not a counter */ @@ -97,8 +96,6 @@ enum monitor_type_value { metrics table */ }; -typedef enum monitor_type_value monitor_type_t; - /** Counter minimum value is initialized to be max value of mon_type_t (ib_int64_t) */ #define MIN_RESERVED ((mon_type_t) (IB_ULONGLONG_MAX >> 1)) @@ -117,7 +114,7 @@ name shall start with MONITOR_OVLD Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail information for each monitor counter */ -enum monitor_id_value { +enum monitor_id_t { /* This is to identify the default value set by the metrics control global variables */ MONITOR_DEFAULT_START = 0, @@ -154,14 +151,15 @@ enum monitor_id_value { MONITOR_OVLD_BUF_POOL_READS, MONITOR_OVLD_BUF_POOL_READ_REQUESTS, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST, - MONITOR_PAGE_INFLUSH, MONITOR_OVLD_BUF_POOL_WAIT_FREE, MONITOR_OVLD_BUF_POOL_READ_AHEAD, MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED, MONITOR_OVLD_BUF_POOL_PAGE_TOTAL, MONITOR_OVLD_BUF_POOL_PAGE_MISC, MONITOR_OVLD_BUF_POOL_PAGES_DATA, + MONITOR_OVLD_BUF_POOL_BYTES_DATA, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY, + MONITOR_OVLD_BUF_POOL_BYTES_DIRTY, MONITOR_OVLD_BUF_POOL_PAGES_FREE, MONITOR_OVLD_PAGE_CREATED, MONITOR_OVLD_PAGES_WRITTEN, @@ -177,15 +175,15 @@ enum monitor_id_value { MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, MONITOR_FLUSH_NEIGHBOR_COUNT, MONITOR_FLUSH_NEIGHBOR_PAGES, - MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE, - MONITOR_FLUSH_MAX_DIRTY_COUNT, - MONITOR_FLUSH_MAX_DIRTY_PAGES, + MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, + MONITOR_FLUSH_AVG_PAGE_RATE, + MONITOR_FLUSH_LSN_AVG_RATE, + MONITOR_FLUSH_PCT_FOR_DIRTY, + MONITOR_FLUSH_PCT_FOR_LSN, + MONITOR_FLUSH_SYNC_WAITS, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, MONITOR_FLUSH_ADAPTIVE_COUNT, MONITOR_FLUSH_ADAPTIVE_PAGES, - MONITOR_FLUSH_ASYNC_TOTAL_PAGE, - MONITOR_FLUSH_ASYNC_COUNT, - MONITOR_FLUSH_ASYNC_PAGES, MONITOR_FLUSH_SYNC_TOTAL_PAGE, MONITOR_FLUSH_SYNC_COUNT, MONITOR_FLUSH_SYNC_PAGES, @@ -303,6 +301,8 @@ enum monitor_id_value { MONITOR_MODULE_PAGE, MONITOR_PAGE_COMPRESS, MONITOR_PAGE_DECOMPRESS, + MONITOR_PAD_INCREMENTS, + MONITOR_PAD_DECREMENTS, /* Index related counters */ MONITOR_MODULE_INDEX, @@ -367,7 +367,10 @@ enum monitor_id_value { /* Data DDL related counters */ MONITOR_MODULE_DDL_STATS, + MONITOR_BACKGROUND_DROP_INDEX, MONITOR_BACKGROUND_DROP_TABLE, + MONITOR_ONLINE_CREATE_INDEX, + MONITOR_PENDING_ALTER_TABLE, MONITOR_MODULE_ICP, MONITOR_ICP_ATTEMPTS, @@ -383,8 +386,6 @@ enum monitor_id_value { NUM_MONITOR }; -typedef enum monitor_id_value monitor_id_t; - /** This informs the monitor control system to turn on/off and reset monitor counters through wild card match */ #define MONITOR_WILDCARD_MATCH (NUM_MONITOR + 1) @@ -394,7 +395,7 @@ on/off and reset monitor counters through wild card match */ /** struct monitor_info describes the basic/static information about each monitor counter. */ -struct monitor_info_struct { +struct monitor_info_t { const char* monitor_name; /*!< Monitor name */ const char* monitor_module; /*!< Sub Module the monitor belongs to */ @@ -408,12 +409,10 @@ struct monitor_info_struct { monitor_id_t */ }; -typedef struct monitor_info_struct monitor_info_t; - /** Following are the "set_option" values allowed for srv_mon_process_existing_counter() and srv_mon_process_existing_counter() functions. To turn on/off/reset the monitor counters. */ -enum mon_set_option { +enum mon_option_t { MONITOR_TURN_ON = 1, /*!< Turn on the counter */ MONITOR_TURN_OFF, /*!< Turn off the counter */ MONITOR_RESET_VALUE, /*!< Reset current values */ @@ -423,8 +422,6 @@ enum mon_set_option { function */ }; -typedef enum mon_set_option mon_option_t; - /** Number of bit in a ulint datatype */ #define NUM_BITS_ULINT (sizeof(ulint) * CHAR_BIT) @@ -533,8 +530,37 @@ on the counters */ } \ } -#ifdef HAVE_ATOMIC_BUILTINS +/** Increment a monitor counter under mutex protection. +Use MONITOR_INC if appropriate mutex protection already exists. +@param monitor monitor to be incremented by 1 +@param mutex mutex to acquire and relese */ +# define MONITOR_MUTEX_INC(mutex, monitor) \ + ut_ad(!mutex_own(mutex)); \ + if (MONITOR_IS_ON(monitor)) { \ + mutex_enter(mutex); \ + if (++MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor); \ + } \ + mutex_exit(mutex); \ + } +/** Decrement a monitor counter under mutex protection. +Use MONITOR_DEC if appropriate mutex protection already exists. +@param monitor monitor to be decremented by 1 +@param mutex mutex to acquire and relese */ +# define MONITOR_MUTEX_DEC(mutex, monitor) \ + ut_ad(!mutex_own(mutex)); \ + if (MONITOR_IS_ON(monitor)) { \ + mutex_enter(mutex); \ + if (--MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor); \ + } \ + mutex_exit(mutex); \ + } +#if defined HAVE_ATOMIC_BUILTINS_64 +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ # define MONITOR_ATOMIC_INC(monitor) \ if (MONITOR_IS_ON(monitor)) { \ ib_uint64_t value; \ @@ -547,10 +573,13 @@ on the counters */ } \ } +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ # define MONITOR_ATOMIC_DEC(monitor) \ if (MONITOR_IS_ON(monitor)) { \ ib_uint64_t value; \ - value = os_atomic_decrement_ulint( \ + value = os_atomic_decrement_uint64( \ (ib_uint64_t*) &MONITOR_VALUE(monitor), 1); \ /* Note: This is not 100% accurate because of the \ inherent race, we ignore it due to performance. */ \ @@ -558,7 +587,34 @@ on the counters */ MONITOR_MIN_VALUE(monitor) = value; \ } \ } -#endif /* HAVE_ATOMIC_BUILTINS */ +# define srv_mon_create() ((void) 0) +# define srv_mon_free() ((void) 0) +#else /* HAVE_ATOMIC_BUILTINS_64 */ +/** Mutex protecting atomic operations on platforms that lack +built-in operations for atomic memory access */ +extern ib_mutex_t monitor_mutex; +/****************************************************************//** +Initialize the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_create(void); +/*================*/ +/****************************************************************//** +Close the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_free(void); +/*==============*/ + +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ +# define MONITOR_ATOMIC_INC(monitor) MONITOR_MUTEX_INC(&monitor_mutex, monitor) +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ +# define MONITOR_ATOMIC_DEC(monitor) MONITOR_MUTEX_DEC(&monitor_mutex, monitor) +#endif /* HAVE_ATOMIC_BUILTINS_64 */ #define MONITOR_DEC(monitor) \ if (MONITOR_IS_ON(monitor)) { \ @@ -568,7 +624,17 @@ on the counters */ } \ } +#ifdef UNIV_DEBUG_VALGRIND +# define MONITOR_CHECK_DEFINED(value) do { \ + mon_type_t m = value; \ + UNIV_MEM_ASSERT_RW(&m, sizeof m); \ +} while (0) +#else /* UNIV_DEBUG_VALGRIND */ +# define MONITOR_CHECK_DEFINED(value) (void) 0 +#endif /* UNIV_DEBUG_VALGRIND */ + #define MONITOR_INC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ if (MONITOR_IS_ON(monitor)) { \ MONITOR_VALUE(monitor) += (mon_type_t) (value); \ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ @@ -577,6 +643,7 @@ on the counters */ } #define MONITOR_DEC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ if (MONITOR_IS_ON(monitor)) { \ ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value); \ MONITOR_VALUE(monitor) -= (mon_type_t) (value); \ @@ -605,6 +672,7 @@ could already be checked as a module group */ /** Directly set a monitor counter's value */ #define MONITOR_SET(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ if (MONITOR_IS_ON(monitor)) { \ MONITOR_VALUE(monitor) = (mon_type_t) (value); \ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ @@ -617,9 +685,10 @@ could already be checked as a module group */ /** Add time difference between now and input "value" (in seconds) to the monitor counter -@monitor monitor to update for the time difference -@value the start time value */ +@param monitor monitor to update for the time difference +@param value the start time value */ #define MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ if (MONITOR_IS_ON(monitor)) { \ ullint old_time = (value); \ value = ut_time_us(NULL); \ @@ -629,15 +698,16 @@ monitor counter /** This macro updates 3 counters in one call. However, it only checks the main/first monitor counter 'monitor', to see it is on or off to decide whether to do the update. -@monitor the main monitor counter to update. It accounts for +@param monitor the main monitor counter to update. It accounts for the accumulative value for the counter. -@monitor_n_calls counter that counts number of times this macro is +@param monitor_n_calls counter that counts number of times this macro is called -@monitor_per_call counter that records the current and max value of +@param monitor_per_call counter that records the current and max value of each incremental value -@value incremental value to record this time */ +@param value incremental value to record this time */ #define MONITOR_INC_VALUE_CUMULATIVE( \ monitor, monitor_n_calls, monitor_per_call, value) \ + MONITOR_CHECK_DEFINED(value); \ if (MONITOR_IS_ON(monitor)) { \ MONITOR_VALUE(monitor_n_calls)++; \ MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value); \ @@ -655,6 +725,7 @@ whether to do the update. /** Directly set a monitor counter's value, and if the value is monotonically increasing, only max value needs to be updated */ #define MONITOR_SET_UPD_MAX_ONLY(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ if (MONITOR_IS_ON(monitor)) { \ MONITOR_VALUE(monitor) = (mon_type_t) (value); \ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ @@ -665,6 +736,7 @@ is monotonically increasing, only max value needs to be updated */ /** Some values such as log sequence number are montomically increasing number, do not need to record max/min values */ #define MONITOR_SET_SIMPLE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ if (MONITOR_IS_ON(monitor)) { \ MONITOR_VALUE(monitor) = (mon_type_t) (value); \ } @@ -693,9 +765,11 @@ consolidate information from existing system status variables. */ /** Save the passed-in value to mon_start_value field of monitor counters */ -#define MONITOR_SAVE_START(monitor, value) \ +#define MONITOR_SAVE_START(monitor, value) do { \ + MONITOR_CHECK_DEFINED(value); \ (MONITOR_START_VALUE(monitor) = \ - (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)) + (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)); \ + } while (0) /** Save the passed-in value to mon_last_value field of monitor counters */ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 99cff251e3c..201f19c0cd8 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -50,22 +50,91 @@ Created 10/10/1995 Heikki Tuuri #include "trx0types.h" #include "srv0conc.h" #include "buf0checksum.h" +#include "ut0counter.h" + +/* Global counters used inside InnoDB. */ +struct srv_stats_t { + typedef ib_counter_t<lsn_t, 1, single_indexer_t> lsn_ctr_1_t; + typedef ib_counter_t<ulint, 1, single_indexer_t> ulint_ctr_1_t; + typedef ib_counter_t<lint, 1, single_indexer_t> lint_ctr_1_t; + typedef ib_counter_t<ulint, 64> ulint_ctr_64_t; + typedef ib_counter_t<ib_int64_t, 1, single_indexer_t> ib_int64_ctr_1_t; + + /** Count the amount of data written in total (in bytes) */ + ulint_ctr_1_t data_written; + + /** Number of the log write requests done */ + ulint_ctr_1_t log_write_requests; + + /** Number of physical writes to the log performed */ + ulint_ctr_1_t log_writes; + + /** Amount of data written to the log files in bytes */ + lsn_ctr_1_t os_log_written; + + /** Number of writes being done to the log files */ + lint_ctr_1_t os_log_pending_writes; + + /** We increase this counter, when we don't have enough + space in the log buffer and have to flush it */ + ulint_ctr_1_t log_waits; + + /** Count the number of times the doublewrite buffer was flushed */ + ulint_ctr_1_t dblwr_writes; + + /** Store the number of pages that have been flushed to the + doublewrite buffer */ + ulint_ctr_1_t dblwr_pages_written; + + /** Store the number of write requests issued */ + ulint_ctr_1_t buf_pool_write_requests; + + /** Store the number of times when we had to wait for a free page + in the buffer pool. It happens when the buffer pool is full and we + need to make a flush, in order to be able to read or create a page. */ + ulint_ctr_1_t buf_pool_wait_free; + + /** Count the number of pages that were written from buffer + pool to the disk */ + ulint_ctr_1_t buf_pool_flushed; + + /** Number of buffer pool reads that led to the reading of + a disk page */ + ulint_ctr_1_t buf_pool_reads; + + /** Number of data read in total (in bytes) */ + ulint_ctr_1_t data_read; + + /** Wait time of database locks */ + ib_int64_ctr_1_t n_lock_wait_time; + + /** Number of database lock waits */ + ulint_ctr_1_t n_lock_wait_count; + + /** Number of threads currently waiting on database locks */ + lint_ctr_1_t n_lock_wait_current_count; + + /** Number of rows read. */ + ulint_ctr_64_t n_rows_read; + + /** Number of rows updated */ + ulint_ctr_64_t n_rows_updated; + + /** Number of rows deleted */ + ulint_ctr_64_t n_rows_deleted; + + /** Number of rows inserted */ + ulint_ctr_64_t n_rows_inserted; +}; extern const char* srv_main_thread_op_info; /** Prefix used by MySQL to indicate pre-5.1 table name encoding */ extern const char srv_mysql50_table_name_prefix[10]; -/* When this event is set the lock timeout and InnoDB monitor -thread starts running */ -extern os_event_t srv_lock_timeout_thread_event; - /* The monitor thread waits on this event. */ extern os_event_t srv_monitor_event; -/* The lock timeout thread waits on this event. */ -extern os_event_t srv_timeout_event; - /* The error monitor thread waits on this event. */ extern os_event_t srv_error_event; @@ -89,20 +158,20 @@ at a time */ #define SRV_AUTO_EXTEND_INCREMENT \ (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE)) -/* Mutex for locking srv_monitor_file */ -extern mutex_t srv_monitor_file_mutex; +/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */ +extern ib_mutex_t srv_monitor_file_mutex; /* Temporary file for innodb monitor output */ extern FILE* srv_monitor_file; -/* Mutex for locking srv_dict_tmpfile. +/* Mutex for locking srv_dict_tmpfile. Only created if !srv_read_only_mode. This mutex has a very high rank; threads reserving it should not be holding any InnoDB latches. */ -extern mutex_t srv_dict_tmpfile_mutex; +extern ib_mutex_t srv_dict_tmpfile_mutex; /* Temporary file for output from the data dictionary */ extern FILE* srv_dict_tmpfile; -/* Mutex for locking srv_misc_tmpfile. +/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode. This mutex has a very low rank; threads reserving it should not acquire any further latches or sleep before releasing this one. */ -extern mutex_t srv_misc_tmpfile_mutex; +extern ib_mutex_t srv_misc_tmpfile_mutex; /* Temporary file for miscellanous diagnostic output */ extern FILE* srv_misc_tmpfile; @@ -114,6 +183,10 @@ extern char* srv_data_home; extern char* srv_arch_dir; #endif /* UNIV_LOG_ARCHIVE */ +/** Set if InnoDB must operate in read-only mode. We don't do any +recovery and open all tables in RO mode instead of RW mode. We don't +sync the max trx id to disk either. */ +extern my_bool srv_read_only_mode; /** store to its own file each table created by an user; data dictionary tables are in the system tablespace 0 */ extern my_bool srv_file_per_table; @@ -134,8 +207,10 @@ extern ulint srv_max_file_format_at_startup; on duplicate key checking and foreign key checking */ extern ibool srv_locks_unsafe_for_binlog; -/* Variable specifying the FTS parallel sort buffer size */ +/** Sort buffer size in index creation */ extern ulong srv_sort_buf_size; +/** Maximum modification log file size for online index creation */ +extern unsigned long long srv_online_max_size; /* If this flag is TRUE, then we will use the native aio of the OS (provided we compiled Innobase with it in), otherwise we will @@ -153,6 +228,9 @@ extern char* srv_undo_dir; /** Number of undo tablespaces to use. */ extern ulong srv_undo_tablespaces; +/** The number of UNDO tablespaces that are open and ready to use. */ +extern ulint srv_undo_tablespaces_open; + /* The number of undo segments to use */ extern ulong srv_undo_logs; @@ -163,17 +241,20 @@ extern ulint* srv_data_file_is_raw_partition; extern ibool srv_auto_extend_last_data_file; extern ulint srv_last_file_size_max; -extern char** srv_log_group_home_dirs; +extern char* srv_log_group_home_dir; #ifndef UNIV_HOTBACKUP extern ulong srv_auto_extend_increment; extern ibool srv_created_new_raw; -extern ulint srv_n_log_groups; -extern ulint srv_n_log_files; +/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */ +#define SRV_N_LOG_FILES_MAX 100 +extern ulong srv_n_log_files; extern ib_uint64_t srv_log_file_size; +extern ib_uint64_t srv_log_file_size_requested; extern ulint srv_log_buffer_size; extern ulong srv_flush_log_at_trx_commit; +extern uint srv_flush_log_at_timeout; extern char srv_adaptive_flushing; /* If this flag is TRUE, then we will load the indexes' (and tables') metadata @@ -195,7 +276,7 @@ extern ulong srv_n_page_hash_locks; /*!< number of locks to protect buf_pool->page_hash */ extern ulong srv_LRU_scan_depth; /*!< Scan depth for LRU flush batch */ -extern my_bool srv_flush_neighbors; /*!< whether or not to flush +extern ulong srv_flush_neighbors; /*!< whether or not to flush neighbors of a block */ extern ulint srv_buf_pool_old_size; /*!< previously requested size */ extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */ @@ -210,6 +291,12 @@ extern ulint srv_n_write_io_threads; /* Number of IO operations per second the server can do */ extern ulong srv_io_capacity; + +/* We use this dummy default value at startup for max_io_capacity. +The real value is set based on the value of io_capacity. */ +#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL) +#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL) +extern ulong srv_max_io_capacity; /* Returns the number of IO operations that is X percent of the capacity. PCT_IO(5) -> returns the number of IO operations that is 5% of the max where max is srv_io_capacity. */ @@ -232,9 +319,16 @@ extern ulint srv_win_file_flush_method; extern ulint srv_max_n_open_files; -extern ulint srv_max_dirty_pages_pct; +extern ulong srv_max_dirty_pages_pct; +extern ulong srv_max_dirty_pages_pct_lwm; + +extern ulong srv_adaptive_flushing_lwm; +extern ulong srv_flushing_avg_loops; -extern ulint srv_force_recovery; +extern ulong srv_force_recovery; +#ifndef DBUG_OFF +extern ulong srv_force_recovery_crash; +#endif /* !DBUG_OFF */ extern ulint srv_fast_shutdown; /*!< If this is 1, do not do a purge and index buffer merge. @@ -246,7 +340,9 @@ extern ulint srv_fast_shutdown; /*!< If this is 1, do not do a extern ibool srv_innodb_status; extern unsigned long long srv_stats_transient_sample_pages; +extern my_bool srv_stats_persistent; extern unsigned long long srv_stats_persistent_sample_pages; +extern my_bool srv_stats_auto_recalc; extern ibool srv_use_doublewrite_buf; extern ulong srv_doublewrite_batch_size; @@ -259,11 +355,6 @@ extern ulong srv_max_purge_lag_delay; extern ulong srv_replication_delay; /*-------------------------------------------*/ -extern ulint srv_n_rows_inserted; -extern ulint srv_n_rows_updated; -extern ulint srv_n_rows_deleted; -extern ulint srv_n_rows_read; - extern ibool srv_print_innodb_monitor; extern ibool srv_print_innodb_lock_monitor; extern ibool srv_print_innodb_tablespace_monitor; @@ -274,21 +365,21 @@ extern ibool srv_print_verbose_log; "tables instead, see " REFMAN "innodb-i_s-tables.html" extern ibool srv_print_innodb_table_monitor; -extern ibool srv_lock_timeout_active; extern ibool srv_monitor_active; extern ibool srv_error_monitor_active; /* TRUE during the lifetime of the buffer pool dump/load thread */ extern ibool srv_buf_dump_thread_active; +/* TRUE during the lifetime of the stats thread */ +extern ibool srv_dict_stats_thread_active; + extern ulong srv_n_spin_wait_rounds; extern ulong srv_n_free_tickets_to_enter; extern ulong srv_thread_sleep_delay; extern ulong srv_spin_wait_delay; extern ibool srv_priority_boost; -extern ulint srv_n_lock_wait_count; - extern ulint srv_truncated_status_writes; extern ulint srv_available_undo_logs; @@ -309,12 +400,21 @@ extern ibool srv_print_latch_waits; # define srv_print_latch_waits FALSE #endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +extern my_bool srv_ibuf_disable_background_merge; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +#ifdef UNIV_DEBUG +extern my_bool srv_purge_view_update_only_debug; +#endif /* UNIV_DEBUG */ + extern ulint srv_fatal_semaphore_wait_threshold; +#define SRV_SEMAPHORE_WAIT_EXTENSION 7200 extern ulint srv_dml_needed_delay; #ifndef HAVE_ATOMIC_BUILTINS /** Mutex protecting some server global variables. */ -extern mutex_t server_mutex; +extern ib_mutex_t server_mutex; #endif /* !HAVE_ATOMIC_BUILTINS */ #define SRV_MAX_N_IO_THREADS 130 @@ -324,22 +424,6 @@ i/o handler thread */ extern const char* srv_io_thread_op_info[]; extern const char* srv_io_thread_function[]; -/* the number of the log write requests done */ -extern ulint srv_log_write_requests; - -/* the number of physical writes to the log performed */ -extern ulint srv_log_writes; - -/* amount of data written to the log files in bytes */ -extern lsn_t srv_os_log_written; - -/* amount of writes being done to the log files */ -extern ulint srv_os_log_pending_writes; - -/* we increase this counter, when there we don't have enough space in the -log buffer and have to flush it */ -extern ulint srv_log_waits; - /* the number of purge threads to use from the worker pool (currently 0 or 1) */ extern ulong srv_n_purge_threads; @@ -349,50 +433,16 @@ extern ulong srv_purge_batch_size; /* the number of sync wait arrays */ extern ulong srv_sync_array_size; -/* variable that counts amount of data read in total (in bytes) */ -extern ulint srv_data_read; - -/* here we count the amount of data written in total (in bytes) */ -extern ulint srv_data_written; - -/* this variable counts the amount of times, when the doublewrite buffer -was flushed */ -extern ulint srv_dblwr_writes; - -/* here we store the number of pages that have been flushed to the -doublewrite buffer */ -extern ulint srv_dblwr_pages_written; - -/* in this variable we store the number of write requests issued */ -extern ulint srv_buf_pool_write_requests; - -/* here we store the number of times when we had to wait for a free page -in the buffer pool. It happens when the buffer pool is full and we need -to make a flush, in order to be able to read or create a page. */ -extern ulint srv_buf_pool_wait_free; - -/* variable to count the number of pages that were written from the -buffer pool to disk */ -extern ulint srv_buf_pool_flushed; - -/** Number of buffer pool reads that led to the -reading of a disk page */ -extern ulint srv_buf_pool_reads; - /* print all user-level transactions deadlocks to mysqld stderr */ extern my_bool srv_print_all_deadlocks; -/** Status variables to be passed to MySQL */ -typedef struct export_var_struct export_struc; - -/** Thread slot in the thread table */ -typedef struct srv_slot_struct srv_slot_t; - -/** Thread table is an array of slots */ -typedef srv_slot_t srv_table_t; +extern my_bool srv_cmp_per_index_enabled; /** Status variables to be passed to MySQL */ -extern export_struc export_vars; +extern struct export_var_t export_vars; + +/** Global counters */ +extern srv_stats_t srv_stats; # ifdef UNIV_PFS_THREAD /* Keys to register InnoDB threads with performance schema */ @@ -404,19 +454,20 @@ extern mysql_pfs_key_t srv_error_monitor_thread_key; extern mysql_pfs_key_t srv_monitor_thread_key; extern mysql_pfs_key_t srv_master_thread_key; extern mysql_pfs_key_t srv_purge_thread_key; +extern mysql_pfs_key_t recv_writer_thread_key; /* This macro register the current thread and its key with performance schema */ # define pfs_register_thread(key) \ do { \ - struct PSI_thread* psi = PSI_CALL(new_thread)(key, NULL, 0);\ - PSI_CALL(set_thread)(psi); \ + struct PSI_thread* psi = PSI_THREAD_CALL(new_thread)(key, NULL, 0);\ + PSI_THREAD_CALL(set_thread)(psi); \ } while (0) /* This macro delist the current thread from performance schema */ # define pfs_delete_thread() \ do { \ - PSI_CALL(delete_current_thread)(); \ + PSI_THREAD_CALL(delete_current_thread)(); \ } while (0) # endif /* UNIV_PFS_THREAD */ @@ -439,8 +490,19 @@ enum { when writing data files, but do flush after writing to log files */ SRV_UNIX_NOSYNC, /*!< do not flush after writing */ - SRV_UNIX_O_DIRECT /*!< invoke os_file_set_nocache() on - data files */ + SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on + data files. This implies using + non-buffered IO but still using fsync, + the reason for which is that some FS + do not flush meta-data when + unbuffered IO happens */ + SRV_UNIX_O_DIRECT_NO_FSYNC + /*!< do not use fsync() when using + direct IO i.e.: it can be set to avoid + the fsync() call that we make when + using SRV_UNIX_O_DIRECT. However, in + this case user/DBA should be sure about + the integrity of the meta-data */ }; /** Alternatives for file i/o in Windows */ @@ -499,10 +561,9 @@ enum srv_thread_type { }; /*********************************************************************//** -Boots Innobase server. -@return DB_SUCCESS or error code */ +Boots Innobase server. */ UNIV_INTERN -ulint +void srv_boot(void); /*==========*/ /*********************************************************************//** @@ -533,6 +594,12 @@ srv_set_io_thread_op_info( ulint i, /*!< in: the 'segment' of the i/o thread */ const char* str); /*!< in: constant char string describing the state */ +/*********************************************************************//** +Resets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_reset_io_thread_op_info(); +/*=========================*/ /*******************************************************************//** Tells the purge thread that there has been activity in the database and wakes up the purge thread if it is suspended (not sleeping). Note @@ -714,7 +781,7 @@ srv_purge_wakeup(void); /*==================*/ /** Status variables to be passed to MySQL */ -struct export_var_struct{ +struct export_var_t{ ulint innodb_data_pending_reads; /*!< Pending reads */ ulint innodb_data_pending_writes; /*!< Pending writes */ ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */ @@ -727,7 +794,9 @@ struct export_var_struct{ char innodb_buffer_pool_load_status[512];/*!< Buf pool load status */ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ ulint innodb_buffer_pool_pages_data; /*!< Data pages */ + ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */ + ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */ ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */ ulint innodb_buffer_pool_pages_free; /*!< Free pages */ #ifdef UNIV_DEBUG @@ -771,10 +840,15 @@ struct export_var_struct{ ulint innodb_num_open_files; /*!< fil_n_file_opened */ ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */ ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */ +#ifdef UNIV_DEBUG + ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */ + ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id + - purged view's min trx_id */ +#endif /* UNIV_DEBUG */ }; /** Thread slot in the thread table. */ -struct srv_slot_struct{ +struct srv_slot_t{ srv_thread_type type; /*!< thread type: user, utility etc. */ ibool in_use; /*!< TRUE if this slot @@ -803,6 +877,7 @@ struct srv_slot_struct{ # define srv_use_native_aio FALSE # define srv_force_recovery 0UL # define srv_set_io_thread_op_info(t,info) ((void) 0) +# define srv_reset_io_thread_op_info() ((void) 0) # define srv_is_being_started 0 # define srv_win_file_flush_method SRV_WIN_IO_UNBUFFERED # define srv_unix_file_flush_method SRV_UNIX_O_DSYNC diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h index 9d948675011..e136f30f96a 100644 --- a/storage/innobase/include/srv0start.h +++ b/storage/innobase/include/srv0start.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -83,24 +83,50 @@ Starts Innobase and creates a new database if database files are not found and the user wants. @return DB_SUCCESS or error code */ UNIV_INTERN -int +dberr_t innobase_start_or_create_for_mysql(void); /*====================================*/ /****************************************************************//** Shuts down the Innobase database. @return DB_SUCCESS or error code */ UNIV_INTERN -int +dberr_t innobase_shutdown_for_mysql(void); /******************************************************************** Signal all per-table background threads to shutdown, and wait for them to do so. */ - +UNIV_INTERN void srv_shutdown_table_bg_threads(void); - /*=============================*/ + +/*************************************************************//** +Copy the file path component of the physical file to parameter. It will +copy up to and including the terminating path separator. +@return number of bytes copied or ULINT_UNDEFINED if destination buffer + is smaller than the path to be copied. */ +UNIV_INTERN +ulint +srv_path_copy( +/*==========*/ + char* dest, /*!< out: destination buffer */ + ulint dest_len, /*!< in: max bytes to copy */ + const char* basedir, /*!< in: base directory */ + const char* table_name) /*!< in: source table name */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Get the meta-data filename from the table name. */ +UNIV_INTERN +void +srv_get_meta_data_filename( +/*======================*/ + dict_table_t* table, /*!< in: table */ + char* filename, /*!< out: filename */ + ulint max_len) /*!< in: filename max length */ + __attribute__((nonnull)); + /** Log sequence number at shutdown */ extern lsn_t srv_shutdown_lsn; /** Log sequence number immediately after startup */ diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h index 56f9ff78c49..bb4d1037a62 100644 --- a/storage/innobase/include/sync0arr.h +++ b/storage/innobase/include/sync0arr.h @@ -32,9 +32,9 @@ Created 9/5/1995 Heikki Tuuri #include "os0thread.h" /** Synchronization wait array cell */ -typedef struct sync_cell_struct sync_cell_t; +struct sync_cell_t; /** Synchronization wait array */ -typedef struct sync_array_struct sync_array_t; +struct sync_array_t; /******************************************************************//** Reserves a wait array cell for waiting for an object. diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index b0c21d0c76b..c268098d1ea 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -36,6 +36,7 @@ Created 9/11/1995 Heikki Tuuri #include "univ.i" #ifndef UNIV_HOTBACKUP #include "ut0lst.h" +#include "ut0counter.h" #include "sync0sync.h" #include "os0sync.h" @@ -44,6 +45,43 @@ in MySQL: */ #undef rw_lock_t #endif /* !UNIV_HOTBACKUP */ +/** Counters for RW locks. */ +struct rw_lock_stats_t { + typedef ib_counter_t<ib_int64_t, IB_N_SLOTS> ib_int64_counter_t; + + /** number of spin waits on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_os_wait_count; + + /** number of unlocks (that unlock shared locks), + set only when UNIV_SYNC_PERF_STAT is defined */ + ib_int64_counter_t rw_s_exit_count; + + /** number of spin waits on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_os_wait_count; + + /** number of unlocks (that unlock exclusive locks), + set only when UNIV_SYNC_PERF_STAT is defined */ + ib_int64_counter_t rw_x_exit_count; +}; + /* Latch types; these are used also in btr0btr.h: keep the numerical values smaller than 30 and the order of the numerical values like below! */ #define RW_S_LATCH 1 @@ -57,22 +95,22 @@ of concurrent read locks before the rw_lock breaks. The current value of 0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/ #define X_LOCK_DECR 0x00100000 -typedef struct rw_lock_struct rw_lock_t; +struct rw_lock_t; #ifdef UNIV_SYNC_DEBUG -typedef struct rw_lock_debug_struct rw_lock_debug_t; +struct rw_lock_debug_t; #endif /* UNIV_SYNC_DEBUG */ typedef UT_LIST_BASE_NODE_T(rw_lock_t) rw_lock_list_t; extern rw_lock_list_t rw_lock_list; -extern mutex_t rw_lock_list_mutex; +extern ib_mutex_t rw_lock_list_mutex; #ifdef UNIV_SYNC_DEBUG /* The global mutex which protects debug info lists of all rw-locks. To modify the debug info list of an rw-lock, this mutex has to be acquired in addition to the mutex protecting the lock. */ -extern mutex_t rw_lock_debug_mutex; +extern ib_mutex_t rw_lock_debug_mutex; extern os_event_t rw_lock_debug_event; /*!< If deadlock detection does not get immediately the mutex it may wait for this event */ @@ -80,30 +118,8 @@ extern ibool rw_lock_debug_waiters; /*!< This is set to TRUE, if there may be waiters for the event */ #endif /* UNIV_SYNC_DEBUG */ -/** number of spin waits on rw-latches, -resulted during exclusive (write) locks */ -extern ib_int64_t rw_s_spin_wait_count; -/** number of spin loop rounds on rw-latches, -resulted during exclusive (write) locks */ -extern ib_int64_t rw_s_spin_round_count; -/** number of unlocks (that unlock shared locks), -set only when UNIV_SYNC_PERF_STAT is defined */ -extern ib_int64_t rw_s_exit_count; -/** number of OS waits on rw-latches, -resulted during shared (read) locks */ -extern ib_int64_t rw_s_os_wait_count; -/** number of spin waits on rw-latches, -resulted during shared (read) locks */ -extern ib_int64_t rw_x_spin_wait_count; -/** number of spin loop rounds on rw-latches, -resulted during shared (read) locks */ -extern ib_int64_t rw_x_spin_round_count; -/** number of OS waits on rw-latches, -resulted during exclusive (write) locks */ -extern ib_int64_t rw_x_os_wait_count; -/** number of unlocks (that unlock exclusive locks), -set only when UNIV_SYNC_PERF_STAT is defined */ -extern ib_int64_t rw_x_exit_count; +/** Counters for RW locks. */ +extern rw_lock_stats_t rw_lock_stats; #ifdef UNIV_PFS_RWLOCK /* Following are rwlock keys used to register with MySQL @@ -121,10 +137,10 @@ extern mysql_pfs_key_t checkpoint_lock_key; extern mysql_pfs_key_t fil_space_latch_key; extern mysql_pfs_key_t fts_cache_rw_lock_key; extern mysql_pfs_key_t fts_cache_init_rw_lock_key; -extern mysql_pfs_key_t index_tree_rw_lock_key; extern mysql_pfs_key_t trx_i_s_cache_lock_key; extern mysql_pfs_key_t trx_purge_latch_key; extern mysql_pfs_key_t index_tree_rw_lock_key; +extern mysql_pfs_key_t index_online_log_key; extern mysql_pfs_key_t dict_table_stats_latch_key; extern mysql_pfs_key_t trx_sys_rw_lock_key; extern mysql_pfs_key_t hash_table_rw_lock_key; @@ -159,6 +175,9 @@ unlocking, not the corresponding function. */ # define rw_lock_s_lock(M) \ rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) +# define rw_lock_s_lock_inline(M, P, F, L) \ + rw_lock_s_lock_func((M), (P), (F), (L)) + # define rw_lock_s_lock_gen(M, P) \ rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) @@ -175,12 +194,18 @@ unlocking, not the corresponding function. */ # define rw_lock_x_lock(M) \ rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) +# define rw_lock_x_lock_inline(M, P, F, L) \ + rw_lock_x_lock_func((M), (P), (F), (L)) + # define rw_lock_x_lock_gen(M, P) \ rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) # define rw_lock_x_lock_nowait(M) \ rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + rw_lock_x_lock_func_nowait((M), (F), (L)) + # ifdef UNIV_SYNC_DEBUG # define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L) # else @@ -212,6 +237,9 @@ unlocking, not the corresponding function. */ # define rw_lock_s_lock(M) \ pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) +# define rw_lock_s_lock_inline(M, P, F, L) \ + pfs_rw_lock_s_lock_func((M), (P), (F), (L)) + # define rw_lock_s_lock_gen(M, P) \ pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) @@ -227,12 +255,18 @@ unlocking, not the corresponding function. */ # define rw_lock_x_lock(M) \ pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) +# define rw_lock_x_lock_inline(M, P, F, L) \ + pfs_rw_lock_x_lock_func((M), (P), (F), (L)) + # define rw_lock_x_lock_gen(M, P) \ pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) # define rw_lock_x_lock_nowait(M) \ pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + pfs_rw_lock_x_lock_func_nowait((M), (F), (L)) + # ifdef UNIV_SYNC_DEBUG # define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(P, L) # else @@ -367,30 +401,6 @@ rw_lock_x_unlock_func( been passed to another thread to unlock */ #endif rw_lock_t* lock); /*!< in/out: rw-lock */ - - -/******************************************************************//** -Low-level function which locks an rw-lock in s-mode when we know that it -is possible and none else is currently accessing the rw-lock structure. -Then we can do the locking without reserving the mutex. */ -UNIV_INLINE -void -rw_lock_s_lock_direct( -/*==================*/ - rw_lock_t* lock, /*!< in/out: rw-lock */ - const char* file_name, /*!< in: file name where requested */ - ulint line); /*!< in: line where lock requested */ -/******************************************************************//** -Low-level function which locks an rw-lock in x-mode when we know that it -is not locked and none else is currently accessing the rw-lock structure. -Then we can do the locking without reserving the mutex. */ -UNIV_INLINE -void -rw_lock_x_lock_direct( -/*==================*/ - rw_lock_t* lock, /*!< in/out: rw-lock */ - const char* file_name, /*!< in: file name where requested */ - ulint line); /*!< in: line where lock requested */ /******************************************************************//** This function is used in the insert buffer to move the ownership of an x-latch on a buffer frame to the current thread. The x-latch was set by @@ -558,7 +568,7 @@ shared locks are allowed. To prevent starving of a writer blocked by readers, a writer may queue for x-lock by decrementing lock_word: no new readers will be let in while the thread waits for readers to exit. */ -struct rw_lock_struct { +struct rw_lock_t { volatile lint lock_word; /*!< Holds the state of the lock. */ volatile ulint waiters;/*!< 1: there are waiters */ @@ -583,7 +593,7 @@ struct rw_lock_struct { /*!< Event for next-writer to wait on. A thread must decrement lock_word before waiting. */ #ifndef INNODB_RW_LOCKS_USE_ATOMICS - mutex_t mutex; /*!< The mutex protecting rw_lock_struct */ + ib_mutex_t mutex; /*!< The mutex protecting rw_lock_t */ #endif /* INNODB_RW_LOCKS_USE_ATOMICS */ UT_LIST_NODE_T(rw_lock_t) list; @@ -615,7 +625,7 @@ struct rw_lock_struct { unsigned last_x_line:14; /*!< Line number where last time x-locked */ #ifdef UNIV_DEBUG ulint magic_n; /*!< RW_LOCK_MAGIC_N */ -/** Value of rw_lock_struct::magic_n */ +/** Value of rw_lock_t::magic_n */ #define RW_LOCK_MAGIC_N 22643 #endif /* UNIV_DEBUG */ }; @@ -623,7 +633,7 @@ struct rw_lock_struct { #ifdef UNIV_SYNC_DEBUG /** The structure for storing debug info of an rw-lock. All access to this structure must be protected by rw_lock_debug_mutex_enter(). */ -struct rw_lock_debug_struct { +struct rw_lock_debug_t { os_thread_id_t thread_id; /*!< The thread id of the thread which locked the rw-lock */ diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index eab89e2619e..8786ad84643 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -90,7 +90,7 @@ rw_lock_set_waiter_flag( rw_lock_t* lock) /*!< in/out: rw-lock */ { #ifdef INNODB_RW_LOCKS_USE_ATOMICS - os_compare_and_swap_ulint(&lock->waiters, 0, 1); + (void) os_compare_and_swap_ulint(&lock->waiters, 0, 1); #else /* INNODB_RW_LOCKS_USE_ATOMICS */ lock->waiters = 1; #endif /* INNODB_RW_LOCKS_USE_ATOMICS */ @@ -107,7 +107,7 @@ rw_lock_reset_waiter_flag( rw_lock_t* lock) /*!< in/out: rw-lock */ { #ifdef INNODB_RW_LOCKS_USE_ATOMICS - os_compare_and_swap_ulint(&lock->waiters, 1, 0); + (void) os_compare_and_swap_ulint(&lock->waiters, 1, 0); #else /* INNODB_RW_LOCKS_USE_ATOMICS */ lock->waiters = 0; #endif /* INNODB_RW_LOCKS_USE_ATOMICS */ @@ -128,7 +128,7 @@ rw_lock_get_writer( /* return NOT_LOCKED in s-lock state, like the writer member of the old lock implementation. */ return(RW_LOCK_NOT_LOCKED); - } else if (((-lock_word) % X_LOCK_DECR) == 0) { + } else if ((lock_word == 0) || (lock_word <= -X_LOCK_DECR)) { return(RW_LOCK_EX); } else { ut_ad(lock_word > -X_LOCK_DECR); @@ -158,7 +158,7 @@ rw_lock_get_reader_count( #ifndef INNODB_RW_LOCKS_USE_ATOMICS UNIV_INLINE -mutex_t* +ib_mutex_t* rw_lock_get_mutex( /*==============*/ rw_lock_t* lock) @@ -178,11 +178,10 @@ rw_lock_get_x_lock_count( const rw_lock_t* lock) /*!< in: rw-lock */ { lint lock_copy = lock->lock_word; - /* If there is a reader, lock_word is not divisible by X_LOCK_DECR */ - if (lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) { + if ((lock_copy != 0) && (lock_copy > -X_LOCK_DECR)) { return(0); } - return(((-lock_copy) / X_LOCK_DECR) + 1); + return((lock_copy == 0) ? 1 : (2 - (lock_copy + X_LOCK_DECR))); } /******************************************************************//** @@ -325,58 +324,6 @@ rw_lock_s_lock_low( } /******************************************************************//** -Low-level function which locks an rw-lock in s-mode when we know that it -is possible and none else is currently accessing the rw-lock structure. -Then we can do the locking without reserving the mutex. */ -UNIV_INLINE -void -rw_lock_s_lock_direct( -/*==================*/ - rw_lock_t* lock, /*!< in/out: rw-lock */ - const char* file_name, /*!< in: file name where requested */ - ulint line) /*!< in: line where lock requested */ -{ - ut_ad(lock->lock_word == X_LOCK_DECR); - - /* Indicate there is a new reader by decrementing lock_word */ - lock->lock_word--; - - lock->last_s_file_name = file_name; - lock->last_s_line = line; - -#ifdef UNIV_SYNC_DEBUG - rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, line); -#endif -} - -/******************************************************************//** -Low-level function which locks an rw-lock in x-mode when we know that it -is not locked and none else is currently accessing the rw-lock structure. -Then we can do the locking without reserving the mutex. */ -UNIV_INLINE -void -rw_lock_x_lock_direct( -/*==================*/ - rw_lock_t* lock, /*!< in/out: rw-lock */ - const char* file_name, /*!< in: file name where requested */ - ulint line) /*!< in: line where lock requested */ -{ - ut_ad(rw_lock_validate(lock)); - ut_ad(lock->lock_word == X_LOCK_DECR); - - lock->lock_word -= X_LOCK_DECR; - lock->writer_thread = os_thread_get_curr_id(); - lock->recursive = TRUE; - - lock->last_x_file_name = file_name; - lock->last_x_line = line; - -#ifdef UNIV_SYNC_DEBUG - rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); -#endif -} - -/******************************************************************//** NOTE! Use the corresponding macro, not directly this function! Lock an rw-lock in shared mode for the current thread. If the rw-lock is locked in exclusive mode, or there is an exclusive lock request waiting, the @@ -458,10 +405,11 @@ rw_lock_x_lock_func_nowait( /* Relock: this lock_word modification is safe since no other threads can modify (lock, unlock, or reserve) lock_word while there is an exclusive writer and this is the writer thread. */ - lock->lock_word -= X_LOCK_DECR; - - /* Recursive x-locks must be multiples of X_LOCK_DECR. */ - ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0); + if (lock->lock_word == 0) { + lock->lock_word = -X_LOCK_DECR; + } else { + lock->lock_word--; + } /* Watch for too many recursive locks */ ut_ad(lock->lock_word < 0); @@ -494,7 +442,9 @@ rw_lock_s_unlock_func( #endif rw_lock_t* lock) /*!< in/out: rw-lock */ { - ut_ad((lock->lock_word % X_LOCK_DECR) != 0); + ut_ad(lock->lock_word > -X_LOCK_DECR); + ut_ad(lock->lock_word != 0); + ut_ad(lock->lock_word < X_LOCK_DECR); #ifdef UNIV_SYNC_DEBUG rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); @@ -530,7 +480,7 @@ rw_lock_x_unlock_func( #endif rw_lock_t* lock) /*!< in/out: rw-lock */ { - ut_ad((lock->lock_word % X_LOCK_DECR) == 0); + ut_ad(lock->lock_word == 0 || lock->lock_word <= -X_LOCK_DECR); /* lock->recursive flag also indicates if lock->writer_thread is valid or stale. If we are the last of the recursive callers @@ -541,15 +491,23 @@ rw_lock_x_unlock_func( if (lock->lock_word == 0) { /* Last caller in a possible recursive chain. */ lock->recursive = FALSE; - UNIV_MEM_INVALID(&lock->writer_thread, - sizeof lock->writer_thread); } #ifdef UNIV_SYNC_DEBUG rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); #endif - if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) { + ulint x_lock_incr; + if (lock->lock_word == 0) { + x_lock_incr = X_LOCK_DECR; + } else if (lock->lock_word == -X_LOCK_DECR) { + x_lock_incr = X_LOCK_DECR; + } else { + ut_ad(lock->lock_word < -X_LOCK_DECR); + x_lock_incr = 1; + } + + if (rw_lock_lock_word_incr(lock, x_lock_incr) == X_LOCK_DECR) { /* Lock is now free. May have to signal read/write waiters. We do not need to signal wait_ex waiters, since they cannot exist when there is a writer. */ @@ -590,7 +548,7 @@ pfs_rw_lock_create_func( ulint cline) /*!< in: file line where created */ { /* Initialize the rwlock for performance schema */ - lock->pfs_psi = PSI_CALL(init_rwlock)(key, lock); + lock->pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock); /* The actual function to initialize an rwlock */ rw_lock_create_func(lock, @@ -623,13 +581,13 @@ pfs_rw_lock_x_lock_func( PSI_rwlock_locker_state state; /* Record the entry of rw x lock request in performance schema */ - locker = PSI_CALL(start_rwlock_wrwait)( + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( &state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, file_name, line); rw_lock_x_lock_func(lock, pass, file_name, line); if (locker != NULL) - PSI_CALL(end_rwlock_wrwait)(locker, 0); + PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0); } else { @@ -659,13 +617,13 @@ pfs_rw_lock_x_lock_func_nowait( PSI_rwlock_locker_state state; /* Record the entry of rw x lock request in performance schema */ - locker = PSI_CALL(start_rwlock_wrwait)( + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( &state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, file_name, line); ret = rw_lock_x_lock_func_nowait(lock, file_name, line); if (locker != NULL) - PSI_CALL(end_rwlock_wrwait)(locker, ret); + PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, ret); } else { @@ -686,7 +644,7 @@ pfs_rw_lock_free_func( { if (lock->pfs_psi != NULL) { - PSI_CALL(destroy_rwlock)(lock->pfs_psi); + PSI_RWLOCK_CALL(destroy_rwlock)(lock->pfs_psi); lock->pfs_psi = NULL; } @@ -714,13 +672,13 @@ pfs_rw_lock_s_lock_func( PSI_rwlock_locker_state state; /* Instrumented to inform we are aquiring a shared rwlock */ - locker = PSI_CALL(start_rwlock_rdwait)( + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( &state, lock->pfs_psi, PSI_RWLOCK_READLOCK, file_name, line); rw_lock_s_lock_func(lock, pass, file_name, line); if (locker != NULL) - PSI_CALL(end_rwlock_rdwait)(locker, 0); + PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0); } else { @@ -753,13 +711,13 @@ pfs_rw_lock_s_lock_low( PSI_rwlock_locker_state state; /* Instrumented to inform we are aquiring a shared rwlock */ - locker = PSI_CALL(start_rwlock_rdwait)( + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( &state, lock->pfs_psi, PSI_RWLOCK_READLOCK, file_name, line); ret = rw_lock_s_lock_low(lock, pass, file_name, line); if (locker != NULL) - PSI_CALL(end_rwlock_rdwait)(locker, ret); + PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, ret); } else { @@ -786,7 +744,7 @@ pfs_rw_lock_x_unlock_func( { /* Inform performance schema we are unlocking the lock */ if (lock->pfs_psi != NULL) - PSI_CALL(unlock_rwlock)(lock->pfs_psi); + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); rw_lock_x_unlock_func( #ifdef UNIV_SYNC_DEBUG @@ -812,7 +770,7 @@ pfs_rw_lock_s_unlock_func( { /* Inform performance schema we are unlocking the lock */ if (lock->pfs_psi != NULL) - PSI_CALL(unlock_rwlock)(lock->pfs_psi); + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); rw_lock_s_unlock_func( #ifdef UNIV_SYNC_DEBUG diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 1adcf938903..9950a6fbf6b 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -1,7 +1,8 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2012, Facebook Inc. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -95,6 +96,7 @@ extern mysql_pfs_key_t mem_pool_mutex_key; extern mysql_pfs_key_t mutex_list_mutex_key; extern mysql_pfs_key_t purge_sys_bh_mutex_key; extern mysql_pfs_key_t recv_sys_mutex_key; +extern mysql_pfs_key_t recv_writer_mutex_key; extern mysql_pfs_key_t rseg_mutex_key; # ifdef UNIV_SYNC_DEBUG extern mysql_pfs_key_t rw_lock_debug_mutex_key; @@ -120,9 +122,13 @@ extern mysql_pfs_key_t srv_sys_tasks_mutex_key; #ifndef HAVE_ATOMIC_BUILTINS extern mysql_pfs_key_t srv_conc_mutex_key; #endif /* !HAVE_ATOMIC_BUILTINS */ +#ifndef HAVE_ATOMIC_BUILTINS_64 +extern mysql_pfs_key_t monitor_mutex_key; +#endif /* !HAVE_ATOMIC_BUILTINS_64 */ extern mysql_pfs_key_t event_os_mutex_key; extern mysql_pfs_key_t ut_list_mutex_key; extern mysql_pfs_key_t os_mutex_key; +extern mysql_pfs_key_t zip_pad_mutex_key; #endif /* UNIV_PFS_MUTEX */ /******************************************************************//** @@ -223,7 +229,7 @@ UNIV_INTERN void mutex_create_func( /*==============*/ - mutex_t* mutex, /*!< in: pointer to memory */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ #ifdef UNIV_DEBUG const char* cmutex_name, /*!< in: mutex name */ # ifdef UNIV_SYNC_DEBUG @@ -242,7 +248,7 @@ UNIV_INTERN void mutex_free_func( /*============*/ - mutex_t* mutex); /*!< in: mutex */ + ib_mutex_t* mutex); /*!< in: mutex */ /**************************************************************//** NOTE! The following macro should be used in mutex locking, not the corresponding function. */ @@ -259,7 +265,7 @@ UNIV_INLINE void mutex_enter_func( /*=============*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where locked */ ulint line); /*!< in: line where locked */ /********************************************************************//** @@ -271,7 +277,7 @@ UNIV_INTERN ulint mutex_enter_nowait_func( /*====================*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where mutex requested */ ulint line); /*!< in: line where requested */ @@ -282,7 +288,7 @@ UNIV_INLINE void mutex_exit_func( /*============*/ - mutex_t* mutex); /*!< in: pointer to mutex */ + ib_mutex_t* mutex); /*!< in: pointer to mutex */ #ifdef UNIV_PFS_MUTEX @@ -297,7 +303,7 @@ void pfs_mutex_create_func( /*==================*/ PSI_mutex_key key, /*!< in: Performance Schema key */ - mutex_t* mutex, /*!< in: pointer to memory */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ # ifdef UNIV_DEBUG const char* cmutex_name, /*!< in: mutex name */ # ifdef UNIV_SYNC_DEBUG @@ -315,7 +321,7 @@ UNIV_INLINE void pfs_mutex_enter_func( /*=================*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where locked */ ulint line); /*!< in: line where locked */ /********************************************************************//** @@ -328,7 +334,7 @@ UNIV_INLINE ulint pfs_mutex_enter_nowait_func( /*========================*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where mutex requested */ ulint line); /*!< in: line where requested */ @@ -341,7 +347,7 @@ UNIV_INLINE void pfs_mutex_exit_func( /*================*/ - mutex_t* mutex); /*!< in: pointer to mutex */ + ib_mutex_t* mutex); /*!< in: pointer to mutex */ /******************************************************************//** NOTE! Please use the corresponding macro mutex_free(), not directly @@ -352,7 +358,7 @@ UNIV_INLINE void pfs_mutex_free_func( /*================*/ - mutex_t* mutex); /*!< in: mutex */ + ib_mutex_t* mutex); /*!< in: mutex */ #endif /* UNIV_PFS_MUTEX */ @@ -390,7 +396,7 @@ UNIV_INTERN ibool mutex_validate( /*===========*/ - const mutex_t* mutex); /*!< in: mutex */ + const ib_mutex_t* mutex); /*!< in: mutex */ /******************************************************************//** Checks that the current thread owns the mutex. Works only in the debug version. @@ -399,7 +405,7 @@ UNIV_INTERN ibool mutex_own( /*======*/ - const mutex_t* mutex) /*!< in: mutex */ + const ib_mutex_t* mutex) /*!< in: mutex */ __attribute__((warn_unused_result)); #endif /* UNIV_DEBUG */ #ifdef UNIV_SYNC_DEBUG @@ -470,7 +476,7 @@ UNIV_INTERN void mutex_get_debug_info( /*=================*/ - mutex_t* mutex, /*!< in: mutex */ + ib_mutex_t* mutex, /*!< in: mutex */ const char** file_name, /*!< out: file where requested */ ulint* line, /*!< out: line where requested */ os_thread_id_t* thread_id); /*!< out: id of the thread which owns @@ -490,7 +496,7 @@ UNIV_INLINE lock_word_t mutex_get_lock_word( /*================*/ - const mutex_t* mutex); /*!< in: mutex */ + const ib_mutex_t* mutex); /*!< in: mutex */ #ifdef UNIV_SYNC_DEBUG /******************************************************************//** NOT to be used outside this module except in debugging! Gets the waiters @@ -500,7 +506,7 @@ UNIV_INLINE ulint mutex_get_waiters( /*==============*/ - const mutex_t* mutex); /*!< in: mutex */ + const ib_mutex_t* mutex); /*!< in: mutex */ #endif /* UNIV_SYNC_DEBUG */ /* @@ -662,6 +668,7 @@ or row lock! */ #define SYNC_FTS_CACHE 1005 /* FTS cache rwlock */ #define SYNC_DICT 1000 #define SYNC_DICT_AUTOINC_MUTEX 999 +#define SYNC_STATS_AUTO_RECALC 997 #define SYNC_DICT_HEADER 995 #define SYNC_IBUF_HEADER 914 #define SYNC_IBUF_PESS_INSERT_MUTEX 912 @@ -679,14 +686,16 @@ or row lock! */ #define SYNC_EXTERN_STORAGE 500 #define SYNC_FSP 400 #define SYNC_FSP_PAGE 395 -/*------------------------------------- Insert buffer headers */ +/*------------------------------------- Change buffer headers */ #define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */ -/*------------------------------------- Insert buffer tree */ +/*------------------------------------- Change buffer tree */ #define SYNC_IBUF_INDEX_TREE 360 #define SYNC_IBUF_TREE_NODE_NEW 359 #define SYNC_IBUF_TREE_NODE 358 #define SYNC_IBUF_BITMAP_MUTEX 351 #define SYNC_IBUF_BITMAP 350 +/*------------------------------------- Change log for online create index */ +#define SYNC_INDEX_ONLINE_LOG 340 /*------------------------------------- MySQL query cache mutex */ /*------------------------------------- MySQL binlog mutex */ /*-------------------------------*/ @@ -733,7 +742,7 @@ Do not use its fields directly! The structure used in the spin lock implementation of a mutual exclusion semaphore. */ /** InnoDB mutex */ -struct mutex_struct { +struct ib_mutex_t { os_event_t event; /*!< Used by sync0arr.cc for the wait queue */ volatile lock_word_t lock_word; /*!< lock_word is the target of the atomic test-and-set instruction when @@ -748,7 +757,7 @@ struct mutex_struct { may be) threads waiting in the global wait array for this mutex to be released. Otherwise, this is 0. */ - UT_LIST_NODE_T(mutex_t) list; /*!< All allocated mutexes are put into + UT_LIST_NODE_T(ib_mutex_t) list; /*!< All allocated mutexes are put into a list. Pointers to the next and prev. */ #ifdef UNIV_SYNC_DEBUG const char* file_name; /*!< File where the mutex was locked */ @@ -757,23 +766,17 @@ struct mutex_struct { #endif /* UNIV_SYNC_DEBUG */ const char* cfile_name;/*!< File name where mutex created */ ulint cline; /*!< Line where created */ + ulong count_os_wait; /*!< count of os_wait */ #ifdef UNIV_DEBUG + +/** Value of mutex_t::magic_n */ +# define MUTEX_MAGIC_N 979585UL + os_thread_id_t thread_id; /*!< The thread id of the thread which locked the mutex. */ ulint magic_n; /*!< MUTEX_MAGIC_N */ -/** Value of mutex_struct::magic_n */ -# define MUTEX_MAGIC_N (ulint)979585 -#endif /* UNIV_DEBUG */ - ulong count_os_wait; /*!< count of os_wait */ -#ifdef UNIV_DEBUG - ulong count_using; /*!< count of times mutex used */ - ulong count_spin_loop; /*!< count of spin loops */ - ulong count_spin_rounds;/*!< count of spin rounds */ - ulong count_os_yield; /*!< count of os_wait */ - ulonglong lspent_time; /*!< mutex os_wait timer msec */ - ulonglong lmax_spent_time;/*!< mutex os_wait timer msec */ const char* cmutex_name; /*!< mutex name */ - ulint mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */ + ulint ib_mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */ #endif /* UNIV_DEBUG */ #ifdef UNIV_PFS_MUTEX struct PSI_mutex* pfs_psi; /*!< The performance schema @@ -799,12 +802,12 @@ extern ibool sync_order_checks_on; extern ibool sync_initialized; /** Global list of database mutexes (not OS mutexes) created. */ -typedef UT_LIST_BASE_NODE_T(mutex_t) ut_list_base_node_t; +typedef UT_LIST_BASE_NODE_T(ib_mutex_t) ut_list_base_node_t; /** Global list of database mutexes (not OS mutexes) created. */ extern ut_list_base_node_t mutex_list; /** Mutex protecting the mutex_list variable */ -extern mutex_t mutex_list_mutex; +extern ib_mutex_t mutex_list_mutex; #ifndef HAVE_ATOMIC_BUILTINS /**********************************************************//** @@ -813,7 +816,7 @@ UNIV_INLINE void os_atomic_dec_ulint_func( /*=====================*/ - mutex_t* mutex, /*!< in: mutex guarding the + ib_mutex_t* mutex, /*!< in: mutex guarding the decrement */ volatile ulint* var, /*!< in/out: variable to decrement */ @@ -824,7 +827,7 @@ UNIV_INLINE void os_atomic_inc_ulint_func( /*=====================*/ - mutex_t* mutex, /*!< in: mutex guarding the + ib_mutex_t* mutex, /*!< in: mutex guarding the increment */ volatile ulint* var, /*!< in/out: variable to increment */ diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic index 746e73ebee7..ad77ad6d5a4 100644 --- a/storage/innobase/include/sync0sync.ic +++ b/storage/innobase/include/sync0sync.ic @@ -36,7 +36,7 @@ UNIV_INTERN void mutex_set_waiters( /*==============*/ - mutex_t* mutex, /*!< in: mutex */ + ib_mutex_t* mutex, /*!< in: mutex */ ulint n); /*!< in: value to set */ /******************************************************************//** Reserves a mutex for the current thread. If the mutex is reserved, the @@ -46,7 +46,7 @@ UNIV_INTERN void mutex_spin_wait( /*============*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where mutex requested */ ulint line); /*!< in: line where requested */ @@ -57,7 +57,7 @@ UNIV_INTERN void mutex_set_debug_info( /*=================*/ - mutex_t* mutex, /*!< in: mutex */ + ib_mutex_t* mutex, /*!< in: mutex */ const char* file_name, /*!< in: file where requested */ ulint line); /*!< in: line where requested */ #endif /* UNIV_SYNC_DEBUG */ @@ -67,7 +67,7 @@ UNIV_INTERN void mutex_signal_object( /*================*/ - mutex_t* mutex); /*!< in: mutex */ + ib_mutex_t* mutex); /*!< in: mutex */ /******************************************************************//** Performs an atomic test-and-set instruction to the lock_word field of a @@ -75,9 +75,9 @@ mutex. @return the previous value of lock_word: 0 or 1 */ UNIV_INLINE byte -mutex_test_and_set( +ib_mutex_test_and_set( /*===============*/ - mutex_t* mutex) /*!< in: mutex */ + ib_mutex_t* mutex) /*!< in: mutex */ { #if defined(HAVE_ATOMIC_BUILTINS) return(os_atomic_test_and_set_byte(&mutex->lock_word, 1)); @@ -105,7 +105,7 @@ UNIV_INLINE void mutex_reset_lock_word( /*==================*/ - mutex_t* mutex) /*!< in: mutex */ + ib_mutex_t* mutex) /*!< in: mutex */ { #if defined(HAVE_ATOMIC_BUILTINS) /* In theory __sync_lock_release should be used to release the lock. @@ -125,7 +125,7 @@ UNIV_INLINE lock_word_t mutex_get_lock_word( /*================*/ - const mutex_t* mutex) /*!< in: mutex */ + const ib_mutex_t* mutex) /*!< in: mutex */ { ut_ad(mutex); @@ -139,7 +139,7 @@ UNIV_INLINE ulint mutex_get_waiters( /*==============*/ - const mutex_t* mutex) /*!< in: mutex */ + const ib_mutex_t* mutex) /*!< in: mutex */ { const volatile ulint* ptr; /*!< declared volatile to ensure that the value is read from memory */ @@ -158,7 +158,7 @@ UNIV_INLINE void mutex_exit_func( /*============*/ - mutex_t* mutex) /*!< in: pointer to mutex */ + ib_mutex_t* mutex) /*!< in: pointer to mutex */ { ut_ad(mutex_own(mutex)); @@ -199,7 +199,7 @@ UNIV_INLINE void mutex_enter_func( /*=============*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where locked */ ulint line) /*!< in: line where locked */ { @@ -209,9 +209,7 @@ mutex_enter_func( /* Note that we do not peek at the value of lock_word before trying the atomic test_and_set; we could peek, and possibly save time. */ - ut_d(mutex->count_using++); - - if (!mutex_test_and_set(mutex)) { + if (!ib_mutex_test_and_set(mutex)) { ut_d(mutex->thread_id = os_thread_get_curr_id()); #ifdef UNIV_SYNC_DEBUG mutex_set_debug_info(mutex, file_name, line); @@ -232,28 +230,28 @@ UNIV_INLINE void pfs_mutex_enter_func( /*=================*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where locked */ ulint line) /*!< in: line where locked */ { - if (mutex->pfs_psi != NULL) - { + if (mutex->pfs_psi != NULL) { PSI_mutex_locker* locker; PSI_mutex_locker_state state; - locker = PSI_CALL(start_mutex_wait)(&state, mutex->pfs_psi, + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, mutex->pfs_psi, PSI_MUTEX_LOCK, file_name, line); mutex_enter_func(mutex, file_name, line); - if (locker != NULL) - PSI_CALL(end_mutex_wait)(locker, 0); - } - else - { + if (locker != NULL) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, 0); + } + } else { mutex_enter_func(mutex, file_name, line); } } + /********************************************************************//** NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly this function! @@ -264,33 +262,33 @@ UNIV_INLINE ulint pfs_mutex_enter_nowait_func( /*========================*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where mutex requested */ ulint line) /*!< in: line where requested */ { - ulint ret; + ulint ret; - if (mutex->pfs_psi != NULL) - { + if (mutex->pfs_psi != NULL) { PSI_mutex_locker* locker; PSI_mutex_locker_state state; - locker = PSI_CALL(start_mutex_wait)(&state, mutex->pfs_psi, + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, mutex->pfs_psi, PSI_MUTEX_TRYLOCK, file_name, line); ret = mutex_enter_nowait_func(mutex, file_name, line); - if (locker != NULL) - PSI_CALL(end_mutex_wait)(locker, (int) ret); - } - else - { + if (locker != NULL) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, (int) ret); + } + } else { ret = mutex_enter_nowait_func(mutex, file_name, line); } return(ret); } + /******************************************************************//** NOTE! Please use the corresponding macro mutex_exit(), not directly this function! @@ -300,10 +298,11 @@ UNIV_INLINE void pfs_mutex_exit_func( /*================*/ - mutex_t* mutex) /*!< in: pointer to mutex */ + ib_mutex_t* mutex) /*!< in: pointer to mutex */ { - if (mutex->pfs_psi != NULL) - PSI_CALL(unlock_mutex)(mutex->pfs_psi); + if (mutex->pfs_psi != NULL) { + PSI_MUTEX_CALL(unlock_mutex)(mutex->pfs_psi); + } mutex_exit_func(mutex); } @@ -319,7 +318,7 @@ void pfs_mutex_create_func( /*==================*/ mysql_pfs_key_t key, /*!< in: Performance Schema key */ - mutex_t* mutex, /*!< in: pointer to memory */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ # ifdef UNIV_DEBUG const char* cmutex_name, /*!< in: mutex name */ # ifdef UNIV_SYNC_DEBUG @@ -329,7 +328,7 @@ pfs_mutex_create_func( const char* cfile_name, /*!< in: file name where created */ ulint cline) /*!< in: file line where created */ { - mutex->pfs_psi = PSI_CALL(init_mutex)(key, mutex); + mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex); mutex_create_func(mutex, # ifdef UNIV_DEBUG @@ -341,6 +340,7 @@ pfs_mutex_create_func( cfile_name, cline); } + /******************************************************************//** NOTE! Please use the corresponding macro mutex_free(), not directly this function! @@ -350,11 +350,10 @@ UNIV_INLINE void pfs_mutex_free_func( /*================*/ - mutex_t* mutex) /*!< in: mutex */ + ib_mutex_t* mutex) /*!< in: mutex */ { - if (mutex->pfs_psi != NULL) - { - PSI_CALL(destroy_mutex)(mutex->pfs_psi); + if (mutex->pfs_psi != NULL) { + PSI_MUTEX_CALL(destroy_mutex)(mutex->pfs_psi); mutex->pfs_psi = NULL; } @@ -370,7 +369,7 @@ UNIV_INLINE void os_atomic_dec_ulint_func( /*=====================*/ - mutex_t* mutex, /*!< in: mutex guarding the dec */ + ib_mutex_t* mutex, /*!< in: mutex guarding the dec */ volatile ulint* var, /*!< in/out: variable to decrement */ ulint delta) /*!< in: delta to decrement */ { @@ -391,7 +390,7 @@ UNIV_INLINE void os_atomic_inc_ulint_func( /*=====================*/ - mutex_t* mutex, /*!< in: mutex guarding the increment */ + ib_mutex_t* mutex, /*!< in: mutex guarding the increment */ volatile ulint* var, /*!< in/out: variable to increment */ ulint delta) /*!< in: delta to increment */ { diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h index 679cf6a9074..0d143004a7a 100644 --- a/storage/innobase/include/sync0types.h +++ b/storage/innobase/include/sync0types.h @@ -26,9 +26,6 @@ Created 9/5/1995 Heikki Tuuri #ifndef sync0types_h #define sync0types_h -/** Rename mutex_t to avoid name space collision on some systems */ -#define mutex_t ib_mutex_t -/** InnoDB mutex */ -typedef struct mutex_struct mutex_t; +struct ib_mutex_t; #endif diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h index c286fc4d9ae..662971a7841 100644 --- a/storage/innobase/include/trx0i_s.h +++ b/storage/innobase/include/trx0i_s.h @@ -79,25 +79,21 @@ do { \ } while (0) /** A row of INFORMATION_SCHEMA.innodb_locks */ -typedef struct i_s_locks_row_struct i_s_locks_row_t; -/** A row of INFORMATION_SCHEMA.innodb_trx */ -typedef struct i_s_trx_row_struct i_s_trx_row_t; -/** A row of INFORMATION_SCHEMA.innodb_lock_waits */ -typedef struct i_s_lock_waits_row_struct i_s_lock_waits_row_t; +struct i_s_locks_row_t; /** Objects of trx_i_s_cache_t::locks_hash */ -typedef struct i_s_hash_chain_struct i_s_hash_chain_t; +struct i_s_hash_chain_t; /** Objects of this type are added to the hash table trx_i_s_cache_t::locks_hash */ -struct i_s_hash_chain_struct { +struct i_s_hash_chain_t { i_s_locks_row_t* value; /*!< row of INFORMATION_SCHEMA.innodb_locks*/ i_s_hash_chain_t* next; /*!< next item in the hash chain */ }; /** This structure represents INFORMATION_SCHEMA.innodb_locks row */ -struct i_s_locks_row_struct { +struct i_s_locks_row_t { trx_id_t lock_trx_id; /*!< transaction identifier */ const char* lock_mode; /*!< lock mode from lock_get_mode_str() */ @@ -128,16 +124,16 @@ struct i_s_locks_row_struct { }; /** This structure represents INFORMATION_SCHEMA.innodb_trx row */ -struct i_s_trx_row_struct { +struct i_s_trx_row_t { trx_id_t trx_id; /*!< transaction identifier */ const char* trx_state; /*!< transaction state from trx_get_que_state_str() */ - ib_time_t trx_started; /*!< trx_struct::start_time */ + ib_time_t trx_started; /*!< trx_t::start_time */ const i_s_locks_row_t* requested_lock_row; /*!< pointer to a row in innodb_locks if trx is waiting, or NULL */ - ib_time_t trx_wait_started; /*!< trx_struct::wait_started */ + ib_time_t trx_wait_started; /*!< trx_t::wait_started */ ullint trx_weight; /*!< TRX_WEIGHT() */ ulint trx_mysql_thread_id; /*!< thd_get_thread_id() */ const char* trx_query; /*!< MySQL statement being @@ -145,36 +141,34 @@ struct i_s_trx_row_struct { struct charset_info_st* trx_query_cs; /*!< charset encode the MySQL statement */ - const char* trx_operation_state; /*!< trx_struct::op_info */ + const char* trx_operation_state; /*!< trx_t::op_info */ ulint trx_tables_in_use;/*!< n_mysql_tables_in_use in - trx_struct */ + trx_t */ ulint trx_tables_locked; /*!< mysql_n_tables_locked in - trx_struct */ + trx_t */ ulint trx_lock_structs;/*!< list len of trx_locks in - trx_struct */ + trx_t */ ulint trx_lock_memory_bytes; /*!< mem_heap_get_size( trx->lock_heap) */ ulint trx_rows_locked;/*!< lock_number_of_rows_locked() */ - ullint trx_rows_modified;/*!< trx_struct::undo_no */ + ullint trx_rows_modified;/*!< trx_t::undo_no */ ulint trx_concurrency_tickets; /*!< n_tickets_to_enter_innodb in - trx_struct */ + trx_t */ const char* trx_isolation_level; - /*!< isolation_level in trx_struct*/ + /*!< isolation_level in trx_t */ ibool trx_unique_checks; - /*!< check_unique_secondary in - trx_struct*/ + /*!< check_unique_secondary in trx_t*/ ibool trx_foreign_key_checks; - /*!< check_foreigns in trx_struct */ + /*!< check_foreigns in trx_t */ const char* trx_foreign_key_error; - /*!< detailed_error in trx_struct */ + /*!< detailed_error in trx_t */ ibool trx_has_search_latch; - /*!< has_search_latch in trx_struct */ + /*!< has_search_latch in trx_t */ ulint trx_search_latch_timeout; - /*!< search_latch_timeout in - trx_struct */ + /*!< search_latch_timeout in trx_t */ ulint trx_is_read_only; /*!< trx_t::read_only */ ulint trx_is_autocommit_non_locking; @@ -183,13 +177,13 @@ struct i_s_trx_row_struct { }; /** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */ -struct i_s_lock_waits_row_struct { +struct i_s_lock_waits_row_t { const i_s_locks_row_t* requested_lock_row; /*!< requested lock */ const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */ }; /** Cache of INFORMATION_SCHEMA table data */ -typedef struct trx_i_s_cache_struct trx_i_s_cache_t; +struct trx_i_s_cache_t; /** Auxiliary enum used by functions that need to select one of the INFORMATION_SCHEMA tables */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 0199083467c..1e13c883800 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -108,7 +108,8 @@ enum purge_state_t { PURGE_STATE_INIT, /*!< Purge instance created */ PURGE_STATE_RUN, /*!< Purge should be running */ PURGE_STATE_STOP, /*!< Purge should be stopped */ - PURGE_STATE_EXIT /*!< Purge has been shutdown */ + PURGE_STATE_EXIT, /*!< Purge has been shutdown */ + PURGE_STATE_DISABLED /*!< Purge was never started */ }; /*******************************************************************//** @@ -121,16 +122,16 @@ trx_purge_state(void); /** This is the purge pointer/iterator. We need both the undo no and the transaction no up to which purge has parsed and applied the records. */ -typedef struct purge_iter_struct { +struct purge_iter_t { trx_id_t trx_no; /*!< Purge has advanced past all transactions whose number is less than this */ undo_no_t undo_no; /*!< Purge has advanced past all records whose undo number is less than this */ -} purge_iter_t; +}; /** The control structure used in the purge operation */ -struct trx_purge_struct{ +struct trx_purge_t{ sess_t* sess; /*!< System session running the purge query */ trx_t* trx; /*!< System transaction running the @@ -146,7 +147,8 @@ struct trx_purge_struct{ protects state and running */ os_event_t event; /*!< State signal event */ ulint n_stop; /*!< Counter to track number stops */ - bool running; /*!< true, if purge is active */ + volatile bool running; /*!< true, if purge is active, + we check this without the latch too */ volatile purge_state_t state; /*!< Purge coordinator thread states, we check this in several places without holding the latch. */ @@ -171,6 +173,10 @@ struct trx_purge_struct{ purge_iter_t limit; /* The 'purge pointer' which advances during a purge, and which is used in history list truncation */ +#ifdef UNIV_DEBUG + purge_iter_t done; /* Indicate 'purge pointer' which have + purged already accurately. */ +#endif /* UNIV_DEBUG */ /*-----------------------------*/ ibool next_stored; /*!< TRUE if the info of the next record to purge is stored below: if yes, then @@ -196,17 +202,15 @@ struct trx_purge_struct{ ib_bh_t* ib_bh; /*!< Binary min-heap, ordered on rseg_queue_t::trx_no. It is protected by the bh_mutex */ - mutex_t bh_mutex; /*!< Mutex protecting ib_bh */ + ib_mutex_t bh_mutex; /*!< Mutex protecting ib_bh */ }; /** Info required to purge a record */ -struct trx_purge_rec_struct { +struct trx_purge_rec_t { trx_undo_rec_t* undo_rec; /*!< Record to purge */ roll_ptr_t roll_ptr; /*!< File pointr to UNDO record */ }; -typedef struct trx_purge_rec_struct trx_purge_rec_t; - #ifndef UNIV_NONINL #include "trx0purge.ic" #endif diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h index c9fae45dad4..cd1ecc096fd 100644 --- a/storage/innobase/include/trx0rec.h +++ b/storage/innobase/include/trx0rec.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -105,10 +105,11 @@ trx_undo_rec_get_pars( TRX_UNDO_INSERT_REC, ... */ ulint* cmpl_info, /*!< out: compiler info, relevant only for update type records */ - ibool* updated_extern, /*!< out: TRUE if we updated an + bool* updated_extern, /*!< out: true if we updated an externally stored fild */ undo_no_t* undo_no, /*!< out: undo log record number */ - table_id_t* table_id); /*!< out: table id */ + table_id_t* table_id) /*!< out: table id */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Builds a row reference from an undo log record. @return pointer to remaining part of undo record */ @@ -178,8 +179,9 @@ trx_undo_update_rec_get_update( needed is allocated */ upd_t** upd); /*!< out, own: update vector */ /*******************************************************************//** -Builds a partial row from an update undo log record. It contains the -columns which occur as ordering in any index of the table. +Builds a partial row from an update undo log record, for purge. +It contains the columns which occur as ordering in any index of the table. +Any missing columns are indicated by col->mtype == DATA_MISSING. @return pointer to remaining part of undo record */ UNIV_INTERN byte* @@ -197,8 +199,9 @@ trx_undo_rec_get_partial_row( ibool ignore_prefix, /*!< in: flag to indicate if we expect blob prefixes in undo. Used only in the assertion. */ - mem_heap_t* heap); /*!< in: memory heap from which the memory + mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); /***********************************************************************//** Writes information to an undo log about an insert, update, or a delete marking of a clustered index record. This information is used in a rollback of the @@ -206,7 +209,7 @@ transaction and in consistent reads that must look to the history of this transaction. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t trx_undo_report_row_operation( /*==========================*/ ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is @@ -225,10 +228,12 @@ trx_undo_report_row_operation( const rec_t* rec, /*!< in: case of an update or delete marking, the record in the clustered index, otherwise NULL */ - roll_ptr_t* roll_ptr); /*!< out: rollback pointer to the + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the inserted undo log record, 0 if BTR_NO_UNDO_LOG flag was specified */ + __attribute__((nonnull(3,4,10), warn_unused_result)); /******************************************************************//** Copies an undo record to heap. This function can be called if we know that the undo log record exists. @@ -238,16 +243,17 @@ trx_undo_rec_t* trx_undo_get_undo_rec_low( /*======================*/ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ - mem_heap_t* heap); /*!< in: memory heap where copied */ + mem_heap_t* heap) /*!< in: memory heap where copied */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Build a previous version of a clustered index record. The caller must -hold a latch on the index page of the clustered index record, to -guarantee that the stack of versions is locked all the way down to the -purge_sys->view. -@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is -earlier than purge_view, which means that it may have been removed */ +hold a latch on the index page of the clustered index record. +@retval true if previous version was built, or if it was an insert +or the table has been rebuilt +@retval false if the previous version is earlier than purge_view, +which means that it may have been removed */ UNIV_INTERN -ulint +bool trx_undo_prev_version_build( /*========================*/ const rec_t* index_rec,/*!< in: clustered index record in the @@ -256,12 +262,13 @@ trx_undo_prev_version_build( index_rec page and purge_view */ const rec_t* rec, /*!< in: version of a clustered index record */ dict_index_t* index, /*!< in: clustered index */ - ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ mem_heap_t* heap, /*!< in: memory heap from which the memory needed is allocated */ - rec_t** old_vers);/*!< out, own: previous version, or NULL if + rec_t** old_vers)/*!< out, own: previous version, or NULL if rec is the first inserted version, or if history data has been deleted */ + __attribute__((nonnull)); #endif /* !UNIV_HOTBACKUP */ /***********************************************************//** Parses a redo log record of adding an undo log record. diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic index 847c26f03a8..08704f6b821 100644 --- a/storage/innobase/include/trx0rec.ic +++ b/storage/innobase/include/trx0rec.ic @@ -90,7 +90,7 @@ trx_undo_rec_get_offset( /*====================*/ undo_no_t undo_no) /*!< in: undo no read from node */ { - return (3 + mach_ull_get_much_compressed_size(undo_no)); + return(3 + mach_ull_get_much_compressed_size(undo_no)); } /***********************************************************************//** diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h index 3b724e03830..9d020a10725 100644 --- a/storage/innobase/include/trx0roll.h +++ b/storage/innobase/include/trx0roll.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -146,29 +146,32 @@ trx_rollback_step( Rollback a transaction used in MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t trx_rollback_for_mysql( /*===================*/ - trx_t* trx); /*!< in/out: transaction */ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); /*******************************************************************//** Rollback the latest SQL statement for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t trx_rollback_last_sql_stat_for_mysql( /*=================================*/ - trx_t* trx); /*!< in/out: transaction */ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); /*******************************************************************//** Rollback a transaction to a given savepoint or do a complete rollback. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t trx_rollback_to_savepoint( /*======================*/ trx_t* trx, /*!< in: transaction handle */ - trx_savept_t* savept);/*!< in: pointer to savepoint undo number, if + trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if partial rollback requested, or NULL for complete rollback */ + __attribute__((nonnull(1))); /*******************************************************************//** Rolls back a transaction back to a named savepoint. Modifications after the savepoint are undone but InnoDB does NOT release the corresponding locks @@ -179,17 +182,18 @@ were set after this savepoint are deleted. @return if no savepoint of the name found then DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_rollback_to_savepoint_for_mysql( /*================================*/ trx_t* trx, /*!< in: transaction handle */ const char* savepoint_name, /*!< in: savepoint name */ - ib_int64_t* mysql_binlog_cache_pos);/*!< out: the MySQL binlog cache + ib_int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache position corresponding to this savepoint; MySQL needs this information to remove the binlog entries of the queries executed after the savepoint */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Creates a named savepoint. If the transaction is not yet started, starts it. If there is already a savepoint of the same name, this call erases that old @@ -197,28 +201,28 @@ savepoint and replaces it with a new. Savepoints are deleted in a transaction commit or rollback. @return always DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_savepoint_for_mysql( /*====================*/ trx_t* trx, /*!< in: transaction handle */ const char* savepoint_name, /*!< in: savepoint name */ - ib_int64_t binlog_cache_pos); /*!< in: MySQL binlog cache + ib_int64_t binlog_cache_pos) /*!< in: MySQL binlog cache position corresponding to this connection at the time of the savepoint */ - + __attribute__((nonnull)); /*******************************************************************//** Releases a named savepoint. Savepoints which were set after this savepoint are deleted. @return if no savepoint of the name found then DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_release_savepoint_for_mysql( /*============================*/ trx_t* trx, /*!< in: transaction handle */ - const char* savepoint_name); /*!< in: savepoint name */ - + const char* savepoint_name) /*!< in: savepoint name */ + __attribute__((nonnull, warn_unused_result)); /*******************************************************************//** Frees savepoint structs starting from savep. */ UNIV_INTERN @@ -230,8 +234,8 @@ trx_roll_savepoints_free( if this is NULL, free all savepoints of trx */ -/** A cell of trx_undo_arr_struct; used during a rollback and a purge */ -struct trx_undo_inf_struct{ +/** A cell of trx_undo_arr_t; used during a rollback and a purge */ +struct trx_undo_inf_t{ ibool in_use; /*!< true if cell is being used */ trx_id_t trx_no; /*!< transaction number: not defined during a rollback */ @@ -241,7 +245,7 @@ struct trx_undo_inf_struct{ /** During a rollback and a purge, undo numbers of undo records currently being processed are stored in this array */ -struct trx_undo_arr_struct{ +struct trx_undo_arr_t{ ulint n_cells; /*!< number of cells in the array */ ulint n_used; /*!< number of cells in use */ trx_undo_inf_t* infos; /*!< the array of undo infos */ @@ -258,7 +262,7 @@ enum roll_node_state { }; /** Rollback command node in a query graph */ -struct roll_node_struct{ +struct roll_node_t{ que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */ enum roll_node_state state; /*!< node execution state */ ibool partial;/*!< TRUE if we want a partial @@ -270,7 +274,7 @@ struct roll_node_struct{ }; /** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */ -struct trx_named_savept_struct{ +struct trx_named_savept_t{ char* name; /*!< savepoint name */ trx_savept_t savept; /*!< the undo number corresponding to the savepoint */ diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index 66e5449cf57..185b05876b4 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -151,11 +151,11 @@ trx_rseg_get_n_undo_tablespaces( #define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2) /* The rollback segment memory object */ -struct trx_rseg_struct{ +struct trx_rseg_t{ /*--------------------------------------------------------*/ ulint id; /*!< rollback segment id == the index of its slot in the trx system file copy */ - mutex_t mutex; /*!< mutex protecting the fields in this + ib_mutex_t mutex; /*!< mutex protecting the fields in this struct except id, which is constant */ ulint space; /*!< space where the rollback segment is header is placed */ @@ -192,13 +192,11 @@ struct trx_rseg_struct{ }; /** For prioritising the rollback segments for purge. */ -struct rseg_queue_struct { +struct rseg_queue_t { trx_id_t trx_no; /*!< trx_rseg_t::last_trx_no */ trx_rseg_t* rseg; /*!< Rollback segment */ }; -typedef struct rseg_queue_struct rseg_queue_t; - /* Undo log segment slot in a rollback segment header */ /*-------------------------------------------------------------*/ #define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index b1aa3d2224c..70f214d1ac7 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -426,7 +426,7 @@ trx_sys_file_format_max_get(void); Check for the max file format tag stored on disk. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t trx_sys_file_format_max_check( /*==========================*/ ulint max_format_id); /*!< in: the max format id to check */ @@ -600,18 +600,28 @@ identifier is added to this 64-bit constant. */ #ifndef UNIV_HOTBACKUP /** The transaction system central memory data structure. */ -struct trx_sys_struct{ +struct trx_sys_t{ - mutex_t mutex; /*!< mutex protecting most fields in + ib_mutex_t mutex; /*!< mutex protecting most fields in this structure except when noted otherwise */ - ulint n_mysql_trx; /*!< Number of transactions currently - allocated for MySQL */ ulint n_prepared_trx; /*!< Number of transactions currently in the XA PREPARED state */ + ulint n_prepared_recovered_trx; /*!< Number of transactions + currently in XA PREPARED state that are + also recovered. Such transactions cannot + be added during runtime. They can only + occur after recovery if mysqld crashed + while there were XA PREPARED + transactions. We disable query cache + if such transactions exist. */ trx_id_t max_trx_id; /*!< The smallest number not yet assigned as a transaction id or transaction number */ +#ifdef UNIV_DEBUG + trx_id_t rw_max_trx_id; /*!< Max trx id of read-write transactions + which exist or existed */ +#endif trx_list_t rw_trx_list; /*!< List of active and committed in memory read-write transactions, sorted on trx id, biggest first. Recovered diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 3e6cfc7d0da..bb84c1806f2 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -121,20 +121,69 @@ UNIV_INTERN void trx_lists_init_at_db_start(void); /*============================*/ + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started_xa(t) \ + { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_xa_low((t)); \ + } +#else +#define trx_start_if_not_started_xa(t) \ + trx_start_if_not_started_xa_low((t)) +#endif /* UNIV_DEBUG */ + /*************************************************************//** Starts the transaction if it is not yet started. */ UNIV_INTERN void -trx_start_if_not_started_xa( -/*========================*/ +trx_start_if_not_started_xa_low( +/*============================*/ trx_t* trx); /*!< in: transaction */ /*************************************************************//** Starts the transaction if it is not yet started. */ UNIV_INTERN void -trx_start_if_not_started( -/*=====================*/ +trx_start_if_not_started_low( +/*=========================*/ trx_t* trx); /*!< in: transaction */ + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started(t) \ + { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_low((t)); \ + } +#else +#define trx_start_if_not_started(t) \ + trx_start_if_not_started_low((t)) +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +UNIV_INTERN +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op) /*!< in: dictionary operation type */ + __attribute__((nonnull)); + +#ifdef UNIV_DEBUG +#define trx_start_for_ddl(t, o) \ + { \ + ut_ad((t)->start_file == 0); \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_for_ddl_low((t), (o)); \ + } +#else +#define trx_start_for_ddl(t, o) \ + trx_start_for_ddl_low((t), (o)) +#endif /* UNIV_DEBUG */ + /****************************************************************//** Commits a transaction. */ UNIV_INTERN @@ -155,7 +204,7 @@ trx_cleanup_at_db_startup( Does the transaction commit for MySQL. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t trx_commit_for_mysql( /*=================*/ trx_t* trx); /*!< in/out: transaction */ @@ -189,13 +238,13 @@ trx_get_trx_by_xid( const XID* xid); /*!< in: X/Open XA transaction identifier */ /**********************************************************************//** If required, flushes the log to disk if we called trx_commit_for_mysql() -with trx->flush_log_later == TRUE. -@return 0 or error number */ +with trx->flush_log_later == TRUE. */ UNIV_INTERN -ulint +void trx_commit_complete_for_mysql( /*==========================*/ - trx_t* trx); /*!< in: trx handle */ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); /**********************************************************************//** Marks the latest SQL statement ended. */ UNIV_INTERN @@ -251,9 +300,9 @@ trx_print_low( ulint max_query_len, /*!< in: max query length to print, or 0 to use the default max length */ - ulint n_lock_rec, + ulint n_rec_locks, /*!< in: lock_number_of_rows_locked(&trx->lock) */ - ulint n_lock_struct, + ulint n_trx_locks, /*!< in: length of trx->lock.trx_locks */ ulint heap_size) /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ @@ -286,26 +335,11 @@ trx_print( or 0 to use the default max length */ __attribute__((nonnull)); -/** Type of data dictionary operation */ -typedef enum trx_dict_op { - /** The transaction is not modifying the data dictionary. */ - TRX_DICT_OP_NONE = 0, - /** The transaction is creating a table or an index, or - dropping a table. The table must be dropped in crash - recovery. This and TRX_DICT_OP_NONE are the only possible - operation modes in crash recovery. */ - TRX_DICT_OP_TABLE = 1, - /** The transaction is creating or dropping an index in an - existing table. In crash recovery, the data dictionary - must be locked, but the table must not be dropped. */ - TRX_DICT_OP_INDEX = 2 -} trx_dict_op_t; - /**********************************************************************//** Determine if a transaction is a dictionary operation. @return dictionary operation mode */ UNIV_INLINE -enum trx_dict_op +enum trx_dict_op_t trx_get_dict_operation( /*===================*/ const trx_t* trx) /*!< in: transaction */ @@ -317,7 +351,7 @@ void trx_set_dict_operation( /*===================*/ trx_t* trx, /*!< in/out: transaction */ - enum trx_dict_op op); /*!< in: operation, not + enum trx_dict_op_t op); /*!< in: operation, not TRX_DICT_OP_NONE */ #ifndef UNIV_HOTBACKUP @@ -359,7 +393,7 @@ UNIV_INTERN ibool trx_is_interrupted( /*===============*/ - trx_t* trx); /*!< in: transaction */ + const trx_t* trx); /*!< in: transaction */ /**********************************************************************//** Determines if the currently running transaction is in strict mode. @return TRUE if strict */ @@ -405,6 +439,15 @@ trx_get_que_state_str( /*==================*/ const trx_t* trx); /*!< in: transaction */ +/****************************************************************//** +Assign a read-only transaction a rollback-segment, if it is attempting +to write to a TEMPORARY table. */ +UNIV_INTERN +void +trx_assign_rseg( +/*============*/ + trx_t* trx); /*!< A read-only transaction that + needs to be assigned a RBS. */ /*******************************************************************//** Transactions that aren't started by the MySQL server don't set the trx_t::mysql_thd field. For such transactions we set the lock @@ -450,7 +493,6 @@ non-locking select */ ut_ad(!trx_is_autocommit_non_locking((t))); \ switch ((t)->state) { \ case TRX_STATE_PREPARED: \ - ut_a(!(t)->read_only); \ /* fall through */ \ case TRX_STATE_ACTIVE: \ case TRX_STATE_COMMITTED_IN_MEMORY: \ @@ -463,7 +505,7 @@ non-locking select */ #ifdef UNIV_DEBUG /*******************************************************************//** -Assert that an autocommit non-locking slect cannot be in the +Assert that an autocommit non-locking select cannot be in the ro_trx_list nor the rw_trx_list and that it is a read-only transaction. The tranasction must be in the mysql_trx_list. */ # define assert_trx_nonlocking_or_in_list(t) \ @@ -511,7 +553,7 @@ code and no mutex is required when the query thread is no longer waiting. */ /** The locks and state of an active transaction. Protected by lock_sys->mutex, trx->mutex or both. */ -struct trx_lock_struct { +struct trx_lock_t { ulint n_active_thrs; /*!< number of active query threads */ trx_que_t que_state; /*!< valid when trx->state @@ -620,10 +662,10 @@ lock_rec_convert_impl_to_expl()) will access transactions associated to other connections. The locks of transactions are protected by lock_sys->mutex and sometimes by trx->mutex. */ -struct trx_struct{ +struct trx_t{ ulint magic_n; - mutex_t mutex; /*!< Mutex protecting the fields + ib_mutex_t mutex; /*!< Mutex protecting the fields state and lock (except some fields of lock, which are protected by lock_sys->mutex) */ @@ -657,8 +699,7 @@ struct trx_struct{ Latching and various transaction lists membership rules: - XA (2PC) transactions are always treated as read-write and - non-autocommit. + XA (2PC) transactions are always treated as non-autocommit. Transitions to ACTIVE or NOT_STARTED occur when !in_rw_trx_list and !in_ro_trx_list (no trx_sys->mutex needed). @@ -793,9 +834,9 @@ struct trx_struct{ transaction branch */ lsn_t commit_lsn; /*!< lsn at the time of the commit */ table_id_t table_id; /*!< Table to drop iff dict_operation - is TRUE, or 0. */ + == TRX_DICT_OP_TABLE, or 0. */ /*------------------------------*/ - void* mysql_thd; /*!< MySQL thread handle corresponding + THD* mysql_thd; /*!< MySQL thread handle corresponding to this trx, or NULL */ const char* mysql_log_file_name; /*!< if MySQL binlog is used, this field @@ -838,7 +879,7 @@ struct trx_struct{ trx_sys->mysql_trx_list */ #endif /* UNIV_DEBUG */ /*------------------------------*/ - enum db_err error_state; /*!< 0 if no error, otherwise error + dberr_t error_state; /*!< 0 if no error, otherwise error number; NOTE That ONLY the thread doing the transaction is allowed to set this field: this is NOT protected @@ -873,7 +914,7 @@ struct trx_struct{ trx_savepoints; /*!< savepoints set with SAVEPOINT ..., oldest first */ /*------------------------------*/ - mutex_t undo_mutex; /*!< mutex protecting the fields in this + ib_mutex_t undo_mutex; /*!< mutex protecting the fields in this section (down to undo_no_arr), EXCEPT last_sql_stat_start, which can be accessed only when we know that there @@ -929,12 +970,24 @@ struct trx_struct{ ulint will_lock; /*!< Will acquire some locks. Increment each time we determine that a lock will be acquired by the MySQL layer. */ + bool ddl; /*!< true if it is a transaction that + is being started for a DDL operation */ /*------------------------------*/ - fts_trx_t* fts_trx; /* FTS information, or NULL if + fts_trx_t* fts_trx; /*!< FTS information, or NULL if transaction hasn't modified tables with FTS indexes (yet). */ doc_id_t fts_next_doc_id;/* The document id used for updates */ /*------------------------------*/ + ulint flush_tables; /*!< if "covering" the FLUSH TABLES", + count of tables being flushed. */ + + /*------------------------------*/ +#ifdef UNIV_DEBUG + ulint start_line; /*!< Track where it was started from */ + const char* start_file; /*!< Filename where it was started */ +#endif /* UNIV_DEBUG */ + + /*------------------------------*/ char detailed_error[256]; /*!< detailed error message for last error, or empty. */ }; @@ -1003,7 +1056,7 @@ enum commit_node_state { }; /** Commit command node in a query graph */ -struct commit_node_struct{ +struct commit_node_t{ que_common_t common; /*!< node type: QUE_NODE_COMMIT */ enum commit_node_state state; /*!< node execution state */ diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic index ceeb121ab70..69ee17ea98b 100644 --- a/storage/innobase/include/trx0trx.ic +++ b/storage/innobase/include/trx0trx.ic @@ -44,7 +44,7 @@ trx_state_eq( #ifdef UNIV_DEBUG switch (trx->state) { case TRX_STATE_PREPARED: - assert_trx_in_rw_list(trx); + ut_ad(!trx_is_autocommit_non_locking(trx)); return(trx->state == state); case TRX_STATE_ACTIVE: @@ -108,12 +108,12 @@ trx_get_que_state_str( Determine if a transaction is a dictionary operation. @return dictionary operation mode */ UNIV_INLINE -enum trx_dict_op +enum trx_dict_op_t trx_get_dict_operation( /*===================*/ const trx_t* trx) /*!< in: transaction */ { - enum trx_dict_op op = (enum trx_dict_op) trx->dict_operation; + trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation); #ifdef UNIV_DEBUG switch (op) { @@ -124,7 +124,7 @@ trx_get_dict_operation( } ut_error; #endif /* UNIV_DEBUG */ - return((enum trx_dict_op) op); + return(op); } /**********************************************************************//** Flag a transaction a dictionary operation. */ @@ -133,11 +133,11 @@ void trx_set_dict_operation( /*===================*/ trx_t* trx, /*!< in/out: transaction */ - enum trx_dict_op op) /*!< in: operation, not + enum trx_dict_op_t op) /*!< in: operation, not TRX_DICT_OP_NONE */ { #ifdef UNIV_DEBUG - enum trx_dict_op old_op = trx_get_dict_operation(trx); + enum trx_dict_op_t old_op = trx_get_dict_operation(trx); switch (op) { case TRX_DICT_OP_NONE: @@ -159,6 +159,7 @@ trx_set_dict_operation( ok: #endif /* UNIV_DEBUG */ + trx->ddl = true; trx->dict_operation = op; } diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h index 650d5878e64..4f515cb5248 100644 --- a/storage/innobase/include/trx0types.h +++ b/storage/innobase/include/trx0types.h @@ -36,7 +36,7 @@ the terminating NUL character. */ #define TRX_ID_MAX_LEN 17 /** Transaction execution states when trx->state == TRX_STATE_ACTIVE */ -enum trx_que_enum { +enum trx_que_t { TRX_QUE_RUNNING, /*!< transaction is running */ TRX_QUE_LOCK_WAIT, /*!< transaction is waiting for a lock */ @@ -45,43 +45,54 @@ enum trx_que_enum { }; /** Transaction states (trx_t::state) */ -enum trx_state_enum { +enum trx_state_t { TRX_STATE_NOT_STARTED, TRX_STATE_ACTIVE, TRX_STATE_PREPARED, /* Support for 2PC/XA */ TRX_STATE_COMMITTED_IN_MEMORY }; +/** Type of data dictionary operation */ +enum trx_dict_op_t { + /** The transaction is not modifying the data dictionary. */ + TRX_DICT_OP_NONE = 0, + /** The transaction is creating a table or an index, or + dropping a table. The table must be dropped in crash + recovery. This and TRX_DICT_OP_NONE are the only possible + operation modes in crash recovery. */ + TRX_DICT_OP_TABLE = 1, + /** The transaction is creating or dropping an index in an + existing table. In crash recovery, the data dictionary + must be locked, but the table must not be dropped. */ + TRX_DICT_OP_INDEX = 2 +}; + /** Memory objects */ /* @{ */ /** Transaction */ -typedef struct trx_struct trx_t; +struct trx_t; /** The locks and state of an active transaction */ -typedef struct trx_lock_struct trx_lock_t; +struct trx_lock_t; /** Transaction system */ -typedef struct trx_sys_struct trx_sys_t; +struct trx_sys_t; /** Signal */ -typedef struct trx_sig_struct trx_sig_t; +struct trx_sig_t; /** Rollback segment */ -typedef struct trx_rseg_struct trx_rseg_t; +struct trx_rseg_t; /** Transaction undo log */ -typedef struct trx_undo_struct trx_undo_t; +struct trx_undo_t; /** Array of undo numbers of undo records being rolled back or purged */ -typedef struct trx_undo_arr_struct trx_undo_arr_t; +struct trx_undo_arr_t; /** A cell of trx_undo_arr_t */ -typedef struct trx_undo_inf_struct trx_undo_inf_t; +struct trx_undo_inf_t; /** The control structure used in the purge operation */ -typedef struct trx_purge_struct trx_purge_t; +struct trx_purge_t; /** Rollback command node in a query graph */ -typedef struct roll_node_struct roll_node_t; +struct roll_node_t; /** Commit command node in a query graph */ -typedef struct commit_node_struct commit_node_t; +struct commit_node_t; /** SAVEPOINT command node in a query graph */ -typedef struct trx_named_savept_struct trx_named_savept_t; -/** Transaction concurrency state */ -typedef enum trx_state_enum trx_state_t; -/** Transaction query thread state */ -typedef enum trx_que_enum trx_que_t; +struct trx_named_savept_t; /* @} */ /** Rollback contexts */ @@ -109,9 +120,7 @@ typedef ib_id_t roll_ptr_t; typedef ib_id_t undo_no_t; /** Transaction savepoint */ -typedef struct trx_savept_struct trx_savept_t; -/** Transaction savepoint */ -struct trx_savept_struct{ +struct trx_savept_t{ undo_no_t least_undo_no; /*!< least undo number to undo */ }; diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index ed2ce66bbb6..4021d71c68a 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -65,6 +65,15 @@ ibool trx_undo_roll_ptr_is_insert( /*========================*/ roll_ptr_t roll_ptr); /*!< in: roll pointer */ +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ + __attribute__((nonnull, pure, warn_unused_result)); #endif /* !UNIV_HOTBACKUP */ /*****************************************************************//** Writes a roll ptr to an index page. In case that the size changes in @@ -285,11 +294,12 @@ undo log reused. are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY DB_OUT_OF_MEMORY */ UNIV_INTERN -ulint +dberr_t trx_undo_assign_undo( /*=================*/ trx_t* trx, /*!< in: transaction */ - ulint type); /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ + ulint type) /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Sets the state of the undo log segment at a transaction finish. @return undo log segment header page, x-latched */ @@ -404,7 +414,7 @@ trx_undo_mem_free( /** Transaction undo log memory object; this is protected by the undo_mutex in the corresponding transaction object */ -struct trx_undo_struct{ +struct trx_undo_t{ /*-----------------------------*/ ulint id; /*!< undo log slot number within the rollback segment */ diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic index 4b38e63297c..577759d6c3d 100644 --- a/storage/innobase/include/trx0undo.ic +++ b/storage/innobase/include/trx0undo.ic @@ -101,6 +101,21 @@ trx_undo_roll_ptr_is_insert( ut_ad(roll_ptr < (1ULL << 56)); return((ibool) (roll_ptr >> 55)); } + +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ +{ +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error +#endif + return(static_cast<bool>(trx_id[DATA_TRX_ID_LEN] >> 7)); +} #endif /* !UNIV_HOTBACKUP */ /*****************************************************************//** diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 60eb1fede91..fbb62e8de01 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -380,11 +380,16 @@ This number varies depending on UNIV_PAGE_SIZE. */ /** Maximum number of parallel threads in a parallelized operation */ #define UNIV_MAX_PARALLELISM 32 -/** The maximum length of a table name. This is the MySQL limit and is -defined in mysql_com.h like NAME_CHAR_LEN*SYSTEM_CHARSET_MBMAXLEN, the -number does not include a terminating '\0'. InnoDB probably can handle -longer names internally */ -#define MAX_TABLE_NAME_LEN 192 +/** This is the "mbmaxlen" for my_charset_filename (defined in +strings/ctype-utf8.c), which is used to encode File and Database names. */ +#define FILENAME_CHARSET_MAXNAMLEN 5 + +/** The maximum length of an encode table name in bytes. The max +table and database names are NAME_CHAR_LEN (64) characters. After the +encoding, the max length would be NAME_CHAR_LEN (64) * +FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a +terminating '\0'. InnoDB can handle longer names internally */ +#define MAX_TABLE_NAME_LEN 320 /** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is the MySQL's NAME_LEN, see check_and_convert_db_name(). */ @@ -398,6 +403,16 @@ database name and table name. In addition, 14 bytes is added for: #define MAX_FULL_NAME_LEN \ (MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14) +/** The maximum length in bytes that a database name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_DB_UTF8_LEN (NAME_LEN + 1) + +/** The maximum length in bytes that a table name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_TABLE_UTF8_LEN (NAME_LEN + sizeof(srv_mysql50_table_name_prefix)) + /* UNIVERSAL TYPE DEFINITIONS ========================== @@ -417,6 +432,7 @@ macro ULINTPF. */ # define UINT32PF "%I32u" # define INT64PF "%I64d" # define UINT64PF "%I64u" +# define UINT64PFx "%016I64u" typedef __int64 ib_int64_t; typedef unsigned __int64 ib_uint64_t; typedef unsigned __int32 ib_uint32_t; @@ -425,6 +441,7 @@ typedef unsigned __int32 ib_uint32_t; # define UINT32PF "%"PRIu32 # define INT64PF "%"PRId64 # define UINT64PF "%"PRIu64 +# define UINT64PFx "%016"PRIx64 typedef int64_t ib_int64_t; typedef uint64_t ib_uint64_t; typedef uint32_t ib_uint32_t; @@ -489,6 +506,8 @@ headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ #endif +#define UNIV_NOTHROW + /** The following number as the length of a logical field means that the field has the SQL NULL as its value. NOTE that because we assume that the length of a field is a 32-bit integer when we store it, for example, to an undo log @@ -588,15 +607,23 @@ typedef void* os_thread_ret_t; # define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size) # define UNIV_MEM_DESC(addr, size) VALGRIND_CREATE_BLOCK(addr, size, #addr) # define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b) -# define UNIV_MEM_ASSERT_RW(addr, size) do { \ +# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do { \ const void* _p = (const void*) (ulint) \ VALGRIND_CHECK_MEM_IS_DEFINED(addr, size); \ - if (UNIV_LIKELY_NULL(_p)) \ + if (UNIV_LIKELY_NULL(_p)) { \ fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n", \ __FILE__, __LINE__, \ (const void*) (addr), (unsigned) (size), (long) \ (((const char*) _p) - ((const char*) (addr)))); \ - } while (0) + if (should_abort) { \ + ut_error; \ + } \ + } \ +} while (0) +# define UNIV_MEM_ASSERT_RW(addr, size) \ + UNIV_MEM_ASSERT_RW_LOW(addr, size, false) +# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) \ + UNIV_MEM_ASSERT_RW_LOW(addr, size, true) # define UNIV_MEM_ASSERT_W(addr, size) do { \ const void* _p = (const void*) (ulint) \ VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size); \ @@ -613,7 +640,9 @@ typedef void* os_thread_ret_t; # define UNIV_MEM_ALLOC(addr, size) do {} while(0) # define UNIV_MEM_DESC(addr, size) do {} while(0) # define UNIV_MEM_UNDESC(b) do {} while(0) +# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do {} while(0) # define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0) +# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) do {} while(0) # define UNIV_MEM_ASSERT_W(addr, size) do {} while(0) #endif #define UNIV_MEM_ASSERT_AND_FREE(addr, size) do { \ diff --git a/storage/innobase/include/usr0sess.h b/storage/innobase/include/usr0sess.h index 4a0710c5060..b5c80b97b43 100644 --- a/storage/innobase/include/usr0sess.h +++ b/storage/innobase/include/usr0sess.h @@ -53,7 +53,7 @@ sess_close( /* The session handle. This data structure is only used by purge and is not really necessary. We should get rid of it. */ -struct sess_struct{ +struct sess_t{ ulint state; /*!< state of the session */ trx_t* trx; /*!< transaction object permanently assigned for the session: the diff --git a/storage/innobase/include/usr0types.h b/storage/innobase/include/usr0types.h index 403ad0223a8..6ba937cacc8 100644 --- a/storage/innobase/include/usr0types.h +++ b/storage/innobase/include/usr0types.h @@ -26,6 +26,6 @@ Created 6/25/1996 Heikki Tuuri #ifndef usr0types_h #define usr0types_h -typedef struct sess_struct sess_t; +struct sess_t; #endif diff --git a/storage/innobase/include/ut0bh.h b/storage/innobase/include/ut0bh.h index 4c029e256a9..84ea6dd915a 100644 --- a/storage/innobase/include/ut0bh.h +++ b/storage/innobase/include/ut0bh.h @@ -31,7 +31,7 @@ Created 2010-05-28 by Sunny Bains /** Comparison function for objects in the binary heap. */ typedef int (*ib_bh_cmp_t)(const void* p1, const void* p2); -typedef struct ib_bh_struct ib_bh_t; +struct ib_bh_t; /**********************************************************************//** Get the number of elements in the binary heap. @@ -138,7 +138,7 @@ ib_bh_pop( ib_bh_t* ib_bh); /*!< in/out: instance */ /** Binary heap data structure */ -struct ib_bh_struct { +struct ib_bh_t { ulint max_elems; /*!< max elements allowed */ ulint n_elems; /*!< current size */ ulint sizeof_elem; /*!< sizeof element */ diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h new file mode 100644 index 00000000000..fe0f36dfff2 --- /dev/null +++ b/storage/innobase/include/ut0counter.h @@ -0,0 +1,203 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0counter.h + +Counter utility class + +Created 2012/04/12 by Sunny Bains +*******************************************************/ + +#ifndef UT0COUNTER_H +#define UT0COUNTER_H + +#include "univ.i" +#include <string.h> +#include "os0thread.h" + +/** CPU cache line size */ +#define CACHE_LINE_SIZE 64 + +/** Default number of slots to use in ib_counter_t */ +#define IB_N_SLOTS 64 + +/** Get the offset into the counter array. */ +template <typename Type, int N> +struct generic_indexer_t { + /** Default constructor/destructor should be OK. */ + + /** @return offset within m_counter */ + size_t offset(size_t index) const UNIV_NOTHROW { + return(((index % N) + 1) * (CACHE_LINE_SIZE / sizeof(Type))); + } +}; + +#ifdef HAVE_SCHED_GETCPU +#include <utmpx.h> +/** Use the cpu id to index into the counter array. If it fails then +use the thread id. */ +template <typename Type, int N> +struct get_sched_indexer_t : public generic_indexer_t<Type, N> { + /** Default constructor/destructor should be OK. */ + + /* @return result from sched_getcpu(), the thread id if it fails. */ + size_t get_rnd_index() const UNIV_NOTHROW { + + size_t cpu = sched_getcpu(); + if (cpu == -1) { + cpu = (lint) os_thread_get_curr_id(); + } + + return(cpu); + } +}; +#endif /* HAVE_SCHED_GETCPU */ + +/** Use the thread id to index into the counter array. */ +template <typename Type, int N> +struct thread_id_indexer_t : public generic_indexer_t<Type, N> { + /** Default constructor/destructor should are OK. */ + + /* @return a random number, currently we use the thread id. Where + thread id is represented as a pointer, it may not work as + effectively. */ + size_t get_rnd_index() const UNIV_NOTHROW { + return((lint) os_thread_get_curr_id()); + } +}; + +/** For counters wher N=1 */ +template <typename Type, int N=1> +struct single_indexer_t { + /** Default constructor/destructor should are OK. */ + + /** @return offset within m_counter */ + size_t offset(size_t index) const UNIV_NOTHROW { + ut_ad(N == 1); + return((CACHE_LINE_SIZE / sizeof(Type))); + } + + /* @return 1 */ + size_t get_rnd_index() const UNIV_NOTHROW { + ut_ad(N == 1); + return(1); + } +}; + +/** Class for using fuzzy counters. The counter is not protected by any +mutex and the results are not guaranteed to be 100% accurate but close +enough. Creates an array of counters and separates each element by the +CACHE_LINE_SIZE bytes */ +template < + typename Type, + int N = IB_N_SLOTS, + template<typename, int> class Indexer = thread_id_indexer_t> +class ib_counter_t { +public: + ib_counter_t() { memset(m_counter, 0x0, sizeof(m_counter)); } + + ~ib_counter_t() + { + ut_ad(validate()); + } + + bool validate() UNIV_NOTHROW { +#ifdef UNIV_DEBUG + size_t n = (CACHE_LINE_SIZE / sizeof(Type)); + + /* Check that we aren't writing outside our defined bounds. */ + for (size_t i = 0; i < UT_ARR_SIZE(m_counter); i += n) { + for (size_t j = 1; j < n - 1; ++j) { + ut_ad(m_counter[i + j] == 0); + } + } +#endif /* UNIV_DEBUG */ + return(true); + } + + /** If you can't use a good index id. Increment by 1. */ + void inc() UNIV_NOTHROW { add(1); } + + /** If you can't use a good index id. + * @param n - is the amount to increment */ + void add(Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(m_policy.get_rnd_index()); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] += n; + } + + /** Use this if you can use a unique indentifier, saves a + call to get_rnd_index(). + @param i - index into a slot + @param n - amount to increment */ + void add(size_t index, Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(index); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] += n; + } + + /** If you can't use a good index id. Decrement by 1. */ + void dec() UNIV_NOTHROW { sub(1); } + + /** If you can't use a good index id. + * @param - n is the amount to decrement */ + void sub(Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(m_policy.get_rnd_index()); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] -= n; + } + + /** Use this if you can use a unique indentifier, saves a + call to get_rnd_index(). + @param i - index into a slot + @param n - amount to decrement */ + void sub(size_t index, Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(index); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] -= n; + } + + /* @return total value - not 100% accurate, since it is not atomic. */ + operator Type() const UNIV_NOTHROW { + Type total = 0; + + for (size_t i = 0; i < N; ++i) { + total += m_counter[m_policy.offset(i)]; + } + + return(total); + } + +private: + /** Indexer into the array */ + Indexer<Type, N>m_policy; + + /** Slot 0 is unused. */ + Type m_counter[(N + 1) * (CACHE_LINE_SIZE / sizeof(Type))]; +}; + +#endif /* UT0COUNTER_H */ diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h index 456648001aa..86217692764 100644 --- a/storage/innobase/include/ut0crc32.h +++ b/storage/innobase/include/ut0crc32.h @@ -45,4 +45,7 @@ or 0x1EDC6F41 without the high-order bit) */ typedef ib_uint32_t (*ib_ut_crc32_t)(const byte* ptr, ulint len); extern ib_ut_crc32_t ut_crc32; + +extern bool ut_crc32_sse2_enabled; + #endif /* ut0crc32_h */ diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h index e9ad62fb81b..0f2da165da7 100644 --- a/storage/innobase/include/ut0dbg.h +++ b/storage/innobase/include/ut0dbg.h @@ -145,10 +145,10 @@ ut_dbg_stop_thread( #include <sys/resource.h> /** structure used for recording usage statistics */ -typedef struct speedo_struct { +struct speedo_t { struct rusage ru; /*!< getrusage() result */ struct timeval tv; /*!< gettimeofday() result */ -} speedo_t; +}; /*******************************************************************//** Resets a speedo (records the current time in it). */ diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h index 57d6bdc33a6..29fc8669ce4 100644 --- a/storage/innobase/include/ut0list.h +++ b/storage/innobase/include/ut0list.h @@ -48,9 +48,8 @@ automatically freeing the list node when the item's heap is freed. #include "mem0mem.h" -typedef struct ib_list_struct ib_list_t; -typedef struct ib_list_node_struct ib_list_node_t; -typedef struct ib_list_helper_struct ib_list_helper_t; +struct ib_list_t; +struct ib_list_node_t; /****************************************************************//** Create a new list using mem_alloc. Lists created with this function must be @@ -152,7 +151,7 @@ ib_list_is_empty( const ib_list_t* list); /* in: list */ /* List. */ -struct ib_list_struct { +struct ib_list_t { ib_list_node_t* first; /*!< first node */ ib_list_node_t* last; /*!< last node */ ibool is_heap_list; /*!< TRUE if this list was @@ -160,7 +159,7 @@ struct ib_list_struct { }; /* A list node. */ -struct ib_list_node_struct { +struct ib_list_node_t { ib_list_node_t* prev; /*!< previous node */ ib_list_node_t* next; /*!< next node */ void* data; /*!< user data */ @@ -169,7 +168,7 @@ struct ib_list_node_struct { /* Quite often, the only additional piece of data you need is the per-item memory heap, so we have this generic struct available to use in those cases. */ -struct ib_list_helper_struct { +struct ib_list_helper_t { mem_heap_t* heap; /*!< memory heap */ void* data; /*!< user data */ }; diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h index 51c89f15a77..b53e7ade4c1 100644 --- a/storage/innobase/include/ut0lst.h +++ b/storage/innobase/include/ut0lst.h @@ -65,8 +65,7 @@ The name of the field in the node struct should be the name given to the list. @param TYPE the list node type name */ /* Example: -typedef struct LRU_node_struct LRU_node_t; -struct LRU_node_struct { +struct LRU_node_t { UT_LIST_NODE_T(LRU_node_t) LRU_list; ... } diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h index e8a4430e76b..e0593e99bde 100644 --- a/storage/innobase/include/ut0rbt.h +++ b/storage/innobase/include/ut0rbt.h @@ -44,25 +44,19 @@ Created 2007-03-20 Sunny Bains #define FALSE 0 #endif -/* Red black tree typedefs */ -typedef struct ib_rbt_struct ib_rbt_t; -typedef struct ib_rbt_node_struct ib_rbt_node_t; -/* FIXME: Iterator is a better name than _bound_ */ -typedef struct ib_rbt_bound_struct ib_rbt_bound_t; +struct ib_rbt_node_t; typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node); typedef int (*ib_rbt_compare)(const void* p1, const void* p2); typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2); /** Red black tree color types */ -enum ib_rbt_color_enum { +enum ib_rbt_color_t { IB_RBT_RED, IB_RBT_BLACK }; -typedef enum ib_rbt_color_enum ib_rbt_color_t; - /** Red black tree node */ -struct ib_rbt_node_struct { +struct ib_rbt_node_t { ib_rbt_color_t color; /* color of this node */ ib_rbt_node_t* left; /* points left child */ @@ -73,7 +67,7 @@ struct ib_rbt_node_struct { }; /** Red black tree instance.*/ -struct ib_rbt_struct { +struct ib_rbt_t { ib_rbt_node_t* nil; /* Black colored node that is used as a sentinel. This is pre-allocated too.*/ @@ -89,12 +83,12 @@ struct ib_rbt_struct { compare_with_arg; /* Fn. to use for comparison with argument */ ulint sizeof_value; /* Sizeof the item in bytes */ - const void* cmp_arg; /* Compare func argument */ + void* cmp_arg; /* Compare func argument */ }; /** The result of searching for a key in the tree, this is useful for a speedy lookup and insert if key doesn't exist.*/ -struct ib_rbt_bound_struct { +struct ib_rbt_bound_t { const ib_rbt_node_t* last; /* Last node visited */ @@ -142,7 +136,7 @@ rbt_create_arg_cmp( size_t sizeof_value, /*!< in: size in bytes */ ib_rbt_arg_compare compare, /*!< in: comparator */ - const void* cmp_arg); /*!< in: compare fn arg */ + void* cmp_arg); /*!< in: compare fn arg */ /**********************************************************************//** Delete a node from the red black tree, identified by key */ UNIV_INTERN diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index 35b8a580e68..1260e0381bf 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -65,16 +65,16 @@ typedef time_t ib_time_t; # elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) # define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") -# elif defined(HAVE_ATOMIC_BUILTINS) -# define UT_RELAX_CPU() do { \ - volatile lint volatile_var; \ - os_compare_and_swap_lint(&volatile_var, 0, 1); \ - } while (0) # elif defined(HAVE_WINDOWS_ATOMICS) /* In the Win32 API, the x86 PAUSE instruction is executed by calling the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- independent way by using YieldProcessor. */ # define UT_RELAX_CPU() YieldProcessor() +# elif defined(HAVE_ATOMIC_BUILTINS) +# define UT_RELAX_CPU() do { \ + volatile lint volatile_var; \ + os_compare_and_swap_lint(&volatile_var, 0, 1); \ + } while (0) # else # define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ # endif @@ -345,7 +345,7 @@ ut_print_filename( #ifndef UNIV_HOTBACKUP /* Forward declaration of transaction handle */ -struct trx_struct; +struct trx_t; /**********************************************************************//** Outputs a fixed-length string, quoted as an SQL identifier. @@ -357,7 +357,7 @@ void ut_print_name( /*==========*/ FILE* f, /*!< in: output stream */ - struct trx_struct*trx, /*!< in: transaction */ + const trx_t* trx, /*!< in: transaction */ ibool table_id,/*!< in: TRUE=print a table name, FALSE=print other identifier */ const char* name); /*!< in: name to print */ @@ -372,13 +372,31 @@ void ut_print_namel( /*===========*/ FILE* f, /*!< in: output stream */ - struct trx_struct*trx, /*!< in: transaction (NULL=no quotes) */ + const trx_t* trx, /*!< in: transaction (NULL=no quotes) */ ibool table_id,/*!< in: TRUE=print a table name, FALSE=print other identifier */ const char* name, /*!< in: name to print */ ulint namelen);/*!< in: length of name */ /**********************************************************************//** +Formats a table or index name, quoted as an SQL identifier. If the name +contains a slash '/', the result will contain two identifiers separated by +a period (.), as in SQL database_name.identifier. +@return pointer to 'formatted' */ +UNIV_INTERN +char* +ut_format_name( +/*===========*/ + const char* name, /*!< in: table or index name, must be + '\0'-terminated */ + ibool is_table, /*!< in: if TRUE then 'name' is a table + name */ + char* formatted, /*!< out: formatted result, will be + '\0'-terminated */ + ulint formatted_size);/*!< out: no more than this number of + bytes will be written to 'formatted' */ + +/**********************************************************************//** Catenate files. */ UNIV_INTERN void @@ -442,7 +460,7 @@ UNIV_INTERN const char* ut_strerr( /*======*/ - enum db_err num); /*!< in: error number */ + dberr_t num); /*!< in: error number */ /**************************************************************** Sort function for ulint arrays. */ diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h index f2a5aba8116..432fb348a09 100644 --- a/storage/innobase/include/ut0vec.h +++ b/storage/innobase/include/ut0vec.h @@ -29,8 +29,8 @@ Created 4/6/2006 Osku Salerma #include "univ.i" #include "mem0mem.h" -typedef struct ib_alloc_struct ib_alloc_t; -typedef struct ib_vector_struct ib_vector_t; +struct ib_alloc_t; +struct ib_vector_t; typedef void* (*ib_mem_alloc_t)( /* out: Pointer to allocated memory */ @@ -64,7 +64,7 @@ freeing it when done with the vector. /******************************************************************** Create a new vector with the given initial size. */ - +UNIV_INTERN ib_vector_t* ib_vector_create( /*=============*/ @@ -124,7 +124,7 @@ ib_vector_size( /******************************************************************** Increase the size of the vector. */ - +UNIV_INTERN void ib_vector_resize( /*=============*/ @@ -311,7 +311,7 @@ ib_ut_allocator_free( ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */ /* Allocator used by ib_vector_t. */ -struct ib_alloc_struct { +struct ib_alloc_t { ib_mem_alloc_t mem_malloc; /* For allocating memory */ ib_mem_free_t mem_release; /* For freeing memory */ ib_mem_resize_t mem_resize; /* For resizing memory */ @@ -320,7 +320,7 @@ struct ib_alloc_struct { }; /* See comment at beginning of file. */ -struct ib_vector_struct { +struct ib_vector_t { ib_alloc_t* allocator; /* Allocator, because one size doesn't fit all */ void* data; /* data elements */ diff --git a/storage/innobase/include/ut0vec.ic b/storage/innobase/include/ut0vec.ic index 1255caee2d9..f41a85e1d1d 100644 --- a/storage/innobase/include/ut0vec.ic +++ b/storage/innobase/include/ut0vec.ic @@ -346,9 +346,10 @@ ib_vector_remove( ib_vector_t* vec, /*!< in: vector */ const void* elem) /*!< in: value to remove */ { - void* current; + void* current = NULL; void* next; ulint i; + ulint old_used_count = vec->used; for (i = 0; i < vec->used; i++) { current = ib_vector_get(vec, i); @@ -359,14 +360,14 @@ ib_vector_remove( } next = ib_vector_get(vec, i + 1); - memcpy(current, next, vec->sizeof_value - * (vec->used - i - 1)); + memmove(current, next, vec->sizeof_value + * (vec->used - i - 1)); + --vec->used; + break; } } - --vec->used; - - return(current); + return((old_used_count != vec->used) ? current : NULL); } /******************************************************************** diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h index ed4e65e4dc6..33385ddf2d4 100644 --- a/storage/innobase/include/ut0wqueue.h +++ b/storage/innobase/include/ut0wqueue.h @@ -37,7 +37,7 @@ processing. #include "os0sync.h" #include "sync0types.h" -typedef struct ib_wqueue_struct ib_wqueue_t; +struct ib_wqueue_t; /****************************************************************//** Create a new work queue. @@ -96,8 +96,8 @@ ib_wqueue_timedwait( ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ /* Work queue. */ -struct ib_wqueue_struct { - mutex_t mutex; /*!< mutex protecting everything */ +struct ib_wqueue_t { + ib_mutex_t mutex; /*!< mutex protecting everything */ ib_list_t* items; /*!< work item list */ os_event_t event; /*!< event we use to signal additions to list */ }; diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 476b305ca70..1152152cc77 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -37,15 +37,17 @@ Created 5/7/1996 Heikki Tuuri #include "usr0sess.h" #include "trx0purge.h" #include "dict0mem.h" +#include "dict0boot.h" #include "trx0sys.h" #include "pars0pars.h" /* pars_complete_graph_for_exec() */ #include "que0que.h" /* que_node_get_parent() */ #include "row0mysql.h" /* row_mysql_handle_errors() */ -#include "row0sel.h" /* sel_node_create(), sel_node_struct */ +#include "row0sel.h" /* sel_node_create(), sel_node_t */ #include "row0types.h" /* sel_node_t */ #include "srv0mon.h" #include "ut0vec.h" #include "btr0btr.h" +#include "dict0boot.h" /* Restricts the length of search we will do in the waits-for graph of transactions */ @@ -345,10 +347,7 @@ static const byte lock_strength_matrix[5][5] = { }; /** Deadlock check context. */ -typedef struct lock_deadlock_ctx_struct lock_deadlock_ctx_t; - -/** Deadlock check context. */ -struct lock_deadlock_ctx_struct { +struct lock_deadlock_ctx_t { const trx_t* start; /*!< Joining transaction that is requesting a lock in an incompatible mode */ @@ -366,10 +365,8 @@ struct lock_deadlock_ctx_struct { was aborted */ }; -typedef struct lock_stack_struct lock_stack_t; - /** DFS visited node information used during deadlock checking. */ -struct lock_stack_struct { +struct lock_stack_t { const lock_t* lock; /*!< Current lock */ const lock_t* wait_lock; /*!< Waiting for lock */ unsigned heap_no:16; /*!< heap number if rec lock */ @@ -415,9 +412,10 @@ lock_rec_validate_page( /* The lock system */ UNIV_INTERN lock_sys_t* lock_sys = NULL; -/* We store info on the latest deadlock error to this buffer. InnoDB +/** We store info on the latest deadlock error to this buffer. InnoDB Monitor will then fetch it and print */ UNIV_INTERN ibool lock_deadlock_found = FALSE; +/** Only created if !srv_read_only_mode */ static FILE* lock_latest_err_file; /********************************************************************//** @@ -502,7 +500,7 @@ lock_check_trx_id_sanity( dict_index_t* index, /*!< in: index */ const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ { - ibool is_ok; + bool is_ok; trx_id_t max_trx_id; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -520,10 +518,10 @@ lock_check_trx_id_sanity( /*********************************************************************//** Checks that a record is seen in a consistent read. -@return TRUE if sees, or FALSE if an earlier version of the record +@return true if sees, or false if an earlier version of the record should be retrieved */ UNIV_INTERN -ibool +bool lock_clust_rec_cons_read_sees( /*==========================*/ const rec_t* rec, /*!< in: user record which should be read or @@ -550,14 +548,14 @@ lock_clust_rec_cons_read_sees( Checks that a non-clustered index record is seen in a consistent read. NOTE that a non-clustered index page contains so little information on -its modifications that also in the case FALSE, the present version of +its modifications that also in the case false, the present version of rec may be the right, but we must check this from the clustered index record. -@return TRUE if certainly sees, or FALSE if an earlier version of the +@return true if certainly sees, or false if an earlier version of the clustered index record might be needed */ UNIV_INTERN -ulint +bool lock_sec_rec_cons_read_sees( /*========================*/ const rec_t* rec, /*!< in: user record which @@ -574,7 +572,7 @@ lock_sec_rec_cons_read_sees( if (recv_recovery_is_on()) { - return(FALSE); + return(false); } max_trx_id = page_get_max_trx_id(page_align(rec)); @@ -593,12 +591,6 @@ lock_sys_create( { ulint lock_sys_sz; - srv_n_lock_wait_count = 0; - srv_n_lock_wait_time = 0; - srv_n_lock_max_wait_time = 0; - srv_lock_timeout_active = FALSE; - srv_n_lock_wait_current_count = 0; - lock_sys_sz = sizeof(*lock_sys) + OS_THREAD_MAX_N * sizeof(srv_slot_t); @@ -618,12 +610,14 @@ lock_sys_create( mutex_create(lock_sys_wait_mutex_key, &lock_sys->wait_mutex, SYNC_LOCK_WAIT_SYS); - lock_sys->rec_hash = hash_create(n_cells); + lock_sys->timeout_event = os_event_create(); - lock_latest_err_file = os_file_create_tmpfile(); - ut_a(lock_latest_err_file); + lock_sys->rec_hash = hash_create(n_cells); - srv_timeout_event = os_event_create(NULL); + if (!srv_read_only_mode) { + lock_latest_err_file = os_file_create_tmpfile(); + ut_a(lock_latest_err_file); + } } /*********************************************************************//** @@ -858,13 +852,16 @@ lock_reset_lock_and_trx_wait( /*=========================*/ lock_t* lock) /*!< in/out: record lock */ { - ut_ad(lock->trx->lock.wait_lock == lock); ut_ad(lock_get_wait(lock)); ut_ad(lock_mutex_own()); /* Reset the back pointer in trx to this waiting lock request */ - - lock->trx->lock.wait_lock = NULL; + if (!(lock->type_mode & LOCK_CONV_BY_OTHER)) { + ut_ad(lock->trx->lock.wait_lock == lock); + lock->trx->lock.wait_lock = NULL; + } else { + ut_ad(lock_get_type_low(lock) == LOCK_REC); + } lock->type_mode &= ~LOCK_WAIT; } @@ -1476,7 +1473,7 @@ Checks if a transaction has a GRANTED explicit lock on rec stronger or equal to precise_mode. @return lock or NULL */ UNIV_INLINE -const lock_t* +lock_t* lock_rec_has_expl( /*==============*/ ulint precise_mode,/*!< in: LOCK_S or LOCK_X @@ -1489,7 +1486,7 @@ lock_rec_has_expl( ulint heap_no,/*!< in: heap number of the record */ const trx_t* trx) /*!< in: transaction */ { - const lock_t* lock; + lock_t* lock; ut_ad(lock_mutex_own()); ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S @@ -1498,14 +1495,14 @@ lock_rec_has_expl( for (lock = lock_rec_get_first(block, heap_no); lock != NULL; - lock = lock_rec_get_next_const(heap_no, lock)) { + lock = lock_rec_get_next(heap_no, lock)) { if (lock->trx == trx + && !lock_is_wait_not_by_other(lock->type_mode) && lock_mode_stronger_or_eq( lock_get_mode(lock), static_cast<enum lock_mode>( precise_mode & LOCK_MODE_MASK)) - && !lock_get_wait(lock) && (!lock_rec_get_rec_not_gap(lock) || (precise_mode & LOCK_REC_NOT_GAP) || heap_no == PAGE_HEAP_NO_SUPREMUM) @@ -1756,6 +1753,7 @@ lock_rec_create( ut_ad(lock_mutex_own()); ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx)); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); /* Non-locking autocommit read-only transactions should not set any locks. */ @@ -1813,7 +1811,7 @@ lock_rec_create( } ut_ad(trx_mutex_own(trx)); - if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + if (lock_is_wait_not_by_other(type_mode)) { lock_set_lock_and_trx_wait(lock, trx); } @@ -1838,7 +1836,7 @@ DB_SUCCESS_LOCKED_REC; DB_SUCCESS_LOCKED_REC means that there was a deadlock, but another transaction was chosen as a victim, and we got the lock immediately: no need to wait then */ static -enum db_err +dberr_t lock_rec_enqueue_waiting( /*=====================*/ ulint type_mode,/*!< in: lock mode this @@ -1853,14 +1851,16 @@ lock_rec_enqueue_waiting( const buf_block_t* block, /*!< in: buffer block containing the record */ ulint heap_no,/*!< in: heap number of the record */ + lock_t* lock, /*!< in: lock object; NULL if a new + one should be created. */ dict_index_t* index, /*!< in: index of record */ que_thr_t* thr) /*!< in: query thread */ { trx_t* trx; - lock_t* lock; trx_id_t victim_trx_id; ut_ad(lock_mutex_own()); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); trx = thr_get_trx(thr); @@ -1893,10 +1893,20 @@ lock_rec_enqueue_waiting( ut_ad(0); } - /* Enqueue the lock request that will wait to be granted, note that - we already own the trx mutex. */ - lock = lock_rec_create( - type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE); + if (lock == NULL) { + /* Enqueue the lock request that will wait + to be granted, note that we already own + the trx mutex. */ + lock = lock_rec_create( + type_mode | LOCK_WAIT, block, heap_no, + index, trx, TRUE); + } else { + ut_ad(lock->type_mode & LOCK_WAIT); + ut_ad(lock->type_mode & LOCK_CONV_BY_OTHER); + + lock->type_mode &= ~LOCK_CONV_BY_OTHER; + lock_set_lock_and_trx_wait(lock, trx); + } /* Release the mutex to obey the latching order. This is safe, because lock_deadlock_check_and_resolve() @@ -1979,6 +1989,7 @@ lock_rec_add_to_queue( ut_ad(lock_mutex_own()); ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx)); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); #ifdef UNIV_DEBUG switch (type_mode & LOCK_MODE_MASK) { case LOCK_X: @@ -2100,6 +2111,7 @@ lock_rec_lock_fast( ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP || mode - (LOCK_MODE_MASK & mode) == 0 || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); lock = lock_rec_get_first_on_page(block); @@ -2146,7 +2158,7 @@ lock, or in the case of a page supremum record, a gap type lock. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ static -enum db_err +dberr_t lock_rec_lock_slow( /*===============*/ ibool impl, /*!< in: if TRUE, no lock is set @@ -2163,7 +2175,8 @@ lock_rec_lock_slow( que_thr_t* thr) /*!< in: query thread */ { trx_t* trx; - enum db_err err = DB_SUCCESS; + lock_t* lock; + dberr_t err = DB_SUCCESS; ut_ad(lock_mutex_own()); ut_ad((LOCK_MODE_MASK & mode) != LOCK_S @@ -2175,12 +2188,33 @@ lock_rec_lock_slow( ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP || mode - (LOCK_MODE_MASK & mode) == 0 || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); trx = thr_get_trx(thr); trx_mutex_enter(trx); - if (lock_rec_has_expl(mode, block, heap_no, trx)) { + lock = lock_rec_has_expl(mode, block, heap_no, trx); + if (lock) { + if (lock->type_mode & LOCK_CONV_BY_OTHER) { + /* This lock or lock waiting was created by the other + transaction, not by the transaction (trx) itself. + So, the transaction (trx) should treat it collectly + according as whether granted or not. */ + + if (lock->type_mode & LOCK_WAIT) { + /* This lock request was not granted yet. + Should wait for granted. */ + + goto enqueue_waiting; + } else { + /* This lock request was already granted. + Just clearing the flag. */ + + lock->type_mode &= ~LOCK_CONV_BY_OTHER; + } + } + /* The trx already has a strong enough lock on rec: do nothing */ @@ -2193,8 +2227,10 @@ lock_rec_lock_slow( have a lock strong enough already granted on the record, we have to wait. */ + ut_ad(lock == NULL); +enqueue_waiting: err = lock_rec_enqueue_waiting( - mode, block, heap_no, index, thr); + mode, block, heap_no, lock, index, thr); } else if (!impl) { /* Set the requested lock on the record, note that @@ -2220,7 +2256,7 @@ of a page supremum record, a gap type lock. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ static -enum db_err +dberr_t lock_rec_lock( /*==========*/ ibool impl, /*!< in: if TRUE, no lock is set @@ -2246,6 +2282,7 @@ lock_rec_lock( ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP || mode - (LOCK_MODE_MASK & mode) == 0); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); /* We try a simplified and faster subroutine for the most common cases */ @@ -2348,7 +2385,8 @@ lock_grant( TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait for it */ - if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + if (!(lock->type_mode & LOCK_CONV_BY_OTHER) + && lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { que_thr_t* thr; thr = que_thr_end_lock_wait(lock->trx); @@ -2375,6 +2413,7 @@ lock_rec_cancel( ut_ad(lock_mutex_own()); ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(!(lock->type_mode & LOCK_CONV_BY_OTHER)); /* Reset the bit (there can be only one set bit) in the lock bitmap */ lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock)); @@ -2541,8 +2580,12 @@ lock_rec_reset_and_release_wait( lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { - if (lock_get_wait(lock)) { + if (lock_is_wait_not_by_other(lock->type_mode)) { lock_rec_cancel(lock); + } else if (lock_get_wait(lock)) { + /* just reset LOCK_WAIT */ + lock_rec_reset_nth_bit(lock, heap_no); + lock_reset_lock_and_trx_wait(lock); } else { lock_rec_reset_nth_bit(lock, heap_no); } @@ -3439,11 +3482,13 @@ lock_deadlock_start_print() /*=======================*/ { ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); rewind(lock_latest_err_file); ut_print_timestamp(lock_latest_err_file); if (srv_print_all_deadlocks) { + ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: transactions deadlock detected, " "dumping detailed information.\n"); ut_print_timestamp(stderr); @@ -3458,10 +3503,12 @@ lock_deadlock_fputs( /*================*/ const char* msg) /*!< in: message to print */ { - fputs(msg, lock_latest_err_file); + if (!srv_read_only_mode) { + fputs(msg, lock_latest_err_file); - if (srv_print_all_deadlocks) { - fputs(msg, stderr); + if (srv_print_all_deadlocks) { + fputs(msg, stderr); + } } } @@ -3475,24 +3522,21 @@ lock_deadlock_trx_print( ulint max_query_len) /*!< in: max query length to print, or 0 to use the default max length */ { - ulint n_lock_rec; - ulint n_lock_struct; - ulint heap_size; - ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); - n_lock_rec = lock_number_of_rows_locked(&trx->lock); - n_lock_struct = UT_LIST_GET_LEN(trx->lock.trx_locks); - heap_size = mem_heap_get_size(trx->lock.lock_heap); + ulint n_rec_locks = lock_number_of_rows_locked(&trx->lock); + ulint n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + ulint heap_size = mem_heap_get_size(trx->lock.lock_heap); mutex_enter(&trx_sys->mutex); trx_print_low(lock_latest_err_file, trx, max_query_len, - n_lock_rec, n_lock_struct, heap_size); + n_rec_locks, n_trx_locks, heap_size); if (srv_print_all_deadlocks) { trx_print_low(stderr, trx, max_query_len, - n_lock_rec, n_lock_struct, heap_size); + n_rec_locks, n_trx_locks, heap_size); } mutex_exit(&trx_sys->mutex); @@ -3507,6 +3551,7 @@ lock_deadlock_lock_print( const lock_t* lock) /*!< in: record or table type lock */ { ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); if (lock_get_type_low(lock) == LOCK_REC) { lock_rec_print(lock_latest_err_file, lock); @@ -3629,6 +3674,7 @@ lock_deadlock_notify( deadlock */ { ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); lock_deadlock_start_print(); @@ -3648,9 +3694,15 @@ lock_deadlock_notify( lock_deadlock_lock_print(lock); - lock_deadlock_fputs("*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n"); + /* It is possible that the joining transaction was granted its + lock when we rolled back some other waiting transaction. */ + + if (ctx->start->lock.wait_lock != 0) { + lock_deadlock_fputs( + "*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n"); - lock_deadlock_lock_print(ctx->start->lock.wait_lock); + lock_deadlock_lock_print(ctx->start->lock.wait_lock); + } #ifdef UNIV_DEBUG if (lock_print_waits) { @@ -3669,6 +3721,7 @@ lock_deadlock_select_victim( const lock_deadlock_ctx_t* ctx) /*!< in: deadlock context */ { ut_ad(lock_mutex_own()); + ut_ad(ctx->start->lock.wait_lock != 0); ut_ad(ctx->wait_lock->trx != ctx->start); if (trx_weight_ge(ctx->wait_lock->trx, ctx->start)) { @@ -3694,8 +3747,10 @@ lock_deadlock_check( { ut_ad(lock_mutex_own()); - /* If it is the joining transaction wait lock. */ - if (lock == ctx->start->lock.wait_lock) { + /* If it is the joining transaction wait lock or the joining + transaction was granted its lock due to deadlock detection. */ + if (lock == ctx->start->lock.wait_lock + || ctx->start->lock.wait_lock == NULL) { ; /* Skip */ } else if (lock == ctx->wait_lock) { @@ -3776,7 +3831,8 @@ lock_deadlock_push( } /********************************************************************//** -Looks iteratively for a deadlock. +Looks iteratively for a deadlock. Note: the joining transaction may +have been granted its lock by the deadlock checks. @return 0 if no deadlock else the victim transaction id.*/ static trx_id_t @@ -3811,7 +3867,9 @@ lock_deadlock_search( /* Found a cycle. */ - lock_deadlock_notify(ctx, lock); + if (!srv_read_only_mode) { + lock_deadlock_notify(ctx, lock); + } return(lock_deadlock_select_victim(ctx)->id); @@ -3882,6 +3940,7 @@ lock_deadlock_joining_trx_print( const lock_t* lock) /*!< in: lock trx wants */ { ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); /* If the lock search exceeds the max step or the max depth, the current trx will be @@ -3968,7 +4027,9 @@ lock_deadlock_check_and_resolve( ut_a(trx == ctx.start); ut_a(victim_trx_id == trx->id); - lock_deadlock_joining_trx_print(trx, lock); + if (!srv_read_only_mode) { + lock_deadlock_joining_trx_print(trx, lock); + } MONITOR_INC(MONITOR_DEADLOCK); @@ -4017,6 +4078,7 @@ lock_table_create( ut_ad(table && trx); ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(trx)); + ut_ad(!(type_mode & LOCK_CONV_BY_OTHER)); /* Non-locking autocommit read-only transactions should not set any locks. */ @@ -4203,7 +4265,7 @@ DB_SUCCESS; DB_SUCCESS means that there was a deadlock, but another transaction was chosen as a victim, and we got the lock immediately: no need to wait then */ static -ulint +dberr_t lock_table_enqueue_waiting( /*=======================*/ ulint mode, /*!< in: lock mode this transaction is @@ -4333,7 +4395,7 @@ Locks the specified database table in the mode given. If the lock cannot be granted immediately, the query thread is put to wait. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_table( /*=======*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, @@ -4344,7 +4406,7 @@ lock_table( que_thr_t* thr) /*!< in: query thread */ { trx_t* trx; - ulint err; + dberr_t err; const lock_t* wait_for; ut_ad(table && thr); @@ -4570,11 +4632,38 @@ lock_release( lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) { if (lock_get_type_low(lock) == LOCK_REC) { - lock_rec_dequeue_from_page(lock); +#ifdef UNIV_DEBUG + /* Check if the transcation locked a record + in a system table in X mode. It should have set + the dict_op code correctly if it did. */ + if (lock->index->table->id < DICT_HDR_FIRST_ID + && lock_get_mode(lock) == LOCK_X) { + + ut_ad(lock_get_mode(lock) != LOCK_IX); + ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); + } +#endif /* UNIV_DEBUG */ + + lock_rec_dequeue_from_page(lock); } else { + dict_table_t* table; + + table = lock->un_member.tab_lock.table; +#ifdef UNIV_DEBUG ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + /* Check if the transcation locked a system table + in IX mode. It should have set the dict_op code + correctly if it did. */ + if (table->id < DICT_HDR_FIRST_ID + && (lock_get_mode(lock) == LOCK_X + || lock_get_mode(lock) == LOCK_IX)) { + + ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); + } +#endif /* UNIV_DEBUG */ + if (lock_get_mode(lock) != LOCK_IS && trx->undo_no != 0) { @@ -4582,8 +4671,7 @@ lock_release( block the use of the MySQL query cache for all currently active transactions. */ - lock->un_member.tab_lock.table - ->query_cache_inv_trx_id = max_trx_id; + table->query_cache_inv_trx_id = max_trx_id; } lock_table_dequeue(lock); @@ -5059,7 +5147,9 @@ lock_print_info_summary( "LATEST DETECTED DEADLOCK\n" "------------------------\n", file); - ut_copy_file(file, lock_latest_err_file); + if (!srv_read_only_mode) { + ut_copy_file(file, lock_latest_err_file); + } } fputs("------------\n" @@ -5085,6 +5175,10 @@ lock_print_info_summary( /* Should never be in this state while the system is running. */ ut_error; + case PURGE_STATE_DISABLED: + fprintf(file, "disabled"); + break; + case PURGE_STATE_RUN: fprintf(file, "running"); /* Check if it is waiting for more data to arrive. */ @@ -5418,6 +5512,8 @@ lock_rec_queue_validate( ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); ut_ad(lock_mutex_own() == locked_lock_trx_sys); + ut_ad(!index || dict_index_is_clust(index) + || !dict_index_is_online_ddl(index)); heap_no = page_rec_get_heap_no(rec); @@ -5694,20 +5790,26 @@ lock_rec_block_validate( If the lock exists in lock_rec_validate_page() we assert !block->page.file_page_was_freed. */ + buf_block_t* block; mtr_t mtr; - mtr_start(&mtr); + /* Make sure that the tablespace is not deleted while we are + trying to access the page. */ + if (!fil_inc_pending_ops(space)) { + mtr_start(&mtr); + block = buf_page_get_gen( + space, fil_space_get_zip_size(space), + page_no, RW_X_LATCH, NULL, + BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, &mtr); - buf_block_t* block = buf_page_get_gen( - space, fil_space_get_zip_size(space), - page_no, RW_X_LATCH, NULL, - BUF_GET_POSSIBLY_FREED, - __FILE__, __LINE__, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + ut_ad(lock_rec_validate_page(block)); + mtr_commit(&mtr); - ut_ad(lock_rec_validate_page(block)); - mtr_commit(&mtr); + fil_decr_pending_ops(space); + } } /*********************************************************************//** @@ -5765,7 +5867,7 @@ the query thread to the lock wait state and inserts a waiting request for a gap x-lock to the lock queue. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_rec_insert_check_and_lock( /*===========================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is @@ -5783,10 +5885,13 @@ lock_rec_insert_check_and_lock( const rec_t* next_rec; trx_t* trx; lock_t* lock; - ulint err; + dberr_t err; ulint next_rec_heap_no; ut_ad(block->frame == page_align(rec)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); if (flags & BTR_NO_LOCKING_FLAG) { @@ -5803,11 +5908,9 @@ lock_rec_insert_check_and_lock( to hold trx->mutex here. */ /* When inserting a record into an index, the table must be at - least IX-locked or we must be building an index, in which case - the table must be at least S-locked. */ - ut_ad(lock_table_has(trx, index->table, LOCK_IX) - || (*index->name == TEMP_INDEX_PREFIX - && lock_table_has(trx, index->table, LOCK_S))); + least IX-locked. When we are building an index, we would pass + BTR_NO_LOCKING_FLAG and skip the locking altogether. */ + ut_ad(lock_table_has(trx, index->table, LOCK_IX)); lock = lock_rec_get_first(block, next_rec_heap_no); @@ -5850,7 +5953,7 @@ lock_rec_insert_check_and_lock( err = lock_rec_enqueue_waiting( LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION, - block, next_rec_heap_no, index, thr); + block, next_rec_heap_no, NULL, index, thr); trx_mutex_exit(trx); } else { @@ -5871,6 +5974,9 @@ lock_rec_insert_check_and_lock( page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr); + default: + /* We only care about the two return values. */ + break; } #ifdef UNIV_DEBUG @@ -5920,6 +6026,7 @@ lock_rec_convert_impl_to_expl( this transaction. The transaction may have been committed a long time ago. */ } else { + ut_ad(!dict_index_is_online_ddl(index)); trx_id = lock_sec_rec_some_has_impl(rec, index, offsets); /* The transaction can be committed before the trx_is_active(trx_id, NULL) check below, because we are not @@ -5943,10 +6050,26 @@ lock_rec_convert_impl_to_expl( if (impl_trx != NULL && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, impl_trx)) { + ulint type_mode = (LOCK_REC | LOCK_X + | LOCK_REC_NOT_GAP); + + /* If the delete-marked record was locked already, + we should reserve lock waiting for impl_trx as + implicit lock. Because cannot lock at this moment.*/ + + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) + && lock_rec_other_has_conflicting( + static_cast<enum lock_mode> + (LOCK_X | LOCK_REC_NOT_GAP), block, + heap_no, impl_trx)) { + + type_mode |= (LOCK_WAIT + | LOCK_CONV_BY_OTHER); + } lock_rec_add_to_queue( - LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP, - block, heap_no, index, impl_trx, FALSE); + type_mode, block, heap_no, index, + impl_trx, FALSE); } lock_mutex_exit(); @@ -5962,7 +6085,7 @@ lock wait state and inserts a waiting request for a record x-lock to the lock queue. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_clust_rec_modify_check_and_lock( /*=================================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -5974,7 +6097,7 @@ lock_clust_rec_modify_check_and_lock( const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; ulint heap_no; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -6020,7 +6143,7 @@ Checks if locks of other transactions prevent an immediate modify (delete mark or delete unmark) of a secondary index record. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_sec_rec_modify_check_and_lock( /*===============================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -6032,13 +6155,15 @@ lock_sec_rec_modify_check_and_lock( clustered index record first: see the comment below */ dict_index_t* index, /*!< in: secondary index */ - que_thr_t* thr, /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ mtr_t* mtr) /*!< in/out: mini-transaction */ { - ulint err; + dberr_t err; ulint heap_no; ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG)); ut_ad(block->frame == page_align(rec)); if (flags & BTR_NO_LOCKING_FLAG) { @@ -6103,7 +6228,7 @@ secondary index record. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -enum db_err +dberr_t lock_sec_rec_read_check_and_lock( /*=============================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -6124,10 +6249,11 @@ lock_sec_rec_read_check_and_lock( LOCK_REC_NOT_GAP */ que_thr_t* thr) /*!< in: query thread */ { - enum db_err err; - ulint heap_no; + dberr_t err; + ulint heap_no; ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_online_ddl(index)); ut_ad(block->frame == page_align(rec)); ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); @@ -6180,7 +6306,7 @@ lock on the record. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -enum db_err +dberr_t lock_clust_rec_read_check_and_lock( /*===============================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -6201,8 +6327,8 @@ lock_clust_rec_read_check_and_lock( LOCK_REC_NOT_GAP */ que_thr_t* thr) /*!< in: query thread */ { - enum db_err err; - ulint heap_no; + dberr_t err; + ulint heap_no; ut_ad(dict_index_is_clust(index)); ut_ad(block->frame == page_align(rec)); @@ -6230,7 +6356,8 @@ lock_clust_rec_read_check_and_lock( ut_ad(mode != LOCK_S || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - err = lock_rec_lock(FALSE, mode | gap_mode, block, heap_no, index, thr); + err = lock_rec_lock(FALSE, mode | gap_mode, + block, heap_no, index, thr); MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); @@ -6251,7 +6378,7 @@ lock_clust_rec_read_check_and_lock() that does not require the parameter "offsets". @return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ UNIV_INTERN -ulint +dberr_t lock_clust_rec_read_check_and_lock_alt( /*===================================*/ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG @@ -6274,7 +6401,7 @@ lock_clust_rec_read_check_and_lock_alt( mem_heap_t* tmp_heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; - ulint err; + dberr_t err; rec_offs_init(offsets_); offsets = rec_get_offsets(rec, index, offsets, @@ -6469,6 +6596,8 @@ lock_get_table( { switch (lock_get_type_low(lock)) { case LOCK_REC: + ut_ad(dict_index_is_clust(lock->index) + || !dict_index_is_online_ddl(lock->index)); return(lock->index->table); case LOCK_TABLE: return(lock->un_member.tab_lock.table); @@ -6521,6 +6650,8 @@ lock_rec_get_index( const lock_t* lock) /*!< in: lock */ { ut_a(lock_get_type_low(lock) == LOCK_REC); + ut_ad(dict_index_is_clust(lock->index) + || !dict_index_is_online_ddl(lock->index)); return(lock->index); } @@ -6536,6 +6667,8 @@ lock_rec_get_index_name( const lock_t* lock) /*!< in: lock */ { ut_a(lock_get_type_low(lock) == LOCK_REC); + ut_ad(dict_index_is_clust(lock->index) + || !dict_index_is_online_ddl(lock->index)); return(lock->index->name); } @@ -6581,6 +6714,7 @@ lock_cancel_waiting_and_release( ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(lock->trx)); + ut_ad(!(lock->type_mode & LOCK_CONV_BY_OTHER)); lock->trx->lock.cancel = TRUE; @@ -6656,10 +6790,14 @@ lock_trx_release_locks( { assert_trx_in_list(trx); - if (UNIV_UNLIKELY(trx_state_eq(trx, TRX_STATE_PREPARED))) { + if (trx_state_eq(trx, TRX_STATE_PREPARED)) { mutex_enter(&trx_sys->mutex); ut_a(trx_sys->n_prepared_trx > 0); trx_sys->n_prepared_trx--; + if (trx->is_recovered) { + ut_a(trx_sys->n_prepared_recovered_trx > 0); + trx_sys->n_prepared_recovered_trx--; + } mutex_exit(&trx_sys->mutex); } else { ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); @@ -6714,12 +6852,12 @@ was selected as a deadlock victim, or if it has to wait then cancel the wait lock. @return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */ UNIV_INTERN -enum db_err +dberr_t lock_trx_handle_wait( /*=================*/ trx_t* trx) /*!< in/out: trx lock state */ { - enum db_err err; + dberr_t err; lock_mutex_enter(); @@ -6800,6 +6938,8 @@ lock_table_locks_lookup( ut_a(lock->trx == trx); if (lock_get_type_low(lock) == LOCK_REC) { + ut_ad(!dict_index_is_online_ddl(lock->index) + || dict_index_is_clust(lock->index)); if (lock->index->table == table) { return(lock); } @@ -6828,18 +6968,89 @@ lock_table_has_locks( lock_mutex_enter(); + has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0; + #ifdef UNIV_DEBUG - mutex_enter(&trx_sys->mutex); + if (!has_locks) { + mutex_enter(&trx_sys->mutex); - ut_ad(lock_table_locks_lookup(table, &trx_sys->rw_trx_list) == NULL); - ut_ad(lock_table_locks_lookup(table, &trx_sys->ro_trx_list) == NULL); + ut_ad(!lock_table_locks_lookup(table, &trx_sys->rw_trx_list)); + ut_ad(!lock_table_locks_lookup(table, &trx_sys->ro_trx_list)); - mutex_exit(&trx_sys->mutex); + mutex_exit(&trx_sys->mutex); + } #endif /* UNIV_DEBUG */ - has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0; - lock_mutex_exit(); return(has_locks); } + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Check if the transaction holds any locks on the sys tables +or its records. +@return the strongest lock found on any sys table or 0 for none */ +UNIV_INTERN +const lock_t* +lock_trx_has_sys_table_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction to check */ +{ + lint i; + const lock_t* strongest_lock = 0; + lock_mode strongest = LOCK_NONE; + + lock_mutex_enter(); + + /* Find a valid mode. Note: ib_vector_size() can be 0. */ + for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) { + const lock_t* lock; + + lock = *static_cast<const lock_t**>( + ib_vector_get(trx->lock.table_locks, i)); + + if (lock != NULL + && dict_is_sys_table(lock->un_member.tab_lock.table->id)) { + + strongest = lock_get_mode(lock); + ut_ad(strongest != LOCK_NONE); + strongest_lock = lock; + break; + } + } + + if (strongest == LOCK_NONE) { + lock_mutex_exit(); + return(NULL); + } + + for (/* No op */; i >= 0; --i) { + const lock_t* lock; + + lock = *static_cast<const lock_t**>( + ib_vector_get(trx->lock.table_locks, i)); + + if (lock == NULL) { + continue; + } + + ut_ad(trx == lock->trx); + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + ut_ad(lock->un_member.tab_lock.table != NULL); + + lock_mode mode = lock_get_mode(lock); + + if (dict_is_sys_table(lock->un_member.tab_lock.table->id) + && lock_mode_stronger_or_eq(mode, strongest)) { + + strongest = mode; + strongest_lock = lock; + } + } + + lock_mutex_exit(); + + return(strongest_lock); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc index 99059f19813..fc355d8bb6d 100644 --- a/storage/innobase/lock/lock0wait.cc +++ b/storage/innobase/lock/lock0wait.cc @@ -33,14 +33,6 @@ Created 25/5/2010 Sunny Bains #include "ha_prototypes.h" #include "lock0priv.h" -UNIV_INTERN ibool srv_lock_timeout_active = FALSE; -UNIV_INTERN ulint srv_n_lock_wait_count = 0; -UNIV_INTERN ulint srv_n_lock_wait_current_count = 0; -UNIV_INTERN ib_int64_t srv_n_lock_wait_time = 0; -UNIV_INTERN ulint srv_n_lock_max_wait_time = 0; - -UNIV_INTERN os_event_t srv_timeout_event; - /*********************************************************************//** Print the contents of the lock_sys_t::waiting_threads array. */ static @@ -156,7 +148,7 @@ lock_wait_table_reserve_slot( slot->thr->slot = slot; if (slot->event == NULL) { - slot->event = os_event_create(NULL); + slot->event = os_event_create(); ut_a(slot->event); } @@ -257,8 +249,8 @@ lock_wait_suspend_thread( slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout); if (thr->lock_state == QUE_THR_LOCK_ROW) { - srv_n_lock_wait_count++; - srv_n_lock_wait_current_count++; + srv_stats.n_lock_wait_count.inc(); + srv_stats.n_lock_wait_current_count.inc(); if (ut_usectime(&sec, &ms) == -1) { start_time = -1; @@ -269,7 +261,7 @@ lock_wait_suspend_thread( /* Wake the lock timeout monitor thread, if it is suspended */ - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); lock_wait_mutex_exit(); trx_mutex_exit(trx); @@ -282,6 +274,8 @@ lock_wait_suspend_thread( case RW_S_LATCH: /* Release foreign key check latch */ row_mysql_unfreeze_data_dictionary(trx); + + DEBUG_SYNC_C("lock_wait_release_s_latch_before_sleep"); break; default: /* There should never be a lock wait when the @@ -341,14 +335,16 @@ lock_wait_suspend_thread( diff_time = (ulint) (finish_time - start_time); - srv_n_lock_wait_current_count--; - srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time; + srv_stats.n_lock_wait_current_count.dec(); + srv_stats.n_lock_wait_time.add(diff_time); - if (diff_time > srv_n_lock_max_wait_time && - /* only update the variable if we successfully - retrieved the start and finish times. See Bug#36819. */ - start_time != -1 && finish_time != -1) { - srv_n_lock_max_wait_time = diff_time; + /* Only update the variable if we successfully + retrieved the start and finish times. See Bug#36819. */ + if (diff_time > lock_sys->n_lock_max_wait_time + && start_time != -1 + && finish_time != -1) { + + lock_sys->n_lock_max_wait_time = diff_time; } } @@ -463,11 +459,15 @@ DECLARE_THREAD(lock_wait_timeout_thread)( os_thread_create */ { ib_int64_t sig_count = 0; + os_event_t event = lock_sys->timeout_event; + + ut_ad(!srv_read_only_mode); #ifdef UNIV_PFS_THREAD pfs_register_thread(srv_lock_timeout_thread_key); -#endif - srv_lock_timeout_active = TRUE; +#endif /* UNIV_PFS_THREAD */ + + lock_sys->timeout_thread_active = true; do { srv_slot_t* slot; @@ -475,7 +475,8 @@ DECLARE_THREAD(lock_wait_timeout_thread)( /* When someone is waiting for a lock, we wake up every second and check if a timeout has passed for a lock wait */ - os_event_wait_time_low(srv_timeout_event, 1000000, sig_count); + os_event_wait_time_low(event, 1000000, sig_count); + sig_count = os_event_reset(event); if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { break; @@ -500,13 +501,13 @@ DECLARE_THREAD(lock_wait_timeout_thread)( } } - sig_count = os_event_reset(srv_timeout_event); + sig_count = os_event_reset(event); lock_wait_mutex_exit(); } while (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); - srv_lock_timeout_active = FALSE; + lock_sys->timeout_thread_active = false; /* We count the number of threads in os_thread_exit(). A created thread should always use that to exit and not use return() to exit. */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 5e4a9dcf515..b6909f4771a 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -37,7 +37,6 @@ Created 12/9/1995 Heikki Tuuri #endif #ifndef UNIV_HOTBACKUP -#include "ha_prototypes.h" #include "mem0mem.h" #include "buf0buf.h" #include "buf0flu.h" @@ -49,6 +48,7 @@ Created 12/9/1995 Heikki Tuuri #include "srv0start.h" #include "trx0sys.h" #include "trx0trx.h" +#include "ha_prototypes.h" #include "srv0mon.h" /* @@ -223,7 +223,7 @@ loop: log_buffer_flush_to_disk(); - srv_log_waits++; + srv_stats.log_waits.inc(); ut_ad(++count < 50); @@ -328,7 +328,7 @@ part_loop: goto part_loop; } - srv_log_write_requests++; + srv_stats.log_write_requests.inc(); } /************************************************************//** @@ -748,9 +748,6 @@ log_init(void) log_sys->lsn = LOG_START_LSN; - MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, - log_sys->lsn - log_sys->last_checkpoint_lsn); - ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE); ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE); @@ -784,11 +781,11 @@ log_init(void) log_sys->n_pending_writes = 0; - log_sys->no_flush_event = os_event_create(NULL); + log_sys->no_flush_event = os_event_create(); os_event_set(log_sys->no_flush_event); - log_sys->one_flushed_event = os_event_create(NULL); + log_sys->one_flushed_event = os_event_create(); os_event_set(log_sys->one_flushed_event); @@ -796,7 +793,6 @@ log_init(void) log_sys->next_checkpoint_no = 0; log_sys->last_checkpoint_lsn = log_sys->lsn; - MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, 0); log_sys->n_pending_checkpoint_writes = 0; @@ -832,7 +828,7 @@ log_init(void) /* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */ - log_sys->archiving_on = os_event_create(NULL); + log_sys->archiving_on = os_event_create(); #endif /* UNIV_LOG_ARCHIVE */ /*----------------------------*/ @@ -1163,7 +1159,7 @@ log_group_file_header_flush( MONITOR_INC(MONITOR_LOG_IO); - srv_os_log_pending_writes++; + srv_stats.os_log_pending_writes.inc(); fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0, (ulint) (dest_offset / UNIV_PAGE_SIZE), @@ -1171,7 +1167,7 @@ log_group_file_header_flush( OS_FILE_LOG_BLOCK_SIZE, buf, group); - srv_os_log_pending_writes--; + srv_stats.os_log_pending_writes.dec(); } } @@ -1238,8 +1234,9 @@ loop: log_group_file_header_flush(group, (ulint) (next_offset / group->file_size), start_lsn); - srv_os_log_written += OS_FILE_LOG_BLOCK_SIZE; - srv_log_writes++; + srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE); + + srv_stats.log_writes.inc(); } if ((next_offset % group->file_size) + len > group->file_size) { @@ -1289,7 +1286,7 @@ loop: MONITOR_INC(MONITOR_LOG_IO); - srv_os_log_pending_writes++; + srv_stats.os_log_pending_writes.inc(); ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX); @@ -1298,10 +1295,10 @@ loop: (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, group); - srv_os_log_pending_writes--; + srv_stats.os_log_pending_writes.dec(); - srv_os_log_written += write_len; - srv_log_writes++; + srv_stats.os_log_written.add(write_len); + srv_stats.log_writes.inc(); } if (write_len < len) { @@ -1345,6 +1342,8 @@ log_write_up_to( ib_uint64_t write_lsn; ib_uint64_t flush_lsn; + ut_ad(!srv_read_only_mode); + if (recv_no_ibuf_operations) { /* Recovery is running and no operations on the log files are allowed yet (the variable name .._no_ibuf_.. is misleading) */ @@ -1560,6 +1559,7 @@ log_buffer_flush_to_disk(void) { lsn_t lsn; + ut_ad(!srv_read_only_mode); mutex_enter(&(log_sys->mutex)); lsn = log_sys->lsn; @@ -1626,15 +1626,16 @@ log_flush_margin(void) Advances the smallest lsn for which there are unflushed dirty blocks in the buffer pool. NOTE: this function may only be called if the calling thread owns no synchronization objects! -@return FALSE if there was a flush batch of the same type running, +@return false if there was a flush batch of the same type running, which means that we could not start this flush batch */ static -ibool +bool log_preflush_pool_modified_pages( /*=============================*/ lsn_t new_oldest) /*!< in: try to advance oldest_modified_lsn at least to this lsn */ { + bool success; ulint n_pages; if (recv_recovery_on) { @@ -1650,13 +1651,12 @@ log_preflush_pool_modified_pages( recv_apply_hashed_log_recs(TRUE); } - n_pages = buf_flush_list(ULINT_MAX, new_oldest); + success = buf_flush_list(ULINT_MAX, new_oldest, &n_pages); buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); - if (n_pages == ULINT_UNDEFINED) { - - return(FALSE); + if (!success) { + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); } MONITOR_INC_VALUE_CUMULATIVE( @@ -1665,7 +1665,7 @@ log_preflush_pool_modified_pages( MONITOR_FLUSH_SYNC_PAGES, n_pages); - return(TRUE); + return(success); } /******************************************************//** @@ -1765,6 +1765,7 @@ log_group_checkpoint( byte* buf; ulint i; + ut_ad(!srv_read_only_mode); ut_ad(mutex_own(&(log_sys->mutex))); #if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE # error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE" @@ -1952,12 +1953,13 @@ log_groups_write_checkpoint_info(void) ut_ad(mutex_own(&(log_sys->mutex))); - group = UT_LIST_GET_FIRST(log_sys->log_groups); - - while (group) { - log_group_checkpoint(group); + if (!srv_read_only_mode) { + for (group = UT_LIST_GET_FIRST(log_sys->log_groups); + group; + group = UT_LIST_GET_NEXT(log_groups, group)) { - group = UT_LIST_GET_NEXT(log_groups, group); + log_group_checkpoint(group); + } } } @@ -1982,6 +1984,8 @@ log_checkpoint( { lsn_t oldest_lsn; + ut_ad(!srv_read_only_mode); + if (recv_recovery_is_on()) { recv_apply_hashed_log_recs(TRUE); } @@ -2088,38 +2092,6 @@ log_make_checkpoint_at( } /****************************************************************//** -Checks if an asynchronous flushing of dirty pages is required in the -background. This function is only called from the page cleaner thread. -@return lsn to which the flushing should happen or LSN_MAX -if flushing is not required */ -UNIV_INTERN -lsn_t -log_async_flush_lsn(void) -/*=====================*/ -{ - lsn_t age; - lsn_t oldest_lsn; - lsn_t new_lsn = LSN_MAX; - - mutex_enter(&log_sys->mutex); - - oldest_lsn = log_buf_pool_get_oldest_modification(); - - ut_a(log_sys->lsn >= oldest_lsn); - age = log_sys->lsn - oldest_lsn; - - if (age > log_sys->max_modified_age_async) { - /* An asynchronous preflush is required */ - ut_a(log_sys->lsn >= log_sys->max_modified_age_async); - new_lsn = log_sys->lsn - log_sys->max_modified_age_async; - } - - mutex_exit(&log_sys->mutex); - - return(new_lsn); -} - -/****************************************************************//** Tries to establish a big enough margin of free space in the log groups, such that a new log entry can be catenated without an immediate need for a checkpoint. NOTE: this function may only be called if the calling thread @@ -2136,7 +2108,7 @@ log_checkpoint_margin(void) lsn_t oldest_lsn; ibool checkpoint_sync; ibool do_checkpoint; - ibool success; + bool success; loop: checkpoint_sync = FALSE; do_checkpoint = FALSE; @@ -3131,10 +3103,8 @@ logs_empty_and_mark_files_at_shutdown(void) const char* thread_name; ibool server_busy; - if (srv_print_verbose_log) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Starting shutdown...\n"); - } + ib_logf(IB_LOG_LEVEL_INFO, "Starting shutdown..."); + /* Wait until the master thread and all other operations are idle: our algorithm only works if the server is idle at shutdown */ @@ -3155,9 +3125,8 @@ loop: threads check will be done later. */ if (srv_print_verbose_log && count > 600) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Waiting for %s to exit\n", - thread_name); + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %s to exit", thread_name); count = 0; } @@ -3174,9 +3143,8 @@ loop: if (total_trx > 0) { if (srv_print_verbose_log && count > 600) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Waiting for %lu " - "active transactions to finish\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %lu active transactions to finish", (ulong) total_trx); count = 0; @@ -3221,9 +3189,9 @@ loop: break; } - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Waiting for %s " - "to be suspended\n", thread_type); + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %s to be suspended", + thread_type); count = 0; } @@ -3239,10 +3207,9 @@ loop: ++count; os_thread_sleep(100000); if (srv_print_verbose_log && count > 600) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Waiting for page_cleaner to " - "finish flushing of buffer pool\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for page_cleaner to " + "finish flushing of buffer pool"); count = 0; } } @@ -3257,10 +3224,9 @@ loop: if (server_busy) { if (srv_print_verbose_log && count > 600) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Pending checkpoint_writes: %lu\n" - " InnoDB: Pending log flush writes: %lu\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Pending checkpoint_writes: %lu. " + "Pending log flush writes: %lu", (ulong) log_sys->n_pending_checkpoint_writes, (ulong) log_sys->n_pending_writes); count = 0; @@ -3272,9 +3238,8 @@ loop: if (pending_io) { if (srv_print_verbose_log && count > 600) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Waiting for %lu buffer page " - "I/Os to complete\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %lu buffer page I/Os to complete", (ulong) pending_io); count = 0; } @@ -3286,41 +3251,50 @@ loop: log_archive_all(); #endif /* UNIV_LOG_ARCHIVE */ if (srv_fast_shutdown == 2) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: MySQL has requested a very fast shutdown" - " without flushing " - "the InnoDB buffer pool to data files." - " At the next mysqld startup " - "InnoDB will do a crash recovery!\n"); - - /* In this fastest shutdown we do not flush the buffer pool: - it is essentially a 'crash' of the InnoDB server. Make sure - that the log is all flushed to disk, so that we can recover - all committed transactions in a crash recovery. We must not - write the lsn stamps to the data files, since at a startup - InnoDB deduces from the stamps if the previous shutdown was - clean. */ - - log_buffer_flush_to_disk(); - - /* Check that the background threads stay suspended */ - thread_name = srv_any_background_threads_are_active(); - if (thread_name != NULL) { - fprintf(stderr, - "InnoDB: Warning: background thread %s" - " woke up during shutdown\n", thread_name); - goto loop; + if (!srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_INFO, + "MySQL has requested a very fast shutdown " + "without flushing the InnoDB buffer pool to " + "data files. At the next mysqld startup " + "InnoDB will do a crash recovery!"); + + /* In this fastest shutdown we do not flush the + buffer pool: + + it is essentially a 'crash' of the InnoDB server. + Make sure that the log is all flushed to disk, so + that we can recover all committed transactions in + a crash recovery. We must not write the lsn stamps + to the data files, since at a startup InnoDB deduces + from the stamps if the previous shutdown was clean. */ + + log_buffer_flush_to_disk(); + + /* Check that the background threads stay suspended */ + thread_name = srv_any_background_threads_are_active(); + + if (thread_name != NULL) { + ib_logf(IB_LOG_LEVEL_WARN, + "Background thread %s woke up " + "during shutdown", thread_name); + goto loop; + } } srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + fil_close_all_files(); + thread_name = srv_any_background_threads_are_active(); + ut_a(!thread_name); + return; } - log_make_checkpoint_at(LSN_MAX, TRUE); + if (!srv_read_only_mode) { + log_make_checkpoint_at(LSN_MAX, TRUE); + } mutex_enter(&log_sys->mutex); @@ -3356,15 +3330,17 @@ loop: /* Check that the background threads stay suspended */ thread_name = srv_any_background_threads_are_active(); if (thread_name != NULL) { - fprintf(stderr, - "InnoDB: Warning: background thread %s" - " woke up during shutdown\n", thread_name); + ib_logf(IB_LOG_LEVEL_WARN, + "Background thread %s woke up during shutdown", + thread_name); goto loop; } - fil_flush_file_spaces(FIL_TABLESPACE); - fil_flush_file_spaces(FIL_LOG); + if (!srv_read_only_mode) { + fil_flush_file_spaces(FIL_TABLESPACE); + fil_flush_file_spaces(FIL_LOG); + } /* The call fil_write_flushed_lsn_to_data_files() will pass the buffer pool: therefore it is essential that the buffer pool has been @@ -3374,9 +3350,8 @@ loop: if (!buf_all_freed()) { if (srv_print_verbose_log && count > 600) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Waiting for dirty buffer " - "pages to be flushed\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for dirty buffer pages to be flushed"); count = 0; } @@ -3386,31 +3361,38 @@ loop: srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; /* Make some checks that the server really is quiet */ - ut_a(srv_get_active_thread_type() == SRV_NONE); + srv_thread_type type = srv_get_active_thread_type(); + ut_a(type == SRV_NONE); + + bool freed = buf_all_freed(); + ut_a(freed); - ut_a(buf_all_freed()); ut_a(lsn == log_sys->lsn); if (lsn < srv_start_lsn) { - fprintf(stderr, - "InnoDB: Error: log sequence number" - " at shutdown " LSN_PF "\n" - "InnoDB: is lower than at startup " LSN_PF "!\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Log sequence number at shutdown " LSN_PF " " + "is lower than at startup " LSN_PF "!", lsn, srv_start_lsn); } srv_shutdown_lsn = lsn; - fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); + if (!srv_read_only_mode) { + fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); - fil_flush_file_spaces(FIL_TABLESPACE); + fil_flush_file_spaces(FIL_TABLESPACE); + } fil_close_all_files(); /* Make some checks that the server really is quiet */ - ut_a(srv_get_active_thread_type() == SRV_NONE); + type = srv_get_active_thread_type(); + ut_a(type == SRV_NONE); + + freed = buf_all_freed(); + ut_a(freed); - ut_a(buf_all_freed()); ut_a(lsn == log_sys->lsn); } @@ -3544,7 +3526,7 @@ log_refresh_stats(void) log_sys->last_printout_time = time(NULL); } -/********************************************************************** +/********************************************************//** Closes a log group. */ static void @@ -3574,12 +3556,12 @@ log_group_close( mem_free(group); } -/********************************************************** -Shutdown the log system but do not release all the memory. */ +/********************************************************//** +Closes all log groups. */ UNIV_INTERN void -log_shutdown(void) -/*==============*/ +log_group_close_all(void) +/*=====================*/ { log_group_t* group; @@ -3593,6 +3575,16 @@ log_shutdown(void) log_group_close(prev_group); } +} + +/********************************************************//** +Shutdown the log system but do not release all the memory. */ +UNIV_INTERN +void +log_shutdown(void) +/*==============*/ +{ + log_group_close_all(); mem_free(log_sys->buf_ptr); log_sys->buf_ptr = NULL; @@ -3610,7 +3602,7 @@ log_shutdown(void) #ifdef UNIV_LOG_ARCHIVE rw_lock_free(&log_sys->archive_lock); - os_event_create(log_sys->archiving_on); + os_event_create(); #endif /* UNIV_LOG_ARCHIVE */ #ifdef UNIV_LOG_DEBUG @@ -3620,7 +3612,7 @@ log_shutdown(void) recv_sys_close(); } -/********************************************************** +/********************************************************//** Free the log system data structures. */ UNIV_INTERN void diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index f914fc2676c..8cefa9e4b70 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,8 +43,6 @@ Created 9/20/1997 Heikki Tuuri #include "trx0undo.h" #include "trx0rec.h" #include "fil0fil.h" -#include "buf0dblwr.h" -#include "srv0mon.h" #ifndef UNIV_HOTBACKUP # include "buf0rea.h" # include "srv0srv.h" @@ -158,6 +157,20 @@ UNIV_INTERN mysql_pfs_key_t trx_rollback_clean_thread_key; UNIV_INTERN mysql_pfs_key_t recv_sys_mutex_key; #endif /* UNIV_PFS_MUTEX */ +#ifndef UNIV_HOTBACKUP +# ifdef UNIV_PFS_THREAD +UNIV_INTERN mysql_pfs_key_t recv_writer_thread_key; +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t recv_writer_mutex_key; +# endif /* UNIV_PFS_MUTEX */ + +/** Flag indicating if recv_writer thread is active. */ +UNIV_INTERN bool recv_writer_thread_active = false; +UNIV_INTERN os_thread_t recv_writer_thread_handle = 0; +#endif /* !UNIV_HOTBACKUP */ + /* prototypes */ #ifndef UNIV_HOTBACKUP @@ -186,6 +199,11 @@ recv_sys_create(void) mutex_create(recv_sys_mutex_key, &recv_sys->mutex, SYNC_RECV); +#ifndef UNIV_HOTBACKUP + mutex_create(recv_writer_mutex_key, &recv_sys->writer_mutex, + SYNC_LEVEL_VARYING); +#endif /* !UNIV_HOTBACKUP */ + recv_sys->heap = NULL; recv_sys->addr_hash = NULL; } @@ -214,6 +232,11 @@ recv_sys_close(void) mem_free(recv_sys->last_block_buf_start); } +#ifndef UNIV_HOTBACKUP + ut_ad(!recv_writer_thread_active); + mutex_free(&recv_sys->writer_mutex); +#endif /* !UNIV_HOTBACKUP */ + mutex_free(&recv_sys->mutex); mem_free(recv_sys); @@ -290,6 +313,58 @@ recv_sys_var_init(void) recv_max_page_lsn = 0; } + +/******************************************************************//** +recv_writer thread tasked with flushing dirty pages from the buffer +pools. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(recv_writer_thread)( +/*===============================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + ut_ad(!srv_read_only_mode); + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(recv_writer_thread_key); +#endif /* UNIV_PFS_THREAD */ + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "InnoDB: recv_writer thread running, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + recv_writer_thread_active = true; + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + + os_thread_sleep(100000); + + mutex_enter(&recv_sys->writer_mutex); + + if (!recv_recovery_on) { + mutex_exit(&recv_sys->writer_mutex); + break; + } + + /* Flush pages from end of LRU if required */ + buf_flush_LRU_tail(); + + mutex_exit(&recv_sys->writer_mutex); + } + + recv_writer_thread_active = false; + + /* We count the number of threads in os_thread_exit(). + A created thread should always use that to exit and not + use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} #endif /* !UNIV_HOTBACKUP */ /************************************************************ @@ -310,9 +385,7 @@ recv_sys_init( flush_list during recovery process. As this initialization is done while holding the buffer pool mutex we perform it before acquiring recv_sys->mutex. */ -#ifndef UNIV_HOTBACKUP buf_flush_init_flush_rbt(); -#endif /* !UNIV_HOTBACKUP */ mutex_enter(&(recv_sys->mutex)); @@ -406,6 +479,7 @@ recv_sys_debug_free(void) } # endif /* UNIV_LOG_DEBUG */ +# ifdef UNIV_LOG_ARCHIVE /********************************************************//** Truncates possible corrupted or extra records from a log group. */ static @@ -427,7 +501,6 @@ recv_truncate_group( lsn_t finish_lsn1; lsn_t finish_lsn2; lsn_t finish_lsn; - ulint i; if (archived_lsn == LSN_MAX) { /* Checkpoint was taken in the NOARCHIVELOG mode */ @@ -455,11 +528,7 @@ recv_truncate_group( ut_a(RECV_SCAN_SIZE <= log_sys->buf_size); - /* Write the log buffer full of zeros */ - for (i = 0; i < RECV_SCAN_SIZE; i++) { - - *(log_sys->buf + i) = '\0'; - } + memset(log_sys->buf, 0, RECV_SCAN_SIZE); start_lsn = ut_uint64_align_down(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); @@ -499,11 +568,7 @@ recv_truncate_group( return; } - /* Write the log buffer full of zeros */ - for (i = 0; i < RECV_SCAN_SIZE; i++) { - - *(log_sys->buf + i) = '\0'; - } + memset(log_sys->buf, 0, RECV_SCAN_SIZE); start_lsn = end_lsn; } @@ -560,6 +625,7 @@ recv_copy_group( start_lsn = end_lsn; } } +# endif /* UNIV_LOG_ARCHIVE */ /********************************************************//** Copies a log segment from the most up-to-date log group to the other log @@ -570,10 +636,12 @@ static void recv_synchronize_groups( /*====================*/ - log_group_t* up_to_date_group) /*!< in: the most up-to-date +#ifdef UNIV_LOG_ARCHIVE + log_group_t* up_to_date_group /*!< in: the most up-to-date log group */ +#endif + ) { - log_group_t* group; lsn_t start_lsn; lsn_t end_lsn; lsn_t recovered_lsn; @@ -590,11 +658,17 @@ recv_synchronize_groups( ut_a(start_lsn != end_lsn); log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block, - up_to_date_group, start_lsn, end_lsn); - - group = UT_LIST_GET_FIRST(log_sys->log_groups); +#ifdef UNIV_LOG_ARCHIVE + up_to_date_group, +#else /* UNIV_LOG_ARCHIVE */ + UT_LIST_GET_FIRST(log_sys->log_groups), +#endif /* UNIV_LOG_ARCHIVE */ + start_lsn, end_lsn); - while (group) { + for (log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups); + group; + group = UT_LIST_GET_NEXT(log_groups, group)) { +#ifdef UNIV_LOG_ARCHIVE if (group != up_to_date_group) { /* Copy log data if needed */ @@ -602,13 +676,11 @@ recv_synchronize_groups( recv_copy_group(group, up_to_date_group, recovered_lsn); } - +#endif /* UNIV_LOG_ARCHIVE */ /* Update the fields in the group struct to correspond to recovered_lsn */ log_group_set_fields(group, recovered_lsn); - - group = UT_LIST_GET_NEXT(log_groups, group); } /* Copy the checkpoint info to the groups; remember that we have @@ -661,8 +733,8 @@ recv_check_cp_is_consistent( /********************************************************//** Looks for the maximum consistent checkpoint from the log groups. @return error code or DB_SUCCESS */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t recv_find_max_checkpoint( /*=====================*/ log_group_t** max_group, /*!< out: max group */ @@ -958,8 +1030,11 @@ recv_parse_or_apply_log_rec_body( not NULL, then the log record is applied to the page, and the log record should be complete then */ - mtr_t* mtr) /*!< in: mtr or NULL; should be non-NULL + mtr_t* mtr, /*!< in: mtr or NULL; should be non-NULL if and only if block is non-NULL */ + ulint space_id) + /*!< in: tablespace id obtained by + parsing initial log record */ { dict_index_t* index = NULL; page_t* page; @@ -1151,18 +1226,22 @@ recv_parse_or_apply_log_rec_body( ptr, end_ptr, block, index, mtr); } break; - case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE: + case MLOG_PAGE_REORGANIZE: + case MLOG_COMP_PAGE_REORGANIZE: + case MLOG_ZIP_PAGE_REORGANIZE: ut_ad(!page || page_type == FIL_PAGE_INDEX); if (NULL != (ptr = mlog_parse_index( ptr, end_ptr, - type == MLOG_COMP_PAGE_REORGANIZE, + type != MLOG_PAGE_REORGANIZE, &index))) { ut_a(!page || (ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); - ptr = btr_parse_page_reorganize(ptr, end_ptr, index, - block, mtr); + ptr = btr_parse_page_reorganize( + ptr, end_ptr, index, + type == MLOG_ZIP_PAGE_REORGANIZE, + block, mtr); } break; case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE: @@ -1231,8 +1310,11 @@ recv_parse_or_apply_log_rec_body( ut_ad(!page || page_type != FIL_PAGE_TYPE_ALLOCATED); ptr = mlog_parse_string(ptr, end_ptr, page, page_zip); break; - case MLOG_FILE_CREATE: case MLOG_FILE_RENAME: + ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, + space_id, 0); + break; + case MLOG_FILE_CREATE: case MLOG_FILE_DELETE: case MLOG_FILE_CREATE2: ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, 0, 0); @@ -1257,6 +1339,16 @@ recv_parse_or_apply_log_rec_body( ptr = page_zip_parse_compress(ptr, end_ptr, page, page_zip); break; + case MLOG_ZIP_PAGE_COMPRESS_NO_DATA: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, TRUE, &index))) { + + ut_a(!page || ((ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table))); + ptr = page_zip_parse_compress_no_data( + ptr, end_ptr, page, page_zip, index); + } + break; default: ptr = NULL; recv_sys->found_corrupt_log = TRUE; @@ -1611,7 +1703,8 @@ recv_recover_page_func( recv_parse_or_apply_log_rec_body(recv->type, buf, buf + recv->len, - block, &mtr); + block, &mtr, + recv_addr->space); end_lsn = recv->start_lsn + recv->len; mach_write_to_8(FIL_PAGE_LSN + page, end_lsn); @@ -1740,7 +1833,6 @@ recv_apply_hashed_log_recs( { recv_addr_t* recv_addr; ulint i; - ulint n_pages; ibool has_printed = FALSE; mtr_t mtr; loop: @@ -1778,11 +1870,11 @@ loop: if (recv_addr->state == RECV_NOT_PROCESSED) { if (!has_printed) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Starting an" - " apply batch of log records" - " to the database...\n" - "InnoDB: Progress in percents: ", + ib_logf(IB_LOG_LEVEL_INFO, + "Starting an apply batch" + " of log records" + " to the database..."); + fputs("InnoDB: Progress in percent: ", stderr); has_printed = TRUE; } @@ -1839,6 +1931,8 @@ loop: } if (!allow_ibuf) { + bool success; + /* Flush all the file pages to disk and invalidate them in the buffer pool */ @@ -1846,13 +1940,24 @@ loop: mutex_exit(&(recv_sys->mutex)); mutex_exit(&(log_sys->mutex)); - n_pages = buf_flush_list(ULINT_MAX, LSN_MAX); - ut_a(n_pages != ULINT_UNDEFINED); + /* Stop the recv_writer thread from issuing any LRU + flush batches. */ + mutex_enter(&recv_sys->writer_mutex); + + /* Wait for any currently run batch to end. */ + buf_flush_wait_LRU_batch_end(); + + success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL); + + ut_a(success); buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); buf_pool_invalidate(); + /* Allow batches from recv_writer thread. */ + mutex_exit(&recv_sys->writer_mutex); + mutex_enter(&(log_sys->mutex)); mutex_enter(&(recv_sys->mutex)); ut_d(recv_no_log_write = FALSE); @@ -1892,9 +1997,10 @@ recv_apply_log_recs_for_backup(void) block = back_block1; - fputs("InnoDB: Starting an apply batch of log records" - " to the database...\n" - "InnoDB: Progress in percents: ", stderr); + ib_logf(IB_LOG_LEVEL_INFO, + "Starting an apply batch of log records to the database..."); + + fputs("InnoDB: Progress in percent: ", stderr); n_hash_cells = hash_get_n_cells(recv_sys->addr_hash); @@ -2079,7 +2185,7 @@ recv_parse_log_rec( #endif /* UNIV_LOG_LSN_DEBUG */ new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr, - NULL, NULL); + NULL, NULL, *space); if (UNIV_UNLIKELY(new_ptr == NULL)) { return(0); @@ -2686,11 +2792,21 @@ recv_scan_log_recs( if (recv_log_scan_is_startup_type && !recv_needed_recovery) { - fprintf(stderr, - "InnoDB: Log scan progressed" - " past the checkpoint lsn " LSN_PF "\n", - recv_sys->scanned_lsn); - recv_init_crash_recovery(); + if (!srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_INFO, + "Log scan progressed past the " + "checkpoint lsn " LSN_PF "", + recv_sys->scanned_lsn); + + recv_init_crash_recovery(); + } else { + + ib_logf(IB_LOG_LEVEL_WARN, + "Recovery skipped, " + "--innodb-read-only set!"); + + return(TRUE); + } } #endif /* !UNIV_HOTBACKUP */ @@ -2838,20 +2954,15 @@ void recv_init_crash_recovery(void) /*==========================*/ { + ut_ad(!srv_read_only_mode); ut_a(!recv_needed_recovery); recv_needed_recovery = TRUE; - ut_print_timestamp(stderr); - - fprintf(stderr, - " InnoDB: Database was not" - " shut down normally!\n" - "InnoDB: Starting crash recovery.\n"); - - fprintf(stderr, - "InnoDB: Reading tablespace information" - " from the .ibd files...\n"); + ib_logf(IB_LOG_LEVEL_INFO, "Database was not shutdown normally!"); + ib_logf(IB_LOG_LEVEL_INFO, "Starting crash recovery."); + ib_logf(IB_LOG_LEVEL_INFO, + "Reading tablespace information from the .ibd files..."); fil_load_single_table_tablespaces(); @@ -2862,11 +2973,12 @@ recv_init_crash_recovery(void) if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { - fprintf(stderr, - "InnoDB: Restoring possible" - " half-written data pages from" - " the doublewrite\n" - "InnoDB: buffer...\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Restoring possible half-written data pages "); + + ib_logf(IB_LOG_LEVEL_INFO, + "from the doublewrite buffer..."); + buf_dblwr_init_or_restore_pages(TRUE); } } @@ -2878,7 +2990,7 @@ recv_recovery_from_checkpoint_finish should be called later to complete the recovery and free the resources used in it. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t recv_recovery_from_checkpoint_start_func( /*=====================================*/ #ifdef UNIV_LOG_ARCHIVE @@ -2890,19 +3002,18 @@ recv_recovery_from_checkpoint_start_func( { log_group_t* group; log_group_t* max_cp_group; - log_group_t* up_to_date_group; ulint max_cp_field; lsn_t checkpoint_lsn; ib_uint64_t checkpoint_no; - lsn_t old_scanned_lsn; lsn_t group_scanned_lsn = 0; lsn_t contiguous_lsn; #ifdef UNIV_LOG_ARCHIVE + log_group_t* up_to_date_group; lsn_t archived_lsn; #endif /* UNIV_LOG_ARCHIVE */ byte* buf; byte log_hdr_buf[LOG_FILE_HDR_SIZE]; - ulint err; + dberr_t err; #ifdef UNIV_LOG_ARCHIVE ut_ad(type != LOG_CHECKPOINT || limit_lsn == LSN_MAX); @@ -2923,10 +3034,10 @@ recv_recovery_from_checkpoint_start_func( } if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { - fprintf(stderr, - "InnoDB: The user has set SRV_FORCE_NO_LOG_REDO on\n"); - fprintf(stderr, - "InnoDB: Skipping log redo\n"); + + ib_logf(IB_LOG_LEVEL_INFO, + "The user has set SRV_FORCE_NO_LOG_REDO on, " + "skipping log redo"); return(DB_SUCCESS); } @@ -2967,17 +3078,24 @@ recv_recovery_from_checkpoint_start_func( if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { + + if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot restore from ibbackup, InnoDB running " + "in read-only mode!"); + + return(DB_ERROR); + } + /* This log file was created by ibbackup --restore: print a note to the user about it */ - fprintf(stderr, - "InnoDB: The log file was created by" - " ibbackup --apply-log at\n" - "InnoDB: %s\n", + ib_logf(IB_LOG_LEVEL_INFO, + "The log file was created by ibbackup --apply-log " + "at %s. The following crash recovery is part of a " + "normal restore.", log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP); - fprintf(stderr, - "InnoDB: NOTE: the following crash recovery" - " is part of a normal restore.\n"); /* Wipe over the label now */ @@ -3017,9 +3135,9 @@ recv_recovery_from_checkpoint_start_func( contiguous_lsn = ut_uint64_align_down(recv_sys->scanned_lsn, OS_FILE_LOG_BLOCK_SIZE); +#ifdef UNIV_LOG_ARCHIVE if (TYPE_CHECKPOINT) { up_to_date_group = max_cp_group; -#ifdef UNIV_LOG_ARCHIVE } else { ulint capacity; @@ -3055,8 +3173,8 @@ recv_recovery_from_checkpoint_start_func( group->scanned_lsn = group_scanned_lsn; up_to_date_group = group; -#endif /* UNIV_LOG_ARCHIVE */ } +#endif /* UNIV_LOG_ARCHIVE */ ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size); @@ -3071,19 +3189,21 @@ recv_recovery_from_checkpoint_start_func( /* Set the flag to publish that we are doing startup scan. */ recv_log_scan_is_startup_type = TYPE_CHECKPOINT; while (group) { - old_scanned_lsn = recv_sys->scanned_lsn; +#ifdef UNIV_LOG_ARCHIVE + lsn_t old_scanned_lsn = recv_sys->scanned_lsn; +#endif /* UNIV_LOG_ARCHIVE */ recv_group_scan_log_recs(group, &contiguous_lsn, &group_scanned_lsn); group->scanned_lsn = group_scanned_lsn; +#ifdef UNIV_LOG_ARCHIVE if (old_scanned_lsn < group_scanned_lsn) { /* We found a more up-to-date group */ up_to_date_group = group; } -#ifdef UNIV_LOG_ARCHIVE if ((type == LOG_ARCHIVE) && (group == recv_sys->archive_group)) { group = UT_LIST_GET_NEXT(log_groups, group); @@ -3104,70 +3224,73 @@ recv_recovery_from_checkpoint_start_func( || checkpoint_lsn != min_flushed_lsn) { if (checkpoint_lsn < max_flushed_lsn) { - fprintf(stderr, - "InnoDB: #########################" - "#################################\n" - "InnoDB: " - "WARNING!\n" - "InnoDB: The log sequence number" - " in ibdata files is higher\n" - "InnoDB: than the log sequence number" - " in the ib_logfiles! Are you sure\n" - "InnoDB: you are using the right" - " ib_logfiles to start up" - " the database?\n" - "InnoDB: Log sequence number in" - " ib_logfiles is " LSN_PF ", log\n" - "InnoDB: sequence numbers stamped" - " to ibdata file headers are between\n" - "InnoDB: " LSN_PF " and " LSN_PF ".\n" - "InnoDB: #########################" - "#################################\n", + + ib_logf(IB_LOG_LEVEL_WARN, + "The log sequence number " + "in the ibdata files is higher " + "than the log sequence number " + "in the ib_logfiles! Are you sure " + "you are using the right " + "ib_logfiles to start up the database. " + "Log sequence number in the " + "ib_logfiles is " LSN_PF ", log" + "sequence numbers stamped " + "to ibdata file headers are between " + "" LSN_PF " and " LSN_PF ".", checkpoint_lsn, min_flushed_lsn, max_flushed_lsn); } if (!recv_needed_recovery) { - fprintf(stderr, - "InnoDB: The log sequence number" - " in ibdata files does not match\n" - "InnoDB: the log sequence number" - " in the ib_logfiles!\n"); - recv_init_crash_recovery(); + ib_logf(IB_LOG_LEVEL_INFO, + "The log sequence numbers " + LSN_PF " and " LSN_PF + " in ibdata files do not match" + " the log sequence number " + LSN_PF + " in the ib_logfiles!", + min_flushed_lsn, + max_flushed_lsn, + checkpoint_lsn); + + if (!srv_read_only_mode) { + recv_init_crash_recovery(); + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Can't initiate database " + "recovery, running " + "in read-only-mode."); + return(DB_READ_ONLY); + } } } - if (!recv_needed_recovery) { - /* Init the doublewrite buffer memory structure */ - buf_dblwr_init_or_restore_pages(FALSE); + if (!srv_read_only_mode) { + if (recv_needed_recovery) { + /* Spawn the background thread to + flush dirty pages from the buffer + pools. */ + recv_writer_thread_handle = + os_thread_create( + recv_writer_thread, 0, 0); + } else { + /* Init the doublewrite buffer memory + structure */ + buf_dblwr_init_or_restore_pages(FALSE); + } } } /* We currently have only one log group */ - if (group_scanned_lsn < checkpoint_lsn) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: ERROR: We were only able to scan the log" - " up to\n" - "InnoDB: " LSN_PF ", but a checkpoint was at " - LSN_PF ".\n" - "InnoDB: It is possible that" - " the database is now corrupt!\n", - group_scanned_lsn, - checkpoint_lsn); - } - - if (group_scanned_lsn < recv_max_page_lsn) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: ERROR: We were only able to scan the log" - " up to " LSN_PF "\n" - "InnoDB: but a database page a had an lsn " LSN_PF "." - " It is possible that the\n" - "InnoDB: database is now corrupt!\n", - group_scanned_lsn, - recv_max_page_lsn); + if (group_scanned_lsn < checkpoint_lsn + || group_scanned_lsn < recv_max_page_lsn) { + ib_logf(IB_LOG_LEVEL_ERROR, + "We scanned the log up to " + LSN_PF ". A checkpoint was at " LSN_PF + " and the maximum LSN on a database page was " LSN_PF + ". It is possible that the database is now corrupt!", + group_scanned_lsn, checkpoint_lsn, recv_max_page_lsn); } if (recv_sys->recovered_lsn < checkpoint_lsn) { @@ -3179,7 +3302,10 @@ recv_recovery_from_checkpoint_start_func( return(DB_SUCCESS); } - ut_error; + /* No harm in trying to do RO access. */ + if (!srv_read_only_mode) { + ut_error; + } return(DB_ERROR); } @@ -3192,9 +3318,11 @@ recv_recovery_from_checkpoint_start_func( #ifdef UNIV_LOG_ARCHIVE log_sys->archived_lsn = archived_lsn; -#endif /* UNIV_LOG_ARCHIVE */ recv_synchronize_groups(up_to_date_group); +#else /* UNIV_LOG_ARCHIVE */ + recv_synchronize_groups(); +#endif /* UNIV_LOG_ARCHIVE */ if (!recv_needed_recovery) { ut_a(checkpoint_lsn == recv_sys->recovered_lsn); @@ -3225,13 +3353,13 @@ recv_recovery_from_checkpoint_start_func( } #endif /* UNIV_LOG_ARCHIVE */ - mutex_enter(&(recv_sys->mutex)); + mutex_enter(&recv_sys->mutex); recv_sys->apply_log_recs = TRUE; - mutex_exit(&(recv_sys->mutex)); + mutex_exit(&recv_sys->mutex); - mutex_exit(&(log_sys->mutex)); + mutex_exit(&log_sys->mutex); recv_lsn_checks_on = TRUE; @@ -3287,10 +3415,40 @@ recv_recovery_from_checkpoint_finish(void) "InnoDB: a backup!\n"); } - /* Free the resources of the recovery system */ + /* Make sure that the recv_writer thread is done. This is + required because it grabs various mutexes and we want to + ensure that when we enable sync_order_checks there is no + mutex currently held by any thread. */ + mutex_enter(&recv_sys->writer_mutex); + /* Free the resources of the recovery system */ recv_recovery_on = FALSE; + /* By acquring the mutex we ensure that the recv_writer thread + won't trigger any more LRU batchtes. Now wait for currently + in progress batches to finish. */ + buf_flush_wait_LRU_batch_end(); + + mutex_exit(&recv_sys->writer_mutex); + + ulint count = 0; + while (recv_writer_thread_active) { + ++count; + os_thread_sleep(100000); + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for recv_writer to " + "finish flushing of buffer pool"); + count = 0; + } + } + +#ifdef __WIN__ + if (recv_writer_thread_handle) { + CloseHandle(recv_writer_thread_handle); + } +#endif /* __WIN__ */ + #ifndef UNIV_LOG_DEBUG recv_sys_debug_free(); #endif @@ -3310,20 +3468,22 @@ void recv_recovery_rollback_active(void) /*===============================*/ { - int i; - #ifdef UNIV_SYNC_DEBUG /* Wait for a while so that created threads have time to suspend themselves before we switch the latching order checks on */ os_thread_sleep(1000000); + ut_ad(!recv_writer_thread_active); + /* Switch latching order checks on in sync0sync.cc */ sync_order_checks_on = TRUE; #endif /* We can't start any (DDL) transactions if UNDO logging has been disabled, additionally disable ROLLBACK of recovered user transactions. */ - if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO + && !srv_read_only_mode) { + /* Drop partially created indexes. */ row_merge_drop_temp_indexes(); /* Drop temporary tables. */ @@ -3338,7 +3498,7 @@ recv_recovery_rollback_active(void) /* Rollback the uncommitted transactions which have no user session */ - os_thread_create(trx_rollback_or_clean_all_recovered, &i, NULL); + os_thread_create(trx_rollback_or_clean_all_recovered, 0, 0); } } @@ -3348,18 +3508,18 @@ UNIV_INTERN void recv_reset_logs( /*============*/ - lsn_t lsn, /*!< in: reset to this lsn - rounded up to be divisible by - OS_FILE_LOG_BLOCK_SIZE, after - which we add - LOG_BLOCK_HDR_SIZE */ #ifdef UNIV_LOG_ARCHIVE ulint arch_log_no, /*!< in: next archived log file number */ -#endif /* UNIV_LOG_ARCHIVE */ - ibool new_logs_created)/*!< in: TRUE if resetting logs + ibool new_logs_created,/*!< in: TRUE if resetting logs is done at the log creation; FALSE if it is done after archive recovery */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t lsn) /*!< in: reset to this lsn + rounded up to be divisible by + OS_FILE_LOG_BLOCK_SIZE, after + which we add + LOG_BLOCK_HDR_SIZE */ { log_group_t* group; @@ -3375,12 +3535,12 @@ recv_reset_logs( #ifdef UNIV_LOG_ARCHIVE group->archived_file_no = arch_log_no; group->archived_offset = 0; -#endif /* UNIV_LOG_ARCHIVE */ if (!new_logs_created) { recv_truncate_group(group, group->lsn, group->lsn, group->lsn, group->lsn); } +#endif /* UNIV_LOG_ARCHIVE */ group = UT_LIST_GET_NEXT(log_groups, group); } @@ -3805,7 +3965,7 @@ recv_recovery_from_archive_start( recv_apply_hashed_log_recs(FALSE); - recv_reset_logs(recv_sys->recovered_lsn, 0, FALSE); + recv_reset_logs(0, FALSE, recv_sys->recovered_lsn); } mutex_exit(&(log_sys->mutex)); diff --git a/storage/innobase/mem/mem0dbg.cc b/storage/innobase/mem/mem0dbg.cc index 83e14ad6071..308c2979551 100644 --- a/storage/innobase/mem/mem0dbg.cc +++ b/storage/innobase/mem/mem0dbg.cc @@ -30,7 +30,7 @@ Created 6/9/1994 Heikki Tuuri /* The mutex which protects in the debug version the hash table containing the list of live memory heaps, and also the global variables below. */ -UNIV_INTERN mutex_t mem_hash_mutex; +UNIV_INTERN ib_mutex_t mem_hash_mutex; #ifdef UNIV_PFS_MUTEX /* Key to register mem_hash_mutex with performance schema */ @@ -58,8 +58,7 @@ static ibool mem_hash_initialized = FALSE; /* The node of the list containing currently allocated memory heaps */ -typedef struct mem_hash_node_struct mem_hash_node_t; -struct mem_hash_node_struct { +struct mem_hash_node_t { UT_LIST_NODE_T(mem_hash_node_t) list; /*!< hash list node */ mem_heap_t* heap; /*!< memory heap */ diff --git a/storage/innobase/mem/mem0pool.cc b/storage/innobase/mem/mem0pool.cc index 2135926a26f..fe9a84d21fa 100644 --- a/storage/innobase/mem/mem0pool.cc +++ b/storage/innobase/mem/mem0pool.cc @@ -100,12 +100,12 @@ pool, and after that its locks will grow into the buffer pool. */ /** Data structure for a memory pool. The space is allocated using the buddy algorithm, where free list i contains areas of size 2 to power i. */ -struct mem_pool_struct{ +struct mem_pool_t{ byte* buf; /*!< memory pool */ ulint size; /*!< memory common pool size */ ulint reserved; /*!< amount of currently allocated memory */ - mutex_t mutex; /*!< mutex protecting this struct */ + ib_mutex_t mutex; /*!< mutex protecting this struct */ UT_LIST_BASE_NODE_T(mem_area_t) free_list[64]; /*!< lists of free memory areas: an area is put to the list whose number @@ -116,7 +116,7 @@ struct mem_pool_struct{ UNIV_INTERN mem_pool_t* mem_comm_pool = NULL; #ifdef UNIV_PFS_MUTEX -/* Key to register mutex in mem_pool_struct with performance schema */ +/* Key to register mutex in mem_pool_t with performance schema */ UNIV_INTERN mysql_pfs_key_t mem_pool_mutex_key; #endif /* UNIV_PFS_MUTEX */ diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc index d549de8802e..5335cb4c9ef 100644 --- a/storage/innobase/mtr/mtr0log.cc +++ b/storage/innobase/mtr/mtr0log.cc @@ -240,8 +240,8 @@ mlog_parse_nbytes( } /********************************************************//** -Writes 1 - 4 bytes to a file page buffered in the buffer pool. -Writes the corresponding log record to the mini-transaction log. */ +Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log +record to the mini-transaction log if mtr is not NULL. */ UNIV_INTERN void mlog_write_ulint( @@ -251,8 +251,6 @@ mlog_write_ulint( byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ mtr_t* mtr) /*!< in: mini-transaction handle */ { - byte* log_ptr; - switch (type) { case MLOG_1BYTE: mach_write_to_1(ptr, val); @@ -267,27 +265,29 @@ mlog_write_ulint( ut_error; } - log_ptr = mlog_open(mtr, 11 + 2 + 5); + if (mtr != 0) { + byte* log_ptr = mlog_open(mtr, 11 + 2 + 5); - /* If no logging is requested, we may return now */ - if (log_ptr == NULL) { + /* If no logging is requested, we may return now */ - return; - } + if (log_ptr != 0) { - log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr); + log_ptr = mlog_write_initial_log_record_fast( + ptr, type, log_ptr, mtr); - mach_write_to_2(log_ptr, page_offset(ptr)); - log_ptr += 2; + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; - log_ptr += mach_write_compressed(log_ptr, val); + log_ptr += mach_write_compressed(log_ptr, val); - mlog_close(mtr, log_ptr); + mlog_close(mtr, log_ptr); + } + } } /********************************************************//** -Writes 8 bytes to a file page buffered in the buffer pool. -Writes the corresponding log record to the mini-transaction log. */ +Writes 8 bytes to a file page. Writes the corresponding log +record to the mini-transaction log, only if mtr is not NULL */ UNIV_INTERN void mlog_write_ull( @@ -296,29 +296,25 @@ mlog_write_ull( ib_uint64_t val, /*!< in: value to write */ mtr_t* mtr) /*!< in: mini-transaction handle */ { - byte* log_ptr; - - ut_ad(ptr && mtr); - mach_write_to_8(ptr, val); - log_ptr = mlog_open(mtr, 11 + 2 + 9); - - /* If no logging is requested, we may return now */ - if (log_ptr == NULL) { + if (mtr != 0) { + byte* log_ptr = mlog_open(mtr, 11 + 2 + 9); - return; - } + /* If no logging is requested, we may return now */ + if (log_ptr != 0) { - log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_8BYTES, - log_ptr, mtr); + log_ptr = mlog_write_initial_log_record_fast( + ptr, MLOG_8BYTES, log_ptr, mtr); - mach_write_to_2(log_ptr, page_offset(ptr)); - log_ptr += 2; + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; - log_ptr += mach_ull_write_compressed(log_ptr, val); + log_ptr += mach_ull_write_compressed(log_ptr, val); - mlog_close(mtr, log_ptr); + mlog_close(mtr, log_ptr); + } + } } #ifndef UNIV_HOTBACKUP @@ -439,12 +435,13 @@ UNIV_INTERN byte* mlog_open_and_write_index( /*======================*/ - mtr_t* mtr, /*!< in: mtr */ - const byte* rec, /*!< in: index record or page */ - dict_index_t* index, /*!< in: record descriptor */ - byte type, /*!< in: log item type */ - ulint size) /*!< in: requested buffer size in bytes - (if 0, calls mlog_close() and returns NULL) */ + mtr_t* mtr, /*!< in: mtr */ + const byte* rec, /*!< in: index record or page */ + const dict_index_t* index, /*!< in: record descriptor */ + byte type, /*!< in: log item type */ + ulint size) /*!< in: requested buffer size in bytes + (if 0, calls mlog_close() and + returns NULL) */ { byte* log_ptr; const byte* log_start; diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 4832e8c7710..10b4686b720 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -142,9 +142,9 @@ mtr_memo_slot_note_modification( mtr_t* mtr, /*!< in: mtr */ mtr_memo_slot_t* slot) /*!< in: memo slot */ { - ut_ad(mtr); - ut_ad(mtr->magic_n == MTR_MAGIC_N); ut_ad(mtr->modifications); + ut_ad(!srv_read_only_mode); + ut_ad(mtr->magic_n == MTR_MAGIC_N); if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) { buf_block_t* block = (buf_block_t*) slot->object; @@ -170,7 +170,7 @@ mtr_memo_note_modifications( dyn_array_t* memo; ulint offset; - ut_ad(mtr); + ut_ad(!srv_read_only_mode); ut_ad(mtr->magic_n == MTR_MAGIC_N); ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in commit */ @@ -191,19 +191,51 @@ mtr_memo_note_modifications( } /************************************************************//** +Append the dirty pages to the flush list. */ +static +void +mtr_add_dirtied_pages_to_flush_list( +/*================================*/ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ut_ad(!srv_read_only_mode); + + /* No need to acquire log_flush_order_mutex if this mtr has + not dirtied a clean page. log_flush_order_mutex is used to + ensure ordered insertions in the flush_list. We need to + insert in the flush_list iff the page in question was clean + before modifications. */ + if (mtr->made_dirty) { + log_flush_order_mutex_enter(); + } + + /* It is now safe to release the log mutex because the + flush_order mutex will ensure that we are the first one + to insert into the flush list. */ + log_release(); + + if (mtr->modifications) { + mtr_memo_note_modifications(mtr); + } + + if (mtr->made_dirty) { + log_flush_order_mutex_exit(); + } +} + +/************************************************************//** Writes the contents of a mini-transaction log, if any, to the database log. */ static void mtr_log_reserve_and_write( /*======================*/ - mtr_t* mtr) /*!< in: mtr */ + mtr_t* mtr) /*!< in/out: mtr */ { dyn_array_t* mlog; - dyn_block_t* block; ulint data_size; byte* first_data; - ut_ad(mtr); + ut_ad(!srv_read_only_mode); mlog = &(mtr->log); @@ -217,14 +249,21 @@ mtr_log_reserve_and_write( } if (mlog->heap == NULL) { + ulint len; + + len = mtr->log_mode != MTR_LOG_NO_REDO + ? dyn_block_get_used(mlog) : 0; + mtr->end_lsn = log_reserve_and_write_fast( - first_data, dyn_block_get_used(mlog), - &mtr->start_lsn); + first_data, len, &mtr->start_lsn); + if (mtr->end_lsn) { /* Success. We have the log mutex. Add pages to flush list and exit */ - goto func_exit; + mtr_add_dirtied_pages_to_flush_list(mtr); + + return; } } @@ -235,43 +274,24 @@ mtr_log_reserve_and_write( if (mtr->log_mode == MTR_LOG_ALL) { - block = mlog; + for (dyn_block_t* block = mlog; + block != 0; + block = dyn_array_get_next_block(mlog, block)) { - while (block != NULL) { - log_write_low(dyn_block_get_data(block), - dyn_block_get_used(block)); - block = dyn_array_get_next_block(mlog, block); + log_write_low( + dyn_block_get_data(block), + dyn_block_get_used(block)); } + } else { - ut_ad(mtr->log_mode == MTR_LOG_NONE); + ut_ad(mtr->log_mode == MTR_LOG_NONE + || mtr->log_mode == MTR_LOG_NO_REDO); /* Do nothing */ } mtr->end_lsn = log_close(); -func_exit: - - /* No need to acquire log_flush_order_mutex if this mtr has - not dirtied a clean page. log_flush_order_mutex is used to - ensure ordered insertions in the flush_list. We need to - insert in the flush_list iff the page in question was clean - before modifications. */ - if (mtr->made_dirty) { - log_flush_order_mutex_enter(); - } - - /* It is now safe to release the log mutex because the - flush_order mutex will ensure that we are the first one - to insert into the flush list. */ - log_release(); - - if (mtr->modifications) { - mtr_memo_note_modifications(mtr); - } - - if (mtr->made_dirty) { - log_flush_order_mutex_exit(); - } + mtr_add_dirtied_pages_to_flush_list(mtr); } #endif /* !UNIV_HOTBACKUP */ @@ -294,6 +314,7 @@ mtr_commit( ut_ad(!recv_no_log_write); if (mtr->modifications && mtr->n_log_recs) { + ut_ad(!srv_read_only_mode); mtr_log_reserve_and_write(mtr); } @@ -376,14 +397,8 @@ mtr_read_ulint( ut_ad(mtr->state == MTR_ACTIVE); ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX) || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX)); - if (type == MLOG_1BYTE) { - return(mach_read_from_1(ptr)); - } else if (type == MLOG_2BYTES) { - return(mach_read_from_2(ptr)); - } else { - ut_ad(type == MLOG_4BYTES); - return(mach_read_from_4(ptr)); - } + + return(mach_read_ulint(ptr, type)); } #ifdef UNIV_DEBUG diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 62cde1cf728..5f0dc0d3667 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -1,6 +1,6 @@ /*********************************************************************** -Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. Portions of this file contain modifications contributed and copyrighted @@ -60,24 +60,29 @@ Created 10/21/1995 Heikki Tuuri #include <libaio.h> #endif +/** Insert buffer segment id */ +static const ulint IO_IBUF_SEGMENT = 0; + +/** Log segment id */ +static const ulint IO_LOG_SEGMENT = 1; + /* This specifies the file permissions InnoDB uses when it creates files in Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to my_umask */ #ifndef __WIN__ /** Umask for creating files */ -UNIV_INTERN ulint os_innodb_umask - = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; #else /** Umask for creating files */ -UNIV_INTERN ulint os_innodb_umask = 0; -#endif +UNIV_INTERN ulint os_innodb_umask = 0; +#endif /* __WIN__ */ #ifndef UNIV_HOTBACKUP /* We use these mutexes to protect lseek + file i/o operation, if the OS does not provide an atomic pread or pwrite, or similar */ #define OS_FILE_N_SEEK_MUTEXES 16 -UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; +UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; /* In simulated aio, merge at most this many consecutive i/os */ #define OS_AIO_MERGE_N_CONSECUTIVE 64 @@ -147,10 +152,7 @@ UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key; #endif /* UNIV_PFS_IO */ /** The asynchronous i/o array slot structure */ -typedef struct os_aio_slot_struct os_aio_slot_t; - -/** The asynchronous i/o array slot structure */ -struct os_aio_slot_struct{ +struct os_aio_slot_t{ ibool is_read; /*!< TRUE if a read operation */ ulint pos; /*!< index of the slot in the aio array */ @@ -182,15 +184,12 @@ struct os_aio_slot_struct{ struct iocb control; /* Linux control block for aio */ int n_bytes; /* bytes written/read. */ int ret; /* AIO return code */ -#endif +#endif /* WIN_ASYNC_IO */ }; /** The asynchronous i/o array structure */ -typedef struct os_aio_array_struct os_aio_array_t; - -/** The asynchronous i/o array structure */ -struct os_aio_array_struct{ - os_mutex_t mutex; /*!< the mutex protecting the aio array */ +struct os_aio_array_t{ + os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */ os_event_t not_full; /*!< The event which is set to the signaled state when there is space in @@ -223,7 +222,7 @@ struct os_aio_array_struct{ order. This can be used in WaitForMultipleObjects; used only in Windows */ -#endif +#endif /* __WIN__ */ #if defined(LINUX_NATIVE_AIO) io_context_t* aio_ctx; @@ -235,7 +234,7 @@ struct os_aio_array_struct{ There is one such event for each possible pending IO. The size of the array is equal to n_slots. */ -#endif +#endif /* LINUX_NATIV_AIO */ }; #if defined(LINUX_NATIVE_AIO) @@ -283,7 +282,7 @@ UNIV_INTERN ibool os_has_said_disk_full = FALSE; #if !defined(UNIV_HOTBACKUP) \ && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8) /** The mutex protecting the following counts of pending I/O operations */ -static os_mutex_t os_file_count_mutex; +static os_ib_mutex_t os_file_count_mutex; #endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */ /** Number of pending os_file_pread() operations */ @@ -336,7 +335,7 @@ ulint os_get_os_version(void) /*===================*/ { - OSVERSIONINFO os_info; + OSVERSIONINFO os_info; os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); @@ -350,15 +349,15 @@ os_get_os_version(void) switch (os_info.dwMajorVersion) { case 3: case 4: - return OS_WINNT; + return(OS_WINNT); case 5: - return (os_info.dwMinorVersion == 0) ? OS_WIN2000 - : OS_WINXP; + return (os_info.dwMinorVersion == 0) + ? OS_WIN2000 : OS_WINXP; case 6: - return (os_info.dwMinorVersion == 0) ? OS_WINVISTA - : OS_WIN7; + return (os_info.dwMinorVersion == 0) + ? OS_WINVISTA : OS_WIN7; default: - return OS_WIN7; + return(OS_WIN7); } } else { ut_error; @@ -377,16 +376,17 @@ static ulint os_file_get_last_error_low( /*=======================*/ - ibool report_all_errors, /*!< in: TRUE if we want an error + bool report_all_errors, /*!< in: TRUE if we want an error message printed of all errors */ - ibool on_error_silent) /*!< in: TRUE then don't print any + bool on_error_silent) /*!< in: TRUE then don't print any diagnostic to the log */ { - ulint err; - #ifdef __WIN__ - err = (ulint) GetLastError(); + ulint err = (ulint) GetLastError(); + if (err == ERROR_SUCCESS) { + return(0); + } if (report_all_errors || (!on_error_silent @@ -469,15 +469,18 @@ os_file_get_last_error_low( return(100 + err); } #else - err = (ulint) errno; + int err = errno; + if (err == 0) { + return(0); + } if (report_all_errors || (err != ENOSPC && err != EEXIST && !on_error_silent)) { ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Operating system error number %lu" - " in a file operation.\n", (ulong) err); + " InnoDB: Operating system error number %d" + " in a file operation.\n", err); if (err == ENOENT) { fprintf(stderr, @@ -497,11 +500,11 @@ os_file_get_last_error_low( " the access rights to\n" "InnoDB: the directory.\n"); } else { - if (strerror((int) err) != NULL) { + if (strerror(err) != NULL) { fprintf(stderr, - "InnoDB: Error number %lu" + "InnoDB: Error number %d" " means '%s'.\n", - err, strerror((int) err)); + err, strerror(err)); } @@ -552,10 +555,10 @@ UNIV_INTERN ulint os_file_get_last_error( /*===================*/ - ibool report_all_errors) /*!< in: TRUE if we want an error + bool report_all_errors) /*!< in: TRUE if we want an error message printed of all errors */ { - return(os_file_get_last_error_low(report_all_errors, FALSE)); + return(os_file_get_last_error_low(report_all_errors, false)); } /****************************************************************//** @@ -577,7 +580,7 @@ os_file_handle_error_cond_exit( { ulint err; - err = os_file_get_last_error_low(FALSE, on_error_silent); + err = os_file_get_last_error_low(false, on_error_silent); switch (err) { case OS_FILE_DISK_FULL: @@ -645,7 +648,8 @@ os_file_handle_error_cond_exit( ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: File operation call: " - "'%s'.\n", operation); + "'%s' returned OS error " ULINTPF ".\n", + operation, err); } if (should_exit) { @@ -654,7 +658,9 @@ os_file_handle_error_cond_exit( "operation.\n"); fflush(stderr); - ut_error; + + ut_ad(0); /* Report call stack, etc only in debug code. */ + exit(1); } } @@ -712,19 +718,23 @@ os_file_lock( const char* name) /*!< in: file name */ { struct flock lk; + + ut_ad(!srv_read_only_mode); + lk.l_type = F_WRLCK; lk.l_whence = SEEK_SET; lk.l_start = lk.l_len = 0; + if (fcntl(fd, F_SETLK, &lk) == -1) { - fprintf(stderr, - "InnoDB: Unable to lock %s, error: %d\n", name, errno); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to lock %s, error: %d", name, errno); if (errno == EAGAIN || errno == EACCES) { - fprintf(stderr, - "InnoDB: Check that you do not already have" - " another mysqld process\n" - "InnoDB: using the same InnoDB data" - " or log files.\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Check that you do not already have " + "another mysqld process using the " + "same InnoDB data or log files."); } return(-1); @@ -742,13 +752,11 @@ void os_io_init_simple(void) /*===================*/ { - ulint i; - #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 os_file_count_mutex = os_mutex_create(); #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */ - for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { os_file_seek_mutexes[i] = os_mutex_create(); } } @@ -765,6 +773,8 @@ os_file_create_tmpfile(void) FILE* file = NULL; int fd = innobase_mysql_tmpfile(); + ut_ad(!srv_read_only_mode); + if (fd >= 0) { file = fdopen(fd, "w+b"); } @@ -840,7 +850,7 @@ os_file_opendir( } return(dir); -#endif +#endif /* __WIN__ */ } /***********************************************************************//** @@ -874,7 +884,7 @@ os_file_closedir( } return(ret); -#endif +#endif /* __WIN__ */ } /***********************************************************************//** @@ -1054,10 +1064,12 @@ next_file: } /*****************************************************************//** -This function attempts to create a directory named pathname. The new directory -gets default permissions. On Unix the permissions are (0770 & ~umask). If the -directory exists already, nothing is done and the call succeeds, unless the -fail_if_exists arguments is true. +This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns FALSE. @return TRUE if call succeeds, FALSE on error */ UNIV_INTERN ibool @@ -1075,13 +1087,14 @@ os_file_create_directory( if (!(rcode != 0 || (GetLastError() == ERROR_ALREADY_EXISTS && !fail_if_exists))) { - /* failure */ - os_file_handle_error(pathname, "CreateDirectory"); + + os_file_handle_error_no_exit( + pathname, "CreateDirectory", FALSE); return(FALSE); } - return (TRUE); + return(TRUE); #else int rcode; @@ -1089,13 +1102,13 @@ os_file_create_directory( if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error(pathname, "mkdir"); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE); return(FALSE); } return (TRUE); -#endif +#endif /* __WIN__ */ } /****************************************************************//** @@ -1115,129 +1128,180 @@ os_file_create_simple_func( OS_FILE_READ_WRITE */ ibool* success)/*!< out: TRUE if succeed, FALSE if error */ { -#ifdef __WIN__ os_file_t file; - DWORD create_flag; + ibool retry; + +#ifdef __WIN__ DWORD access; + DWORD create_flag; DWORD attributes = 0; - ibool retry; ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); -try_again: - ut_a(name); if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_CREATE_PATH) { - /* create subdirs along the path if needed */ + + ut_a(!srv_read_only_mode); + + /* Create subdirs along the path if needed */ *success = os_file_create_subdirs_if_needed(name); + if (!*success) { - ut_error; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); } + create_flag = CREATE_NEW; create_mode = OS_FILE_CREATE; + } else { - create_flag = 0; - ut_error; + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); } if (access_type == OS_FILE_READ_ONLY) { access = GENERIC_READ; + } else if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_INFO, + "read only mode set. Unable to " + "open file '%s' in RW mode, trying RO mode", name); + + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { access = GENERIC_READ | GENERIC_WRITE; } else { - access = 0; - ut_error; + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); } - file = CreateFile((LPCTSTR) name, - access, - FILE_SHARE_READ | FILE_SHARE_WRITE, - /* file can be read and written also - by other processes */ - NULL, /* default security attributes */ - create_flag, - attributes, - NULL); /*!< no template file */ + do { + /* Use default security attributes and no template file. */ - if (file == INVALID_HANDLE_VALUE) { - *success = FALSE; + file = CreateFile( + (LPCTSTR) name, access, FILE_SHARE_READ, NULL, + create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + + *success = FALSE; - retry = os_file_handle_error(name, - create_mode == OS_FILE_OPEN ? - "open" : "create"); - if (retry) { - goto try_again; + retry = os_file_handle_error( + name, create_mode == OS_FILE_OPEN ? + "open" : "create"); + + } else { + *success = TRUE; + retry = false; } - } else { - *success = TRUE; - } - return(file); + } while (retry); + #else /* __WIN__ */ - os_file_t file; int create_flag; - ibool retry; ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); -try_again: - ut_a(name); - if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { create_flag = O_RDONLY; + } else if (srv_read_only_mode) { + create_flag = O_RDONLY; } else { create_flag = O_RDWR; } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else if (create_mode == OS_FILE_CREATE_PATH) { - /* create subdirs along the path if needed */ + + /* Create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + if (!*success) { - return (-1); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); } + create_flag = O_RDWR | O_CREAT | O_EXCL; create_mode = OS_FILE_CREATE; } else { - create_flag = 0; - ut_error; - } - if (create_mode == OS_FILE_CREATE) { - file = open(name, create_flag, S_IRUSR | S_IWUSR - | S_IRGRP | S_IWGRP); - } else { - file = open(name, create_flag); + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); } - if (file == -1) { - *success = FALSE; + do { + file = ::open(name, create_flag, os_innodb_umask); + + if (file == -1) { + *success = FALSE; - retry = os_file_handle_error(name, - create_mode == OS_FILE_OPEN ? - "open" : "create"); - if (retry) { - goto try_again; + retry = os_file_handle_error( + name, + create_mode == OS_FILE_OPEN + ? "open" : "create"); + } else { + *success = TRUE; + retry = false; } + + } while (retry); + #ifdef USE_FILE_LOCK - } else if (access_type == OS_FILE_READ_WRITE - && os_file_lock(file, name)) { + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; close(file); file = -1; -#endif - } else { - *success = TRUE; } +#endif /* USE_FILE_LOCK */ - return(file); #endif /* __WIN__ */ + + return(file); } /****************************************************************//** @@ -1259,12 +1323,13 @@ os_file_create_simple_no_error_handling_func( used by a backup program reading the file */ ibool* success)/*!< out: TRUE if succeed, FALSE if error */ { -#ifdef __WIN__ os_file_t file; - DWORD create_flag; + +#ifdef __WIN__ DWORD access; + DWORD create_flag; DWORD attributes = 0; - DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + DWORD share_mode = FILE_SHARE_READ; ut_a(name); @@ -1273,46 +1338,53 @@ os_file_create_simple_no_error_handling_func( if (create_mode == OS_FILE_OPEN) { create_flag = OPEN_EXISTING; + } else if (srv_read_only_mode) { + create_flag = OPEN_EXISTING; } else if (create_mode == OS_FILE_CREATE) { create_flag = CREATE_NEW; } else { - create_flag = 0; - ut_error; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); } if (access_type == OS_FILE_READ_ONLY) { access = GENERIC_READ; + } else if (srv_read_only_mode) { + access = GENERIC_READ; } else if (access_type == OS_FILE_READ_WRITE) { access = GENERIC_READ | GENERIC_WRITE; } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + + ut_a(!srv_read_only_mode); + access = GENERIC_READ; - share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ - | FILE_SHARE_WRITE; /*!< A backup program has to give - mysqld the maximum freedom to - do what it likes with the - file */ + + /*!< A backup program has to give mysqld the maximum + freedom to do what it likes with the file */ + + share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE; } else { - access = 0; - ut_error; + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); } file = CreateFile((LPCTSTR) name, access, share_mode, - NULL, /* default security attributes */ + NULL, // Security attributes create_flag, attributes, - NULL); /*!< no template file */ - - if (file == INVALID_HANDLE_VALUE) { - *success = FALSE; - } else { - *success = TRUE; - } + NULL); // No template file - return(file); + *success = (file != INVALID_HANDLE_VALUE); #else /* __WIN__ */ - os_file_t file; int create_flag; ut_a(name); @@ -1321,40 +1393,59 @@ os_file_create_simple_no_error_handling_func( ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (srv_read_only_mode) { + create_flag = O_RDONLY; + } else { + + ut_a(access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_ALLOW_DELETE); + create_flag = O_RDWR; } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; - } else { - create_flag = 0; - ut_error; - } - if (create_mode == OS_FILE_CREATE) { - file = open(name, create_flag, S_IRUSR | S_IWUSR - | S_IRGRP | S_IWGRP); } else { - file = open(name, create_flag); + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); } - if (file == -1) { - *success = FALSE; + file = ::open(name, create_flag, os_innodb_umask); + + *success = file == -1 ? FALSE : TRUE; + #ifdef USE_FILE_LOCK - } else if (access_type == OS_FILE_READ_WRITE - && os_file_lock(file, name)) { + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; close(file); file = -1; -#endif - } else { - *success = TRUE; + } +#endif /* USE_FILE_LOCK */ - return(file); #endif /* __WIN__ */ + + return(file); } /****************************************************************//** @@ -1364,42 +1455,41 @@ void os_file_set_nocache( /*================*/ int fd /*!< in: file descriptor to alter */ - __attribute__((unused)), - const char* file_name /*!< in: used in the diagnostic message */ - __attribute__((unused)), + __attribute__((unused)), + const char* file_name /*!< in: used in the diagnostic + message */ + __attribute__((unused)), const char* operation_name __attribute__((unused))) - /*!< in: "open" or "create"; used in the - diagnostic message */ + /*!< in: "open" or "create"; used + in the diagnostic message */ { /* some versions of Solaris may not have DIRECTIO_ON */ #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) if (directio(fd, DIRECTIO_ON) == -1) { - int errno_save; - errno_save = (int) errno; - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Failed to set DIRECTIO_ON " - "on file %s: %s: %s, continuing anyway\n", + int errno_save = errno; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to set DIRECTIO_ON on file %s: %s: %s, " + "continuing anyway.", file_name, operation_name, strerror(errno_save)); } #elif defined(O_DIRECT) if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { - int errno_save; - errno_save = (int) errno; - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Failed to set O_DIRECT " - "on file %s: %s: %s, continuing anyway\n", + int errno_save = errno; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to set O_DIRECT on file %s: %s: %s, " + "continuing anyway", file_name, operation_name, strerror(errno_save)); + if (errno_save == EINVAL) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: O_DIRECT is known to result in " - "'Invalid argument' on Linux on tmpfs, " - "see MySQL Bug#26662\n"); + ib_logf(IB_LOG_LEVEL_ERROR, + "O_DIRECT is known to result in 'Invalid " + "argument' on Linux on tmpfs, see MySQL " + "Bug#26662"); } } -#endif +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ } /****************************************************************//** @@ -1425,138 +1515,155 @@ os_file_create_func( ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success)/*!< out: TRUE if succeed, FALSE if error */ { + os_file_t file; + ibool retry; ibool on_error_no_exit; ibool on_error_silent; #ifdef __WIN__ - os_file_t file; - DWORD share_mode = FILE_SHARE_READ; + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + SetLastError(ERROR_DISK_FULL); + return((os_file_t) -1); + ); +#else /* __WIN__ */ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + errno = ENOSPC; + return((os_file_t) -1); + ); +#endif /* __WIN__ */ + +#ifdef __WIN__ DWORD create_flag; - DWORD attributes; - ibool retry; + DWORD share_mode = FILE_SHARE_READ; on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? TRUE : FALSE; + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT ? TRUE : FALSE; create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; create_mode &= ~OS_FILE_ON_ERROR_SILENT; + if (create_mode == OS_FILE_OPEN_RAW) { - DBUG_EXECUTE_IF( - "ib_create_table_fail_disk_full", - *success = FALSE; - SetLastError(ERROR_DISK_FULL); - return((os_file_t) -1); - ); -try_again: - ut_a(name); + ut_a(!srv_read_only_mode); - if (create_mode == OS_FILE_OPEN_RAW) { create_flag = OPEN_EXISTING; - share_mode = FILE_SHARE_WRITE; + + /* On Windows Physical devices require admin privileges and + have to have the write-share mode set. See the remarks + section for the CreateFile() function documentation in MSDN. */ + + share_mode |= FILE_SHARE_WRITE; + } else if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RETRY) { + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_OVERWRITE) { + create_flag = CREATE_ALWAYS; + } else { - create_flag = 0; - ut_error; + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); } + DWORD attributes = 0; + +#ifdef UNIV_HOTBACKUP + attributes |= FILE_FLAG_NO_BUFFERING; +#else if (purpose == OS_FILE_AIO) { + +#ifdef WIN_ASYNC_IO /* If specified, use asynchronous (overlapped) io and no buffering of writes in the OS */ - attributes = 0; -#ifdef WIN_ASYNC_IO + if (srv_use_native_aio) { - attributes = attributes | FILE_FLAG_OVERLAPPED; + attributes |= FILE_FLAG_OVERLAPPED; } -#endif -#ifdef UNIV_NON_BUFFERED_IO -# ifndef UNIV_HOTBACKUP - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { - /* Do not use unbuffered i/o to log files because - value 2 denotes that we do not flush the log at every - commit, but only once per second */ - } else if (srv_win_file_flush_method - == SRV_WIN_IO_UNBUFFERED) { - attributes = attributes | FILE_FLAG_NO_BUFFERING; - } -# else /* !UNIV_HOTBACKUP */ - attributes = attributes | FILE_FLAG_NO_BUFFERING; -# endif /* !UNIV_HOTBACKUP */ -#endif /* UNIV_NON_BUFFERED_IO */ +#endif /* WIN_ASYNC_IO */ + } else if (purpose == OS_FILE_NORMAL) { - attributes = 0; + /* Use default setting. */ + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown purpose flag (%lu) while opening file '%s'", + purpose, name); + + return((os_file_t)(-1)); + } + #ifdef UNIV_NON_BUFFERED_IO -# ifndef UNIV_HOTBACKUP - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { - /* Do not use unbuffered i/o to log files because - value 2 denotes that we do not flush the log at every - commit, but only once per second */ - } else if (srv_win_file_flush_method - == SRV_WIN_IO_UNBUFFERED) { - attributes = attributes | FILE_FLAG_NO_BUFFERING; - } -# else /* !UNIV_HOTBACKUP */ - attributes = attributes | FILE_FLAG_NO_BUFFERING; -# endif /* !UNIV_HOTBACKUP */ + // TODO: Create a bug, this looks wrong. The flush log + // parameter is dynamic. + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + + /* Do not use unbuffered i/o for the log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + + } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) { + + attributes |= FILE_FLAG_NO_BUFFERING; + } #endif /* UNIV_NON_BUFFERED_IO */ - } else { - attributes = 0; - ut_error; + +#endif /* UNIV_HOTBACKUP */ + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; } - file = CreateFile((LPCTSTR) name, - GENERIC_READ | GENERIC_WRITE, /* read and write - access */ - share_mode, /* File can be read also by other - processes; we must give the read - permission because of ibbackup. We do - not give the write permission to - others because if one would succeed to - start 2 instances of mysqld on the - SAME files, that could cause severe - database corruption! When opening - raw disk partitions, Microsoft manuals - say that we must give also the write - permission. */ - NULL, /* default security attributes */ - create_flag, - attributes, - NULL); /*!< no template file */ + do { + /* Use default security attributes and no template file. */ + file = CreateFile( + (LPCTSTR) name, access, share_mode, NULL, + create_flag, attributes, NULL); - if (file == INVALID_HANDLE_VALUE) { - const char* operation; + if (file == INVALID_HANDLE_VALUE) { + const char* operation; - operation = create_mode == OS_FILE_CREATE ? "create" : "open"; + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; - *success = FALSE; + *success = FALSE; - if (on_error_no_exit) { - retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } } else { - retry = os_file_handle_error(name, operation); + *success = TRUE; + retry = FALSE; } - if (retry) { - goto try_again; - } - } else { - *success = TRUE; - } + } while (retry); - return(file); #else /* __WIN__ */ - os_file_t file; int create_flag; - ibool retry; const char* mode_str = NULL; on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT @@ -1567,28 +1674,36 @@ try_again: create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; create_mode &= ~OS_FILE_ON_ERROR_SILENT; - DBUG_EXECUTE_IF( - "ib_create_table_fail_disk_full", - *success = FALSE; - errno = ENOSPC; - return((os_file_t) -1); - ); -try_again: - ut_a(name); - - if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW + if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW || create_mode == OS_FILE_OPEN_RETRY) { + + mode_str = "OPEN"; + + create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR; + + } else if (srv_read_only_mode) { + mode_str = "OPEN"; - create_flag = O_RDWR; + + create_flag = O_RDONLY; + } else if (create_mode == OS_FILE_CREATE) { + mode_str = "CREATE"; create_flag = O_RDWR | O_CREAT | O_EXCL; + } else if (create_mode == OS_FILE_OVERWRITE) { + mode_str = "OVERWRITE"; create_flag = O_RDWR | O_CREAT | O_TRUNC; + } else { - create_flag = 0; - ut_error; + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); } ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE); @@ -1598,69 +1713,75 @@ try_again: /* We let O_SYNC only affect log files; note that we map O_DSYNC to O_SYNC because the datasync options seemed to corrupt files in 2001 in both Linux and Solaris */ - if (type == OS_LOG_FILE - && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { -# if 0 - fprintf(stderr, "Using O_SYNC for file %s\n", name); -# endif + if (!srv_read_only_mode + && type == OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { - create_flag = create_flag | O_SYNC; + create_flag |= O_SYNC; } #endif /* O_SYNC */ - file = open(name, create_flag, os_innodb_umask); - - if (file == -1) { - const char* operation; + do { + file = ::open(name, create_flag, os_innodb_umask); - operation = create_mode == OS_FILE_CREATE ? "create" : "open"; + if (file == -1) { + const char* operation; - *success = FALSE; + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; - if (on_error_no_exit) { - retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); - } else { - retry = os_file_handle_error(name, operation); - } + *success = FALSE; - if (retry) { - goto try_again; + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } } else { - return(file /* -1 */); + *success = TRUE; + retry = false; } - } - /* else */ - *success = TRUE; + } while (retry); /* We disable OS caching (O_DIRECT) only on data files */ - if (type != OS_LOG_FILE - && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { + + if (!srv_read_only_mode + && *success + && type != OS_LOG_FILE + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { os_file_set_nocache(file, name, mode_str); } #ifdef USE_FILE_LOCK - if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) { + if (!srv_read_only_mode + && *success + && create_mode != OS_FILE_OPEN_RAW + && os_file_lock(file, name)) { if (create_mode == OS_FILE_OPEN_RETRY) { - int i; - ut_print_timestamp(stderr); - fputs(" InnoDB: Retrying to lock" - " the first data file\n", - stderr); - for (i = 0; i < 100; i++) { + + ut_a(!srv_read_only_mode); + + ib_logf(IB_LOG_LEVEL_INFO, + "Retrying to lock the first data file"); + + for (int i = 0; i < 100; i++) { os_thread_sleep(1000000); + if (!os_file_lock(file, name)) { *success = TRUE; return(file); } } - ut_print_timestamp(stderr); - fputs(" InnoDB: Unable to open the first data file\n", - stderr); + + ib_logf(IB_LOG_LEVEL_INFO, + "Unable to open the first data file"); } *success = FALSE; @@ -1669,22 +1790,23 @@ try_again: } #endif /* USE_FILE_LOCK */ - return(file); #endif /* __WIN__ */ + + return(file); } /***********************************************************************//** Deletes a file if it exists. The file has to be closed before calling this. @return TRUE if success */ UNIV_INTERN -ibool +bool os_file_delete_if_exists( /*=====================*/ const char* name) /*!< in: file path as a null-terminated string */ { #ifdef __WIN__ - BOOL ret; + bool ret; ulint count = 0; loop: /* In Windows, deleting an .ibd file may fail if ibbackup is copying @@ -1693,31 +1815,30 @@ loop: ret = DeleteFile((LPCTSTR) name); if (ret) { - return(TRUE); + return(true); } - if (GetLastError() == ERROR_FILE_NOT_FOUND) { + DWORD lasterr = GetLastError(); + if (lasterr == ERROR_FILE_NOT_FOUND + || lasterr == ERROR_PATH_NOT_FOUND) { /* the file does not exist, this not an error */ - return(TRUE); + return(true); } count++; if (count > 100 && 0 == (count % 10)) { - fprintf(stderr, - "InnoDB: Warning: cannot delete file %s\n" - "InnoDB: Are you running ibbackup" - " to back up the file?\n", name); + os_file_get_last_error(true); /* print error information */ - os_file_get_last_error(TRUE); /* print error information */ + ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name); } os_thread_sleep(1000000); /* sleep for a second */ if (count > 2000) { - return(FALSE); + return(false); } goto loop; @@ -1729,18 +1850,18 @@ loop: if (ret != 0 && errno != ENOENT) { os_file_handle_error_no_exit(name, "delete", FALSE); - return(FALSE); + return(false); } - return(TRUE); -#endif + return(true); +#endif /* __WIN__ */ } /***********************************************************************//** Deletes a file. The file has to be closed before calling this. @return TRUE if success */ UNIV_INTERN -ibool +bool os_file_delete( /*===========*/ const char* name) /*!< in: file path as a null-terminated @@ -1756,32 +1877,32 @@ loop: ret = DeleteFile((LPCTSTR) name); if (ret) { - return(TRUE); + return(true); } if (GetLastError() == ERROR_FILE_NOT_FOUND) { /* If the file does not exist, we classify this as a 'mild' error and return */ - return(FALSE); + return(false); } count++; if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + fprintf(stderr, "InnoDB: Warning: cannot delete file %s\n" "InnoDB: Are you running ibbackup" " to back up the file?\n", name); - - os_file_get_last_error(TRUE); /* print error information */ } os_thread_sleep(1000000); /* sleep for a second */ if (count > 2000) { - return(FALSE); + return(false); } goto loop; @@ -1793,10 +1914,10 @@ loop: if (ret != 0) { os_file_handle_error_no_exit(name, "delete", FALSE); - return(FALSE); + return(false); } - return(TRUE); + return(true); #endif } @@ -1813,6 +1934,19 @@ os_file_rename_func( string */ const char* newpath)/*!< in: new file path */ { +#ifdef UNIV_DEBUG + os_file_type_t type; + ibool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + #ifdef __WIN__ BOOL ret; @@ -1837,7 +1971,7 @@ os_file_rename_func( } return(TRUE); -#endif +#endif /* __WIN__ */ } /***********************************************************************//** @@ -1877,7 +2011,7 @@ os_file_close_func( } return(TRUE); -#endif +#endif /* __WIN__ */ } #ifdef UNIV_HOTBACKUP @@ -1913,7 +2047,7 @@ os_file_close_no_error_handling( } return(TRUE); -#endif +#endif /* __WIN__ */ } #endif /* UNIV_HOTBACKUP */ @@ -1942,7 +2076,7 @@ os_file_get_size( return(offset); #else return((os_offset_t) lseek(file, 0, SEEK_END)); -#endif +#endif /* __WIN__ */ } /***********************************************************************//** @@ -2175,10 +2309,7 @@ os_file_flush_func( return(TRUE); } - ut_print_timestamp(stderr); - - fprintf(stderr, - " InnoDB: Error: the OS said file flush did not succeed\n"); + ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); os_file_handle_error(NULL, "flush"); @@ -2215,9 +2346,9 @@ os_file_pread( offs = (off_t) offset; if (sizeof(off_t) <= 4) { - if (UNIV_UNLIKELY(offset != (os_offset_t) offs)) { - fprintf(stderr, - "InnoDB: Error: file read at offset > 4 GB\n"); + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File read at offset > 4 GB"); } } @@ -2318,16 +2449,16 @@ os_file_pwrite( off_t offs; ut_ad(n); + ut_ad(!srv_read_only_mode); /* If off_t is > 4 bytes in size, then we assume we can pass a 64-bit address */ offs = (off_t) offset; if (sizeof(off_t) <= 4) { - if (UNIV_UNLIKELY(offset != (os_offset_t) offs)) { - fprintf(stderr, - "InnoDB: Error: file write" - " at offset > 4 GB\n"); + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File write at offset > 4 GB."); } } @@ -2402,7 +2533,7 @@ func_exit: return(ret); } -#endif +#endif /* !UNIV_HOTBACKUP */ } #endif @@ -2503,11 +2634,9 @@ try_again: return(TRUE); } - fprintf(stderr, - "InnoDB: Error: tried to read "ULINTPF" bytes at offset " - UINT64PF"\n" - "InnoDB: Was only able to read %ld.\n", - n, offset, (lint) ret); + ib_logf(IB_LOG_LEVEL_ERROR, + "Tried to read "ULINTPF" bytes at offset " UINT64PF". " + "Was only able to read %ld.", n, offset, (lint) ret); #endif /* __WIN__ */ #ifdef __WIN__ error_handling: @@ -2525,7 +2654,7 @@ error_handling: (ulong) GetLastError() #else (ulong) errno -#endif +#endif /* __WIN__ */ ); fflush(stderr); @@ -2683,6 +2812,8 @@ os_file_write_func( os_offset_t offset, /*!< in: file offset where to write */ ulint n) /*!< in: number of bytes to write */ { + ut_ad(!srv_read_only_mode); + #ifdef __WIN__ BOOL ret; DWORD len; @@ -2842,8 +2973,8 @@ retry: (ulint) errno); if (strerror(errno) != NULL) { fprintf(stderr, - "InnoDB: Error number %lu means '%s'.\n", - (ulint) errno, strerror(errno)); + "InnoDB: Error number %d means '%s'.\n", + errno, strerror(errno)); } fprintf(stderr, @@ -2866,15 +2997,15 @@ UNIV_INTERN ibool os_file_status( /*===========*/ - const char* path, /*!< in: pathname of the file */ + const char* path, /*!< in: pathname of the file */ ibool* exists, /*!< out: TRUE if file exists */ os_file_type_t* type) /*!< out: type of the file (if it exists) */ { #ifdef __WIN__ int ret; - struct _stat statinfo; + struct _stat64 statinfo; - ret = _stat(path, &statinfo); + ret = _stat64(path, &statinfo); if (ret && (errno == ENOENT || errno == ENOTDIR)) { /* file does not exist */ *exists = FALSE; @@ -2933,47 +3064,73 @@ os_file_status( /*******************************************************************//** This function returns information about the specified file -@return TRUE if stat information found */ +@return DB_SUCCESS if all OK */ UNIV_INTERN -ibool +dberr_t os_file_get_status( /*===============*/ const char* path, /*!< in: pathname of the file */ - os_file_stat_t* stat_info) /*!< information of a file in a + os_file_stat_t* stat_info, /*!< information of a file in a directory */ + bool check_rw_perm) /*!< in: for testing whether the + file can be opened in RW mode */ { -#ifdef __WIN__ int ret; - struct _stat statinfo; - ret = _stat(path, &statinfo); +#ifdef __WIN__ + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { /* file does not exist */ - return(FALSE); + return(DB_NOT_FOUND); + } else if (ret) { /* file exists, but stat call failed */ os_file_handle_error_no_exit(path, "stat", FALSE); - return(FALSE); - } - if (_S_IFDIR & statinfo.st_mode) { + return(DB_FAIL); + + } else if (_S_IFDIR & statinfo.st_mode) { stat_info->type = OS_FILE_TYPE_DIR; } else if (_S_IFREG & statinfo.st_mode) { + + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + stat_info->type = OS_FILE_TYPE_FILE; + + /* Check if we can open it in read-only mode. */ + + if (check_rw_perm) { + HANDLE fh; + + fh = CreateFile( + (LPCTSTR) path, // File to open + access, + 0, // No sharing + NULL, // Default security + OPEN_EXISTING, // Existing file only + FILE_ATTRIBUTE_NORMAL, // Normal file + NULL); // No attr. template + + if (fh == INVALID_HANDLE_VALUE) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + CloseHandle(fh); + } + } } else { stat_info->type = OS_FILE_TYPE_UNKNOWN; } - - stat_info->ctime = statinfo.st_ctime; - stat_info->atime = statinfo.st_atime; - stat_info->mtime = statinfo.st_mtime; - stat_info->size = statinfo.st_size; - - return(TRUE); #else - int ret; struct stat statinfo; ret = stat(path, &statinfo); @@ -2981,32 +3138,49 @@ os_file_get_status( if (ret && (errno == ENOENT || errno == ENOTDIR)) { /* file does not exist */ - return(FALSE); + return(DB_NOT_FOUND); + } else if (ret) { /* file exists, but stat call failed */ os_file_handle_error_no_exit(path, "stat", FALSE); - return(FALSE); - } + return(DB_FAIL); - if (S_ISDIR(statinfo.st_mode)) { + } else if (S_ISDIR(statinfo.st_mode)) { stat_info->type = OS_FILE_TYPE_DIR; } else if (S_ISLNK(statinfo.st_mode)) { stat_info->type = OS_FILE_TYPE_LINK; } else if (S_ISREG(statinfo.st_mode)) { stat_info->type = OS_FILE_TYPE_FILE; + + if (check_rw_perm) { + int fh; + int access; + + access = !srv_read_only_mode ? O_RDWR : O_RDONLY; + + fh = ::open(path, access, os_innodb_umask); + + if (fh == -1) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + close(fh); + } + } } else { stat_info->type = OS_FILE_TYPE_UNKNOWN; } +#endif /* _WIN_ */ + stat_info->ctime = statinfo.st_ctime; stat_info->atime = statinfo.st_atime; stat_info->mtime = statinfo.st_mtime; - stat_info->size = statinfo.st_size; + stat_info->size = statinfo.st_size; - return(TRUE); -#endif + return(DB_SUCCESS); } /* path name separator character */ @@ -3017,6 +3191,153 @@ os_file_get_status( #endif /****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* tablename) /*!< in: contains new base name */ +{ + ulint dir_len; + char* last_slash; + char* base_name; + char* new_path; + ulint new_path_len; + + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + last_slash = strrchr((char*) tablename, '/'); + base_name = last_slash ? last_slash + 1 : (char*) tablename; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR); + dir_len = last_slash ? last_slash - old_path : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd"; + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, old_path, dir_len); + + ut_snprintf(new_path + dir_len, + new_path_len - dir_len, + "%c%s.ibd", + OS_FILE_PATH_SEPARATOR, + base_name); + + return(new_path); +} + +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention) /*!< in: file extention; ibd,cfg */ +{ + ulint data_dir_len; + char* last_slash; + char* new_path; + ulint new_path_len; + + ut_ad(extention && strlen(extention) == 3); + + /* Find the offset of the last slash. We will strip off the + old basename or tablename which starts after that slash. */ + last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = data_dir_len + strlen(tablename) + + sizeof "/." + strlen(extention); + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, data_dir_path, data_dir_len); + ut_snprintf(new_path + data_dir_len, + new_path_len - data_dir_len, + "%c%s.%s", + OS_FILE_PATH_SEPARATOR, + tablename, + extention); + + srv_normalize_path_for_win(new_path); + + return(new_path); +} + +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path) /*!< in/out: full path/data_dir_path */ +{ + char* ptr; + char* tablename; + ulint tablename_len; + + /* Replace the period before the extension with a null byte. */ + ptr = strrchr((char*) data_dir_path, '.'); + if (!ptr) { + return; + } + ptr[0] = '\0'; + + /* The tablename starts after the last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + ptr[0] = '\0'; + tablename = ptr + 1; + + /* The databasename starts after the next to last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + tablename_len = ut_strlen(tablename); + + ut_memmove(++ptr, tablename, tablename_len); + + ptr[tablename_len] = '\0'; +} + +/****************************************************************//** The function os_file_dirname returns a directory component of a null-terminated pathname string. In the usual case, dirname returns the string up to, but not including, the final '/', and basename @@ -3080,11 +3401,18 @@ os_file_create_subdirs_if_needed( /*=============================*/ const char* path) /*!< in: path name */ { - char* subdir; - ibool success, subdir_exists; - os_file_type_t type; + if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "read only mode set. Can't create subdirectories '%s'", + path); + + return(FALSE); + + } + + char* subdir = os_file_dirname(path); - subdir = os_file_dirname(path); if (strlen(subdir) == 1 && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { /* subdir is root or cwd, nothing to do */ @@ -3094,15 +3422,21 @@ os_file_create_subdirs_if_needed( } /* Test if subdir exists */ - success = os_file_status(subdir, &subdir_exists, &type); + os_file_type_t type; + ibool subdir_exists; + ibool success = os_file_status(subdir, &subdir_exists, &type); + if (success && !subdir_exists) { + /* subdir does not exist, create it */ success = os_file_create_subdirs_if_needed(subdir); + if (!success) { mem_free(subdir); return(FALSE); } + success = os_file_create_directory(subdir, FALSE); } @@ -3124,7 +3458,7 @@ os_aio_array_get_nth_slot( { ut_a(index < array->n_slots); - return((array->slots) + index); + return(&array->slots[index]); } #if defined(LINUX_NATIVE_AIO) @@ -3226,43 +3560,74 @@ os_aio_native_aio_supported(void) /*=============================*/ { int fd; - byte* buf; - byte* ptr; - struct io_event io_event; io_context_t io_ctx; - struct iocb iocb; - struct iocb* p_iocb; - int err; + char name[1000]; if (!os_aio_linux_create_io_ctx(1, &io_ctx)) { /* The platform does not support native aio. */ return(FALSE); - } + } else if (!srv_read_only_mode) { + /* Now check if tmpdir supports native aio ops. */ + fd = innobase_mysql_tmpfile(); - /* Now check if tmpdir supports native aio ops. */ - fd = innobase_mysql_tmpfile(); + if (fd < 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to create temp file to check " + "native AIO support."); - if (fd < 0) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: unable to create " - "temp file to check native AIO support.\n"); + return(FALSE); + } + } else { - return(FALSE); + srv_normalize_path_for_win(srv_log_group_home_dir); + + ulint dirnamelen = strlen(srv_log_group_home_dir); + ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile"); + memcpy(name, srv_log_group_home_dir, dirnamelen); + + /* Add a path separator if needed. */ + if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + name[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + strcpy(name + dirnamelen, "ib_logfile0"); + + fd = ::open(name, O_RDONLY); + + if (fd == -1) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to open \"%s\" to check " + "native AIO read support.", name); + + return(FALSE); + } } + struct io_event io_event; + memset(&io_event, 0x0, sizeof(io_event)); - buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2)); - ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2)); + byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + + struct iocb iocb; /* Suppress valgrind warning. */ memset(buf, 0x00, UNIV_PAGE_SIZE * 2); - memset(&iocb, 0x0, sizeof(iocb)); - p_iocb = &iocb; - io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0); - err = io_submit(io_ctx, 1, &p_iocb); + struct iocb* p_iocb = &iocb; + + if (!srv_read_only_mode) { + io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0); + } else { + ut_a(UNIV_PAGE_SIZE >= 512); + io_prep_pread(p_iocb, fd, ptr, 512, 0); + } + + int err = io_submit(io_ctx, 1, &p_iocb); + if (err >= 1) { /* Now collect the submitted IO request. */ err = io_getevents(io_ctx, 1, 1, &io_event, NULL); @@ -3277,22 +3642,18 @@ os_aio_native_aio_supported(void) case -EINVAL: case -ENOSYS: - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: Linux Native AIO is not" - " supported on tmpdir.\n" - "InnoDB: You can either move tmpdir to a" - " file system that supports native AIO\n" - "InnoDB: or you can set" - " innodb_use_native_aio to FALSE to avoid" - " this message.\n"); + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO not supported. You can either " + "move %s to a file system that supports native " + "AIO or you can set innodb_use_native_aio to " + "FALSE to avoid this message.", + srv_read_only_mode ? name : "tmpdir"); /* fall through. */ default: - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: Linux Native AIO check" - " on tmpdir returned error[%d]\n", -err); + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO check on %s returned error[%d]", + srv_read_only_mode ? name : "tmpdir", -err); } return(FALSE); @@ -3314,34 +3675,33 @@ os_aio_array_create( ulint n_segments) /*!< in: number of segments in the aio array */ { os_aio_array_t* array; - ulint i; - os_aio_slot_t* slot; #ifdef WIN_ASYNC_IO OVERLAPPED* over; #elif defined(LINUX_NATIVE_AIO) struct io_event* io_event = NULL; -#endif +#endif /* WIN_ASYNC_IO */ ut_a(n > 0); ut_a(n_segments > 0); - array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(os_aio_array_t))); + array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array))); + memset(array, 0x0, sizeof(*array)); - array->mutex = os_mutex_create(); - array->not_full = os_event_create(NULL); - array->is_empty = os_event_create(NULL); + array->mutex = os_mutex_create(); + array->not_full = os_event_create(); + array->is_empty = os_event_create(); os_event_set(array->is_empty); - array->n_slots = n; - array->n_segments = n_segments; - array->n_reserved = 0; - array->cur_seg = 0; + array->n_slots = n; + array->n_segments = n_segments; array->slots = static_cast<os_aio_slot_t*>( - ut_malloc(n * sizeof(os_aio_slot_t))); + ut_malloc(n * sizeof(*array->slots))); + + memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); #ifdef __WIN__ array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE))); -#endif +#endif /* __WIN__ */ #if defined(LINUX_NATIVE_AIO) array->aio_ctx = NULL; @@ -3359,16 +3719,27 @@ os_aio_array_create( array->aio_ctx = static_cast<io_context**>( ut_malloc(n_segments * sizeof(*array->aio_ctx))); - for (i = 0; i < n_segments; ++i) { + for (ulint i = 0; i < n_segments; ++i) { if (!os_aio_linux_create_io_ctx(n/n_segments, &array->aio_ctx[i])) { /* If something bad happened during aio setup - we should call it a day and return right away. - We don't care about any leaks because a failure - to initialize the io subsystem means that the - server (or atleast the innodb storage engine) - is not going to startup. */ - return(NULL); + we disable linux native aio. + The disadvantage will be a small memory leak + at shutdown but that's ok compared to a crash + or a not working server. + This frequently happens when running the test suite + with many threads on a system with low fs.aio-max-nr! + */ + + fprintf(stderr, + " InnoDB: Warning: Linux Native AIO disabled " + "because os_aio_linux_create_io_ctx() " + "failed. To get rid of this warning you can " + "try increasing system " + "fs.aio-max-nr to 1048576 or larger or " + "setting innodb_use_native_aio = 0 in my.cnf\n"); + srv_use_native_aio = FALSE; + goto skip_native_aio; } } @@ -3381,7 +3752,9 @@ os_aio_array_create( skip_native_aio: #endif /* LINUX_NATIVE_AIO */ - for (i = 0; i < n; i++) { + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + slot = os_aio_array_get_nth_slot(array, i); slot->pos = i; @@ -3389,18 +3762,17 @@ skip_native_aio: #ifdef WIN_ASYNC_IO slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL); - over = &(slot->control); + over = &slot->control; over->hEvent = slot->handle; - *((array->handles) + i) = over->hEvent; + array->handles[i] = over->hEvent; #elif defined(LINUX_NATIVE_AIO) - memset(&slot->control, 0x0, sizeof(slot->control)); slot->n_bytes = 0; slot->ret = 0; -#endif +#endif /* WIN_ASYNC_IO */ } return(array); @@ -3412,7 +3784,7 @@ static void os_aio_array_free( /*==============*/ - os_aio_array_t* array) /*!< in, own: array to free */ + os_aio_array_t*& array) /*!< in, own: array to free */ { #ifdef WIN_ASYNC_IO ulint i; @@ -3439,6 +3811,8 @@ os_aio_array_free( ut_free(array->slots); ut_free(array); + + array = 0; } /*********************************************************************** @@ -3459,93 +3833,100 @@ os_aio_init( ulint n_slots_sync) /*<! in: number of slots in the sync aio array */ { - ulint i; - ulint n_segments = 2 + n_read_segs + n_write_segs; - - ut_ad(n_segments >= 4); - os_io_init_simple(); #if defined(LINUX_NATIVE_AIO) /* Check if native aio is supported on this system and tmpfs */ - if (srv_use_native_aio - && !os_aio_native_aio_supported()) { + if (srv_use_native_aio && !os_aio_native_aio_supported()) { + + ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled."); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Warning: Linux Native AIO" - " disabled.\n"); srv_use_native_aio = FALSE; } #endif /* LINUX_NATIVE_AIO */ - for (i = 0; i < n_segments; i++) { - srv_set_io_thread_op_info(i, "not started yet"); - } - + srv_reset_io_thread_op_info(); - /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ + os_aio_read_array = os_aio_array_create( + n_read_segs * n_per_seg, n_read_segs); - os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); - if (os_aio_ibuf_array == NULL) { - goto err_exit; + if (os_aio_read_array == NULL) { + return(FALSE); } - srv_io_thread_function[0] = "insert buffer thread"; + ulint start = (srv_read_only_mode) ? 0 : 2; + ulint n_segs = n_read_segs + start; - os_aio_log_array = os_aio_array_create(n_per_seg, 1); - if (os_aio_log_array == NULL) { - goto err_exit; + /* 0 is the ibuf segment and 1 is the insert buffer segment. */ + for (ulint i = start; i < n_segs; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "read thread"; } - srv_io_thread_function[1] = "log thread"; + ulint n_segments = n_read_segs; - os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, - n_read_segs); - if (os_aio_read_array == NULL) { - goto err_exit; - } + if (!srv_read_only_mode) { - for (i = 2; i < 2 + n_read_segs; i++) { - ut_a(i < SRV_MAX_N_IO_THREADS); - srv_io_thread_function[i] = "read thread"; - } + os_aio_log_array = os_aio_array_create(n_per_seg, 1); - os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, - n_write_segs); - if (os_aio_write_array == NULL) { - goto err_exit; - } + if (os_aio_log_array == NULL) { + return(FALSE); + } - for (i = 2 + n_read_segs; i < n_segments; i++) { - ut_a(i < SRV_MAX_N_IO_THREADS); - srv_io_thread_function[i] = "write thread"; + ++n_segments; + + srv_io_thread_function[1] = "log thread"; + + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + if (os_aio_ibuf_array == NULL) { + return(FALSE); + } + + ++n_segments; + + srv_io_thread_function[0] = "insert buffer thread"; + + os_aio_write_array = os_aio_array_create( + n_write_segs * n_per_seg, n_write_segs); + + if (os_aio_write_array == NULL) { + return(FALSE); + } + + n_segments += n_write_segs; + + for (ulint i = start + n_read_segs; i < n_segments; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "write thread"; + } + + ut_ad(n_segments >= 4); + } else { + ut_ad(n_segments > 0); } os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + if (os_aio_sync_array == NULL) { - goto err_exit; + return(FALSE); } - os_aio_n_segments = n_segments; os_aio_validate(); - os_aio_segment_wait_events = static_cast<os_event_struct_t**>( - ut_malloc(n_segments * sizeof(void*))); + os_aio_segment_wait_events = static_cast<os_event_t*>( + ut_malloc(n_segments * sizeof *os_aio_segment_wait_events)); - for (i = 0; i < n_segments; i++) { - os_aio_segment_wait_events[i] = os_event_create(NULL); + for (ulint i = 0; i < n_segments; ++i) { + os_aio_segment_wait_events[i] = os_event_create(); } - os_last_printout = time(NULL); + os_last_printout = ut_time(); return(TRUE); -err_exit: - return(FALSE); - } /*********************************************************************** @@ -3555,20 +3936,25 @@ void os_aio_free(void) /*=============*/ { - ulint i; + if (os_aio_ibuf_array != 0) { + os_aio_array_free(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_free(os_aio_log_array); + } + + if (os_aio_write_array != 0) { + os_aio_array_free(os_aio_write_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_free(os_aio_sync_array); + } - os_aio_array_free(os_aio_ibuf_array); - os_aio_ibuf_array = NULL; - os_aio_array_free(os_aio_log_array); - os_aio_log_array = NULL; os_aio_array_free(os_aio_read_array); - os_aio_read_array = NULL; - os_aio_array_free(os_aio_write_array); - os_aio_write_array = NULL; - os_aio_array_free(os_aio_sync_array); - os_aio_sync_array = NULL; - for (i = 0; i < os_aio_n_segments; i++) { + for (ulint i = 0; i < os_aio_n_segments; i++) { os_event_free(os_aio_segment_wait_events[i]); } @@ -3604,14 +3990,20 @@ void os_aio_wake_all_threads_at_shutdown(void) /*=====================================*/ { - ulint i; - #ifdef WIN_ASYNC_IO /* This code wakes up all ai/o threads in Windows native aio */ os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array); - os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); - os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); - os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); + if (os_aio_write_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); + } #elif defined(LINUX_NATIVE_AIO) @@ -3623,12 +4015,14 @@ os_aio_wake_all_threads_at_shutdown(void) if (srv_use_native_aio) { return; } + /* Fall through to simulated AIO handler wakeup if we are not using native AIO. */ -#endif +#endif /* !WIN_ASYNC_AIO */ + /* This loop wakes up all simulated ai/o threads */ - for (i = 0; i < os_aio_n_segments; i++) { + for (ulint i = 0; i < os_aio_n_segments; i++) { os_event_set(os_aio_segment_wait_events[i]); } @@ -3642,6 +4036,7 @@ void os_aio_wait_until_no_pending_writes(void) /*=====================================*/ { + ut_ad(!srv_read_only_mode); os_event_wait(os_aio_write_array->is_empty); } @@ -3660,10 +4055,14 @@ os_aio_get_segment_no_from_slot( ulint seg_len; if (array == os_aio_ibuf_array) { - segment = 0; + ut_ad(!srv_read_only_mode); + + segment = IO_IBUF_SEGMENT; } else if (array == os_aio_log_array) { - segment = 1; + ut_ad(!srv_read_only_mode); + + segment = IO_LOG_SEGMENT; } else if (array == os_aio_read_array) { seg_len = os_aio_read_array->n_slots @@ -3671,7 +4070,9 @@ os_aio_get_segment_no_from_slot( segment = 2 + slot->pos / seg_len; } else { + ut_ad(!srv_read_only_mode); ut_a(array == os_aio_write_array); + seg_len = os_aio_write_array->n_slots / os_aio_write_array->n_segments; @@ -3692,15 +4093,19 @@ os_aio_get_array_and_local_segment( os_aio_array_t** array, /*!< out: aio wait array */ ulint global_segment)/*!< in: global segment number */ { - ulint segment; + ulint segment; ut_a(global_segment < os_aio_n_segments); - if (global_segment == 0) { + if (srv_read_only_mode) { + *array = os_aio_read_array; + + return(global_segment); + } else if (global_segment == IO_IBUF_SEGMENT) { *array = os_aio_ibuf_array; segment = 0; - } else if (global_segment == 1) { + } else if (global_segment == IO_LOG_SEGMENT) { *array = os_aio_log_array; segment = 0; @@ -3748,7 +4153,7 @@ os_aio_array_reserve_slot( struct iocb* iocb; off_t aio_offset; -#endif +#endif /* WIN_ASYNC_IO */ ulint i; ulint counter; ulint slots_per_seg; @@ -3756,7 +4161,7 @@ os_aio_array_reserve_slot( #ifdef WIN_ASYNC_IO ut_a((len & 0xFFFFFFFFUL) == len); -#endif +#endif /* WIN_ASYNC_IO */ /* No need of a mutex. Only reading constant fields */ slots_per_seg = array->n_slots / array->n_segments; @@ -3789,9 +4194,11 @@ loop: local segment and do a full scan of the array. We are guaranteed to find a slot in full scan. */ for (i = local_seg * slots_per_seg, counter = 0; - counter < array->n_slots; i++, counter++) { + counter < array->n_slots; + i++, counter++) { i %= array->n_slots; + slot = os_aio_array_get_nth_slot(array, i); if (slot->reserved == FALSE) { @@ -3815,7 +4222,7 @@ found: } slot->reserved = TRUE; - slot->reservation_time = time(NULL); + slot->reservation_time = ut_time(); slot->message1 = message1; slot->message2 = message2; slot->file = file; @@ -3827,7 +4234,7 @@ found: slot->io_already_done = FALSE; #ifdef WIN_ASYNC_IO - control = &(slot->control); + control = &slot->control; control->Offset = (DWORD) offset & 0xFFFFFFFF; control->OffsetHigh = (DWORD) (offset >> 32); ResetEvent(slot->handle); @@ -3858,7 +4265,6 @@ found: iocb->data = (void*) slot; slot->n_bytes = 0; slot->ret = 0; - /*fprintf(stderr, "Filled up Linux native iocb.\n");*/ skip_native_aio: #endif /* LINUX_NATIVE_AIO */ @@ -3876,9 +4282,6 @@ os_aio_array_free_slot( os_aio_array_t* array, /*!< in: aio array */ os_aio_slot_t* slot) /*!< in: pointer to slot */ { - ut_ad(array); - ut_ad(slot); - os_mutex_enter(array->mutex); ut_ad(slot->reserved); @@ -3927,36 +4330,42 @@ os_aio_simulated_wake_handler_thread( arrays */ { os_aio_array_t* array; - os_aio_slot_t* slot; ulint segment; - ulint n; - ulint i; ut_ad(!srv_use_native_aio); segment = os_aio_get_array_and_local_segment(&array, global_segment); - n = array->n_slots / array->n_segments; + ulint n = array->n_slots / array->n_segments; + + segment *= n; /* Look through n slots after the segment * n'th slot */ os_mutex_enter(array->mutex); - for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, i + segment * n); + for (ulint i = 0; i < n; ++i) { + const os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, segment + i); if (slot->reserved) { + /* Found an i/o request */ - break; + os_mutex_exit(array->mutex); + + os_event_t event; + + event = os_aio_segment_wait_events[global_segment]; + + os_event_set(event); + + return; } } os_mutex_exit(array->mutex); - - if (i < n) { - os_event_set(os_aio_segment_wait_events[global_segment]); - } } /**********************************************************************//** @@ -3966,8 +4375,6 @@ void os_aio_simulated_wake_handler_threads(void) /*=======================================*/ { - ulint i; - if (srv_use_native_aio) { /* We do not use simulated aio: do nothing */ @@ -3976,7 +4383,7 @@ os_aio_simulated_wake_handler_threads(void) os_aio_recommend_sleep_for_read_threads = FALSE; - for (i = 0; i < os_aio_n_segments; i++) { + for (ulint i = 0; i < os_aio_n_segments; i++) { os_aio_simulated_wake_handler_thread(i); } } @@ -3998,7 +4405,6 @@ background threads too eagerly to allow for coalescing during readahead requests. */ #ifdef __WIN__ os_aio_array_t* array; - ulint g; if (srv_use_native_aio) { /* We do not use simulated aio: do nothing */ @@ -4008,12 +4414,12 @@ readahead requests. */ os_aio_recommend_sleep_for_read_threads = TRUE; - for (g = 0; g < os_aio_n_segments; g++) { - os_aio_get_array_and_local_segment(&array, g); + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_get_array_and_local_segment(&array, i); if (array == os_aio_read_array) { - os_event_reset(os_aio_segment_wait_events[g]); + os_event_reset(os_aio_segment_wait_events[i]); } } #endif /* __WIN__ */ @@ -4111,11 +4517,10 @@ os_aio_func( ibool retval; BOOL ret = TRUE; DWORD len = (DWORD) n; - struct fil_node_struct * dummy_mess1; + struct fil_node_t* dummy_mess1; void* dummy_mess2; ulint dummy_type; #endif /* WIN_ASYNC_IO */ - ibool retry; ulint wake_later; ut_ad(file); @@ -4153,6 +4558,7 @@ os_aio_func( return(os_file_read_func(file, buf, offset, n)); } + ut_ad(!srv_read_only_mode); ut_a(type == OS_FILE_WRITE); return(os_file_write_func(name, file, buf, offset, n)); @@ -4161,9 +4567,12 @@ os_aio_func( try_again: switch (mode) { case OS_AIO_NORMAL: - array = (type == OS_FILE_READ) - ? os_aio_read_array - : os_aio_write_array; + if (type == OS_FILE_READ) { + array = os_aio_read_array; + } else { + ut_ad(!srv_read_only_mode); + array = os_aio_write_array; + } break; case OS_AIO_IBUF: ut_ad(type == OS_FILE_READ); @@ -4172,14 +4581,21 @@ try_again: wake_later = FALSE; - array = os_aio_ibuf_array; + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_ibuf_array; + } break; case OS_AIO_LOG: - array = os_aio_log_array; + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_log_array; + } break; case OS_AIO_SYNC: array = os_aio_sync_array; - #if defined(LINUX_NATIVE_AIO) /* In Linux native AIO we don't use sync IO array. */ ut_a(!srv_use_native_aio); @@ -4204,7 +4620,7 @@ try_again: if (!os_aio_linux_dispatch(array, slot)) { goto err_exit; } -#endif +#endif /* WIN_ASYNC_IO */ } else { if (!wake_later) { os_aio_simulated_wake_handler_thread( @@ -4213,6 +4629,7 @@ try_again: } } } else if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); if (srv_use_native_aio) { os_n_file_writes++; #ifdef WIN_ASYNC_IO @@ -4223,7 +4640,7 @@ try_again: if (!os_aio_linux_dispatch(array, slot)) { goto err_exit; } -#endif +#endif /* WIN_ASYNC_IO */ } else { if (!wake_later) { os_aio_simulated_wake_handler_thread( @@ -4247,11 +4664,10 @@ try_again: we must use the same wait mechanism as for async i/o */ - retval = os_aio_windows_handle(ULINT_UNDEFINED, - slot->pos, - &dummy_mess1, - &dummy_mess2, - &dummy_type); + retval = os_aio_windows_handle( + ULINT_UNDEFINED, slot->pos, + &dummy_mess1, &dummy_mess2, + &dummy_type); return(retval); } @@ -4270,10 +4686,8 @@ err_exit: #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */ os_aio_array_free_slot(array, slot); - retry = os_file_handle_error(name, - type == OS_FILE_READ - ? "aio read" : "aio write"); - if (retry) { + if (os_file_handle_error( + name,type == OS_FILE_READ ? "aio read" : "aio write")) { goto try_again; } @@ -4323,8 +4737,8 @@ os_aio_windows_handle( BOOL retry = FALSE; if (segment == ULINT_UNDEFINED) { - array = os_aio_sync_array; segment = 0; + array = os_aio_sync_array; } else { segment = os_aio_get_array_and_local_segment(&array, segment); } @@ -4338,16 +4752,21 @@ os_aio_windows_handle( n = array->n_slots / array->n_segments; if (array == os_aio_sync_array) { + WaitForSingleObject( os_aio_array_get_nth_slot(array, pos)->handle, INFINITE); + i = pos; + } else { - srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); - i = WaitForMultipleObjects((DWORD) n, - array->handles + segment * n, - FALSE, - INFINITE); + if (orig_seg != ULINT_UNDEFINED) { + srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); + } + + i = WaitForMultipleObjects( + (DWORD) n, array->handles + segment * n, + FALSE, INFINITE); } os_mutex_enter(array->mutex); @@ -4367,8 +4786,8 @@ os_aio_windows_handle( ut_a(slot->reserved); if (orig_seg != ULINT_UNDEFINED) { - srv_set_io_thread_op_info(orig_seg, - "get windows aio return value"); + srv_set_io_thread_op_info( + orig_seg, "get windows aio return value"); } ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE); @@ -4671,7 +5090,7 @@ found: *type = slot->type; - if ((slot->ret == 0) && (slot->n_bytes == (long) slot->len)) { + if (slot->ret == 0 && slot->n_bytes == (long) slot->len) { ret = TRUE; } else { @@ -4720,8 +5139,6 @@ os_aio_simulated_handle( { os_aio_array_t* array; ulint segment; - os_aio_slot_t* slot; - os_aio_slot_t* slot2; os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; ulint n_consecutive; ulint total_len; @@ -4734,7 +5151,7 @@ os_aio_simulated_handle( ibool ret; ibool any_reserved; ulint n; - ulint i; + os_aio_slot_t* aio_slot; /* Fix compiler warning */ *consecutive_ios = NULL; @@ -4772,7 +5189,9 @@ restart: os_mutex_enter(array->mutex); - for (i = 0; i < n; i++) { + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + slot = os_aio_array_get_nth_slot(array, i + segment * n); if (!slot->reserved) { @@ -4786,8 +5205,8 @@ restart: (ulong) i); } + aio_slot = slot; ret = TRUE; - goto slot_io_done; } else { any_reserved = TRUE; @@ -4797,9 +5216,7 @@ restart: /* There is no completed request. If there is no pending request at all, and the system is being shut down, exit. */ - if (UNIV_UNLIKELY - (!any_reserved - && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { os_mutex_exit(array->mutex); *message1 = NULL; *message2 = NULL; @@ -4815,12 +5232,15 @@ restart: biggest_age = 0; lowest_offset = IB_UINT64_MAX; - for (i = 0; i < n; i++) { + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + slot = os_aio_array_get_nth_slot(array, i + segment * n); if (slot->reserved) { - age = (ulint) difftime(time(NULL), - slot->reservation_time); + + age = (ulint) difftime( + ut_time(), slot->reservation_time); if ((age >= 2 && age > biggest_age) || (age >= 2 && age == biggest_age @@ -4844,9 +5264,11 @@ restart: lowest_offset = IB_UINT64_MAX; - for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, - i + segment * n); + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot( + array, i + segment * n); if (slot->reserved && slot->offset < lowest_offset) { @@ -4872,25 +5294,28 @@ restart: ut_ad(n_consecutive != 0); ut_ad(consecutive_ios[0] != NULL); - slot = consecutive_ios[0]; + aio_slot = consecutive_ios[0]; /* Check if there are several consecutive blocks to read or write */ consecutive_loop: - for (i = 0; i < n; i++) { - slot2 = os_aio_array_get_nth_slot(array, i + segment * n); + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); - if (slot2->reserved && slot2 != slot - && slot2->offset == slot->offset + slot->len - && slot2->type == slot->type - && slot2->file == slot->file) { + if (slot->reserved + && slot != aio_slot + && slot->offset == slot->offset + aio_slot->len + && slot->type == aio_slot->type + && slot->file == aio_slot->file) { /* Found a consecutive i/o request */ - consecutive_ios[n_consecutive] = slot2; + consecutive_ios[n_consecutive] = slot; n_consecutive++; - slot = slot2; + aio_slot = slot; if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { @@ -4908,15 +5333,15 @@ consecutive_loop: i/o */ total_len = 0; - slot = consecutive_ios[0]; + aio_slot = consecutive_ios[0]; - for (i = 0; i < n_consecutive; i++) { + for (ulint i = 0; i < n_consecutive; i++) { total_len += consecutive_ios[i]->len; } if (n_consecutive == 1) { /* We can use the buffer of the i/o request */ - combined_buf = slot->buf; + combined_buf = aio_slot->buf; combined_buf2 = NULL; } else { combined_buf2 = static_cast<byte*>( @@ -4934,50 +5359,41 @@ consecutive_loop: os_mutex_exit(array->mutex); - if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { + if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) { /* Copy the buffers to the combined buffer */ offs = 0; - for (i = 0; i < n_consecutive; i++) { + for (ulint i = 0; i < n_consecutive; i++) { ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf, consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; } } srv_set_io_thread_op_info(global_segment, "doing file i/o"); - if (os_aio_print_debug) { - fprintf(stderr, - "InnoDB: doing i/o of type %lu at offset " UINT64PF - ", length %lu\n", - (ulong) slot->type, slot->offset, (ulong) total_len); - } - /* Do the i/o with ordinary, synchronous i/o functions: */ - if (slot->type == OS_FILE_WRITE) { - ret = os_file_write(slot->name, slot->file, combined_buf, - slot->offset, total_len); + if (aio_slot->type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + ret = os_file_write( + aio_slot->name, aio_slot->file, combined_buf, + aio_slot->offset, total_len); } else { - ret = os_file_read(slot->file, combined_buf, - slot->offset, total_len); + ret = os_file_read( + aio_slot->file, combined_buf, + aio_slot->offset, total_len); } ut_a(ret); srv_set_io_thread_op_info(global_segment, "file i/o done"); -#if 0 - fprintf(stderr, - "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n", - n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE); -#endif - - if (slot->type == OS_FILE_READ && n_consecutive > 1) { + if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) { /* Copy the combined buffer to individual buffers */ offs = 0; - for (i = 0; i < n_consecutive; i++) { + for (ulint i = 0; i < n_consecutive; i++) { ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs, consecutive_ios[i]->len); @@ -4993,7 +5409,7 @@ consecutive_loop: /* Mark the i/os done in slots */ - for (i = 0; i < n_consecutive; i++) { + for (ulint i = 0; i < n_consecutive; i++) { consecutive_ios[i]->io_already_done = TRUE; } @@ -5003,16 +5419,16 @@ consecutive_loop: slot_io_done: - ut_a(slot->reserved); + ut_a(aio_slot->reserved); - *message1 = slot->message1; - *message2 = slot->message2; + *message1 = aio_slot->message1; + *message2 = aio_slot->message2; - *type = slot->type; + *type = aio_slot->type; os_mutex_exit(array->mutex); - os_aio_array_free_slot(array, slot); + os_aio_array_free_slot(array, aio_slot); return(ret); @@ -5031,30 +5447,20 @@ recommended_sleep: os_event_wait(os_aio_segment_wait_events[global_segment]); - if (os_aio_print_debug) { - fprintf(stderr, - "InnoDB: i/o handler thread for i/o" - " segment %lu wakes up\n", - (ulong) global_segment); - } - goto restart; } /**********************************************************************//** Validates the consistency of an aio array. -@return TRUE if ok */ +@return true if ok */ static -ibool +bool os_aio_array_validate( /*==================*/ os_aio_array_t* array) /*!< in: aio wait array */ { - os_aio_slot_t* slot; - ulint n_reserved = 0; ulint i; - - ut_a(array); + ulint n_reserved = 0; os_mutex_enter(array->mutex); @@ -5062,6 +5468,8 @@ os_aio_array_validate( ut_a(array->n_segments > 0); for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot; + slot = os_aio_array_get_nth_slot(array, i); if (slot->reserved) { @@ -5074,7 +5482,7 @@ os_aio_array_validate( os_mutex_exit(array->mutex); - return(TRUE); + return(true); } /**********************************************************************//** @@ -5086,10 +5494,22 @@ os_aio_validate(void) /*=================*/ { os_aio_array_validate(os_aio_read_array); - os_aio_array_validate(os_aio_write_array); - os_aio_array_validate(os_aio_ibuf_array); - os_aio_array_validate(os_aio_log_array); - os_aio_array_validate(os_aio_sync_array); + + if (os_aio_write_array != 0) { + os_aio_array_validate(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_validate(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_validate(os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_validate(os_aio_sync_array); + } return(TRUE); } @@ -5129,65 +5549,36 @@ os_aio_print_segment_info( } /**********************************************************************//** -Prints info of the aio arrays. */ +Prints info about the aio array. */ UNIV_INTERN void -os_aio_print( -/*=========*/ - FILE* file) /*!< in: file where to print */ +os_aio_print_array( +/*==============*/ + FILE* file, /*!< in: file where to print */ + os_aio_array_t* array) /*!< in: aio array to print */ { - os_aio_array_t* array; - os_aio_slot_t* slot; - ulint n_reserved; - ulint n_res_seg[SRV_MAX_N_IO_THREADS]; - time_t current_time; - double time_elapsed; - double avg_bytes_read; - ulint i; - - for (i = 0; i < srv_n_file_io_threads; i++) { - fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i, - srv_io_thread_op_info[i], - srv_io_thread_function[i]); - -#ifndef __WIN__ - if (os_aio_segment_wait_events[i]->is_set) { - fprintf(file, " ev set"); - } -#endif - - fprintf(file, "\n"); - } - - fputs("Pending normal aio reads:", file); - - array = os_aio_read_array; -loop: - ut_a(array); + ulint n_reserved = 0; + ulint n_res_seg[SRV_MAX_N_IO_THREADS]; os_mutex_enter(array->mutex); ut_a(array->n_slots > 0); ut_a(array->n_segments > 0); - n_reserved = 0; - memset(n_res_seg, 0x0, sizeof(n_res_seg)); - for (i = 0; i < array->n_slots; i++) { - ulint seg_no; + for (ulint i = 0; i < array->n_slots; ++i) { + os_aio_slot_t* slot; + ulint seg_no; slot = os_aio_array_get_nth_slot(array, i); seg_no = (i * array->n_segments) / array->n_slots; + if (slot->reserved) { - n_reserved++; - n_res_seg[seg_no]++; -#if 0 - fprintf(stderr, "Reserved slot, messages %p %p\n", - (void*) slot->message1, - (void*) slot->message2); -#endif + ++n_reserved; + ++n_res_seg[seg_no]; + ut_a(slot->len > 0); } } @@ -5199,38 +5590,61 @@ loop: os_aio_print_segment_info(file, n_res_seg, array); os_mutex_exit(array->mutex); +} - if (array == os_aio_read_array) { - fputs(", aio writes:", file); +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file) /*!< in: file where to print */ +{ + time_t current_time; + double time_elapsed; + double avg_bytes_read; - array = os_aio_write_array; + for (ulint i = 0; i < srv_n_file_io_threads; ++i) { + fprintf(file, "I/O thread %lu state: %s (%s)", + (ulong) i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); - goto loop; +#ifndef __WIN__ + if (os_aio_segment_wait_events[i]->is_set) { + fprintf(file, " ev set"); + } +#endif /* __WIN__ */ + + fprintf(file, "\n"); } - if (array == os_aio_write_array) { - fputs(",\n ibuf aio reads:", file); - array = os_aio_ibuf_array; + fputs("Pending normal aio reads:", file); - goto loop; + os_aio_print_array(file, os_aio_read_array); + + if (os_aio_write_array != 0) { + fputs(", aio writes:", file); + os_aio_print_array(file, os_aio_write_array); } - if (array == os_aio_ibuf_array) { - fputs(", log i/o's:", file); - array = os_aio_log_array; + if (os_aio_ibuf_array != 0) { + fputs(",\n ibuf aio reads:", file); + os_aio_print_array(file, os_aio_ibuf_array); + } - goto loop; + if (os_aio_log_array != 0) { + fputs(", log i/o's:", file); + os_aio_print_array(file, os_aio_log_array); } - if (array == os_aio_log_array) { + if (os_aio_sync_array != 0) { fputs(", sync i/o's:", file); - array = os_aio_sync_array; - - goto loop; + os_aio_print_array(file, os_aio_sync_array); } putc('\n', file); - current_time = time(NULL); + current_time = ut_time(); time_elapsed = 0.001 + difftime(current_time, os_last_printout); fprintf(file, @@ -5238,7 +5652,8 @@ loop: "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", (ulong) fil_n_pending_log_flushes, (ulong) fil_n_pending_tablespace_flushes, - (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_file_reads, + (ulong) os_n_file_writes, (ulong) os_n_fsyncs); if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { @@ -5310,21 +5725,29 @@ os_aio_all_slots_free(void) os_mutex_exit(array->mutex); - array = os_aio_write_array; + if (!srv_read_only_mode) { + ut_a(os_aio_write_array == 0); - os_mutex_enter(array->mutex); + array = os_aio_write_array; - n_res += array->n_reserved; + os_mutex_enter(array->mutex); - os_mutex_exit(array->mutex); + n_res += array->n_reserved; - array = os_aio_ibuf_array; + os_mutex_exit(array->mutex); - os_mutex_enter(array->mutex); + ut_a(os_aio_ibuf_array == 0); - n_res += array->n_reserved; + array = os_aio_ibuf_array; - os_mutex_exit(array->mutex); + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + } + + ut_a(os_aio_log_array == 0); array = os_aio_log_array; diff --git a/storage/innobase/os/os0sync.cc b/storage/innobase/os/os0sync.cc index c2e2e7e477f..392dbe0d7a7 100644 --- a/storage/innobase/os/os0sync.cc +++ b/storage/innobase/os/os0sync.cc @@ -38,7 +38,7 @@ Created 9/6/1995 Heikki Tuuri #include "srv0srv.h" /* Type definition for an operating system mutex struct */ -struct os_mutex_struct{ +struct os_mutex_t{ os_event_t event; /*!< Used by sync0arr.cc for queing threads */ void* handle; /*!< OS handle to mutex */ ulint count; /*!< we use this counter to check @@ -47,12 +47,12 @@ struct os_mutex_struct{ do not assume that the OS mutex supports recursive locking, though NT seems to do that */ - UT_LIST_NODE_T(os_mutex_str_t) os_mutex_list; + UT_LIST_NODE_T(os_mutex_t) os_mutex_list; /* list of all 'slow' OS mutexes created */ }; /** Mutex protecting counts and the lists of OS mutexes and events */ -UNIV_INTERN os_mutex_t os_sync_mutex; +UNIV_INTERN os_ib_mutex_t os_sync_mutex; /** TRUE if os_sync_mutex has been initialized */ static ibool os_sync_mutex_inited = FALSE; /** TRUE when os_sync_free() is being executed */ @@ -63,10 +63,10 @@ os_thread_exit */ UNIV_INTERN ulint os_thread_count = 0; /** The list of all events created */ -static UT_LIST_BASE_NODE_T(os_event_struct_t) os_event_list; +static UT_LIST_BASE_NODE_T(os_event) os_event_list; /** The list of all OS 'slow' mutexes */ -static UT_LIST_BASE_NODE_T(os_mutex_str_t) os_mutex_list; +static UT_LIST_BASE_NODE_T(os_mutex_t) os_mutex_list; UNIV_INTERN ulint os_event_count = 0; UNIV_INTERN ulint os_mutex_count = 0; @@ -329,7 +329,7 @@ os_sync_free(void) /*==============*/ { os_event_t event; - os_mutex_t mutex; + os_ib_mutex_t mutex; os_sync_free_called = TRUE; event = UT_LIST_GET_FIRST(os_event_list); @@ -365,10 +365,8 @@ must be reset explicitly by calling sync_os_reset_event. @return the event handle */ UNIV_INTERN os_event_t -os_event_create( -/*============*/ - const char* name) /*!< in: the name of the event, if NULL - the event is created without a name */ +os_event_create(void) +/*==================*/ { os_event_t event; @@ -377,10 +375,7 @@ os_event_create( event = static_cast<os_event_t>(ut_malloc(sizeof(*event))); - event->handle = CreateEvent(NULL, - TRUE, - FALSE, - (LPCTSTR) name); + event->handle = CreateEvent(NULL, TRUE, FALSE, NULL); if (!event->handle) { fprintf(stderr, "InnoDB: Could not create a Windows event" @@ -390,10 +385,7 @@ os_event_create( } else /* Windows with condition variables */ #endif { - UT_NOT_USED(name); - - event = static_cast<os_event_struct_t*>( - ut_malloc(sizeof(struct os_event_struct))); + event = static_cast<os_event_t>(ut_malloc(sizeof *event)); #ifndef PFS_SKIP_EVENT_MUTEX os_fast_mutex_init(event_os_mutex_key, &event->os_mutex); @@ -739,27 +731,26 @@ os_event_wait_time_low( /*********************************************************//** Creates an operating system mutex semaphore. Because these are slow, the -mutex semaphore of InnoDB itself (mutex_t) should be used where possible. +mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible. @return the mutex handle */ UNIV_INTERN -os_mutex_t +os_ib_mutex_t os_mutex_create(void) /*=================*/ { os_fast_mutex_t* mutex; - os_mutex_t mutex_str; + os_ib_mutex_t mutex_str; mutex = static_cast<os_fast_mutex_t*>( ut_malloc(sizeof(os_fast_mutex_t))); os_fast_mutex_init(os_mutex_key, mutex); - mutex_str = static_cast<os_mutex_t>( - ut_malloc(sizeof(os_mutex_str_t))); + mutex_str = static_cast<os_ib_mutex_t>(ut_malloc(sizeof *mutex_str)); mutex_str->handle = mutex; mutex_str->count = 0; - mutex_str->event = os_event_create(NULL); + mutex_str->event = os_event_create(); if (UNIV_LIKELY(os_sync_mutex_inited)) { /* When creating os_sync_mutex itself we cannot reserve it */ @@ -783,7 +774,7 @@ UNIV_INTERN void os_mutex_enter( /*===========*/ - os_mutex_t mutex) /*!< in: mutex to acquire */ + os_ib_mutex_t mutex) /*!< in: mutex to acquire */ { os_fast_mutex_lock(static_cast<os_fast_mutex_t*>(mutex->handle)); @@ -798,7 +789,7 @@ UNIV_INTERN void os_mutex_exit( /*==========*/ - os_mutex_t mutex) /*!< in: mutex to release */ + os_ib_mutex_t mutex) /*!< in: mutex to release */ { ut_a(mutex); @@ -814,7 +805,7 @@ UNIV_INTERN void os_mutex_free( /*==========*/ - os_mutex_t mutex) /*!< in: mutex to free */ + os_ib_mutex_t mutex) /*!< in: mutex to free */ { ut_a(mutex); diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc index 48ee61e9402..9cc09a847b1 100644 --- a/storage/innobase/os/os0thread.cc +++ b/storage/innobase/os/os0thread.cc @@ -132,8 +132,10 @@ os_thread_create_func( if (thread_id) { *thread_id = win_thread_id; } - - return(thread); + if (thread) { + CloseHandle(thread); + } + return((os_thread_t)win_thread_id); #else int ret; os_thread_t pthread; diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index 9046338f377..f416d38cc35 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -973,6 +974,9 @@ page_cur_insert_rec_low( page = page_align(current_rec); ut_ad(dict_table_is_comp(index->table) == (ibool) !!page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() || mtr->inside_ibuf); ut_ad(!page_rec_is_supremum(current_rec)); @@ -1007,8 +1011,8 @@ page_cur_insert_rec_low( rec_offs_init(foffsets_); - foffsets = rec_get_offsets(free_rec, index, foffsets, - ULINT_UNDEFINED, &heap); + foffsets = rec_get_offsets( + free_rec, index, foffsets, ULINT_UNDEFINED, &heap); if (rec_offs_size(foffsets) < rec_size) { if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); @@ -1167,14 +1171,27 @@ page_cur_insert_rec_zip_reorg( buf_block_t* block, /*!< in: buffer block */ dict_index_t* index, /*!< in: record descriptor */ rec_t* rec, /*!< in: inserted record */ + ulint rec_size,/*!< in: size of the inserted record */ page_t* page, /*!< in: uncompressed page */ page_zip_des_t* page_zip,/*!< in: compressed page */ mtr_t* mtr) /*!< in: mini-transaction, or NULL */ { ulint pos; + /* Make a local copy as the values can change dynamically. */ + bool log_compressed = page_log_compressed_pages; + ulint level = page_compression_level; + /* Recompress or reorganize and recompress the page. */ - if (page_zip_compress(page_zip, page, index, mtr)) { + if (page_zip_compress(page_zip, page, index, level, + log_compressed ? mtr : NULL)) { + if (!log_compressed) { + page_cur_insert_rec_write_log( + rec, rec_size, *current_rec, index, mtr); + page_zip_compress_write_log_no_data( + level, page, index, mtr); + } + return(rec); } @@ -1246,6 +1263,9 @@ page_cur_insert_rec_zip( page = page_align(*current_rec); ut_ad(dict_table_is_comp(index->table)); ut_ad(page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || mtr->inside_ibuf || recv_recovery_is_on()); ut_ad(!page_rec_is_supremum(*current_rec)); #ifdef UNIV_ZIP_DEBUG @@ -1281,10 +1301,27 @@ page_cur_insert_rec_zip( index, rec, offsets, NULL); - if (UNIV_LIKELY(insert_rec != NULL)) { + /* If recovery is on, this implies that the compression + of the page was successful during runtime. Had that not + been the case or had the redo logging of compressed + pages been enabled during runtime then we'd have seen + a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we + know that we don't need to reorganize the page. We, + however, do need to recompress the page. That will + happen when the next redo record is read which must + be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must + contain a valid compression level value. + This implies that during recovery from this point till + the next redo is applied the uncompressed and + compressed versions are not identical and + page_zip_validate will fail but that is OK because + we call page_zip_validate only after processing + all changes to a page under a single mtr during + recovery. */ + if (insert_rec != NULL && !recv_recovery_is_on()) { insert_rec = page_cur_insert_rec_zip_reorg( current_rec, block, index, insert_rec, - page, page_zip, mtr); + rec_size, page, page_zip, mtr); #ifdef UNIV_DEBUG if (insert_rec) { rec_offs_make_valid( @@ -1781,9 +1818,9 @@ UNIV_INLINE void page_cur_delete_rec_write_log( /*==========================*/ - rec_t* rec, /*!< in: record to be deleted */ - dict_index_t* index, /*!< in: record descriptor */ - mtr_t* mtr) /*!< in: mini-transaction handle */ + rec_t* rec, /*!< in: record to be deleted */ + const dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mini-transaction handle */ { byte* log_ptr; @@ -1865,10 +1902,11 @@ UNIV_INTERN void page_cur_delete_rec( /*================*/ - page_cur_t* cursor, /*!< in/out: a page cursor */ - dict_index_t* index, /*!< in: record descriptor */ - const ulint* offsets,/*!< in: rec_get_offsets(cursor->rec, index) */ - mtr_t* mtr) /*!< in: mini-transaction handle */ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle */ { page_dir_slot_t* cur_dir_slot; page_dir_slot_t* prev_slot; @@ -1881,8 +1919,6 @@ page_cur_delete_rec( ulint cur_n_owned; rec_t* rec; - ut_ad(cursor && mtr); - page = page_cur_get_page(cursor); page_zip = page_cur_get_page_zip(cursor); @@ -1897,17 +1933,23 @@ page_cur_delete_rec( current_rec = cursor->rec; ut_ad(rec_offs_validate(current_rec, index, offsets)); ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || mtr->inside_ibuf || recv_recovery_is_on()); /* The record must not be the supremum or infimum record. */ ut_ad(page_rec_is_user_rec(current_rec)); /* Save to local variables some data associated with current_rec */ cur_slot_no = page_dir_find_owner_slot(current_rec); + ut_ad(cur_slot_no > 0); cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no); cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); /* 0. Write the log record */ - page_cur_delete_rec_write_log(current_rec, index, mtr); + if (mtr != 0) { + page_cur_delete_rec_write_log(current_rec, index, mtr); + } /* 1. Reset the last insert info in the page header and increment the modify clock for the frame */ @@ -1915,9 +1957,13 @@ page_cur_delete_rec( page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); /* The page gets invalid for optimistic searches: increment the - frame modify clock */ + frame modify clock only if there is an mini-transaction covering + the change. During IMPORT we allocate local blocks that are not + part of the buffer pool. */ - buf_block_modify_clock_inc(page_cur_get_block(cursor)); + if (mtr != 0) { + buf_block_modify_clock_inc(page_cur_get_block(cursor)); + } /* 2. Find the next and the previous record. Note that the cursor is left at the next record. */ @@ -1961,14 +2007,15 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ - btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete"); + btr_blob_dbg_remove_rec(current_rec, const_cast<dict_index_t*>(index), + offsets, "delete"); page_mem_free(page, page_zip, current_rec, index, offsets); /* 7. Now we have decremented the number of owned records of the slot. If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the slots. */ - if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) { + if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { page_dir_balance_slot(page, page_zip, cur_slot_no); } diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index deef6935f08..6b7b8424856 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -512,7 +513,8 @@ page_create_zip( page = page_create_low(block, TRUE); mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level); - if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) { + if (!page_zip_compress(page_zip, page, index, + page_compression_level, mtr)) { /* The compression of a newly created page should always succeed. */ ut_error; @@ -658,7 +660,11 @@ page_copy_rec_list_end( if (new_page_zip) { mtr_set_log_mode(mtr, log_mode); - if (!page_zip_compress(new_page_zip, new_page, index, mtr)) { + if (!page_zip_compress(new_page_zip, + new_page, + index, + page_compression_level, + mtr)) { /* Before trying to reorganize the page, store the number of preceding records on the page. */ ulint ret_pos @@ -781,8 +787,9 @@ page_copy_rec_list_start( DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail", goto zip_reorganize;); - if (UNIV_UNLIKELY - (!page_zip_compress(new_page_zip, new_page, index, mtr))) { + if (!page_zip_compress(new_page_zip, new_page, index, + page_compression_level, mtr)) { + ulint ret_pos; #ifndef DBUG_OFF zip_reorganize: @@ -793,8 +800,8 @@ zip_reorganize: /* Before copying, "ret" was the predecessor of the predefined supremum record. If it was the predefined infimum record, then it would - still be the infimum. Thus, the assertion - ut_a(ret_pos > 0) would fail here. */ + still be the infimum, and we would have + ret_pos == 0. */ if (UNIV_UNLIKELY (!page_zip_reorganize(new_block, index, mtr))) { @@ -1049,6 +1056,7 @@ page_delete_rec_list_end( n_owned = rec_get_n_owned_new(rec2) - count; slot_index = page_dir_find_owner_slot(rec2); + ut_ad(slot_index > 0); slot = page_dir_get_nth_slot(page, slot_index); } else { rec_t* rec2 = rec; @@ -1064,6 +1072,7 @@ page_delete_rec_list_end( n_owned = rec_get_n_owned_old(rec2) - count; slot_index = page_dir_find_owner_slot(rec2); + ut_ad(slot_index > 0); slot = page_dir_get_nth_slot(page, slot_index); } @@ -1470,6 +1479,10 @@ page_rec_get_nth_const( ulint n_owned; const rec_t* rec; + if (nth == 0) { + return(page_get_infimum_rec(page)); + } + ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); for (i = 0;; i++) { @@ -2313,6 +2326,20 @@ page_validate( } } + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page) + && page_get_n_recs(page) > 0) { + trx_id_t max_trx_id = page_get_max_trx_id(page); + trx_id_t sys_max_trx_id = trx_sys_get_max_trx_id(); + + if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) { + ib_logf(IB_LOG_LEVEL_ERROR, + "PAGE_MAX_TRX_ID out of bounds: " + TRX_ID_FMT ", " TRX_ID_FMT, + max_trx_id, sys_max_trx_id); + goto func_exit2; + } + } + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); /* The following buffer is used to check that the @@ -2602,3 +2629,60 @@ page_find_rec_with_heap_no( } } #endif /* !UNIV_HOTBACKUP */ + +/*******************************************************//** +Removes the record from a leaf page. This function does not log +any changes. It is used by the IMPORT tablespace functions. +The cursor is moved to the next record after the deleted one. +@return true if success, i.e., the page did not become too empty */ +UNIV_INTERN +bool +page_delete_rec( +/*============*/ + const dict_index_t* index, /*!< in: The index that the record + belongs to */ + page_cur_t* pcur, /*!< in/out: page cursor on record + to delete */ + page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + const ulint* offsets)/*!< in: offsets for record */ +{ + bool no_compress_needed; + buf_block_t* block = pcur->block; + page_t* page = buf_block_get_frame(block); + + ut_ad(page_is_leaf(page)); + + if (!rec_offs_any_extern(offsets) + && ((page_get_data_size(page) - rec_offs_size(offsets) + < BTR_CUR_PAGE_COMPRESS_LIMIT) + || (mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL + && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL) + || (page_get_n_recs(page) < 2))) { + + ulint root_page_no = dict_index_get_page(index); + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + no_compress_needed = page_get_page_no(page) == root_page_no; + } else { + no_compress_needed = true; + } + + if (no_compress_needed) { +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_cur_delete_rec(pcur, index, offsets, 0); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(no_compress_needed); +} + diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index 35a8f458fb2..dee37580002 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,6 +24,9 @@ Compressed page interface Created June 2005 by Marko Makela *******************************************************/ +#include <map> +using namespace std; + #define THIS_MODULE #include "page0zip.h" #ifdef UNIV_NONINL @@ -54,9 +58,23 @@ Created June 2005 by Marko Makela #ifndef UNIV_HOTBACKUP /** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ -UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by index->id */ +UNIV_INTERN page_zip_stat_per_index_t page_zip_stat_per_index; +/** Mutex protecting page_zip_stat_per_index */ +UNIV_INTERN ib_mutex_t page_zip_stat_per_index_mutex; +#ifdef HAVE_PSI_INTERFACE +UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ #endif /* !UNIV_HOTBACKUP */ +/* Compression level to be used by zlib. Settable by user. */ +UNIV_INTERN ulint page_compression_level = 6; + +/* Whether or not to log compressed page images to avoid possible +compression algorithm changes in zlib. */ +UNIV_INTERN bool page_log_compressed_pages = true; + /* Please refer to ../include/page0zip.ic for a description of the compressed page format. */ @@ -386,7 +404,7 @@ page_zip_get_n_prev_extern( compressed page */ const rec_t* rec, /*!< in: compact physical record on a B-tree leaf page */ - dict_index_t* index) /*!< in: record descriptor */ + const dict_index_t* index) /*!< in: record descriptor */ { const page_t* page = page_align(rec); ulint n_ext = 0; @@ -1181,6 +1199,7 @@ page_zip_compress( m_start, m_end, m_nonempty */ const page_t* page, /*!< in: uncompressed page */ dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: commpression level */ mtr_t* mtr) /*!< in: mini-transaction, or NULL */ { z_stream c_stream; @@ -1194,7 +1213,6 @@ page_zip_compress( const rec_t** recs; /*!< dense page directory, sorted by address */ mem_heap_t* heap; ulint trx_id_col; - ulint* offsets = NULL; ulint n_blobs = 0; byte* storage;/* storage of uncompressed columns */ #ifndef UNIV_HOTBACKUP @@ -1203,6 +1221,10 @@ page_zip_compress( #ifdef PAGE_ZIP_COMPRESS_DBG FILE* logfile = NULL; #endif + /* A local copy of srv_cmp_per_index_enabled to avoid reading that + variable multiple times in this function since it can be changed at + anytime. */ + my_bool cmp_per_index_enabled = srv_cmp_per_index_enabled; ut_a(page_is_comp(page)); ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); @@ -1265,6 +1287,11 @@ page_zip_compress( #endif /* PAGE_ZIP_COMPRESS_DBG */ #ifndef UNIV_HOTBACKUP page_zip_stat[page_zip->ssize - 1].compressed++; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed++; + mutex_exit(&page_zip_stat_per_index_mutex); + } #endif /* !UNIV_HOTBACKUP */ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE @@ -1276,7 +1303,8 @@ page_zip_compress( MONITOR_INC(MONITOR_PAGE_COMPRESS); heap = mem_heap_create(page_zip_get_size(page_zip) - + n_fields * (2 + sizeof *offsets) + + n_fields * (2 + sizeof(ulint)) + + REC_OFFS_HEADER_SIZE + n_dense * ((sizeof *recs) - PAGE_ZIP_DIR_SLOT_SIZE) + UNIV_PAGE_SIZE * 4 @@ -1295,7 +1323,7 @@ page_zip_compress( /* Compress the data payload. */ page_zip_set_alloc(&c_stream, heap); - err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, + err = deflateInit2(&c_stream, level, Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT, MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); ut_a(err == Z_OK); @@ -1408,8 +1436,19 @@ err_exit: } #endif /* PAGE_ZIP_COMPRESS_DBG */ #ifndef UNIV_HOTBACKUP + if (page_is_leaf(page)) { + dict_index_zip_failure(index); + } + + ullint time_diff = ut_time_us(NULL) - usec; page_zip_stat[page_zip->ssize - 1].compressed_usec - += ut_time_us(NULL) - usec; + += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_usec + += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } #endif /* !UNIV_HOTBACKUP */ return(FALSE); } @@ -1469,11 +1508,18 @@ err_exit: } #endif /* PAGE_ZIP_COMPRESS_DBG */ #ifndef UNIV_HOTBACKUP - { - page_zip_stat_t* zip_stat - = &page_zip_stat[page_zip->ssize - 1]; - zip_stat->compressed_ok++; - zip_stat->compressed_usec += ut_time_us(NULL) - usec; + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].compressed_ok++; + page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_ok++; + page_zip_stat_per_index[index->id].compressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } + + if (page_is_leaf(page)) { + dict_index_zip_success(index); } #endif /* !UNIV_HOTBACKUP */ @@ -1518,6 +1564,7 @@ page_zip_fields_free( { if (index) { dict_table_t* table = index->table; + os_fast_mutex_free(&index->zip_pad.mutex); mem_heap_free(index->heap); mutex_free(&(table->autoinc_mutex)); ut_free(table->name); @@ -3075,11 +3122,17 @@ err_exit: page_zip_fields_free(index); mem_heap_free(heap); #ifndef UNIV_HOTBACKUP - { - page_zip_stat_t* zip_stat - = &page_zip_stat[page_zip->ssize - 1]; - zip_stat->decompressed++; - zip_stat->decompressed_usec += ut_time_us(NULL) - usec; + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].decompressed++; + page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff; + + index_id_t index_id = btr_page_get_index_id(page); + + if (srv_cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index_id].decompressed++; + page_zip_stat_per_index[index_id].decompressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); } #endif /* !UNIV_HOTBACKUP */ @@ -3177,7 +3230,7 @@ page_zip_validate_low( UNIV_MEM_ASSERT_RW() checks fail. The v-bits of page[], page_zip->data[] or page_zip could be viewed at temp_page[] or temp_page_zip in a debugger when running valgrind --db-attach. */ - VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE); + (void) VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE); UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); # if UNIV_WORD_SIZE == 4 VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip); @@ -3186,8 +3239,8 @@ page_zip_validate_low( pad bytes. */ UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip); # endif - VALGRIND_GET_VBITS(page_zip->data, temp_page, - page_zip_get_size(page_zip)); + (void) VALGRIND_GET_VBITS(page_zip->data, temp_page, + page_zip_get_size(page_zip)); UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); #endif /* UNIV_DEBUG_VALGRIND */ @@ -3295,7 +3348,7 @@ page_zip_validate_low( "record list: 0x%02x!=0x%02x\n", (unsigned) page_offset(rec), (unsigned) page_offset(trec))); - valid = FALSE; + valid = FALSE; break; } @@ -4042,10 +4095,10 @@ static void page_zip_clear_rec( /*===============*/ - page_zip_des_t* page_zip,/*!< in/out: compressed page */ - byte* rec, /*!< in: record to clear */ - dict_index_t* index, /*!< in: index of rec */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: record to clear */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ { ulint heap_no; page_t* page = page_align(rec); @@ -4256,11 +4309,12 @@ UNIV_INTERN void page_zip_dir_delete( /*================*/ - page_zip_des_t* page_zip,/*!< in/out: compressed page */ - byte* rec, /*!< in: record to delete */ - dict_index_t* index, /*!< in: index of rec */ - const ulint* offsets,/*!< in: rec_get_offsets(rec) */ - const byte* free) /*!< in: previous start of the free list */ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of + the free list */ { byte* slot_rec; byte* slot_free; @@ -4576,7 +4630,8 @@ page_zip_reorganize( /* Restore logging. */ mtr_set_log_mode(mtr, log_mode); - if (!page_zip_compress(page_zip, page, index, mtr)) { + if (!page_zip_compress(page_zip, page, index, + page_compression_level, mtr)) { #ifndef UNIV_HOTBACKUP buf_block_free(temp_block); diff --git a/storage/innobase/pars/lexyy.cc b/storage/innobase/pars/lexyy.cc index 9de8ea51efd..48ab04e1eff 100644 --- a/storage/innobase/pars/lexyy.cc +++ b/storage/innobase/pars/lexyy.cc @@ -35,7 +35,7 @@ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, - * if you want the limit (max/min) macros for int types. + * if you want the limit (max/min) macros for int types. */ #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS 1 @@ -55,7 +55,6 @@ typedef int flex_int32_t; typedef unsigned char flex_uint8_t; typedef unsigned short int flex_uint16_t; typedef unsigned int flex_uint32_t; -#endif /* ! C99 */ /* Limits of integral types. */ #ifndef INT8_MIN @@ -86,6 +85,8 @@ typedef unsigned int flex_uint32_t; #define UINT32_MAX (4294967295U) #endif +#endif /* ! C99 */ + #endif /* ! FLEXINT_H */ #ifdef __cplusplus @@ -142,7 +143,15 @@ typedef unsigned int flex_uint32_t; /* Size of default input buffer. */ #ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else #define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ #endif /* The state buf must be large enough to hold one state per character in the main buffer. @@ -276,7 +285,7 @@ static yy_size_t yy_n_chars; /* number of characters read into yy_ch_buf */ yy_size_t yyleng; /* Points to current character in buffer. */ -static char *yy_c_buf_p = (char*) 0; +static char *yy_c_buf_p = (char *) 0; static int yy_init = 0; /* whether we need to initialize */ static int yy_start = 0; /* start state number */ @@ -338,7 +347,7 @@ void yyfree (void * ); typedef unsigned char YY_CHAR; -FILE *yyin = (FILE*) 0, *yyout = (FILE*) 0; +FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; typedef int yy_state_type; @@ -373,7 +382,7 @@ struct yy_trans_info flex_int32_t yy_verify; flex_int32_t yy_nxt; }; -static yyconst flex_int16_t yy_accept[424] = +static yyconst flex_int16_t yy_accept[425] = { 0, 0, 0, 119, 119, 0, 0, 0, 0, 125, 123, 122, 122, 8, 123, 114, 5, 103, 109, 112, 110, @@ -382,46 +391,46 @@ static yyconst flex_int16_t yy_accept[424] = 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 115, 116, 119, 120, 6, 7, 9, 10, 122, 4, 98, 118, 2, 1, 3, 99, 100, 102, 101, 0, - 96, 96, 96, 96, 96, 96, 44, 96, 96, 96, + 96, 0, 96, 96, 96, 96, 96, 44, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, - 96, 96, 96, 28, 17, 25, 96, 96, 96, 96, + 96, 96, 96, 96, 28, 17, 25, 96, 96, 96, - 96, 96, 54, 63, 96, 14, 96, 96, 96, 96, - 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, - 96, 96, 96, 96, 119, 120, 120, 121, 6, 7, - 9, 10, 2, 0, 97, 13, 45, 96, 96, 96, - 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, - 96, 96, 96, 96, 96, 96, 27, 96, 96, 96, - 41, 96, 96, 96, 96, 21, 96, 96, 96, 96, - 96, 15, 96, 96, 96, 18, 96, 96, 96, 96, - 96, 82, 96, 96, 96, 51, 96, 12, 96, 36, + 96, 96, 96, 54, 63, 96, 14, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, - - 96, 0, 97, 96, 96, 96, 96, 20, 96, 24, + 96, 96, 96, 96, 96, 119, 120, 120, 121, 6, + 7, 9, 10, 2, 0, 97, 13, 45, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, - 96, 46, 96, 96, 30, 96, 89, 96, 96, 39, - 96, 96, 96, 96, 96, 48, 96, 94, 91, 32, - 93, 96, 11, 66, 96, 96, 96, 42, 96, 96, - 96, 96, 96, 96, 96, 96, 96, 96, 29, 96, - 96, 96, 96, 96, 96, 96, 96, 96, 87, 0, - 96, 26, 96, 96, 96, 68, 96, 96, 96, 96, - 37, 96, 96, 96, 96, 96, 96, 96, 31, 67, - 23, 96, 59, 96, 77, 96, 96, 96, 43, 96, - - 96, 96, 96, 96, 96, 96, 96, 92, 96, 96, - 56, 96, 96, 96, 96, 96, 96, 96, 40, 33, - 0, 81, 95, 19, 96, 96, 85, 96, 76, 55, - 96, 65, 96, 52, 96, 96, 96, 47, 96, 78, - 96, 80, 96, 96, 34, 96, 96, 96, 35, 74, - 96, 96, 96, 96, 60, 96, 50, 49, 96, 96, - 96, 57, 53, 64, 96, 96, 96, 22, 96, 96, - 75, 83, 96, 96, 79, 96, 70, 96, 96, 96, - 96, 96, 38, 96, 90, 69, 96, 86, 96, 96, - 96, 88, 96, 96, 61, 96, 16, 96, 72, 71, - - 96, 58, 96, 84, 96, 96, 96, 96, 96, 96, - 96, 96, 96, 96, 73, 96, 96, 96, 96, 96, - 96, 62, 0 + 96, 96, 96, 96, 96, 96, 96, 27, 96, 96, + 96, 41, 96, 96, 96, 96, 21, 96, 96, 96, + 96, 96, 15, 96, 96, 96, 18, 96, 96, 96, + 96, 96, 82, 96, 96, 96, 51, 96, 12, 96, + 36, 96, 96, 96, 96, 96, 96, 96, 96, 96, + + 96, 96, 0, 97, 96, 96, 96, 96, 20, 96, + 24, 96, 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 46, 96, 96, 30, 96, 89, 96, 96, + 39, 96, 96, 96, 96, 96, 48, 96, 94, 91, + 32, 93, 96, 11, 66, 96, 96, 96, 42, 96, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 29, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 87, + 0, 96, 26, 96, 96, 96, 68, 96, 96, 96, + 96, 37, 96, 96, 96, 96, 96, 96, 96, 31, + 67, 23, 96, 59, 96, 77, 96, 96, 96, 43, + + 96, 96, 96, 96, 96, 96, 96, 96, 92, 96, + 96, 56, 96, 96, 96, 96, 96, 96, 96, 40, + 33, 0, 81, 95, 19, 96, 96, 85, 96, 76, + 55, 96, 65, 96, 52, 96, 96, 96, 47, 96, + 78, 96, 80, 96, 96, 34, 96, 96, 96, 35, + 74, 96, 96, 96, 96, 60, 96, 50, 49, 96, + 96, 96, 57, 53, 64, 96, 96, 96, 22, 96, + 96, 75, 83, 96, 96, 79, 96, 70, 96, 96, + 96, 96, 96, 38, 96, 90, 69, 96, 86, 96, + 96, 96, 88, 96, 96, 61, 96, 16, 96, 72, + + 71, 96, 58, 96, 84, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 73, 96, 96, 96, 96, + 96, 96, 62, 0 } ; static yyconst flex_int32_t yy_ec[256] = @@ -432,14 +441,14 @@ static yyconst flex_int32_t yy_ec[256] = 1, 2, 1, 4, 5, 6, 7, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 19, 20, - 21, 22, 23, 1, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, - 1, 1, 1, 1, 50, 1, 33, 33, 33, 33, - - 33, 33, 33, 33, 33, 33, 33, 51, 33, 33, - 33, 33, 52, 33, 53, 33, 33, 33, 33, 33, - 33, 33, 54, 1, 55, 1, 1, 1, 1, 1, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 1, 1, 1, 1, 51, 1, 34, 34, 34, 34, + + 34, 34, 34, 34, 34, 34, 34, 52, 34, 34, + 34, 34, 53, 34, 54, 34, 34, 34, 34, 34, + 34, 34, 55, 1, 56, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -456,328 +465,438 @@ static yyconst flex_int32_t yy_ec[256] = 1, 1, 1, 1, 1 } ; -static yyconst flex_int32_t yy_meta[56] = +static yyconst flex_int32_t yy_meta[57] = { 0, 1, 1, 1, 2, 3, 1, 1, 4, 1, 1, 5, 1, 1, 1, 1, 6, 7, 1, 1, 1, - 8, 1, 1, 9, 9, 9, 9, 9, 9, 9, + 8, 1, 1, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 1, 1 + 9, 9, 9, 9, 1, 1 } ; -static yyconst flex_int16_t yy_base[436] = +static yyconst flex_int16_t yy_base[438] = { 0, - 0, 0, 849, 848, 850, 849, 852, 851, 854, 861, - 54, 56, 861, 0, 861, 861, 861, 861, 861, 861, - 861, 861, 838, 841, 45, 830, 861, 42, 861, 829, - 861, 45, 49, 54, 58, 56, 72, 833, 83, 86, - 63, 67, 90, 53, 105, 107, 106, 120, 51, 101, - 861, 861, 0, 55, 0, 840, 0, 843, 106, 0, - 861, 861, 829, 61, 824, 861, 861, 861, 861, 839, - 827, 88, 124, 130, 132, 125, 826, 129, 133, 136, - 52, 138, 148, 140, 142, 145, 149, 152, 151, 159, - 162, 169, 165, 825, 172, 824, 173, 170, 175, 179, - - 176, 177, 823, 822, 180, 182, 184, 200, 201, 195, - 189, 202, 204, 207, 205, 210, 218, 220, 213, 215, - 223, 230, 238, 217, 0, 240, 244, 861, 0, 829, - 0, 832, 818, 781, 0, 817, 816, 233, 237, 243, - 248, 251, 246, 252, 255, 257, 258, 262, 264, 263, - 265, 267, 266, 269, 273, 270, 815, 274, 275, 287, - 814, 290, 292, 291, 293, 294, 297, 300, 304, 298, - 307, 313, 308, 309, 317, 813, 314, 315, 323, 318, - 324, 328, 331, 332, 333, 812, 336, 811, 338, 810, - 340, 339, 342, 344, 343, 341, 347, 346, 348, 349, - - 359, 773, 0, 356, 369, 370, 360, 808, 371, 807, - 372, 375, 376, 378, 379, 380, 382, 383, 388, 393, - 394, 806, 396, 397, 805, 398, 804, 399, 400, 803, - 403, 404, 408, 413, 405, 802, 415, 801, 800, 799, - 798, 406, 797, 796, 416, 417, 420, 795, 422, 418, - 423, 425, 424, 426, 439, 429, 437, 440, 794, 446, - 450, 453, 454, 455, 457, 458, 459, 460, 793, 757, - 461, 791, 463, 464, 466, 790, 467, 468, 473, 474, - 789, 475, 476, 477, 478, 480, 485, 486, 788, 787, - 786, 489, 785, 491, 784, 498, 493, 494, 783, 499, - - 504, 509, 511, 513, 516, 514, 517, 782, 520, 519, - 781, 521, 523, 527, 525, 528, 526, 529, 780, 779, - 780, 776, 773, 530, 533, 535, 772, 534, 771, 770, - 541, 769, 550, 760, 543, 548, 551, 753, 552, 736, - 554, 730, 556, 557, 723, 558, 566, 563, 693, 692, - 569, 572, 565, 578, 691, 574, 690, 689, 567, 585, - 588, 688, 687, 685, 571, 589, 591, 683, 592, 593, - 681, 680, 595, 596, 679, 597, 678, 599, 604, 602, - 605, 608, 676, 606, 675, 674, 609, 673, 607, 610, - 614, 670, 620, 623, 668, 628, 667, 630, 665, 664, - - 625, 663, 629, 112, 627, 626, 631, 632, 647, 633, - 636, 637, 644, 650, 110, 652, 659, 657, 660, 661, - 662, 57, 861, 710, 719, 728, 731, 734, 738, 747, - 756, 765, 774, 781, 784 + 0, 0, 293, 287, 284, 281, 272, 256, 254, 1357, + 55, 57, 1357, 0, 1357, 1357, 1357, 1357, 1357, 1357, + 1357, 1357, 238, 227, 46, 205, 1357, 43, 1357, 203, + 1357, 46, 50, 56, 52, 66, 64, 51, 81, 92, + 91, 94, 96, 111, 113, 116, 130, 134, 53, 143, + 1357, 1357, 0, 106, 0, 212, 0, 210, 141, 0, + 1357, 1357, 192, 56, 173, 1357, 1357, 1357, 1357, 168, + 140, 150, 152, 154, 155, 161, 167, 171, 177, 172, + 184, 174, 188, 189, 191, 194, 203, 212, 215, 217, + 219, 221, 226, 228, 231, 240, 233, 235, 246, 251, + + 258, 253, 255, 256, 269, 271, 278, 272, 285, 283, + 287, 289, 296, 305, 298, 315, 319, 321, 322, 326, + 332, 333, 342, 339, 343, 0, 112, 173, 1357, 0, + 155, 0, 156, 132, 93, 0, 355, 357, 358, 360, + 364, 367, 374, 370, 379, 380, 389, 383, 390, 392, + 395, 408, 411, 409, 415, 418, 425, 427, 429, 436, + 431, 441, 446, 448, 450, 452, 453, 462, 471, 464, + 473, 474, 478, 485, 488, 490, 491, 494, 500, 501, + 504, 506, 507, 517, 518, 519, 520, 521, 522, 523, + 533, 536, 538, 543, 549, 554, 555, 561, 556, 566, + + 567, 576, 60, 0, 573, 578, 580, 582, 583, 593, + 589, 596, 598, 603, 605, 607, 610, 617, 619, 621, + 622, 628, 633, 634, 635, 639, 640, 649, 650, 652, + 653, 655, 659, 664, 668, 669, 665, 671, 674, 678, + 681, 685, 687, 688, 692, 697, 698, 701, 703, 704, + 707, 708, 717, 713, 728, 730, 724, 740, 734, 745, + 746, 750, 751, 756, 757, 760, 761, 762, 771, 773, + 42, 778, 782, 783, 787, 789, 792, 794, 793, 804, + 805, 808, 809, 810, 819, 823, 826, 828, 829, 830, + 835, 840, 844, 846, 847, 856, 857, 858, 859, 860, + + 863, 872, 873, 878, 879, 882, 885, 889, 894, 895, + 896, 898, 905, 910, 908, 912, 914, 915, 926, 930, + 931, 73, 932, 933, 935, 937, 942, 944, 946, 947, + 948, 949, 951, 958, 961, 965, 967, 972, 978, 979, + 981, 984, 983, 985, 994, 988, 999, 1000, 1001, 1004, + 1013, 1015, 1022, 1016, 1019, 1026, 1032, 1033, 1035, 1036, + 1038, 1039, 1048, 1049, 1050, 1051, 1053, 1054, 1060, 1063, + 1065, 1066, 1069, 1070, 1072, 1082, 1084, 1085, 1087, 1096, + 1097, 1098, 1099, 1101, 1113, 1114, 1115, 1116, 1117, 1118, + 1119, 1128, 1130, 1131, 1134, 1133, 1135, 1137, 1150, 1151, + + 1153, 1155, 1157, 1162, 1160, 1167, 1172, 1173, 1174, 1176, + 1185, 1190, 1183, 1187, 1189, 1199, 1204, 1206, 1208, 1210, + 1215, 1220, 1222, 1357, 1269, 1278, 1287, 1290, 1293, 1297, + 1306, 1315, 1324, 1333, 1340, 1344, 1347 } ; -static yyconst flex_int16_t yy_def[436] = +static yyconst flex_int16_t yy_def[438] = { 0, - 423, 1, 424, 424, 425, 425, 426, 426, 423, 423, - 423, 423, 423, 427, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 428, 423, 423, 423, 423, - 423, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 423, 423, 430, 431, 432, 423, 433, 423, 423, 427, - 423, 423, 423, 423, 428, 423, 423, 423, 423, 434, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 430, 431, 431, 423, 432, 423, - 433, 423, 423, 423, 435, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - - 429, 423, 435, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 423, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 423, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 429, 429, 429, 429, 429, 429, 429, 429, - 429, 429, 0, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423 + 424, 1, 425, 425, 426, 426, 427, 427, 424, 424, + 424, 424, 424, 428, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 429, 424, 424, 424, 424, + 424, 430, 430, 430, 430, 430, 34, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 424, 424, 431, 432, 433, 424, 434, 424, 424, 428, + 424, 424, 424, 424, 429, 424, 424, 424, 424, 435, + 430, 436, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 431, 432, 432, 424, 433, + 424, 434, 424, 424, 424, 437, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 424, 437, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 424, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 424, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 0, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424 } ; -static yyconst flex_int16_t yy_nxt[917] = +static yyconst flex_int16_t yy_nxt[1414] = { 0, 10, 11, 12, 13, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, - 38, 39, 38, 38, 40, 41, 42, 43, 44, 38, - 45, 46, 47, 48, 49, 50, 38, 38, 38, 38, - 38, 38, 38, 51, 52, 59, 59, 59, 59, 63, - 70, 64, 67, 68, 70, 127, 70, 70, 70, 70, - 128, 70, 70, 70, 122, 63, 74, 64, 70, 149, - 75, 72, 70, 76, 78, 83, 73, 70, 79, 84, - 86, 80, 87, 108, 81, 85, 77, 82, 70, 89, - - 100, 70, 88, 70, 101, 70, 90, 59, 59, 91, - 102, 94, 92, 97, 136, 93, 70, 98, 103, 95, - 70, 70, 70, 99, 96, 70, 104, 70, 105, 117, - 106, 123, 109, 107, 112, 70, 118, 113, 124, 70, - 70, 110, 111, 119, 70, 70, 114, 70, 70, 137, - 115, 70, 143, 70, 116, 70, 120, 70, 121, 139, - 70, 140, 142, 70, 70, 138, 70, 70, 141, 155, - 144, 146, 147, 151, 70, 157, 145, 70, 150, 148, - 70, 154, 152, 158, 70, 70, 156, 70, 70, 153, - 70, 70, 70, 159, 70, 70, 160, 70, 164, 70, - - 169, 163, 161, 168, 70, 171, 162, 174, 175, 167, - 70, 173, 170, 165, 166, 70, 70, 70, 172, 70, - 70, 182, 70, 183, 179, 70, 176, 187, 70, 189, - 70, 177, 70, 70, 184, 70, 185, 178, 70, 180, - 190, 188, 192, 181, 186, 70, 195, 193, 70, 197, - 423, 191, 70, 70, 127, 423, 196, 201, 70, 128, - 204, 70, 194, 70, 198, 199, 70, 70, 205, 200, - 70, 207, 70, 70, 206, 208, 209, 70, 70, 70, - 70, 70, 70, 215, 70, 70, 210, 217, 70, 70, - 70, 222, 213, 211, 221, 214, 212, 225, 216, 220, - - 228, 226, 70, 218, 219, 70, 70, 70, 70, 70, - 229, 223, 70, 70, 224, 70, 227, 231, 232, 70, - 233, 235, 70, 70, 70, 230, 237, 238, 70, 70, - 70, 236, 70, 70, 241, 234, 240, 239, 70, 70, - 247, 242, 243, 70, 245, 244, 70, 70, 70, 248, - 246, 70, 249, 70, 70, 70, 70, 70, 70, 70, - 254, 70, 70, 70, 70, 252, 257, 250, 260, 261, - 265, 70, 264, 258, 70, 70, 255, 251, 259, 256, - 262, 253, 263, 268, 70, 70, 70, 70, 267, 266, - 70, 70, 269, 70, 70, 70, 271, 70, 70, 276, - - 274, 279, 280, 70, 275, 272, 273, 278, 70, 70, - 283, 70, 70, 70, 70, 70, 285, 277, 70, 70, - 70, 70, 281, 70, 282, 284, 289, 287, 70, 290, - 70, 70, 70, 70, 296, 70, 286, 70, 70, 70, - 70, 70, 291, 298, 70, 292, 288, 301, 294, 305, - 293, 307, 70, 295, 70, 70, 299, 297, 303, 300, - 310, 70, 306, 302, 304, 70, 308, 311, 70, 70, - 70, 309, 70, 70, 70, 70, 70, 312, 70, 70, - 313, 70, 70, 70, 316, 318, 319, 320, 70, 70, - 70, 70, 70, 70, 326, 70, 314, 315, 328, 317, - - 70, 70, 330, 322, 70, 323, 70, 334, 70, 70, - 327, 324, 331, 70, 70, 325, 329, 332, 333, 70, - 337, 335, 336, 340, 70, 339, 70, 342, 70, 70, - 343, 70, 70, 338, 70, 70, 70, 341, 70, 347, - 70, 70, 70, 70, 70, 70, 353, 345, 70, 70, - 70, 344, 355, 357, 348, 346, 70, 352, 70, 349, - 350, 351, 354, 70, 356, 70, 70, 70, 365, 70, - 358, 70, 70, 70, 360, 361, 362, 364, 70, 359, - 70, 70, 70, 363, 70, 366, 70, 70, 367, 70, - 369, 373, 368, 70, 374, 376, 375, 371, 372, 370, - - 70, 379, 378, 70, 70, 377, 70, 70, 70, 380, - 70, 70, 70, 383, 70, 382, 381, 70, 386, 70, - 70, 70, 70, 70, 70, 70, 391, 385, 388, 70, - 392, 384, 389, 387, 395, 70, 397, 390, 70, 393, - 70, 70, 70, 70, 70, 70, 70, 70, 70, 398, - 402, 70, 70, 394, 400, 396, 403, 399, 404, 70, - 406, 405, 70, 413, 412, 70, 409, 70, 408, 401, - 407, 411, 70, 414, 70, 70, 70, 70, 70, 70, - 70, 410, 70, 70, 415, 70, 418, 417, 70, 70, - 70, 70, 419, 70, 70, 70, 70, 420, 70, 416, - - 70, 421, 70, 70, 70, 70, 70, 70, 70, 422, - 53, 53, 53, 53, 53, 53, 53, 53, 53, 55, - 55, 55, 55, 55, 55, 55, 55, 55, 57, 57, - 57, 57, 57, 57, 57, 57, 57, 60, 70, 60, - 65, 65, 65, 71, 71, 70, 71, 125, 125, 125, - 125, 70, 125, 125, 125, 125, 126, 126, 126, 126, - 126, 126, 126, 126, 126, 129, 129, 129, 70, 129, - 129, 129, 129, 129, 131, 70, 131, 131, 131, 131, - 131, 131, 131, 135, 70, 70, 70, 70, 70, 135, - 203, 70, 203, 135, 70, 70, 70, 70, 70, 70, - - 70, 70, 70, 70, 70, 70, 70, 321, 70, 70, - 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, - 70, 70, 70, 70, 270, 70, 70, 70, 70, 70, - 70, 70, 70, 202, 133, 132, 130, 70, 70, 70, - 70, 70, 70, 134, 423, 133, 132, 130, 70, 69, - 66, 62, 61, 423, 58, 58, 56, 56, 54, 54, - 9, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423 + 29, 30, 31, 10, 32, 33, 34, 35, 36, 37, + 38, 38, 39, 38, 38, 40, 41, 42, 43, 44, + 38, 45, 46, 47, 48, 49, 50, 38, 38, 38, + 38, 38, 38, 38, 51, 52, 59, 59, 59, 59, + 63, 70, 64, 67, 68, 70, 70, 70, 70, 72, + 63, 70, 64, 72, 72, 72, 72, 123, 75, 72, + 84, 70, 76, 73, 85, 77, 136, 79, 74, 72, + 86, 80, 90, 322, 81, 71, 70, 82, 78, 91, + + 83, 87, 92, 88, 72, 93, 70, 70, 94, 70, + 95, 70, 271, 89, 72, 72, 128, 72, 96, 72, + 98, 129, 424, 97, 99, 104, 70, 424, 70, 101, + 100, 70, 102, 105, 72, 106, 72, 107, 103, 72, + 108, 110, 59, 59, 113, 70, 203, 114, 134, 70, + 111, 112, 109, 72, 118, 70, 115, 72, 70, 133, + 116, 119, 131, 72, 117, 70, 72, 70, 120, 70, + 70, 121, 135, 122, 124, 72, 70, 72, 72, 137, + 138, 125, 70, 128, 72, 140, 70, 70, 129, 70, + 72, 141, 70, 424, 72, 72, 139, 72, 142, 70, + + 72, 144, 150, 70, 70, 143, 70, 72, 134, 70, + 145, 72, 72, 133, 72, 152, 146, 72, 70, 131, + 147, 148, 156, 69, 153, 66, 72, 70, 149, 151, + 70, 154, 70, 155, 70, 72, 70, 62, 72, 158, + 72, 70, 72, 70, 72, 157, 70, 159, 70, 72, + 70, 72, 61, 424, 72, 70, 72, 161, 72, 58, + 160, 70, 162, 72, 163, 164, 70, 165, 70, 72, + 70, 70, 168, 70, 72, 58, 72, 170, 72, 72, + 169, 72, 166, 167, 70, 172, 70, 70, 56, 171, + 174, 56, 72, 70, 72, 72, 173, 54, 70, 175, + + 70, 72, 70, 54, 70, 176, 72, 180, 72, 424, + 72, 70, 72, 70, 183, 177, 424, 178, 424, 72, + 70, 72, 181, 179, 184, 424, 182, 424, 72, 188, + 70, 186, 424, 189, 70, 185, 70, 70, 72, 187, + 190, 70, 72, 424, 72, 72, 193, 70, 70, 72, + 194, 191, 424, 424, 70, 72, 72, 70, 70, 424, + 198, 192, 72, 424, 196, 72, 72, 200, 424, 424, + 70, 201, 70, 70, 197, 70, 195, 199, 72, 70, + 72, 72, 70, 72, 202, 70, 205, 72, 424, 70, + 72, 208, 206, 72, 70, 70, 207, 72, 70, 209, + + 210, 424, 72, 72, 70, 70, 72, 70, 424, 216, + 70, 211, 72, 72, 424, 72, 218, 424, 72, 424, + 424, 212, 213, 70, 70, 214, 70, 217, 215, 424, + 70, 72, 72, 70, 72, 223, 219, 220, 72, 222, + 70, 72, 70, 221, 70, 424, 70, 424, 72, 424, + 72, 70, 72, 226, 72, 230, 70, 227, 224, 72, + 225, 70, 229, 70, 72, 70, 424, 70, 70, 72, + 424, 72, 228, 72, 232, 72, 72, 70, 233, 70, + 234, 236, 231, 424, 424, 72, 70, 72, 70, 70, + 424, 237, 238, 70, 72, 235, 72, 72, 240, 239, + + 70, 72, 242, 70, 424, 70, 70, 243, 72, 70, + 424, 72, 241, 72, 72, 70, 70, 72, 246, 70, + 244, 70, 70, 72, 72, 245, 248, 72, 249, 72, + 72, 247, 70, 70, 70, 70, 70, 70, 70, 250, + 72, 72, 72, 72, 72, 72, 72, 255, 70, 424, + 251, 70, 253, 70, 424, 424, 72, 252, 70, 72, + 424, 72, 256, 258, 70, 257, 72, 424, 254, 70, + 70, 70, 72, 259, 261, 262, 70, 72, 72, 72, + 260, 70, 70, 424, 72, 266, 263, 265, 70, 72, + 72, 70, 424, 70, 264, 70, 72, 70, 70, 72, + + 267, 72, 269, 72, 70, 72, 72, 268, 70, 424, + 270, 70, 72, 70, 272, 273, 72, 274, 70, 72, + 70, 72, 70, 275, 277, 70, 72, 276, 72, 280, + 72, 281, 70, 72, 70, 279, 70, 70, 424, 424, + 72, 278, 72, 70, 72, 72, 286, 284, 70, 70, + 70, 72, 424, 282, 70, 70, 72, 72, 72, 285, + 283, 424, 72, 72, 70, 70, 288, 70, 70, 290, + 70, 287, 72, 72, 70, 72, 72, 424, 72, 70, + 70, 291, 72, 70, 70, 289, 70, 72, 72, 70, + 424, 72, 72, 70, 72, 292, 70, 72, 293, 297, + + 70, 72, 70, 70, 72, 295, 294, 70, 72, 296, + 72, 72, 70, 70, 298, 72, 70, 424, 70, 70, + 72, 72, 70, 70, 72, 299, 72, 72, 70, 302, + 72, 72, 70, 424, 424, 424, 72, 424, 300, 70, + 72, 301, 306, 70, 424, 70, 303, 72, 304, 70, + 305, 72, 307, 72, 308, 70, 424, 72, 309, 424, + 70, 70, 312, 72, 311, 70, 70, 310, 72, 72, + 424, 70, 70, 72, 72, 70, 70, 70, 313, 72, + 72, 314, 424, 72, 72, 72, 70, 317, 70, 319, + 320, 424, 424, 70, 72, 315, 72, 70, 70, 321, + + 316, 72, 70, 318, 70, 72, 72, 70, 70, 70, + 72, 424, 72, 424, 424, 72, 72, 72, 424, 70, + 70, 323, 327, 70, 70, 70, 324, 72, 72, 424, + 329, 72, 72, 72, 70, 325, 328, 331, 70, 326, + 424, 70, 72, 70, 70, 70, 72, 332, 330, 72, + 70, 72, 72, 72, 335, 70, 424, 424, 72, 70, + 333, 70, 70, 72, 334, 336, 337, 72, 424, 72, + 72, 70, 70, 70, 70, 70, 338, 424, 70, 72, + 72, 72, 72, 72, 424, 340, 72, 70, 70, 341, + 339, 424, 343, 70, 70, 72, 72, 70, 424, 344, + + 70, 72, 72, 342, 70, 72, 348, 424, 72, 70, + 70, 70, 72, 70, 424, 346, 345, 72, 72, 72, + 70, 72, 347, 70, 424, 70, 349, 70, 72, 70, + 70, 72, 350, 72, 354, 72, 351, 72, 72, 352, + 356, 70, 353, 358, 355, 70, 70, 70, 70, 72, + 70, 357, 70, 72, 72, 72, 72, 70, 72, 70, + 72, 70, 70, 70, 70, 72, 70, 72, 359, 72, + 72, 72, 72, 70, 72, 424, 70, 424, 424, 361, + 70, 72, 70, 362, 72, 360, 365, 70, 72, 363, + 72, 366, 364, 70, 70, 72, 70, 424, 70, 70, + + 70, 72, 72, 70, 72, 367, 72, 72, 72, 70, + 368, 72, 424, 424, 70, 70, 70, 72, 424, 70, + 369, 370, 72, 72, 72, 424, 374, 72, 70, 371, + 70, 70, 424, 375, 70, 372, 72, 70, 72, 72, + 373, 70, 72, 376, 379, 72, 377, 70, 70, 72, + 70, 70, 424, 70, 70, 72, 72, 378, 72, 72, + 380, 72, 72, 70, 70, 70, 70, 383, 70, 70, + 382, 72, 72, 72, 72, 70, 72, 72, 70, 381, + 70, 70, 424, 72, 70, 70, 72, 70, 72, 72, + 387, 386, 72, 72, 384, 72, 385, 70, 424, 70, + + 70, 424, 70, 424, 389, 72, 388, 72, 72, 390, + 72, 70, 70, 70, 70, 392, 70, 424, 424, 72, + 72, 72, 72, 393, 72, 391, 396, 424, 70, 70, + 70, 70, 70, 70, 70, 394, 72, 72, 72, 72, + 72, 72, 72, 70, 398, 70, 70, 395, 70, 70, + 70, 72, 70, 72, 72, 424, 72, 72, 72, 424, + 72, 399, 403, 397, 404, 70, 70, 400, 70, 401, + 70, 424, 70, 72, 72, 70, 72, 70, 72, 405, + 72, 402, 70, 72, 424, 72, 424, 70, 70, 70, + 72, 70, 406, 424, 407, 72, 72, 72, 70, 72, + + 70, 412, 70, 424, 70, 70, 72, 424, 72, 410, + 72, 408, 72, 72, 70, 409, 424, 413, 414, 70, + 415, 70, 72, 70, 411, 70, 424, 72, 416, 72, + 70, 72, 424, 72, 419, 70, 424, 70, 72, 417, + 418, 424, 424, 72, 420, 72, 424, 424, 421, 424, + 424, 424, 424, 424, 424, 424, 422, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 423, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 60, 424, 60, 65, + + 65, 65, 71, 71, 424, 71, 126, 126, 126, 126, + 424, 126, 126, 126, 126, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 130, 130, 130, 424, 130, 130, + 130, 130, 130, 132, 424, 132, 132, 132, 132, 132, + 132, 132, 136, 424, 424, 424, 424, 424, 136, 72, + 72, 424, 72, 204, 424, 204, 9, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424 } ; -static yyconst flex_int16_t yy_chk[917] = +static yyconst flex_int16_t yy_chk[1414] = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 11, 11, 12, 12, 25, - 32, 25, 28, 28, 33, 54, 49, 81, 44, 34, - 54, 36, 422, 35, 49, 64, 33, 64, 41, 81, - 33, 32, 42, 33, 34, 35, 32, 37, 34, 35, - 36, 34, 36, 44, 34, 35, 33, 34, 39, 37, - - 41, 40, 36, 72, 42, 43, 37, 59, 59, 37, - 42, 39, 37, 40, 72, 37, 50, 40, 43, 39, - 45, 47, 46, 40, 39, 415, 43, 404, 43, 47, - 43, 50, 45, 43, 46, 48, 47, 46, 50, 73, - 76, 45, 45, 47, 78, 74, 46, 75, 79, 73, - 46, 80, 78, 82, 46, 84, 48, 85, 48, 74, - 86, 75, 76, 83, 87, 73, 89, 88, 75, 85, - 79, 80, 80, 83, 90, 87, 79, 91, 82, 80, - 93, 84, 83, 88, 92, 98, 86, 95, 97, 83, - 99, 101, 102, 89, 100, 105, 90, 106, 95, 107, - - 99, 93, 91, 98, 111, 100, 92, 105, 106, 97, - 110, 102, 99, 95, 95, 108, 109, 112, 101, 113, - 115, 110, 114, 111, 109, 116, 107, 113, 119, 115, - 120, 108, 124, 117, 111, 118, 112, 108, 121, 109, - 115, 114, 117, 109, 112, 122, 120, 118, 138, 121, - 126, 116, 139, 123, 127, 126, 120, 124, 140, 127, - 138, 143, 119, 141, 122, 123, 142, 144, 139, 123, - 145, 141, 146, 147, 140, 142, 142, 148, 150, 149, - 151, 153, 152, 147, 154, 156, 143, 149, 155, 158, - 159, 153, 146, 144, 152, 146, 145, 156, 148, 151, - - 159, 156, 160, 150, 150, 162, 164, 163, 165, 166, - 160, 154, 167, 170, 155, 168, 158, 163, 164, 169, - 165, 166, 171, 173, 174, 162, 167, 168, 172, 177, - 178, 166, 175, 180, 171, 165, 170, 169, 179, 181, - 178, 172, 173, 182, 175, 174, 183, 184, 185, 179, - 177, 187, 180, 189, 192, 191, 196, 193, 195, 194, - 185, 198, 197, 199, 200, 183, 191, 181, 194, 194, - 197, 204, 196, 192, 201, 207, 187, 182, 193, 189, - 194, 184, 195, 200, 205, 206, 209, 211, 199, 198, - 212, 213, 201, 214, 215, 216, 204, 217, 218, 211, - - 207, 214, 215, 219, 209, 205, 206, 213, 220, 221, - 218, 223, 224, 226, 228, 229, 220, 212, 231, 232, - 235, 242, 216, 233, 217, 219, 226, 223, 234, 228, - 237, 245, 246, 250, 235, 247, 221, 249, 251, 253, - 252, 254, 229, 242, 256, 231, 224, 247, 233, 252, - 232, 254, 257, 234, 255, 258, 245, 237, 250, 246, - 257, 260, 253, 249, 251, 261, 255, 258, 262, 263, - 264, 256, 265, 266, 267, 268, 271, 260, 273, 274, - 261, 275, 277, 278, 264, 266, 267, 268, 279, 280, - 282, 283, 284, 285, 277, 286, 262, 263, 279, 265, - - 287, 288, 282, 271, 292, 273, 294, 286, 297, 298, - 278, 274, 283, 296, 300, 275, 280, 284, 285, 301, - 292, 287, 288, 297, 302, 296, 303, 300, 304, 306, - 301, 305, 307, 294, 310, 309, 312, 298, 313, 305, - 315, 317, 314, 316, 318, 324, 313, 303, 325, 328, - 326, 302, 315, 317, 306, 304, 331, 312, 335, 307, - 309, 310, 314, 336, 316, 333, 337, 339, 335, 341, - 318, 343, 344, 346, 325, 326, 328, 333, 348, 324, - 353, 347, 359, 331, 351, 336, 365, 352, 337, 356, - 341, 347, 339, 354, 348, 352, 351, 344, 346, 343, - - 360, 356, 354, 361, 366, 353, 367, 369, 370, 359, - 373, 374, 376, 365, 378, 361, 360, 380, 369, 379, - 381, 384, 389, 382, 387, 390, 378, 367, 373, 391, - 379, 366, 374, 370, 382, 393, 387, 376, 394, 380, - 401, 406, 405, 396, 403, 398, 407, 408, 410, 389, - 394, 411, 412, 381, 391, 384, 396, 390, 398, 413, - 403, 401, 409, 411, 410, 414, 407, 416, 406, 393, - 405, 409, 418, 412, 417, 419, 420, 421, 402, 400, - 399, 408, 397, 395, 413, 392, 417, 416, 388, 386, - 385, 383, 418, 377, 375, 372, 371, 419, 368, 414, - - 364, 420, 363, 362, 358, 357, 355, 350, 349, 421, - 424, 424, 424, 424, 424, 424, 424, 424, 424, 425, + 1, 1, 1, 1, 1, 1, 11, 11, 12, 12, + 25, 32, 25, 28, 28, 33, 38, 35, 49, 32, + 64, 34, 64, 33, 38, 35, 49, 49, 33, 34, + 35, 36, 33, 32, 35, 33, 322, 34, 32, 36, + 35, 34, 37, 271, 34, 37, 39, 34, 33, 37, + + 34, 36, 37, 36, 39, 37, 41, 40, 37, 42, + 39, 43, 203, 36, 41, 40, 54, 42, 39, 43, + 40, 54, 127, 39, 40, 43, 44, 127, 45, 41, + 40, 46, 42, 43, 44, 43, 45, 43, 42, 46, + 43, 45, 59, 59, 46, 47, 135, 46, 134, 48, + 45, 45, 44, 47, 47, 71, 46, 48, 50, 133, + 46, 47, 131, 71, 46, 72, 50, 73, 47, 74, + 75, 48, 70, 48, 50, 73, 76, 74, 75, 73, + 74, 50, 77, 128, 76, 75, 78, 80, 128, 82, + 77, 76, 79, 65, 78, 80, 74, 82, 76, 81, + + 79, 79, 82, 83, 84, 77, 85, 81, 63, 86, + 80, 83, 84, 58, 85, 84, 80, 86, 87, 56, + 81, 81, 86, 30, 84, 26, 87, 88, 81, 83, + 89, 84, 90, 85, 91, 88, 92, 24, 89, 88, + 90, 93, 91, 94, 92, 87, 95, 89, 97, 93, + 98, 94, 23, 9, 95, 96, 97, 91, 98, 8, + 90, 99, 92, 96, 93, 94, 100, 96, 102, 99, + 103, 104, 98, 101, 100, 7, 102, 100, 103, 104, + 99, 101, 96, 96, 105, 101, 106, 108, 6, 100, + 103, 5, 105, 107, 106, 108, 102, 4, 110, 106, + + 109, 107, 111, 3, 112, 107, 110, 110, 109, 0, + 111, 113, 112, 115, 111, 108, 0, 109, 0, 113, + 114, 115, 110, 109, 112, 0, 110, 0, 114, 114, + 116, 113, 0, 115, 117, 112, 118, 119, 116, 113, + 116, 120, 117, 0, 118, 119, 118, 121, 122, 120, + 119, 116, 0, 0, 124, 121, 122, 123, 125, 0, + 122, 117, 124, 0, 121, 123, 125, 124, 0, 0, + 137, 124, 138, 139, 121, 140, 120, 123, 137, 141, + 138, 139, 142, 140, 125, 144, 139, 141, 0, 143, + 142, 142, 140, 144, 145, 146, 141, 143, 148, 143, + + 143, 0, 145, 146, 147, 149, 148, 150, 0, 148, + 151, 144, 147, 149, 0, 150, 150, 0, 151, 0, + 0, 145, 146, 152, 154, 147, 153, 149, 147, 0, + 155, 152, 154, 156, 153, 154, 151, 151, 155, 153, + 157, 156, 158, 152, 159, 0, 161, 0, 157, 0, + 158, 160, 159, 157, 161, 161, 162, 157, 155, 160, + 156, 163, 160, 164, 162, 165, 0, 166, 167, 163, + 0, 164, 159, 165, 164, 166, 167, 168, 165, 170, + 166, 167, 163, 0, 0, 168, 169, 170, 171, 172, + 0, 167, 168, 173, 169, 166, 171, 172, 170, 169, + + 174, 173, 172, 175, 0, 176, 177, 173, 174, 178, + 0, 175, 171, 176, 177, 179, 180, 178, 176, 181, + 174, 182, 183, 179, 180, 175, 179, 181, 180, 182, + 183, 178, 184, 185, 186, 187, 188, 189, 190, 181, + 184, 185, 186, 187, 188, 189, 190, 186, 191, 0, + 182, 192, 184, 193, 0, 0, 191, 183, 194, 192, + 0, 193, 188, 192, 195, 190, 194, 0, 185, 196, + 197, 199, 195, 193, 195, 195, 198, 196, 197, 199, + 194, 200, 201, 0, 198, 198, 195, 197, 205, 200, + 201, 202, 0, 206, 196, 207, 205, 208, 209, 202, + + 199, 206, 201, 207, 211, 208, 209, 200, 210, 0, + 202, 212, 211, 213, 205, 206, 210, 207, 214, 212, + 215, 213, 216, 208, 212, 217, 214, 210, 215, 215, + 216, 216, 218, 217, 219, 214, 220, 221, 0, 0, + 218, 213, 219, 222, 220, 221, 221, 219, 223, 224, + 225, 222, 0, 217, 226, 227, 223, 224, 225, 220, + 218, 0, 226, 227, 228, 229, 224, 230, 231, 227, + 232, 222, 228, 229, 233, 230, 231, 0, 232, 234, + 237, 229, 233, 235, 236, 225, 238, 234, 237, 239, + 0, 235, 236, 240, 238, 230, 241, 239, 232, 236, + + 242, 240, 243, 244, 241, 234, 233, 245, 242, 235, + 243, 244, 246, 247, 238, 245, 248, 0, 249, 250, + 246, 247, 251, 252, 248, 243, 249, 250, 254, 248, + 251, 252, 253, 0, 0, 0, 254, 0, 246, 257, + 253, 247, 253, 255, 0, 256, 250, 257, 251, 259, + 252, 255, 254, 256, 255, 258, 0, 259, 256, 0, + 260, 261, 259, 258, 258, 262, 263, 257, 260, 261, + 0, 264, 265, 262, 263, 266, 267, 268, 261, 264, + 265, 262, 0, 266, 267, 268, 269, 265, 270, 267, + 268, 0, 0, 272, 269, 263, 270, 273, 274, 269, + + 264, 272, 275, 266, 276, 273, 274, 277, 279, 278, + 275, 0, 276, 0, 0, 277, 279, 278, 0, 280, + 281, 272, 278, 282, 283, 284, 274, 280, 281, 0, + 280, 282, 283, 284, 285, 275, 279, 283, 286, 276, + 0, 287, 285, 288, 289, 290, 286, 284, 281, 287, + 291, 288, 289, 290, 287, 292, 0, 0, 291, 293, + 285, 294, 295, 292, 286, 288, 289, 293, 0, 294, + 295, 296, 297, 298, 299, 300, 293, 0, 301, 296, + 297, 298, 299, 300, 0, 297, 301, 302, 303, 298, + 295, 0, 301, 304, 305, 302, 303, 306, 0, 302, + + 307, 304, 305, 299, 308, 306, 306, 0, 307, 309, + 310, 311, 308, 312, 0, 304, 303, 309, 310, 311, + 313, 312, 305, 315, 0, 314, 307, 316, 313, 317, + 318, 315, 308, 314, 314, 316, 310, 317, 318, 311, + 316, 319, 313, 318, 315, 320, 321, 323, 324, 319, + 325, 317, 326, 320, 321, 323, 324, 327, 325, 328, + 326, 329, 330, 331, 332, 327, 333, 328, 319, 329, + 330, 331, 332, 334, 333, 0, 335, 0, 0, 326, + 336, 334, 337, 327, 335, 325, 334, 338, 336, 329, + 337, 336, 332, 339, 340, 338, 341, 0, 343, 342, + + 344, 339, 340, 346, 341, 337, 343, 342, 344, 345, + 338, 346, 0, 0, 347, 348, 349, 345, 0, 350, + 340, 342, 347, 348, 349, 0, 348, 350, 351, 344, + 352, 354, 0, 349, 355, 345, 351, 353, 352, 354, + 347, 356, 355, 352, 355, 353, 353, 357, 358, 356, + 359, 360, 0, 361, 362, 357, 358, 354, 359, 360, + 357, 361, 362, 363, 364, 365, 366, 362, 367, 368, + 361, 363, 364, 365, 366, 369, 367, 368, 370, 360, + 371, 372, 0, 369, 373, 374, 370, 375, 371, 372, + 370, 368, 373, 374, 366, 375, 367, 376, 0, 377, + + 378, 0, 379, 0, 374, 376, 371, 377, 378, 375, + 379, 380, 381, 382, 383, 379, 384, 0, 0, 380, + 381, 382, 383, 380, 384, 377, 383, 0, 385, 386, + 387, 388, 389, 390, 391, 381, 385, 386, 387, 388, + 389, 390, 391, 392, 388, 393, 394, 382, 396, 395, + 397, 392, 398, 393, 394, 0, 396, 395, 397, 0, + 398, 390, 395, 385, 397, 399, 400, 391, 401, 392, + 402, 0, 403, 399, 400, 405, 401, 404, 402, 399, + 403, 394, 406, 405, 0, 404, 0, 407, 408, 409, + 406, 410, 402, 0, 404, 407, 408, 409, 413, 410, + + 411, 410, 414, 0, 415, 412, 413, 0, 411, 408, + 414, 406, 415, 412, 416, 407, 0, 411, 412, 417, + 413, 418, 416, 419, 409, 420, 0, 417, 414, 418, + 421, 419, 0, 420, 418, 422, 0, 423, 421, 415, + 417, 0, 0, 422, 419, 423, 0, 0, 420, 0, + 0, 0, 0, 0, 0, 0, 421, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 422, 425, 425, 425, 425, 425, 425, 425, 425, 425, 426, 426, - 426, 426, 426, 426, 426, 426, 426, 427, 345, 427, - 428, 428, 428, 429, 429, 342, 429, 430, 430, 430, - 430, 340, 430, 430, 430, 430, 431, 431, 431, 431, - 431, 431, 431, 431, 431, 432, 432, 432, 338, 432, - 432, 432, 432, 432, 433, 334, 433, 433, 433, 433, - 433, 433, 433, 434, 332, 330, 329, 327, 323, 434, - 435, 322, 435, 321, 320, 319, 311, 308, 299, 295, - - 293, 291, 290, 289, 281, 276, 272, 270, 269, 259, - 248, 244, 243, 241, 240, 239, 238, 236, 230, 227, - 225, 222, 210, 208, 202, 190, 188, 186, 176, 161, - 157, 137, 136, 134, 133, 132, 130, 104, 103, 96, - 94, 77, 71, 70, 65, 63, 58, 56, 38, 30, - 26, 24, 23, 9, 8, 7, 6, 5, 4, 3, - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - - 423, 423, 423, 423, 423, 423, 423, 423, 423, 423, - 423, 423, 423, 423, 423, 423 + 426, 426, 426, 426, 426, 426, 426, 427, 427, 427, + 427, 427, 427, 427, 427, 427, 428, 0, 428, 429, + + 429, 429, 430, 430, 0, 430, 431, 431, 431, 431, + 0, 431, 431, 431, 431, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 433, 433, 433, 0, 433, 433, + 433, 433, 433, 434, 0, 434, 434, 434, 434, 434, + 434, 434, 435, 0, 0, 0, 0, 0, 435, 436, + 436, 0, 436, 437, 0, 437, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424 } ; static yy_state_type yy_last_accepting_state; @@ -847,6 +966,7 @@ Created 12/14/1997 Heikki Tuuri #define realloc(P, A) ut_realloc(P, A) #define exit(A) ut_error +/* Note: We cast &result to int* from yysize_t* */ #define YY_INPUT(buf, result, max_size) \ (result = pars_get_lex_chars(buf, max_size)) @@ -883,7 +1003,7 @@ string_append( -#line 887 "lexyy.cc" +#line 1006 "lexyy.cc" #define INITIAL 0 #define comment 1 @@ -965,7 +1085,12 @@ static int input (void ); /* Amount of stuff to slurp up with each read. */ #ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else #define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ #endif /* Copy whatever the last rule matched to the standard output. */ @@ -973,7 +1098,7 @@ static int input (void ); /* This used to be an fputs(), but since the string might contain NUL's, * we now use fwrite(). */ -#define ECHO fwrite( yytext, yyleng, 1, yyout ) +#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0) #endif /* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, @@ -984,7 +1109,7 @@ static int input (void ); if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ { \ int c = '*'; \ - yy_size_t n; \ + size_t n; \ for ( n = 0; n < max_size && \ (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ buf[n] = (char) c; \ @@ -1069,7 +1194,7 @@ YY_DECL #line 112 "pars0lex.l" -#line 1073 "lexyy.cc" +#line 1197 "lexyy.cc" if ( !(yy_init) ) { @@ -1122,13 +1247,13 @@ yy_match: while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 424 ) + if ( yy_current_state >= 425 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; ++yy_cp; } - while ( yy_current_state != 423 ); + while ( yy_current_state != 424 ); yy_cp = (yy_last_accepting_cpos); yy_current_state = (yy_last_accepting_state); @@ -2109,7 +2234,7 @@ YY_RULE_SETUP #line 691 "pars0lex.l" YY_FATAL_ERROR( "flex scanner jammed" ); YY_BREAK -#line 2113 "lexyy.cc" +#line 2237 "lexyy.cc" case YY_STATE_EOF(INITIAL): case YY_STATE_EOF(comment): case YY_STATE_EOF(quoted): @@ -2299,7 +2424,7 @@ static int yy_get_next_buffer (void) else { - yy_size_t num_to_read = + int num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; while ( num_to_read <= 0 ) @@ -2313,16 +2438,16 @@ static int yy_get_next_buffer (void) if ( b->yy_is_our_buffer ) { - yy_size_t new_size = b->yy_buf_size * 2; + int new_size = b->yy_buf_size * 2; if ( new_size <= 0 ) b->yy_buf_size += b->yy_buf_size / 8; else b->yy_buf_size *= 2; - b->yy_ch_buf = (char*) + b->yy_ch_buf = (char *) /* Include room in for 2 EOB chars. */ - yyrealloc((void*) b->yy_ch_buf,b->yy_buf_size + 2 ); + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); } else /* Can't grow it, we don't own it. */ @@ -2344,7 +2469,7 @@ static int yy_get_next_buffer (void) /* Read in more data. */ YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), - (yy_n_chars), num_to_read ); + (yy_n_chars), (size_t) num_to_read ); YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); } @@ -2371,7 +2496,7 @@ static int yy_get_next_buffer (void) if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { /* Extend the array by 50%, plus the number we really need. */ yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); - YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char*) yyrealloc((void*) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ); if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); } @@ -2387,7 +2512,7 @@ static int yy_get_next_buffer (void) /* yy_get_previous_state - get the state just before the EOB char was reached */ - static yy_state_type yy_get_previous_state (void) + yy_state_type yy_get_previous_state (void) { register yy_state_type yy_current_state; register char *yy_cp; @@ -2405,7 +2530,7 @@ static int yy_get_next_buffer (void) while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 424 ) + if ( yy_current_state >= 425 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; @@ -2419,7 +2544,7 @@ static int yy_get_next_buffer (void) * synopsis * next_state = yy_try_NUL_trans( current_state ); */ - static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) { register int yy_is_jam; register char *yy_cp = (yy_c_buf_p); @@ -2433,11 +2558,11 @@ static int yy_get_next_buffer (void) while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 424 ) + if ( yy_current_state >= 425 ) yy_c = yy_meta[(unsigned int) yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; - yy_is_jam = (yy_current_state == 423); + yy_is_jam = (yy_current_state == 424); return yy_is_jam ? 0 : yy_current_state; } @@ -2466,7 +2591,7 @@ static int yy_get_next_buffer (void) else { /* need more input */ - yy_size_t offset = (yy_c_buf_p) - (yytext_ptr); + int offset = (int)((yy_c_buf_p) - (yytext_ptr)); ++(yy_c_buf_p); switch ( yy_get_next_buffer( ) ) @@ -2490,7 +2615,7 @@ static int yy_get_next_buffer (void) case EOB_ACT_END_OF_FILE: { if ( yywrap( ) ) - return 0; + return EOF; if ( ! (yy_did_buffer_switch_on_eof) ) YY_NEW_FILE; @@ -2508,7 +2633,7 @@ static int yy_get_next_buffer (void) } } - c = *(unsigned char*) (yy_c_buf_p); /* cast for 8-bit char's */ + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ *(yy_c_buf_p) = '\0'; /* preserve yytext */ (yy_hold_char) = *++(yy_c_buf_p); @@ -2518,7 +2643,7 @@ static int yy_get_next_buffer (void) /** Immediately switch to a different input stream. * @param input_file A readable stream. - * + * * @note This function does not reset the start condition to @c INITIAL . */ void yyrestart (FILE * input_file ) @@ -2536,7 +2661,7 @@ static int yy_get_next_buffer (void) /** Switch to a different input buffer. * @param new_buffer The new input buffer. - * + * */ __attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) { @@ -2580,7 +2705,7 @@ static void yy_load_buffer_state (void) /** Allocate and initialize an input buffer state. * @param file A readable stream. * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. - * + * * @return the allocated buffer state. */ static YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) @@ -2596,7 +2721,7 @@ static void yy_load_buffer_state (void) /* yy_ch_buf has to be 2 characters longer than the size given because * we need to put in 2 end-of-buffer characters. */ - b->yy_ch_buf = (char*) yyalloc(b->yy_buf_size + 2 ); + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ); if ( ! b->yy_ch_buf ) YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); @@ -2609,9 +2734,9 @@ static void yy_load_buffer_state (void) /** Destroy the buffer. * @param b a buffer created with yy_create_buffer() - * + * */ - void yy_delete_buffer (YY_BUFFER_STATE b ) + void yy_delete_buffer (YY_BUFFER_STATE b ) { if ( ! b ) @@ -2621,20 +2746,20 @@ static void yy_load_buffer_state (void) YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; if ( b->yy_is_our_buffer ) - yyfree((void*) b->yy_ch_buf ); + yyfree((void *) b->yy_ch_buf ); - yyfree((void*) b ); + yyfree((void *) b ); } /* Initializes or reinitializes a buffer. * This function is sometimes called more than once on the same buffer, * such as during a yyrestart() or at EOF. */ - static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) { int oerrno = errno; - + yy_flush_buffer(b ); b->yy_input_file = file; @@ -2650,13 +2775,13 @@ static void yy_load_buffer_state (void) } b->yy_is_interactive = 0; - + errno = oerrno; } /** Discard all buffered characters. On the next scan, YY_INPUT will be called. * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. - * + * */ void yy_flush_buffer (YY_BUFFER_STATE b ) { @@ -2685,7 +2810,7 @@ static void yy_load_buffer_state (void) * the current state. This function will allocate the stack * if necessary. * @param new_buffer The new state. - * + * */ void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) { @@ -2715,7 +2840,7 @@ void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) /** Removes and deletes the top of the stack, if present. * The next element becomes the new top. - * + * */ void yypop_buffer_state (void) { @@ -2738,8 +2863,8 @@ void yypop_buffer_state (void) */ static void yyensure_buffer_stack (void) { - yy_size_t num_to_alloc; - + int num_to_alloc; + if (!(yy_buffer_stack)) { /* First allocation is just for 2 elements, since we don't know if this @@ -2747,7 +2872,7 @@ static void yyensure_buffer_stack (void) * immediate realloc on the next call. */ num_to_alloc = 1; - (yy_buffer_stack) = (struct yy_buffer_state**) yyalloc + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc (num_to_alloc * sizeof(struct yy_buffer_state*) ); if ( ! (yy_buffer_stack) ) @@ -2766,7 +2891,7 @@ static void yyensure_buffer_stack (void) int grow_size = 8 /* arbitrary grow size */; num_to_alloc = (yy_buffer_stack_max) + grow_size; - (yy_buffer_stack) = (struct yy_buffer_state**) yyrealloc + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc ((yy_buffer_stack), num_to_alloc * sizeof(struct yy_buffer_state*) ); @@ -2809,7 +2934,7 @@ static void yy_fatal_error (yyconst char* msg ) /* Accessor methods (get/set functions) to struct members. */ /** Get the current line number. - * + * */ int yyget_lineno (void) { @@ -2818,7 +2943,7 @@ int yyget_lineno (void) } /** Get the input stream. - * + * */ FILE *yyget_in (void) { @@ -2826,7 +2951,7 @@ FILE *yyget_in (void) } /** Get the output stream. - * + * */ FILE *yyget_out (void) { @@ -2834,7 +2959,7 @@ FILE *yyget_out (void) } /** Get the length of the current token. - * + * */ yy_size_t yyget_leng (void) { @@ -2842,7 +2967,7 @@ yy_size_t yyget_leng (void) } /** Get the current token. - * + * */ char *yyget_text (void) @@ -2852,18 +2977,18 @@ char *yyget_text (void) /** Set the current line number. * @param line_number - * + * */ void yyset_lineno (int line_number ) { - + yylineno = line_number; } /** Set the input stream. This does not discard the current * input buffer. * @param in_str A readable stream. - * + * * @see yy_switch_to_buffer */ void yyset_in (FILE * in_str ) @@ -2895,7 +3020,7 @@ static int yy_init_globals (void) (yy_buffer_stack) = 0; (yy_buffer_stack_top) = 0; (yy_buffer_stack_max) = 0; - (yy_c_buf_p) = (char*) 0; + (yy_c_buf_p) = (char *) 0; (yy_init) = 0; (yy_start) = 0; @@ -2904,8 +3029,8 @@ static int yy_init_globals (void) yyin = stdin; yyout = stdout; #else - yyin = (FILE*) 0; - yyout = (FILE*) 0; + yyin = (FILE *) 0; + yyout = (FILE *) 0; #endif /* For future reference: Set errno on error, since we are called by @@ -2917,7 +3042,7 @@ static int yy_init_globals (void) /* yylex_destroy is for both reentrant and non-reentrant scanners. */ __attribute__((unused)) static int yylex_destroy (void) { - + /* Pop the buffer stack, destroying each element. */ while(YY_CURRENT_BUFFER){ yy_delete_buffer(YY_CURRENT_BUFFER ); @@ -2962,24 +3087,24 @@ static int yy_flex_strlen (yyconst char * s ) void *yyalloc (yy_size_t size ) { - return (void*) malloc( size ); + return (void *) malloc( size ); } void *yyrealloc (void * ptr, yy_size_t size ) { - /* The cast to (char*) in the following accommodates both + /* The cast to (char *) in the following accommodates both * implementations that use char* generic pointers, and those * that use void* generic pointers. It works with the latter * because both ANSI C and C++ allow castless assignment from * any pointer type to void*, and deal with argument conversions * as though doing an assignment. */ - return (void*) realloc( (char*) ptr, size ); + return (void *) realloc( (char *) ptr, size ); } void yyfree (void * ptr ) { - free( (char*) ptr ); /* see yyrealloc() for (char*) cast */ + free( (char*) ptr ); /* see yyrealloc() for (char *) cast */ } #define YYTABLES_NAME "yytables" diff --git a/storage/innobase/pars/pars0lex.l b/storage/innobase/pars/pars0lex.l index 2446e40cde8..83c3af4b6c5 100644 --- a/storage/innobase/pars/pars0lex.l +++ b/storage/innobase/pars/pars0lex.l @@ -102,7 +102,7 @@ string_append( DIGIT [0-9] ID [a-z_A-Z][a-z_A-Z0-9]* -TABLE_NAME [a-z_A-Z][a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]* +TABLE_NAME [a-z_A-Z][@a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]* BOUND_LIT \:[a-z_A-Z0-9]+ BOUND_ID \$[a-z_A-Z0-9]+ diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc index e5f347eedd6..cbed2b39eeb 100644 --- a/storage/innobase/pars/pars0opt.cc +++ b/storage/innobase/pars/pars0opt.cc @@ -345,7 +345,7 @@ opt_calc_index_goodness( /* At least for now we don't support using FTS indexes for queries done through InnoDB's own SQL parser. */ - if (index->type == DICT_FTS) { + if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) { return(0); } @@ -400,7 +400,7 @@ opt_calc_index_goodness( } } - /* We have to test for goodness here, as last_op may note be set */ + /* We have to test for goodness here, as last_op may not be set */ if (goodness && dict_index_is_clust(index)) { goodness++; diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc index a4ab85adc36..f82610e62d0 100644 --- a/storage/innobase/pars/pars0pars.cc +++ b/storage/innobase/pars/pars0pars.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -859,7 +859,8 @@ pars_retrieve_table_def( sym_node->resolved = TRUE; sym_node->token_type = SYM_TABLE_REF_COUNTED; - sym_node->table = dict_table_open_on_name(sym_node->name, TRUE); + sym_node->table = dict_table_open_on_name( + sym_node->name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); ut_a(sym_node->table != NULL); } @@ -1115,8 +1116,8 @@ pars_function_declaration( sym_node->token_type = SYM_FUNCTION; /* Check that the function exists. */ - ut_a(pars_info_get_user_func(pars_sym_tab_global->info, - sym_node->name)); + ut_a(pars_info_lookup_user_func( + pars_sym_tab_global->info, sym_node->name)); return(sym_node); } @@ -1782,8 +1783,9 @@ pars_fetch_statement( } else { pars_resolve_exp_variables_and_types(NULL, user_func); - node->func = pars_info_get_user_func(pars_sym_tab_global->info, - user_func->name); + node->func = pars_info_lookup_user_func( + pars_sym_tab_global->info, user_func->name); + ut_a(node->func); node->into_list = NULL; @@ -1941,9 +1943,23 @@ pars_create_table( const dtype_t* dtype; ulint n_cols; ulint flags = 0; + ulint flags2 = 0; if (compact != NULL) { + + /* System tables currently only use the REDUNDANT row + format therefore the check for srv_file_per_table should be + safe for now. */ + flags |= DICT_TF_COMPACT; + + /* FIXME: Ideally this should be part of the SQL syntax + or use some other mechanism. We want to reduce dependency + on global variables. There is an inherent race here but + that has always existed around this variable. */ + if (srv_file_per_table) { + flags2 |= DICT_TF2_USE_TABLESPACE; + } } if (block_size != NULL) { @@ -1974,10 +1990,8 @@ pars_create_table( n_cols = que_node_list_get_len(column_defs); - /* As the InnoDB SQL parser is for internal use only, - for creating some system tables, this function will only - create tables in the old (not compact) record format. */ - table = dict_mem_table_create(table_sym->name, 0, n_cols, flags, 0); + table = dict_mem_table_create( + table_sym->name, 0, n_cols, flags, flags2); #ifdef UNIV_DEBUG if (not_fit_in_memory != NULL) { @@ -1998,7 +2012,7 @@ pars_create_table( column = static_cast<sym_node_t*>(que_node_get_next(column)); } - node = tab_create_graph_create(table, pars_sym_tab_global->heap); + node = tab_create_graph_create(table, pars_sym_tab_global->heap, true); table_sym->resolved = TRUE; table_sym->token_type = SYM_TABLE; @@ -2052,7 +2066,7 @@ pars_create_index( column = static_cast<sym_node_t*>(que_node_get_next(column)); } - node = ind_create_graph_create(index, pars_sym_tab_global->heap); + node = ind_create_graph_create(index, pars_sym_tab_global->heap, true); table_sym->resolved = TRUE; table_sym->token_type = SYM_TABLE; @@ -2251,7 +2265,7 @@ que_thr_t* pars_complete_graph_for_exec( /*=========================*/ que_node_t* node, /*!< in: root node for an incomplete - query graph */ + query graph, or NULL for dummy graph */ trx_t* trx, /*!< in: transaction handle */ mem_heap_t* heap) /*!< in: memory heap from which allocated */ { @@ -2265,7 +2279,9 @@ pars_complete_graph_for_exec( thr->child = node; - que_node_set_parent(node, thr); + if (node) { + que_node_set_parent(node, thr); + } trx->graph = NULL; @@ -2478,7 +2494,7 @@ pars_info_bind_int8_literal( const char* name, /* in: name */ const ib_uint64_t* val) /* in: value */ { - pars_bound_lit_t* pbl; + pars_bound_lit_t* pbl; pbl = pars_info_lookup_bound_lit(info, name); @@ -2519,6 +2535,33 @@ pars_info_add_ull_literal( } /****************************************************************//** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_ull_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val) /*!< in: value */ +{ + pars_bound_lit_t* pbl; + + pbl = pars_info_lookup_bound_lit(info, name); + + if (!pbl) { + pars_info_add_literal( + info, name, val, sizeof(*val), DATA_FIXBINARY, 0); + } else { + + pbl->address = val; + pbl->length = sizeof(*val); + + sym_tab_rebind_lit(pbl->node, val, sizeof(*val)); + } +} + +/****************************************************************//** Add user function. */ UNIV_INTERN void @@ -2605,19 +2648,6 @@ pars_info_get_bound_id( } /****************************************************************//** -Get user function with the given name. -@return user func, or NULL if not found */ -UNIV_INTERN -pars_user_func_t* -pars_info_get_user_func( -/*====================*/ - pars_info_t* info, /*!< in: info struct */ - const char* name) /*!< in: function name to find*/ -{ - return(pars_info_lookup_user_func(info, name)); -} - -/****************************************************************//** Get bound literal with the given name. @return bound literal, or NULL if not found */ UNIV_INTERN diff --git a/storage/innobase/pars/pars0sym.cc b/storage/innobase/pars/pars0sym.cc index c71ad8a6b39..b01a69cb33a 100644 --- a/storage/innobase/pars/pars0sym.cc +++ b/storage/innobase/pars/pars0sym.cc @@ -84,7 +84,7 @@ sym_tab_free_private( if (sym->token_type == SYM_TABLE_REF_COUNTED) { - dict_table_close(sym->table, TRUE); + dict_table_close(sym->table, TRUE, FALSE); sym->table = NULL; sym->resolved = FALSE; diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc index c023723685c..fb185959d56 100644 --- a/storage/innobase/que/que0que.cc +++ b/storage/innobase/que/que0que.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1248,7 +1248,7 @@ loop: Evaluate the given SQL. @return error code or DB_SUCCESS */ UNIV_INTERN -enum db_err +dberr_t que_eval_sql( /*=========*/ pars_info_t* info, /*!< in: info struct, or NULL */ diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc index 02d78d657c6..14dc9ee5e7f 100644 --- a/storage/innobase/read/read0read.cc +++ b/storage/innobase/read/read0read.cc @@ -174,59 +174,6 @@ The order does not matter. No new transactions can be created and no running transaction can commit or rollback (or free views). */ -#ifdef UNIV_DEBUG -/*********************************************************************//** -Validates a read view object. */ -static -ibool -read_view_validate( -/*===============*/ - const read_view_t* view) /*!< in: view to validate */ -{ - ulint i; - - ut_ad(mutex_own(&trx_sys->mutex)); - - /* Check that the view->trx_ids array is in descending order. */ - for (i = 1; i < view->n_trx_ids; ++i) { - - ut_a(view->trx_ids[i] < view->trx_ids[i - 1]); - } - - return(TRUE); -} - -/** Functor to validate the view list. */ -struct Check { - - Check() : m_prev_view(0) { } - - void operator()(const read_view_t* view) - { - ut_a(m_prev_view == NULL - || m_prev_view->low_limit_no >= view->low_limit_no); - - m_prev_view = view; - } - - const read_view_t* m_prev_view; -}; - -/*********************************************************************//** -Validates a read view list. */ -static -ibool -read_view_list_validate(void) -/*=========================*/ -{ - ut_ad(mutex_own(&trx_sys->mutex)); - - ut_list_map(trx_sys->view_list, &read_view_t::view_list, Check()); - - return(TRUE); -} -#endif - /*********************************************************************//** Creates a read view object. @return own: read view struct */ @@ -530,25 +477,6 @@ read_view_purge_open( } /*********************************************************************//** -Remove a read view from the trx_sys->view_list. */ -UNIV_INTERN -void -read_view_remove( -/*=============*/ - read_view_t* view) /*!< in: read view */ -{ - mutex_enter(&trx_sys->mutex); - - ut_ad(read_view_validate(view)); - - UT_LIST_REMOVE(view_list, trx_sys->view_list, view); - - ut_ad(read_view_list_validate()); - - mutex_exit(&trx_sys->mutex); -} - -/*********************************************************************//** Closes a consistent read view for MySQL. This function is called at an SQL statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ UNIV_INTERN @@ -559,7 +487,7 @@ read_view_close_for_mysql( { ut_a(trx->global_read_view); - read_view_remove(trx->global_read_view); + read_view_remove(trx->global_read_view, false); mem_heap_empty(trx->global_read_view_heap); @@ -692,7 +620,7 @@ read_cursor_view_close_for_mysql( belong to this transaction */ trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use; - read_view_remove(curview->read_view); + read_view_remove(curview->read_view, false); trx->read_view = trx->global_read_view; diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc index 19f5633953a..db0fdf3ee21 100644 --- a/storage/innobase/rem/rem0cmp.cc +++ b/storage/innobase/rem/rem0cmp.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,6 +30,7 @@ Created 7/1/1994 Heikki Tuuri #endif #include "ha_prototypes.h" +#include "handler0alter.h" #include "srv0srv.h" /* ALPHABETICAL ORDER @@ -69,10 +70,12 @@ cmp_debug_dtuple_rec_with_match( has an equal number or more fields than dtuple */ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ - ulint* matched_fields);/*!< in/out: number of already + ulint n_cmp, /*!< in: number of fields to compare */ + ulint* matched_fields)/*!< in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ + __attribute__((nonnull, warn_unused_result)); #endif /* UNIV_DEBUG */ /*************************************************************//** This function is used to compare two data fields for which the data type @@ -621,14 +624,15 @@ respectively, when only the common first fields are compared, or until the first externally stored field in rec */ UNIV_INTERN int -cmp_dtuple_rec_with_match( -/*======================*/ +cmp_dtuple_rec_with_match_low( +/*==========================*/ const dtuple_t* dtuple, /*!< in: data tuple */ const rec_t* rec, /*!< in: physical record which differs from dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n_cmp, /*!< in: number of fields to compare */ ulint* matched_fields, /*!< in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ @@ -652,7 +656,7 @@ cmp_dtuple_rec_with_match( ulint cur_field; /* current field number */ ulint cur_bytes; /* number of already matched bytes in current field */ - int ret = 3333; /* return value */ + int ret; /* return value */ ut_ad(dtuple && rec && matched_fields && matched_bytes); ut_ad(dtuple_check_typed(dtuple)); @@ -661,7 +665,9 @@ cmp_dtuple_rec_with_match( cur_field = *matched_fields; cur_bytes = *matched_bytes; - ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple)); + ut_ad(n_cmp > 0); + ut_ad(n_cmp <= dtuple_get_n_fields(dtuple)); + ut_ad(cur_field <= n_cmp); ut_ad(cur_field <= rec_offs_n_fields(offsets)); if (cur_bytes == 0 && cur_field == 0) { @@ -681,7 +687,7 @@ cmp_dtuple_rec_with_match( /* Match fields in a loop; stop if we run out of fields in dtuple or find an externally stored field */ - while (cur_field < dtuple_get_n_fields_cmp(dtuple)) { + while (cur_field < n_cmp) { ulint mtype; ulint prtype; @@ -838,7 +844,7 @@ next_field: order_resolved: ut_ad((ret >= - 1) && (ret <= 1)); ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets, - matched_fields)); + n_cmp, matched_fields)); ut_ad(*matched_fields == cur_field); /* In the debug version, the above cmp_debug_... sets *matched_fields to a value */ @@ -909,156 +915,181 @@ cmp_dtuple_is_prefix_of_rec( } /*************************************************************//** -Compare two physical records that contain the same number of columns, -none of which are stored externally. -@return 1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */ -UNIV_INTERN +Compare two physical record fields. +@retval 1 if rec1 field is greater than rec2 +@retval -1 if rec1 field is less than rec2 +@retval 0 if rec1 field equals to rec2 */ +static __attribute__((nonnull, warn_unused_result)) int -cmp_rec_rec_simple( -/*===============*/ +cmp_rec_rec_simple_field( +/*=====================*/ const rec_t* rec1, /*!< in: physical record */ const rec_t* rec2, /*!< in: physical record */ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ const dict_index_t* index, /*!< in: data dictionary index */ - ibool* null_eq)/*!< out: set to TRUE if - found matching null values */ + ulint n) /*!< in: field to compare */ { - ulint rec1_f_len; /*!< length of current field in rec1 */ - const byte* rec1_b_ptr; /*!< pointer to the current byte - in rec1 field */ - ulint rec1_byte; /*!< value of current byte to be - compared in rec1 */ - ulint rec2_f_len; /*!< length of current field in rec2 */ - const byte* rec2_b_ptr; /*!< pointer to the current byte - in rec2 field */ - ulint rec2_byte; /*!< value of current byte to be - compared in rec2 */ - ulint cur_field; /*!< current field number */ - ulint n_uniq; - - n_uniq = dict_index_get_n_unique(index); - ut_ad(rec_offs_n_fields(offsets1) >= n_uniq); - ut_ad(rec_offs_n_fields(offsets2) >= n_uniq); - - ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); + const byte* rec1_b_ptr; + const byte* rec2_b_ptr; + ulint rec1_f_len; + ulint rec2_f_len; + const dict_col_t* col = dict_index_get_nth_col(index, n); - for (cur_field = 0; cur_field < n_uniq; cur_field++) { + ut_ad(!rec_offs_nth_extern(offsets1, n)); + ut_ad(!rec_offs_nth_extern(offsets2, n)); - ulint cur_bytes; - ulint mtype; - ulint prtype; - - { - const dict_col_t* col - = dict_index_get_nth_col(index, cur_field); + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, n, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, n, &rec2_f_len); - mtype = col->mtype; - prtype = col->prtype; + if (rec1_f_len == UNIV_SQL_NULL || rec2_f_len == UNIV_SQL_NULL) { + if (rec1_f_len == rec2_f_len) { + return(0); } + /* We define the SQL null to be the smallest possible + value of a field in the alphabetical order */ + return(rec1_f_len == UNIV_SQL_NULL ? -1 : 1); + } - ut_ad(!rec_offs_nth_extern(offsets1, cur_field)); - ut_ad(!rec_offs_nth_extern(offsets2, cur_field)); - - rec1_b_ptr = rec_get_nth_field(rec1, offsets1, - cur_field, &rec1_f_len); - rec2_b_ptr = rec_get_nth_field(rec2, offsets2, - cur_field, &rec2_f_len); + if (col->mtype >= DATA_FLOAT + || (col->mtype == DATA_BLOB + && !(col->prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(col->prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + return(cmp_whole_field(col->mtype, col->prtype, + rec1_b_ptr, (unsigned) rec1_f_len, + rec2_b_ptr, (unsigned) rec2_f_len)); + } - if (rec1_f_len == UNIV_SQL_NULL - || rec2_f_len == UNIV_SQL_NULL) { + /* Compare the fields */ + for (ulint cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) { + ulint rec1_byte; + ulint rec2_byte; - if (rec1_f_len == rec2_f_len) { - if (null_eq) { - *null_eq = TRUE; - } + if (rec2_f_len <= cur_bytes) { + if (rec1_f_len <= cur_bytes) { + return(0); + } - goto next_field; + rec2_byte = dtype_get_pad_char( + col->mtype, col->prtype); - } else if (rec2_f_len == UNIV_SQL_NULL) { + if (rec2_byte == ULINT_UNDEFINED) { + return(1); + } + } else { + rec2_byte = *rec2_b_ptr; + } - /* We define the SQL null to be the - smallest possible value of a field - in the alphabetical order */ + if (rec1_f_len <= cur_bytes) { + rec1_byte = dtype_get_pad_char( + col->mtype, col->prtype); - return(1); - } else { + if (rec1_byte == ULINT_UNDEFINED) { return(-1); } + } else { + rec1_byte = *rec1_b_ptr; } - if (mtype >= DATA_FLOAT - || (mtype == DATA_BLOB - && 0 == (prtype & DATA_BINARY_TYPE) - && dtype_get_charset_coll(prtype) - != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { - int ret = cmp_whole_field(mtype, prtype, - rec1_b_ptr, - (unsigned) rec1_f_len, - rec2_b_ptr, - (unsigned) rec2_f_len); - if (ret) { - return(ret); - } + if (rec1_byte == rec2_byte) { + /* If the bytes are equal, they will remain such + even after the collation transformation below */ + continue; + } - goto next_field; + if (col->mtype <= DATA_CHAR + || (col->mtype == DATA_BLOB + && !(col->prtype & DATA_BINARY_TYPE))) { + + rec1_byte = cmp_collate(rec1_byte); + rec2_byte = cmp_collate(rec2_byte); } - /* Compare the fields */ - for (cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) { - if (rec2_f_len <= cur_bytes) { + if (rec1_byte < rec2_byte) { + return(-1); + } else if (rec1_byte > rec2_byte) { + return(1); + } + } +} - if (rec1_f_len <= cur_bytes) { +/*************************************************************//** +Compare two physical records that contain the same number of columns, +none of which are stored externally. +@retval 1 if rec1 (including non-ordering columns) is greater than rec2 +@retval -1 if rec1 (including non-ordering columns) is less than rec2 +@retval 0 if rec1 is a duplicate of rec2 */ +UNIV_INTERN +int +cmp_rec_rec_simple( +/*===============*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ + const dict_index_t* index, /*!< in: data dictionary index */ + struct TABLE* table) /*!< in: MySQL table, for reporting + duplicate key value if applicable, + or NULL */ +{ + ulint n; + ulint n_uniq = dict_index_get_n_unique(index); + bool null_eq = false; - goto next_field; - } + ut_ad(rec_offs_n_fields(offsets1) >= n_uniq); + ut_ad(rec_offs_n_fields(offsets2) == rec_offs_n_fields(offsets2)); - rec2_byte = dtype_get_pad_char(mtype, prtype); + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); - if (rec2_byte == ULINT_UNDEFINED) { - return(1); - } - } else { - rec2_byte = *rec2_b_ptr; - } + for (n = 0; n < n_uniq; n++) { + int cmp = cmp_rec_rec_simple_field( + rec1, rec2, offsets1, offsets2, index, n); - if (rec1_f_len <= cur_bytes) { - rec1_byte = dtype_get_pad_char(mtype, prtype); + if (cmp) { + return(cmp); + } - if (rec1_byte == ULINT_UNDEFINED) { - return(-1); - } - } else { - rec1_byte = *rec1_b_ptr; - } + /* If the fields are internally equal, they must both + be NULL or non-NULL. */ + ut_ad(rec_offs_nth_sql_null(offsets1, n) + == rec_offs_nth_sql_null(offsets2, n)); - if (rec1_byte == rec2_byte) { - /* If the bytes are equal, they will remain - such even after the collation transformation - below */ + if (rec_offs_nth_sql_null(offsets1, n)) { + ut_ad(!(dict_index_get_nth_col(index, n)->prtype + & DATA_NOT_NULL)); + null_eq = true; + } + } - continue; - } + /* If we ran out of fields, the ordering columns of rec1 were + equal to rec2. Issue a duplicate key error if needed. */ - if (mtype <= DATA_CHAR - || (mtype == DATA_BLOB - && !(prtype & DATA_BINARY_TYPE))) { + if (!null_eq && table && dict_index_is_unique(index)) { + /* Report erroneous row using new version of table. */ + innobase_rec_to_mysql(table, rec1, index, offsets1); + return(0); + } - rec1_byte = cmp_collate(rec1_byte); - rec2_byte = cmp_collate(rec2_byte); - } + /* Else, keep comparing so that we have the full internal + order. */ + for (; n < dict_index_get_n_fields(index); n++) { + int cmp = cmp_rec_rec_simple_field( + rec1, rec2, offsets1, offsets2, index, n); - if (rec1_byte < rec2_byte) { - return(-1); - } else if (rec1_byte > rec2_byte) { - return(1); - } + if (cmp) { + return(cmp); } -next_field: - continue; + + /* If the fields are internally equal, they must both + be NULL or non-NULL. */ + ut_ad(rec_offs_nth_sql_null(offsets1, n) + == rec_offs_nth_sql_null(offsets2, n)); } - /* If we ran out of fields, rec1 was equal to rec2. */ + /* This should never be reached. Internally, an index must + never contain duplicate entries. */ + ut_ad(0); return(0); } @@ -1327,6 +1358,7 @@ cmp_debug_dtuple_rec_with_match( has an equal number or more fields than dtuple */ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n_cmp, /*!< in: number of fields to compare */ ulint* matched_fields) /*!< in/out: number of already completely matched fields; when function returns, contains the value for current @@ -1339,14 +1371,16 @@ cmp_debug_dtuple_rec_with_match( field data */ ulint rec_f_len; /* length of current field in rec */ const byte* rec_f_data; /* pointer to the current rec field */ - int ret = 3333; /* return value */ + int ret; /* return value */ ulint cur_field; /* current field number */ ut_ad(dtuple && rec && matched_fields); ut_ad(dtuple_check_typed(dtuple)); ut_ad(rec_offs_validate(rec, NULL, offsets)); - ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple)); + ut_ad(n_cmp > 0); + ut_ad(n_cmp <= dtuple_get_n_fields(dtuple)); + ut_ad(*matched_fields <= n_cmp); ut_ad(*matched_fields <= rec_offs_n_fields(offsets)); cur_field = *matched_fields; @@ -1372,7 +1406,7 @@ cmp_debug_dtuple_rec_with_match( /* Match fields in a loop; stop if we run out of fields in dtuple */ - while (cur_field < dtuple_get_n_fields_cmp(dtuple)) { + while (cur_field < n_cmp) { ulint mtype; ulint prtype; diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc index 5a864f122a3..3a5d2f579c3 100644 --- a/storage/innobase/rem/rem0rec.cc +++ b/storage/innobase/rem/rem0rec.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,6 +29,7 @@ Created 5/30/1994 Heikki Tuuri #include "rem0rec.ic" #endif +#include "page0page.h" #include "mtr0mtr.h" #include "mtr0log.h" #include "fts0fts.h" @@ -162,13 +163,12 @@ UNIV_INTERN ulint rec_get_n_extern_new( /*=================*/ - const rec_t* rec, /*!< in: compact physical record */ - dict_index_t* index, /*!< in: record descriptor */ - ulint n) /*!< in: number of columns to scan */ + const rec_t* rec, /*!< in: compact physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n) /*!< in: number of columns to scan */ { const byte* nulls; const byte* lens; - dict_field_t* field; ulint null_mask; ulint n_extern; ulint i; @@ -189,10 +189,13 @@ rec_get_n_extern_new( /* read the lengths of fields 0..n */ do { - ulint len; + const dict_field_t* field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint len; - field = dict_index_get_nth_field(index, i); - if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) { + if (!(col->prtype & DATA_NOT_NULL)) { /* nullable field => read the null flag */ if (UNIV_UNLIKELY(!(byte) null_mask)) { @@ -210,8 +213,6 @@ rec_get_n_extern_new( if (UNIV_UNLIKELY(!field->fixed_len)) { /* Variable-length field: read the length */ - const dict_col_t* col - = dict_field_get_col(field); len = *lens--; /* If the maximum length of the field is up to 255 bytes, the actual length is always @@ -240,16 +241,15 @@ rec_get_n_extern_new( Determine the offset to each field in a leaf-page record in ROW_FORMAT=COMPACT. This is a special case of rec_init_offsets() and rec_get_offsets_func(). */ -UNIV_INTERN +UNIV_INLINE __attribute__((nonnull)) void rec_init_offsets_comp_ordinary( /*===========================*/ const rec_t* rec, /*!< in: physical record in ROW_FORMAT=COMPACT */ - ulint extra, /*!< in: number of bytes to reserve - between the record header and - the data payload - (usually REC_N_NEW_EXTRA_BYTES) */ + bool temp, /*!< in: whether to use the + format for temporary files in + index creation */ const dict_index_t* index, /*!< in: record descriptor */ ulint* offsets)/*!< in/out: array of offsets; in: n=rec_offs_n_fields(offsets) */ @@ -257,28 +257,40 @@ rec_init_offsets_comp_ordinary( ulint i = 0; ulint offs = 0; ulint any_ext = 0; - const byte* nulls = rec - (extra + 1); - const byte* lens = nulls - - UT_BITS_IN_BYTES(index->n_nullable); - dict_field_t* field; + ulint n_null = index->n_nullable; + const byte* nulls = temp + ? rec - 1 + : rec - (1 + REC_N_NEW_EXTRA_BYTES); + const byte* lens = nulls - UT_BITS_IN_BYTES(n_null); ulint null_mask = 1; #ifdef UNIV_DEBUG - /* We cannot invoke rec_offs_make_valid() here, because it can hold - that extra != REC_N_NEW_EXTRA_BYTES. Similarly, rec_offs_validate() - will fail in that case, because it invokes rec_get_status(). */ + /* We cannot invoke rec_offs_make_valid() here if temp=true. + Similarly, rec_offs_validate() will fail in that case, because + it invokes rec_get_status(). */ offsets[2] = (ulint) rec; offsets[3] = (ulint) index; #endif /* UNIV_DEBUG */ + ut_ad(temp || dict_table_is_comp(index->table)); + + if (temp && dict_table_is_comp(index->table)) { + /* No need to do adjust fixed_len=0. We only need to + adjust it for ROW_FORMAT=REDUNDANT. */ + temp = false; + } + /* read the lengths of fields 0..n */ do { - ulint len; + const dict_field_t* field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint len; - field = dict_index_get_nth_field(index, i); - if (!(dict_field_get_col(field)->prtype - & DATA_NOT_NULL)) { + if (!(col->prtype & DATA_NOT_NULL)) { /* nullable field => read the null flag */ + ut_ad(n_null--); if (UNIV_UNLIKELY(!(byte) null_mask)) { nulls--; @@ -297,10 +309,9 @@ rec_init_offsets_comp_ordinary( null_mask <<= 1; } - if (UNIV_UNLIKELY(!field->fixed_len)) { + if (!field->fixed_len + || (temp && !dict_col_get_fixed_size(col, temp))) { /* Variable-length field: read the length */ - const dict_col_t* col - = dict_field_get_col(field); len = *lens--; /* If the maximum length of the field is up to 255 bytes, the actual length is always @@ -394,9 +405,8 @@ rec_init_offsets( = dict_index_get_n_unique_in_tree(index); break; case REC_STATUS_ORDINARY: - rec_init_offsets_comp_ordinary(rec, - REC_N_NEW_EXTRA_BYTES, - index, offsets); + rec_init_offsets_comp_ordinary( + rec, false, index, offsets); return; } @@ -774,34 +784,45 @@ rec_get_nth_field_offs_old( /**********************************************************//** Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. @return total size */ -UNIV_INTERN +UNIV_INLINE __attribute__((warn_unused_result, nonnull(1,2))) ulint -rec_get_converted_size_comp_prefix( -/*===============================*/ +rec_get_converted_size_comp_prefix_low( +/*===================================*/ const dict_index_t* index, /*!< in: record descriptor; dict_table_is_comp() is assumed to hold, even if it does not */ const dfield_t* fields, /*!< in: array of data fields */ ulint n_fields,/*!< in: number of data fields */ - ulint* extra) /*!< out: extra size */ + ulint* extra, /*!< out: extra size */ + bool temp) /*!< in: whether this is a + temporary file record */ { ulint extra_size; ulint data_size; ulint i; - ut_ad(index); - ut_ad(fields); + ulint n_null = index->n_nullable; ut_ad(n_fields > 0); ut_ad(n_fields <= dict_index_get_n_fields(index)); + ut_ad(!temp || extra); - extra_size = REC_N_NEW_EXTRA_BYTES - + UT_BITS_IN_BYTES(index->n_nullable); + extra_size = temp + ? UT_BITS_IN_BYTES(n_null) + : REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(n_null); data_size = 0; + if (temp && dict_table_is_comp(index->table)) { + /* No need to do adjust fixed_len=0. We only need to + adjust it for ROW_FORMAT=REDUNDANT. */ + temp = false; + } + /* read the lengths of fields 0..n */ for (i = 0; i < n_fields; i++) { const dict_field_t* field; ulint len; + ulint fixed_len; const dict_col_t* col; field = dict_index_get_nth_field(index, i); @@ -810,6 +831,8 @@ rec_get_converted_size_comp_prefix( ut_ad(dict_col_type_assert_equal(col, dfield_get_type(&fields[i]))); + /* All NULLable fields must be included in the n_null count. */ + ut_ad((col->prtype & DATA_NOT_NULL) || n_null--); if (dfield_is_null(&fields[i])) { /* No length is stored for NULL fields. */ @@ -820,6 +843,11 @@ rec_get_converted_size_comp_prefix( ut_ad(len <= col->len || col->mtype == DATA_BLOB || (col->len == 0 && col->mtype == DATA_VARCHAR)); + fixed_len = field->fixed_len; + if (temp && fixed_len + && !dict_col_get_fixed_size(col, temp)) { + fixed_len = 0; + } /* If the maximum length of a variable-length field is up to 255 bytes, the actual length is always stored in one byte. If the maximum length is more than 255 @@ -827,11 +855,20 @@ rec_get_converted_size_comp_prefix( 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (field->fixed_len) { - ut_ad(len == field->fixed_len); + if (fixed_len) { +#ifdef UNIV_DEBUG + ulint mbminlen = DATA_MBMINLEN(col->mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen); + + ut_ad(len <= fixed_len); + + ut_ad(!mbmaxlen || len >= mbminlen + * (fixed_len / mbmaxlen)); + /* dict_index_add_col() should guarantee this */ ut_ad(!field->prefix_len - || field->fixed_len == field->prefix_len); + || fixed_len == field->prefix_len); +#endif /* UNIV_DEBUG */ } else if (dfield_is_ext(&fields[i])) { ut_ad(col->len >= 256 || col->mtype == DATA_BLOB); extra_size += 2; @@ -848,7 +885,7 @@ rec_get_converted_size_comp_prefix( data_size += len; } - if (UNIV_LIKELY_NULL(extra)) { + if (extra) { *extra = extra_size; } @@ -856,6 +893,23 @@ rec_get_converted_size_comp_prefix( } /**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ +{ + ut_ad(dict_table_is_comp(index->table)); + return(rec_get_converted_size_comp_prefix_low( + index, fields, n_fields, extra, false)); +} + +/**********************************************************//** Determines the size of a data tuple in ROW_FORMAT=COMPACT. @return total size */ UNIV_INTERN @@ -872,8 +926,6 @@ rec_get_converted_size_comp( ulint* extra) /*!< out: extra size */ { ulint size; - ut_ad(index); - ut_ad(fields); ut_ad(n_fields > 0); switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { @@ -899,8 +951,8 @@ rec_get_converted_size_comp( return(ULINT_UNDEFINED); } - return(size + rec_get_converted_size_comp_prefix(index, fields, - n_fields, extra)); + return(size + rec_get_converted_size_comp_prefix_low( + index, fields, n_fields, extra, false)); } /***********************************************************//** @@ -1077,19 +1129,18 @@ rec_convert_dtuple_to_rec_old( /*********************************************************//** Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ -UNIV_INTERN +UNIV_INLINE __attribute__((nonnull)) void rec_convert_dtuple_to_rec_comp( /*===========================*/ rec_t* rec, /*!< in: origin of record */ - ulint extra, /*!< in: number of bytes to - reserve between the record - header and the data payload - (normally REC_N_NEW_EXTRA_BYTES) */ const dict_index_t* index, /*!< in: record descriptor */ - ulint status, /*!< in: status bits of the record */ const dfield_t* fields, /*!< in: array of data fields */ - ulint n_fields)/*!< in: number of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint status, /*!< in: status bits of the record */ + bool temp) /*!< in: whether to use the + format for temporary files in + index creation */ { const dfield_t* field; const dtype_t* type; @@ -1101,32 +1152,48 @@ rec_convert_dtuple_to_rec_comp( ulint n_node_ptr_field; ulint fixed_len; ulint null_mask = 1; - ut_ad(extra == 0 || dict_table_is_comp(index->table)); - ut_ad(extra == 0 || extra == REC_N_NEW_EXTRA_BYTES); + ulint n_null; + + ut_ad(temp || dict_table_is_comp(index->table)); ut_ad(n_fields > 0); - switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { - case REC_STATUS_ORDINARY: + if (temp) { + ut_ad(status == REC_STATUS_ORDINARY); ut_ad(n_fields <= dict_index_get_n_fields(index)); n_node_ptr_field = ULINT_UNDEFINED; - break; - case REC_STATUS_NODE_PTR: - ut_ad(n_fields == dict_index_get_n_unique_in_tree(index) + 1); - n_node_ptr_field = n_fields - 1; - break; - case REC_STATUS_INFIMUM: - case REC_STATUS_SUPREMUM: - ut_ad(n_fields == 1); - n_node_ptr_field = ULINT_UNDEFINED; - break; - default: - ut_error; - return; + nulls = rec - 1; + if (dict_table_is_comp(index->table)) { + /* No need to do adjust fixed_len=0. We only + need to adjust it for ROW_FORMAT=REDUNDANT. */ + temp = false; + } + } else { + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + n_node_ptr_field = ULINT_UNDEFINED; + break; + case REC_STATUS_NODE_PTR: + ut_ad(n_fields + == dict_index_get_n_unique_in_tree(index) + 1); + n_node_ptr_field = n_fields - 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(n_fields == 1); + n_node_ptr_field = ULINT_UNDEFINED; + break; + default: + ut_error; + return; + } } end = rec; - nulls = rec - (extra + 1); - lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + n_null = index->n_nullable; + lens = nulls - UT_BITS_IN_BYTES(n_null); /* clear the SQL-null flags */ memset(lens + 1, 0, nulls - lens); @@ -1148,7 +1215,7 @@ rec_convert_dtuple_to_rec_comp( if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { /* nullable field */ - ut_ad(index->n_nullable > 0); + ut_ad(n_null--); if (UNIV_UNLIKELY(!(byte) null_mask)) { nulls--; @@ -1171,6 +1238,10 @@ rec_convert_dtuple_to_rec_comp( ifield = dict_index_get_nth_field(index, i); fixed_len = ifield->fixed_len; + if (temp && fixed_len + && !dict_col_get_fixed_size(ifield->col, temp)) { + fixed_len = 0; + } /* If the maximum length of a variable-length field is up to 255 bytes, the actual length is always stored in one byte. If the maximum length is more than 255 @@ -1178,8 +1249,17 @@ rec_convert_dtuple_to_rec_comp( 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ if (fixed_len) { - ut_ad(len == fixed_len); +#ifdef UNIV_DEBUG + ulint mbminlen = DATA_MBMINLEN( + ifield->col->mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN( + ifield->col->mbminmaxlen); + + ut_ad(len <= fixed_len); + ut_ad(!mbmaxlen || len >= mbminlen + * (fixed_len / mbmaxlen)); ut_ad(!dfield_is_ext(field)); +#endif /* UNIV_DEBUG */ } else if (dfield_is_ext(field)) { ut_ad(ifield->col->len >= 256 || ifield->col->mtype == DATA_BLOB); @@ -1227,14 +1307,12 @@ rec_convert_dtuple_to_rec_new( rec_t* rec; status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK; - rec_get_converted_size_comp(index, status, - dtuple->fields, dtuple->n_fields, - &extra_size); + rec_get_converted_size_comp( + index, status, dtuple->fields, dtuple->n_fields, &extra_size); rec = buf + extra_size; rec_convert_dtuple_to_rec_comp( - rec, REC_N_NEW_EXTRA_BYTES, index, status, - dtuple->fields, dtuple->n_fields); + rec, index, dtuple->fields, dtuple->n_fields, status, false); /* Set the info bits of the record */ rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple)); @@ -1296,6 +1374,54 @@ rec_convert_dtuple_to_rec( return(rec); } +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_temp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ +{ + return(rec_get_converted_size_comp_prefix_low( + index, fields, n_fields, extra, true)); +} + +/******************************************************//** +Determine the offset to each field in temporary file. +@see rec_convert_dtuple_to_temp() */ +UNIV_INTERN +void +rec_init_offsets_temp( +/*==================*/ + const rec_t* rec, /*!< in: temporary file record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + rec_init_offsets_comp_ordinary(rec, true, index, offsets); +} + +/*********************************************************//** +Builds a temporary file record out of a data tuple. +@see rec_init_offsets_temp() */ +UNIV_INTERN +void +rec_convert_dtuple_to_temp( +/*=======================*/ + rec_t* rec, /*!< out: record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields) /*!< in: number of fields */ +{ + rec_convert_dtuple_to_rec_comp(rec, index, fields, n_fields, + REC_STATUS_ORDINARY, true); +} + /**************************************************************//** Copies the first n fields of a physical record to a data tuple. The fields are copied to the memory heap. */ @@ -1506,6 +1632,7 @@ rec_copy_prefix_to_buf( return(*buf + (rec - (lens + 1))); } +#endif /* UNIV_HOTBACKUP */ /***************************************************************//** Validates the consistency of an old-style physical record. @@ -1782,4 +1909,47 @@ rec_print( } } } + +# ifdef UNIV_DEBUG +/************************************************************//** +Reads the DB_TRX_ID of a clustered index record. +@return the value of DB_TRX_ID */ +UNIV_INTERN +trx_id_t +rec_get_trx_id( +/*===========*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index) /*!< in: clustered index */ +{ + const page_t* page + = page_align(rec); + ulint trx_id_col + = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + const byte* trx_id; + ulint len; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id); + ut_ad(dict_index_is_clust(index)); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + offsets = rec_get_offsets(rec, index, offsets, trx_id_col + 1, &heap); + + trx_id = rec_get_nth_field(rec, offsets, trx_id_col, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + if (heap) { + mem_heap_free(heap); + } + + return(trx_read_trx_id(trx_id)); +} +# endif /* UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc index 8d4da9f034b..f084fa09c5a 100644 --- a/storage/innobase/row/row0ext.cc +++ b/storage/innobase/row/row0ext.cc @@ -95,6 +95,8 @@ row_ext_create( row_ext_t* ret; + ut_ad(n_ext > 0); + ret = static_cast<row_ext_t*>( mem_heap_alloc(heap, (sizeof *ret) + (n_ext - 1) * sizeof ret->len)); diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index 50b681361d8..9a6af50e09d 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,6 +23,7 @@ Create Full Text Index with (parallel) merge sort Created 10/13/2010 Jimmy Yang *******************************************************/ +#include "dict0dict.h" /* dict_table_stats_lock() */ #include "row0merge.h" #include "pars0pars.h" #include "row0ftsort.h" @@ -47,9 +48,6 @@ Created 10/13/2010 Jimmy Yang /** Parallel sort degree */ UNIV_INTERN ulong fts_sort_pll_degree = 2; -/** Parallel sort buffer size */ -UNIV_INTERN ulong srv_sort_buf_size = 1048576; - /*********************************************************************//** Create a temporary "fts sort index" used to merge sort the tokenized doc string. The index has three "fields": @@ -124,7 +122,7 @@ row_merge_create_fts_sort_index( if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { /* If Doc ID column is being added by this create index, then just check the number of rows in the table */ - if (table->stat_n_rows < MAX_DOC_ID_OPT_VAL) { + if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) { *opt_doc_id_size = TRUE; } } else { @@ -173,10 +171,10 @@ ibool row_fts_psort_info_init( /*====================*/ trx_t* trx, /*!< in: transaction */ - struct TABLE* table, /*!< in: MySQL table object */ + row_merge_dup_t* dup, /*!< in,own: descriptor of + FTS index being created */ const dict_table_t* new_table,/*!< in: table on which indexes are created */ - dict_index_t* index, /*!< in: FTS index to be created */ ibool opt_doc_id_size, /*!< in: whether to use 4 bytes instead of 8 bytes integer to @@ -192,7 +190,6 @@ row_fts_psort_info_init( fts_psort_t* psort_info = NULL; fts_psort_t* merge_info = NULL; ulint block_size; - os_event_t sort_event; ibool ret = TRUE; block_size = 3 * srv_sort_buf_size; @@ -201,28 +198,28 @@ row_fts_psort_info_init( fts_sort_pll_degree * sizeof *psort_info)); if (!psort_info) { - return FALSE; + ut_free(dup); + return(FALSE); } - sort_event = os_event_create(NULL); - /* Common Info for all sort threads */ common_info = static_cast<fts_psort_common_t*>( mem_alloc(sizeof *common_info)); - common_info->table = table; + if (!common_info) { + ut_free(dup); + mem_free(psort_info); + return(FALSE); + } + + common_info->dup = dup; common_info->new_table = (dict_table_t*) new_table; common_info->trx = trx; - common_info->sort_index = index; common_info->all_info = psort_info; - common_info->sort_event = sort_event; + common_info->sort_event = os_event_create(); + common_info->merge_event = os_event_create(); common_info->opt_doc_id_size = opt_doc_id_size; - if (!common_info) { - mem_free(psort_info); - return FALSE; - } - /* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for each parallel sort thread. Each "sort bucket" holds records for a particular "FTS index partition" */ @@ -242,9 +239,12 @@ row_fts_psort_info_init( } psort_info[j].merge_buf[i] = row_merge_buf_create( - index); + dup->index); - row_merge_file_create(psort_info[j].merge_file[i]); + if (row_merge_file_create(psort_info[j].merge_file[i]) + < 0) { + goto func_exit; + } /* Need to align memory for O_DIRECT write */ psort_info[j].block_alloc[i] = @@ -314,6 +314,9 @@ row_fts_psort_info_destroy( } } + os_event_free(merge_info[0].psort_common->sort_event); + os_event_free(merge_info[0].psort_common->merge_event); + ut_free(merge_info[0].psort_common->dup); mem_free(merge_info[0].psort_common); mem_free(psort_info); } @@ -433,12 +436,11 @@ row_merge_fts_doc_tokenize( ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX); idx = t_ctx->buf_used; - buf->tuples[buf->n_tuples + n_tuple[idx]] = field = - static_cast<dfield_t*>(mem_heap_alloc( - buf->heap, - FTS_NUM_FIELDS_SORT * sizeof *field)); + mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]]; - ut_a(field); + field = mtuple->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, + FTS_NUM_FIELDS_SORT * sizeof *field)); /* The first field is the tokenized word */ dfield_set_data(field, t_str.f_str, t_str.f_len); @@ -522,6 +524,10 @@ row_merge_fts_doc_tokenize( /* Update the data length and the number of new word tuples added in this round of tokenization */ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + /* The computation of total_size below assumes that no + delete-mark flags will be stored and that all fields + are NOT NULL and fixed-length. */ + sort_buf[i]->total_size += data_size[i]; sort_buf[i]->n_tuples += n_tuple[i]; @@ -560,7 +566,7 @@ fts_parallel_tokenization( ulint mycount[FTS_NUM_AUX_INDEX]; ib_uint64_t total_rec = 0; ulint num_doc_processed = 0; - doc_id_t last_doc_id; + doc_id_t last_doc_id = 0; ulint zip_size; mem_heap_t* blob_heap = NULL; fts_doc_t doc; @@ -581,10 +587,10 @@ fts_parallel_tokenization( memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int)); doc.charset = fts_index_get_charset( - psort_info->psort_common->sort_index); + psort_info->psort_common->dup->index); idx_field = dict_index_get_nth_field( - psort_info->psort_common->sort_index, 0); + psort_info->psort_common->dup->index, 0); word_dtype.prtype = idx_field->col->prtype; word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen; word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0) @@ -742,7 +748,12 @@ loop: } if (doc_item) { - prev_doc_item = doc_item; + prev_doc_item = doc_item; + + if (last_doc_id != doc_item->doc_id) { + t_ctx.init_pos = 0; + } + retried = 0; } else if (psort_info->state == FTS_PARENT_COMPLETE) { retried++; @@ -751,16 +762,51 @@ loop: goto loop; exit: + /* Do a final sort of the last (or latest) batch of records + in block memory. Flush them to temp file if records cannot + be hold in one block memory */ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { if (t_ctx.rows_added[i]) { row_merge_buf_sort(buf[i], NULL); row_merge_buf_write( - buf[i], (const merge_file_t*) merge_file[i], - block[i]); - row_merge_write(merge_file[i]->fd, - merge_file[i]->offset++, block[i]); + buf[i], merge_file[i], block[i]); + + /* Write to temp file, only if records have + been flushed to temp file before (offset > 0): + The pseudo code for sort is following: + + while (there are rows) { + tokenize rows, put result in block[] + if (block[] runs out) { + sort rows; + write to temp file with + row_merge_write(); + offset++; + } + } + + # write out the last batch + if (offset > 0) { + row_merge_write(); + offset++; + } else { + # no need to write anything + offset stay as 0 + } + + so if merge_file[i]->offset is 0 when we come to + here as the last batch, this means rows have + never flush to temp file, it can be held all in + memory */ + if (merge_file[i]->offset != 0) { + row_merge_write(merge_file[i]->fd, + merge_file[i]->offset++, + block[i]); + + UNIV_MEM_INVALID(block[i][0], + srv_sort_buf_size); + } - UNIV_MEM_INVALID(block[i][0], srv_sort_buf_size); buf[i] = row_merge_buf_empty(buf[i]); t_ctx.rows_added[i] = 0; } @@ -776,16 +822,19 @@ exit: continue; } - tmpfd[i] = innobase_mysql_tmpfile(); + tmpfd[i] = row_merge_file_create_low(); + if (tmpfd[i] < 0) { + goto func_exit; + } + row_merge_sort(psort_info->psort_common->trx, - psort_info->psort_common->sort_index, - merge_file[i], - (row_merge_block_t*) block[i], &tmpfd[i], - psort_info->psort_common->table); + psort_info->psort_common->dup, + merge_file[i], block[i], &tmpfd[i]); total_rec += merge_file[i]->n_rec; close(tmpfd[i]); } +func_exit: if (fts_enable_diag_print) { DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: complete merge sort\n"); } @@ -794,8 +843,14 @@ exit: psort_info->child_status = FTS_CHILD_COMPLETE; os_event_set(psort_info->psort_common->sort_event); + psort_info->child_status = FTS_CHILD_EXITING; + +#ifdef __WIN__ + CloseHandle(psort_info->thread_hdl); +#endif /*__WIN__ */ os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; } @@ -812,8 +867,9 @@ row_fts_start_psort( for (i = 0; i < fts_sort_pll_degree; i++) { psort_info[i].psort_id = i; - os_thread_create(fts_parallel_tokenization, - (void*) &psort_info[i], &thd_id); + psort_info[i].thread_hdl = os_thread_create( + fts_parallel_tokenization, + (void*) &psort_info[i], &thd_id); } } @@ -833,14 +889,20 @@ fts_parallel_merge( id = psort_info->psort_id; - row_fts_merge_insert(psort_info->psort_common->sort_index, + row_fts_merge_insert(psort_info->psort_common->dup->index, psort_info->psort_common->new_table, psort_info->psort_common->all_info, id); psort_info->child_status = FTS_CHILD_COMPLETE; - os_event_set(psort_info->psort_common->sort_event); + os_event_set(psort_info->psort_common->merge_event); + psort_info->child_status = FTS_CHILD_EXITING; + +#ifdef __WIN__ + CloseHandle(psort_info->thread_hdl); +#endif /*__WIN__ */ os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; } @@ -860,16 +922,16 @@ row_fts_start_parallel_merge( merge_info[i].psort_id = i; merge_info[i].child_status = 0; - os_thread_create(fts_parallel_merge, - (void*) &merge_info[i], &thd_id); + merge_info[i].thread_hdl = os_thread_create( + fts_parallel_merge, (void*) &merge_info[i], &thd_id); } } /********************************************************************//** Insert processed FTS data to auxillary index tables. @return DB_SUCCESS if insertion runs fine */ -UNIV_INTERN -ulint +static __attribute__((nonnull)) +dberr_t row_merge_write_fts_word( /*=====================*/ trx_t* trx, /*!< in: transaction */ @@ -880,15 +942,15 @@ row_merge_write_fts_word( CHARSET_INFO* charset) /*!< in: charset */ { ulint selected; - ulint ret = DB_SUCCESS; + dberr_t ret = DB_SUCCESS; selected = fts_select_index( charset, word->text.f_str, word->text.f_len); fts_table->suffix = fts_get_suffix(selected); /* Pop out each fts_node in word->nodes write them to auxiliary table */ - while(ib_vector_size(word->nodes) > 0) { - ulint error; + while (ib_vector_size(word->nodes) > 0) { + dberr_t error; fts_node_t* fts_node; fts_node = static_cast<fts_node_t*>(ib_vector_pop(word->nodes)); @@ -900,8 +962,8 @@ row_merge_write_fts_word( if (error != DB_SUCCESS) { fprintf(stderr, "InnoDB: failed to write" " word %s to FTS auxiliary index" - " table, error (%lu) \n", - word->text.f_str, error); + " table, error (%s) \n", + word->text.f_str, ut_strerr(error)); ret = error; } @@ -1064,7 +1126,6 @@ row_fts_sel_tree_propagate( int child_left; int child_right; int selected; - ibool null_eq = FALSE; /* Find which parent this value will be propagated to */ parent = (propogated - 1) / 2; @@ -1083,10 +1144,10 @@ row_fts_sel_tree_propagate( } else if (child_right == -1 || mrec[child_right] == NULL) { selected = child_left; - } else if (row_merge_cmp(mrec[child_left], mrec[child_right], - offsets[child_left], - offsets[child_right], - index, &null_eq) < 0) { + } else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right], + offsets[child_left], + offsets[child_right], + index, NULL) < 0) { selected = child_left; } else { selected = child_right; @@ -1143,8 +1204,6 @@ row_fts_build_sel_tree_level( num_item = (1 << level); for (i = 0; i < num_item; i++) { - ibool null_eq = FALSE; - child_left = sel_tree[(start + i) * 2 + 1]; child_right = sel_tree[(start + i) * 2 + 2]; @@ -1174,14 +1233,12 @@ row_fts_build_sel_tree_level( } /* Select the smaller one to set parent pointer */ - if (row_merge_cmp(mrec[child_left], mrec[child_right], - offsets[child_left], - offsets[child_right], - index, &null_eq) < 0) { - sel_tree[start + i] = child_left; - } else { - sel_tree[start + i] = child_right; - } + int cmp = cmp_rec_rec_simple( + mrec[child_left], mrec[child_right], + offsets[child_left], offsets[child_right], + index, NULL); + + sel_tree[start + i] = cmp < 0 ? child_left : child_right; } } @@ -1231,7 +1288,7 @@ Read sorted file containing index data tuples and insert these data tuples to the index @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t row_fts_merge_insert( /*=================*/ dict_index_t* index, /*!< in: index */ @@ -1243,7 +1300,7 @@ row_fts_merge_insert( const byte** b; mem_heap_t* tuple_heap; mem_heap_t* heap; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ulint* foffs; ulint** offsets; fts_tokenizer_word_t new_word; @@ -1317,7 +1374,7 @@ row_fts_merge_insert( count_diag += (int) psort_info[i].merge_file[id]->n_rec; } - if (fts_enable_diag_print) { + if (fts_enable_diag_print) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB_FTS: to inserted %lu records\n", (ulong) count_diag); @@ -1349,8 +1406,13 @@ row_fts_merge_insert( /* No Rows to read */ mrec[i] = b[i] = NULL; } else { - if (!row_merge_read(fd[i], foffs[i], - (row_merge_block_t*) block[i])) { + /* Read from temp file only if it has been + written to. Otherwise, block memory holds + all the sorted records */ + if (psort_info[i].merge_file[id]->offset > 0 + && (!row_merge_read( + fd[i], foffs[i], + (row_merge_block_t*) block[i]))) { error = DB_CORRUPTION; goto exit; } @@ -1386,14 +1448,14 @@ row_fts_merge_insert( } for (i = min_rec + 1; i < fts_sort_pll_degree; i++) { - ibool null_eq = FALSE; if (!mrec[i]) { continue; } - if (row_merge_cmp(mrec[i], mrec[min_rec], - offsets[i], offsets[min_rec], - index, &null_eq) < 0) { + if (cmp_rec_rec_simple( + mrec[i], mrec[min_rec], + offsets[i], offsets[min_rec], + index, NULL) < 0) { min_rec = i; } } diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc new file mode 100644 index 00000000000..f5eb31191a5 --- /dev/null +++ b/storage/innobase/row/row0import.cc @@ -0,0 +1,3806 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0import.cc +Import a tablespace to a running instance. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0import.h" + +#ifdef UNIV_NONINL +#include "row0import.ic" +#endif + +#include "btr0pcur.h" +#include "que0que.h" +#include "dict0boot.h" +#include "ibuf0ibuf.h" +#include "pars0pars.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "row0quiesce.h" + +#include <vector> + +/** The size of the buffer to use for IO. Note: os_file_read() doesn't expect +reads to fail. If you set the buffer size to be greater than a multiple of the +file size then it will assert. TODO: Fix this limitation of the IO functions. +@param n - page size of the tablespace. +@retval number of pages */ +#define IO_BUFFER_SIZE(n) ((1024 * 1024) / n) + +/** For gathering stats on records during phase I */ +struct row_stats_t { + ulint m_n_deleted; /*!< Number of deleted records + found in the index */ + + ulint m_n_purged; /*!< Number of records purged + optimisatically */ + + ulint m_n_rows; /*!< Number of rows */ + + ulint m_n_purge_failed; /*!< Number of deleted rows + that could not be purged */ +}; + +/** Index information required by IMPORT. */ +struct row_index_t { + index_id_t m_id; /*!< Index id of the table + in the exporting server */ + byte* m_name; /*!< Index name */ + + ulint m_space; /*!< Space where it is placed */ + + ulint m_page_no; /*!< Root page number */ + + ulint m_type; /*!< Index type */ + + ulint m_trx_id_offset; /*!< Relevant only for clustered + indexes, offset of transaction + id system column */ + + ulint m_n_user_defined_cols; /*!< User defined columns */ + + ulint m_n_uniq; /*!< Number of columns that can + uniquely identify the row */ + + ulint m_n_nullable; /*!< Number of nullable + columns */ + + ulint m_n_fields; /*!< Total number of fields */ + + dict_field_t* m_fields; /*!< Index fields */ + + const dict_index_t* + m_srv_index; /*!< Index instance in the + importing server */ + + row_stats_t m_stats; /*!< Statistics gathered during + the import phase */ + +}; + +/** Meta data required by IMPORT. */ +struct row_import { + row_import() UNIV_NOTHROW + : + m_table(), + m_version(), + m_hostname(), + m_table_name(), + m_autoinc(), + m_page_size(), + m_flags(), + m_n_cols(), + m_cols(), + m_col_names(), + m_n_indexes(), + m_indexes(), + m_missing(true) { } + + ~row_import() UNIV_NOTHROW; + + /** + Find the index entry in in the indexes array. + @param name - index name + @return instance if found else 0. */ + row_index_t* get_index(const char* name) const UNIV_NOTHROW; + + /** + Get the number of rows in the index. + @param name - index name + @return number of rows (doesn't include delete marked rows). */ + ulint get_n_rows(const char* name) const UNIV_NOTHROW; + + /** + Find the ordinal value of the column name in the cfg table columns. + @param name - of column to look for. + @return ULINT_UNDEFINED if not found. */ + ulint find_col(const char* name) const UNIV_NOTHROW; + + /** + Find the index field entry in in the cfg indexes fields. + @name - of the index to look for + @return instance if found else 0. */ + const dict_field_t* find_field( + const row_index_t* cfg_index, + const char* name) const UNIV_NOTHROW; + + /** + Get the number of rows for which purge failed during the convert phase. + @param name - index name + @return number of rows for which purge failed. */ + ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW; + + /** + Check if the index is clean. ie. no delete-marked records + @param name - index name + @return true if index needs to be purged. */ + bool requires_purge(const char* name) const UNIV_NOTHROW + { + return(get_n_purge_failed(name) > 0); + } + + /** + Set the index root <space, pageno> using the index name */ + void set_root_by_name() UNIV_NOTHROW; + + /** + Set the index root <space, pageno> using a heuristic + @return DB_SUCCESS or error code */ + dberr_t set_root_by_heuristic() UNIV_NOTHROW; + + /** Check if the index schema that was read from the .cfg file + matches the in memory index definition. + Note: It will update row_import_t::m_srv_index to map the meta-data + read from the .cfg file to the server index instance. + @return DB_SUCCESS or error code. */ + dberr_t match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW; + + /** + Check if the table schema that was read from the .cfg file matches the + in memory table definition. + @param thd - MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_table_columns( + THD* thd) UNIV_NOTHROW; + + /** + Check if the table (and index) schema that was read from the .cfg file + matches the in memory table definition. + @param thd - MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_schema( + THD* thd) UNIV_NOTHROW; + + dict_table_t* m_table; /*!< Table instance */ + + ulint m_version; /*!< Version of config file */ + + byte* m_hostname; /*!< Hostname where the + tablespace was exported */ + byte* m_table_name; /*!< Exporting instance table + name */ + + ib_uint64_t m_autoinc; /*!< Next autoinc value */ + + ulint m_page_size; /*!< Tablespace page size */ + + ulint m_flags; /*!< Table flags */ + + ulint m_n_cols; /*!< Number of columns in the + meta-data file */ + + dict_col_t* m_cols; /*!< Column data */ + + byte** m_col_names; /*!< Column names, we store the + column naems separately becuase + there is no field to store the + value in dict_col_t */ + + ulint m_n_indexes; /*!< Number of indexes, + including clustered index */ + + row_index_t* m_indexes; /*!< Index meta data */ + + bool m_missing; /*!< true if a .cfg file was + found and was readable */ +}; + +/** Use the page cursor to iterate over records in a block. */ +class RecIterator { +public: + /** + Default constructor */ + RecIterator() UNIV_NOTHROW + { + memset(&m_cur, 0x0, sizeof(m_cur)); + } + + /** + Position the cursor on the first user record. */ + void open(buf_block_t* block) UNIV_NOTHROW + { + page_cur_set_before_first(block, &m_cur); + + if (!end()) { + next(); + } + } + + /** + Move to the next record. */ + void next() UNIV_NOTHROW + { + page_cur_move_to_next(&m_cur); + } + + /** + @return the current record */ + rec_t* current() UNIV_NOTHROW + { + ut_ad(!end()); + return(page_cur_get_rec(&m_cur)); + } + + /** + @return true if cursor is at the end */ + bool end() UNIV_NOTHROW + { + return(page_cur_is_after_last(&m_cur) == TRUE); + } + + /** Remove the current record + @return true on success */ + bool remove( + const dict_index_t* index, + page_zip_des_t* page_zip, + ulint* offsets) UNIV_NOTHROW + { + /* We can't end up with an empty page unless it is root. */ + if (page_get_n_recs(m_cur.block->frame) <= 1) { + return(false); + } + + return(page_delete_rec(index, &m_cur, page_zip, offsets)); + } + +private: + page_cur_t m_cur; +}; + +/** Class that purges delete marked reocords from indexes, both secondary +and cluster. It does a pessimistic delete. This should only be done if we +couldn't purge the delete marked reocrds during Phase I. */ +class IndexPurge { +public: + /** Constructor + @param trx - the user transaction covering the import tablespace + @param index - to be imported + @param space_id - space id of the tablespace */ + IndexPurge( + trx_t* trx, + dict_index_t* index) UNIV_NOTHROW + : + m_trx(trx), + m_index(index), + m_n_rows(0) + { + ib_logf(IB_LOG_LEVEL_INFO, + "Phase II - Purge records from index %s", + index->name); + } + + /** Descructor */ + ~IndexPurge() UNIV_NOTHROW { } + + /** Purge delete marked records. + @return DB_SUCCESS or error code. */ + dberr_t garbage_collect() UNIV_NOTHROW; + + /** The number of records that are not delete marked. + @return total records in the index after purge */ + ulint get_n_rows() const UNIV_NOTHROW + { + return(m_n_rows); + } + +private: + /** + Begin import, position the cursor on the first record. */ + void open() UNIV_NOTHROW; + + /** + Close the persistent curosr and commit the mini-transaction. */ + void close() UNIV_NOTHROW; + + /** + Position the cursor on the next record. + @return DB_SUCCESS or error code */ + dberr_t next() UNIV_NOTHROW; + + /** + Store the persistent cursor position and reopen the + B-tree cursor in BTR_MODIFY_TREE mode, because the + tree structure may be changed during a pessimistic delete. */ + void purge_pessimistic_delete() UNIV_NOTHROW; + + /** + Purge delete-marked records. + @param offsets - current row offsets. */ + void purge() UNIV_NOTHROW; + +protected: + // Disable copying + IndexPurge(); + IndexPurge(const IndexPurge&); + IndexPurge &operator=(const IndexPurge&); + +private: + trx_t* m_trx; /*!< User transaction */ + mtr_t m_mtr; /*!< Mini-transaction */ + btr_pcur_t m_pcur; /*!< Persistent cursor */ + dict_index_t* m_index; /*!< Index to be processed */ + ulint m_n_rows; /*!< Records in index */ +}; + +/** Functor that is called for each physical page that is read from the +tablespace file. */ +class AbstractCallback : public PageCallback { +public: + /** Constructor + @param trx - covering transaction */ + AbstractCallback(trx_t* trx) + : + m_trx(trx), + m_space(ULINT_UNDEFINED), + m_xdes(), + m_xdes_page_no(ULINT_UNDEFINED), + m_space_flags(ULINT_UNDEFINED), + m_table_flags(ULINT_UNDEFINED) UNIV_NOTHROW { } + + /** + Free any extent descriptor instance */ + virtual ~AbstractCallback() + { + delete [] m_xdes; + } + + /** Determine the page size to use for traversing the tablespace + @param file_size - size of the tablespace file in bytes + @param block - contents of the first page in the tablespace file. + @retval DB_SUCCESS or error code. */ + virtual dberr_t init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW; + + /** @return true if compressed table. */ + bool is_compressed_table() const UNIV_NOTHROW + { + return(get_zip_size() > 0); + } + +protected: + /** + Get the data page depending on the table type, compressed or not. + @param block - block read from disk + @retval the buffer frame */ + buf_frame_t* get_frame(buf_block_t* block) const UNIV_NOTHROW + { + if (is_compressed_table()) { + return(block->page.zip.data); + } + + return(buf_block_get_frame(block)); + } + + /** Check for session interrupt. If required we could + even flush to disk here every N pages. + @retval DB_SUCCESS or error code */ + dberr_t periodic_check() UNIV_NOTHROW + { + if (trx_is_interrupted(m_trx)) { + return(DB_INTERRUPTED); + } + + return(DB_SUCCESS); + } + + /** + Get the physical offset of the extent descriptor within the page. + @param page_no - page number of the extent descriptor + @param page - contents of the page containing the extent descriptor. + @return the start of the xdes array in a page */ + const xdes_t* xdes( + ulint page_no, + const page_t* page) const UNIV_NOTHROW + { + ulint offset; + + offset = xdes_calc_descriptor_index(get_zip_size(), page_no); + + return(page + XDES_ARR_OFFSET + XDES_SIZE * offset); + } + + /** + Set the current page directory (xdes). If the extent descriptor is + marked as free then free the current extent descriptor and set it to + 0. This implies that all pages that are covered by this extent + descriptor are also freed. + + @param page_no - offset of page within the file + @param page - page contents + @return DB_SUCCESS or error code. */ + dberr_t set_current_xdes( + ulint page_no, + const page_t* page) UNIV_NOTHROW + { + m_xdes_page_no = page_no; + + delete[] m_xdes; + + m_xdes = 0; + + ulint state; + const xdes_t* xdesc = page + XDES_ARR_OFFSET; + + state = mach_read_ulint(xdesc + XDES_STATE, MLOG_4BYTES); + + if (state != XDES_FREE) { + + m_xdes = new(std::nothrow) xdes_t[m_page_size]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_13", + delete [] m_xdes; m_xdes = 0;); + + if (m_xdes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(m_xdes, page, m_page_size); + } + + return(DB_SUCCESS); + } + + /** + @return true if it is a root page */ + bool is_root_page(const page_t* page) const UNIV_NOTHROW + { + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + return(mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL + && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL); + } + + /** + Check if the page is marked as free in the extent descriptor. + @param page_no - page number to check in the extent descriptor. + @return true if the page is marked as free */ + bool is_free(ulint page_no) const UNIV_NOTHROW + { + ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no) + == m_xdes_page_no); + + if (m_xdes != 0) { + const xdes_t* xdesc = xdes(page_no, m_xdes); + ulint pos = page_no % FSP_EXTENT_SIZE; + + return(xdes_get_bit(xdesc, XDES_FREE_BIT, pos)); + } + + /* If the current xdes was free, the page must be free. */ + return(true); + } + +protected: + /** Covering transaction. */ + trx_t* m_trx; + + /** Space id of the file being iterated over. */ + ulint m_space; + + /** Minimum page number for which the free list has not been + initialized: the pages >= this limit are, by definition, free; + note that in a single-table tablespace where size < 64 pages, + this number is 64, i.e., we have initialized the space about + the first extent, but have not physically allocted those pages + to the file. @see FSP_LIMIT. */ + ulint m_free_limit; + + /** Current size of the space in pages */ + ulint m_size; + + /** Current extent descriptor page */ + xdes_t* m_xdes; + + /** Physical page offset in the file of the extent descriptor */ + ulint m_xdes_page_no; + + /** Flags value read from the header page */ + ulint m_space_flags; + + /** Derived from m_space_flags and row format type, the row format + type is determined from the page header. */ + ulint m_table_flags; +}; + +/** Determine the page size to use for traversing the tablespace +@param file_size - size of the tablespace file in bytes +@param block - contents of the first page in the tablespace file. +@retval DB_SUCCESS or error code. */ +dberr_t +AbstractCallback::init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW +{ + const page_t* page = block->frame; + + m_space_flags = fsp_header_get_flags(page); + + /* Since we don't know whether it is a compressed table + or not, the data is always read into the block->frame. */ + + dberr_t err = set_zip_size(block->frame); + + if (err != DB_SUCCESS) { + return(DB_CORRUPTION); + } + + /* Set the page size used to traverse the tablespace. */ + + m_page_size = (is_compressed_table()) + ? get_zip_size() : fsp_flags_get_page_size(m_space_flags); + + if (m_page_size == 0) { + ib_logf(IB_LOG_LEVEL_ERROR, "Page size is 0"); + return(DB_CORRUPTION); + } else if (!is_compressed_table() && m_page_size != UNIV_PAGE_SIZE) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Page size %lu of ibd file is not the same " + "as the server page size %lu", + m_page_size, UNIV_PAGE_SIZE); + + return(DB_CORRUPTION); + + } else if ((file_size % m_page_size)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "File size " UINT64PF " is not a multiple " + "of the page size %lu", + (ib_uint64_t) file_size, (ulong) m_page_size); + + return(DB_CORRUPTION); + } + + ut_a(m_space == ULINT_UNDEFINED); + + m_size = mach_read_from_4(page + FSP_SIZE); + m_free_limit = mach_read_from_4(page + FSP_FREE_LIMIT); + m_space = mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID); + + if ((err = set_current_xdes(0, page)) != DB_SUCCESS) { + return(err); + } + + return(DB_SUCCESS); +} + +/** +Try and determine the index root pages by checking if the next/prev +pointers are both FIL_NULL. We need to ensure that skip deleted pages. */ +struct FetchIndexRootPages : public AbstractCallback { + + /** Index information gathered from the .ibd file. */ + struct Index { + + Index(index_id_t id, ulint page_no) + : + m_id(id), + m_page_no(page_no) { } + + index_id_t m_id; /*!< Index id */ + ulint m_page_no; /*!< Root page number */ + }; + + typedef std::vector<Index> Indexes; + + /** Constructor + @param trx - covering (user) transaction + @param table - table definition in server .*/ + FetchIndexRootPages(const dict_table_t* table, trx_t* trx) + : + AbstractCallback(trx), + m_table(table) UNIV_NOTHROW { } + + /** Destructor */ + virtual ~FetchIndexRootPages() UNIV_NOTHROW { } + + /** + @retval the space id of the tablespace being iterated over */ + virtual ulint get_space_id() const UNIV_NOTHROW + { + return(m_space); + } + + /** + Check if the .ibd file row format is the same as the table's. + @param ibd_table_flags - determined from space and page. + @return DB_SUCCESS or error code. */ + dberr_t check_row_format(ulint ibd_table_flags) UNIV_NOTHROW + { + dberr_t err; + rec_format_t ibd_rec_format; + rec_format_t table_rec_format; + + if (!dict_tf_is_valid(ibd_table_flags)) { + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + ".ibd file has invlad table flags: %lx", + ibd_table_flags); + + return(DB_CORRUPTION); + } + + ibd_rec_format = dict_tf_get_rec_format(ibd_table_flags); + table_rec_format = dict_tf_get_rec_format(m_table->flags); + + if (table_rec_format != ibd_rec_format) { + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Table has %s row format, .ibd " + "file has %s row format.", + dict_tf_to_row_format_string(m_table->flags), + dict_tf_to_row_format_string(ibd_table_flags)); + + err = DB_CORRUPTION; + } else { + err = DB_SUCCESS; + } + + return(err); + } + + /** + Called for each block as it is read from the file. + @param offset - physical offset in the file + @param block - block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW; + + /** Update the import configuration that will be used to import + the tablespace. */ + dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW; + + /** Table definition in server. */ + const dict_table_t* m_table; + + /** Index information */ + Indexes m_indexes; +}; + +/** +Called for each block as it is read from the file. Check index pages to +determine the exact row format. We can't get that from the tablespace +header flags alone. + +@param offset - physical offset in the file +@param block - block to convert, it is not from the buffer pool. +@retval DB_SUCCESS or error code. */ +dberr_t +FetchIndexRootPages::operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + dberr_t err; + + if ((err = periodic_check()) != DB_SUCCESS) { + return(err); + } + + const page_t* page = get_frame(block); + + ulint page_type = fil_page_get_type(page); + + if (block->page.offset * m_page_size != offset) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Page offset doesn't match file offset: " + "page offset: %lu, file offset: %lu", + (ulint) block->page.offset, + (ulint) (offset / m_page_size)); + + err = DB_CORRUPTION; + } else if (page_type == FIL_PAGE_TYPE_XDES) { + err = set_current_xdes(block->page.offset, page); + } else if (page_type == FIL_PAGE_INDEX + && !is_free(block->page.offset) + && is_root_page(page)) { + + index_id_t id = btr_page_get_index_id(page); + ulint page_no = buf_block_get_page_no(block); + + m_indexes.push_back(Index(id, page_no)); + + if (m_indexes.size() == 1) { + + m_table_flags = dict_sys_tables_type_to_tf( + m_space_flags, + page_is_comp(page) ? DICT_N_COLS_COMPACT : 0); + + err = check_row_format(m_table_flags); + } + } + + return(err); +} + +/** +Update the import configuration that will be used to import the tablespace. +@return error code or DB_SUCCESS */ +dberr_t +FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW +{ + Indexes::const_iterator end = m_indexes.end(); + + ut_a(cfg->m_table == m_table); + cfg->m_page_size = m_page_size; + cfg->m_n_indexes = m_indexes.size(); + + if (cfg->m_n_indexes == 0) { + + ib_logf(IB_LOG_LEVEL_ERROR, "No B+Tree found in tablespace"); + + return(DB_CORRUPTION); + } + + cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_11", + delete [] cfg->m_indexes; cfg->m_indexes = 0;); + + if (cfg->m_indexes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + row_index_t* cfg_index = cfg->m_indexes; + + for (Indexes::const_iterator it = m_indexes.begin(); + it != end; + ++it, ++cfg_index) { + + char name[BUFSIZ]; + + ut_snprintf(name, sizeof(name), "index" IB_ID_FMT, it->m_id); + + ulint len = strlen(name) + 1; + + cfg_index->m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_12", + delete [] cfg_index->m_name; + cfg_index->m_name = 0;); + + if (cfg_index->m_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(cfg_index->m_name, name, len); + + cfg_index->m_id = it->m_id; + + cfg_index->m_space = m_space; + + cfg_index->m_page_no = it->m_page_no; + } + + return(DB_SUCCESS); +} + +/* Functor that is called for each physical page that is read from the +tablespace file. + + 1. Check each page for corruption. + + 2. Update the space id and LSN on every page + * For the header page + - Validate the flags + - Update the LSN + + 3. On Btree pages + * Set the index id + * Update the max trx id + * In a cluster index, update the system columns + * In a cluster index, update the BLOB ptr, set the space id + * Purge delete marked records, but only if they can be easily + removed from the page + * Keep a counter of number of rows, ie. non-delete-marked rows + * Keep a counter of number of delete marked rows + * Keep a counter of number of purge failure + * If a page is stamped with an index id that isn't in the .cfg file + we assume it is deleted and the page can be ignored. + + 4. Set the page state to dirty so that it will be written to disk. +*/ +class PageConverter : public AbstractCallback { +public: + /** Constructor + * @param cfg - config of table being imported. + * @param trx - transaction covering the import */ + PageConverter(row_import* cfg, trx_t* trx) UNIV_NOTHROW; + + virtual ~PageConverter() UNIV_NOTHROW + { + if (m_heap != 0) { + mem_heap_free(m_heap); + } + } + + /** + @retval the server space id of the tablespace being iterated over */ + virtual ulint get_space_id() const UNIV_NOTHROW + { + return(m_cfg->m_table->space); + } + + /** + Called for each block as it is read from the file. + @param offset - physical offset in the file + @param block - block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW; +private: + + /** Status returned by PageConverter::validate() */ + enum import_page_status_t { + IMPORT_PAGE_STATUS_OK, /*!< Page is OK */ + IMPORT_PAGE_STATUS_ALL_ZERO, /*!< Page is all zeros */ + IMPORT_PAGE_STATUS_CORRUPTED /*!< Page is corrupted */ + }; + + /** + Update the page, set the space id, max trx id and index id. + @param block - block read from file + @param page_type - type of the page + @retval DB_SUCCESS or error code */ + dberr_t update_page( + buf_block_t* block, + ulint& page_type) UNIV_NOTHROW; + +#if defined UNIV_DEBUG + /** + @return true error condition is enabled. */ + bool trigger_corruption() UNIV_NOTHROW + { + return(false); + } + #else +#define trigger_corruption() (false) +#endif /* UNIV_DEBUG */ + + /** + Update the space, index id, trx id. + @param block - block to convert + @return DB_SUCCESS or error code */ + dberr_t update_index_page(buf_block_t* block) UNIV_NOTHROW; + + /** Update the BLOB refrences and write UNDO log entries for + rows that can't be purged optimistically. + @param block - block to update + @retval DB_SUCCESS or error code */ + dberr_t update_records(buf_block_t* block) UNIV_NOTHROW; + + /** + Validate the page, check for corruption. + @param offset - physical offset within file. + @param page - page read from file. + @return 0 on success, 1 if all zero, 2 if corrupted */ + import_page_status_t validate( + os_offset_t offset, + buf_block_t* page) UNIV_NOTHROW; + + /** + Validate the space flags and update tablespace header page. + @param block - block read from file, not from the buffer pool. + @retval DB_SUCCESS or error code */ + dberr_t update_header(buf_block_t* block) UNIV_NOTHROW; + + /** + Adjust the BLOB reference for a single column that is externally stored + @param rec - record to update + @param offsets - column offsets for the record + @param i - column ordinal value + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_column( + rec_t* rec, + const ulint* offsets, + ulint i) UNIV_NOTHROW; + + /** + Adjusts the BLOB reference in the clustered index row for all + externally stored columns. + @param rec - record to update + @param offsets - column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_columns( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW; + + /** + In the clustered index, adjist the BLOB pointers as needed. + Also update the BLOB reference, write the new space id. + @param rec - record to update + @param offsets - column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_ref( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW; + + /** + Purge delete-marked records, only if it is possible to do + so without re-organising the B+tree. + @param offsets - current row offsets. + @retval true if purged */ + bool purge(const ulint* offsets) UNIV_NOTHROW; + + /** + Adjust the BLOB references and sys fields for the current record. + @param index - the index being converted + @param rec - record to update + @param offsets - column offsets for the record + @param deleted - true if row is delete marked + @return DB_SUCCESS or error code. */ + dberr_t adjust_cluster_record( + const dict_index_t* index, + rec_t* rec, + const ulint* offsets, + bool deleted) UNIV_NOTHROW; + + /** + Find an index with the matching id. + @return row_index_t* instance or 0 */ + row_index_t* find_index(index_id_t id) UNIV_NOTHROW + { + row_index_t* index = &m_cfg->m_indexes[0]; + + for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) { + if (id == index->m_id) { + return(index); + } + } + + return(0); + + } +private: + /** Config for table that is being imported. */ + row_import* m_cfg; + + /** Current index whose pages are being imported */ + row_index_t* m_index; + + /** Current system LSN */ + lsn_t m_current_lsn; + + /** Alias for m_page_zip, only set for compressed pages. */ + page_zip_des_t* m_page_zip_ptr; + + /** Iterator over records in a block */ + RecIterator m_rec_iter; + + /** Record offset */ + ulint m_offsets_[REC_OFFS_NORMAL_SIZE]; + + /** Pointer to m_offsets_ */ + ulint* m_offsets; + + /** Memory heap for the record offsets */ + mem_heap_t* m_heap; + + /** Cluster index instance */ + dict_index_t* m_cluster_index; +}; + +/** +row_import destructor. */ +row_import::~row_import() UNIV_NOTHROW +{ + for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) { + delete [] m_indexes[i].m_name; + + if (m_indexes[i].m_fields == 0) { + continue; + } + + dict_field_t* fields = m_indexes[i].m_fields; + ulint n_fields = m_indexes[i].m_n_fields; + + for (ulint j = 0; j < n_fields; ++j) { + delete [] fields[j].name; + } + + delete [] fields; + } + + for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) { + delete [] m_col_names[i]; + } + + delete [] m_cols; + delete [] m_indexes; + delete [] m_col_names; + delete [] m_table_name; + delete [] m_hostname; +} + +/** +Find the index entry in in the indexes array. +@param name - index name +@return instance if found else 0. */ +row_index_t* +row_import::get_index( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_indexes; ++i) { + const char* index_name; + row_index_t* index = &m_indexes[i]; + + index_name = reinterpret_cast<const char*>(index->m_name); + + if (strcmp(index_name, name) == 0) { + + return(index); + } + } + + return(0); +} + +/** +Get the number of rows in the index. +@param name - index name +@return number of rows (doesn't include delete marked rows). */ +ulint +row_import::get_n_rows( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_rows); +} + +/** +Get the number of rows for which purge failed uding the convert phase. +@param name - index name +@return number of rows for which purge failed. */ +ulint +row_import::get_n_purge_failed( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_purge_failed); +} + +/** +Find the ordinal value of the column name in the cfg table columns. +@param name - of column to look for. +@return ULINT_UNDEFINED if not found. */ +ulint +row_import::find_col( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_cols; ++i) { + const char* col_name; + + col_name = reinterpret_cast<const char*>(m_col_names[i]); + + if (strcmp(col_name, name) == 0) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/** +Find the index field entry in in the cfg indexes fields. +@name - of the index to look for +@return instance if found else 0. */ +const dict_field_t* +row_import::find_field( + const row_index_t* cfg_index, + const char* name) const UNIV_NOTHROW +{ + const dict_field_t* field = cfg_index->m_fields; + + for (ulint i = 0; i < cfg_index->m_n_fields; ++i, ++field) { + const char* field_name; + + field_name = reinterpret_cast<const char*>(field->name); + + if (strcmp(field_name, name) == 0) { + return(field); + } + } + + return(0); +} + +/** +Check if the index schema that was read from the .cfg file matches the +in memory index definition. +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW +{ + row_index_t* cfg_index; + dberr_t err = DB_SUCCESS; + + cfg_index = get_index(index->name); + + if (cfg_index == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s not found in tablespace meta-data file.", + index->name); + + return(DB_ERROR); + } + + cfg_index->m_srv_index = index; + + const dict_field_t* field = index->fields; + + for (ulint i = 0; i < index->n_fields; ++i, ++field) { + + const dict_field_t* cfg_field; + + cfg_field = find_field(cfg_index, field->name); + + if (cfg_field == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s not found in tablespace " + "meta-data file.", + index->name, field->name); + + err = DB_ERROR; + } else { + + if (cfg_field->prefix_len != field->prefix_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s prefix len %lu " + "doesn't match meta-data file value " + "%lu", + index->name, field->name, + (ulong) field->prefix_len, + (ulong) cfg_field->prefix_len); + + err = DB_ERROR; + } + + if (cfg_field->fixed_len != field->fixed_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s fixed len %lu " + "doesn't match meta-data file value " + "%lu", + index->name, field->name, + (ulong) field->fixed_len, + (ulong) cfg_field->fixed_len); + + err = DB_ERROR; + } + } + } + + return(err); +} + +/** +Check if the table schema that was read from the .cfg file matches the +in memory table definition. +@param thd - MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_table_columns( + THD* thd) UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + const dict_col_t* col = m_table->cols; + + for (ulint i = 0; i < m_table->n_cols; ++i, ++col) { + + const char* col_name; + ulint cfg_col_index; + + col_name = dict_table_get_col_name( + m_table, dict_col_get_no(col)); + + cfg_col_index = find_col(col_name); + + if (cfg_col_index == ULINT_UNDEFINED) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s not found in tablespace.", + col_name); + + err = DB_ERROR; + } else if (cfg_col_index != col->ind) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordinal value mismatch, it's at " + "%lu in the table and %lu in the tablespace " + "meta-data file", + col_name, + (ulong) col->ind, (ulong) cfg_col_index); + + err = DB_ERROR; + } else { + const dict_col_t* cfg_col; + + cfg_col = &m_cols[cfg_col_index]; + ut_a(cfg_col->ind == cfg_col_index); + + if (cfg_col->prtype != col->prtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s precise type mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->mtype != col->mtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s main type mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->len != col->len) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s length mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->mbminmaxlen != col->mbminmaxlen) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s multi-byte len mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->ind != col->ind) { + err = DB_ERROR; + } + + if (cfg_col->ord_part != col->ord_part) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordering mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->max_prefix != col->max_prefix) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s max prefix mismatch.", + col_name); + err = DB_ERROR; + } + } + } + + return(err); +} + +/** +Check if the table (and index) schema that was read from the .cfg file +matches the in memory table definition. +@param thd - MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_schema( + THD* thd) UNIV_NOTHROW +{ + /* Do some simple checks. */ + + if (m_flags != m_table->flags) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Table flags don't match, server table has 0x%lx " + "and the meta-data file has 0x%lx", + (ulong) m_table->n_cols, (ulong) m_flags); + + return(DB_ERROR); + } else if (m_table->n_cols != m_n_cols) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Number of columns don't match, table has %lu " + "columns but the tablespace meta-data file has " + "%lu columns", + (ulong) m_table->n_cols, (ulong) m_n_cols); + + return(DB_ERROR); + } else if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + /* If the number of indexes don't match then it is better + to abort the IMPORT. It is easy for the user to create a + table matching the IMPORT definition. */ + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Number of indexes don't match, table has %lu " + "indexes but the tablespace meta-data file has " + "%lu indexes", + (ulong) UT_LIST_GET_LEN(m_table->indexes), + (ulong) m_n_indexes); + + return(DB_ERROR); + } + + dberr_t err = match_table_columns(thd); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Check if the index definitions match. */ + + const dict_index_t* index; + + for (index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + dberr_t index_err; + + index_err = match_index_columns(thd, index); + + if (index_err != DB_SUCCESS) { + err = index_err; + } + } + + return(err); +} + +/** +Set the index root <space, pageno>, using index name. */ +void +row_import::set_root_by_name() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) { + dict_index_t* index; + + const char* index_name; + + index_name = reinterpret_cast<const char*>(cfg_index->m_name); + + index = dict_table_get_index_on_name(m_table, index_name); + + /* We've already checked that it exists. */ + ut_a(index != 0); + + /* Set the root page number and space id. */ + index->space = m_table->space; + index->page = cfg_index->m_page_no; + } +} + +/** +Set the index root <space, pageno>, using a heuristic. +@return DB_SUCCESS or error code */ +dberr_t +row_import::set_root_by_heuristic() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + ut_a(m_n_indexes > 0); + + // TODO: For now use brute force, based on ordinality + + if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), m_table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_WARN, + "Table %s should have %lu indexes but the tablespace " + "has %lu indexes", + table_name, + UT_LIST_GET_LEN(m_table->indexes), + m_n_indexes); + } + + dict_mutex_enter_for_mysql(); + + ulint i = 0; + dberr_t err = DB_SUCCESS; + + for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + index->type |= DICT_CORRUPT; + ib_logf(IB_LOG_LEVEL_WARN, + "Skipping FTS index: %s", index->name); + } else if (i < m_n_indexes) { + + delete [] cfg_index[i].m_name; + + ulint len = strlen(index->name) + 1; + + cfg_index[i].m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_14", + delete[] cfg_index[i].m_name; + cfg_index[i].m_name = 0;); + + if (cfg_index[i].m_name == 0) { + err = DB_OUT_OF_MEMORY; + break; + } + + memcpy(cfg_index[i].m_name, index->name, len); + + cfg_index[i].m_srv_index = index; + + index->space = m_table->space; + index->page = cfg_index[i].m_page_no; + + ++i; + } + } + + dict_mutex_exit_for_mysql(); + + return(err); +} + +/** +Purge delete marked records. +@return DB_SUCCESS or error code. */ +dberr_t +IndexPurge::garbage_collect() UNIV_NOTHROW +{ + dberr_t err; + ibool comp = dict_table_is_comp(m_index->table); + + /* Open the persistent cursor and start the mini-transaction. */ + + open(); + + while ((err = next()) == DB_SUCCESS) { + + rec_t* rec = btr_pcur_get_rec(&m_pcur); + ibool deleted = rec_get_deleted_flag(rec, comp); + + if (!deleted) { + ++m_n_rows; + } else { + purge(); + } + } + + /* Close the persistent cursor and commit the mini-transaction. */ + + close(); + + return(err == DB_END_OF_INDEX ? DB_SUCCESS : err); +} + +/** +Begin import, position the cursor on the first record. */ +void +IndexPurge::open() UNIV_NOTHROW +{ + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_open_at_index_side( + true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr); +} + +/** +Close the persistent curosr and commit the mini-transaction. */ +void +IndexPurge::close() UNIV_NOTHROW +{ + btr_pcur_close(&m_pcur); + mtr_commit(&m_mtr); +} + +/** +Position the cursor on the next record. +@return DB_SUCCESS or error code */ +dberr_t +IndexPurge::next() UNIV_NOTHROW +{ + btr_pcur_move_to_next_on_page(&m_pcur); + + /* When switching pages, commit the mini-transaction + in order to release the latch on the old page. */ + + if (!btr_pcur_is_after_last_on_page(&m_pcur)) { + return(DB_SUCCESS); + } else if (trx_is_interrupted(m_trx)) { + /* Check after every page because the check + is expensive. */ + return(DB_INTERRUPTED); + } + + btr_pcur_store_position(&m_pcur, &m_mtr); + + mtr_commit(&m_mtr); + + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr); + + if (!btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr)) { + + return(DB_END_OF_INDEX); + } + + return(DB_SUCCESS); +} + +/** +Store the persistent cursor position and reopen the +B-tree cursor in BTR_MODIFY_TREE mode, because the +tree structure may be changed during a pessimistic delete. */ +void +IndexPurge::purge_pessimistic_delete() UNIV_NOTHROW +{ + dberr_t err; + + btr_pcur_restore_position(BTR_MODIFY_TREE, &m_pcur, &m_mtr); + + ut_ad(rec_get_deleted_flag( + btr_pcur_get_rec(&m_pcur), + dict_table_is_comp(m_index->table))); + + btr_cur_pessimistic_delete( + &err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, RB_NONE, &m_mtr); + + ut_a(err == DB_SUCCESS); + + /* Reopen the B-tree cursor in BTR_MODIFY_LEAF mode */ + mtr_commit(&m_mtr); +} + +/** +Purge delete-marked records. */ +void +IndexPurge::purge() UNIV_NOTHROW +{ + btr_pcur_store_position(&m_pcur, &m_mtr); + + purge_pessimistic_delete(); + + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr); +} + +/** +Constructor +* @param cfg - config of table being imported. +* @param trx - transaction covering the import */ +PageConverter::PageConverter( + row_import* cfg, + trx_t* trx) + : + AbstractCallback(trx), + m_cfg(cfg), + m_page_zip_ptr(0), + m_heap(0) UNIV_NOTHROW +{ + m_index = m_cfg->m_indexes; + + m_current_lsn = log_get_lsn(); + ut_a(m_current_lsn > 0); + + m_offsets = m_offsets_; + rec_offs_init(m_offsets_); + + m_cluster_index = dict_table_get_first_index(m_cfg->m_table); +} + +/** +Adjust the BLOB reference for a single column that is externally stored +@param rec - record to update +@param offsets - column offsets for the record +@param i - column ordinal value +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_column( + rec_t* rec, + const ulint* offsets, + ulint i) UNIV_NOTHROW +{ + ulint len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &len); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_2", + len = BTR_EXTERN_FIELD_REF_SIZE - 1;); + + if (len < BTR_EXTERN_FIELD_REF_SIZE) { + + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), + m_cluster_index->name, TRUE); + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Externally stored column(%lu) has a reference " + "length of %lu in the cluster index %s", + (ulong) i, (ulong) len, index_name); + + return(DB_CORRUPTION); + } + + field += BTR_EXTERN_SPACE_ID - BTR_EXTERN_FIELD_REF_SIZE + len; + + if (is_compressed_table()) { + mach_write_to_4(field, get_space_id()); + + page_zip_write_blob_ptr( + m_page_zip_ptr, rec, m_cluster_index, offsets, i, 0); + } else { + mlog_write_ulint(field, get_space_id(), MLOG_4BYTES, 0); + } + + return(DB_SUCCESS); +} + +/** +Adjusts the BLOB reference in the clustered index row for all externally +stored columns. +@param rec - record to update +@param offsets - column offsets for the record +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_columns( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW +{ + ut_ad(rec_offs_any_extern(offsets)); + + /* Adjust the space_id in the BLOB pointers. */ + + for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) { + + /* Only if the column is stored "externally". */ + + if (rec_offs_nth_extern(offsets, i)) { + dberr_t err; + + err = adjust_cluster_index_blob_column(rec, offsets, i); + + if (err != DB_SUCCESS) { + return(err); + } + } + } + + return(DB_SUCCESS); +} + +/** +In the clustered index, adjust BLOB pointers as needed. Also update the +BLOB reference, write the new space id. +@param rec - record to update +@param offsets - column offsets for the record +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_ref( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW +{ + if (rec_offs_any_extern(offsets)) { + dberr_t err; + + err = adjust_cluster_index_blob_columns(rec, offsets); + + if (err != DB_SUCCESS) { + return(err); + } + } + + return(DB_SUCCESS); +} + +/** +Purge delete-marked records, only if it is possible to do so without +re-organising the B+tree. +@param offsets - current row offsets. +@return true if purge succeeded */ +bool +PageConverter::purge(const ulint* offsets) UNIV_NOTHROW +{ + const dict_index_t* index = m_index->m_srv_index; + + /* We can't have a page that is empty and not root. */ + if (m_rec_iter.remove(index, m_page_zip_ptr, m_offsets)) { + + ++m_index->m_stats.m_n_purged; + + return(true); + } else { + ++m_index->m_stats.m_n_purge_failed; + } + + return(false); +} + +/** +Adjust the BLOB references and sys fields for the current record. +@param rec - record to update +@param offsets - column offsets for the record +@param deleted - true if row is delete marked +@return DB_SUCCESS or error code. */ +dberr_t +PageConverter::adjust_cluster_record( + const dict_index_t* index, + rec_t* rec, + const ulint* offsets, + bool deleted) UNIV_NOTHROW +{ + dberr_t err; + + if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) { + + /* Reset DB_TRX_ID and DB_ROLL_PTR. Normally, these fields + are only written in conjunction with other changes to the + record. */ + + row_upd_rec_sys_fields( + rec, m_page_zip_ptr, m_cluster_index, m_offsets, + m_trx, 0); + } + + return(err); +} + +/** +Update the BLOB refrences and write UNDO log entries for +rows that can't be purged optimistically. +@param block - block to update +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_records( + buf_block_t* block) UNIV_NOTHROW +{ + ibool comp = dict_table_is_comp(m_cfg->m_table); + bool clust_index = m_index->m_srv_index == m_cluster_index; + + /* This will also position the cursor on the first user record. */ + + m_rec_iter.open(block); + + while (!m_rec_iter.end()) { + + rec_t* rec = m_rec_iter.current(); + + /* FIXME: Move out of the loop */ + + if (rec_get_status(rec) == REC_STATUS_NODE_PTR) { + break; + } + + ibool deleted = rec_get_deleted_flag(rec, comp); + + /* For the clustered index we have to adjust the BLOB + reference and the system fields irrespective of the + delete marked flag. The adjustment of delete marked + cluster records is required for purge to work later. */ + + if (deleted || clust_index) { + m_offsets = rec_get_offsets( + rec, m_index->m_srv_index, m_offsets, + ULINT_UNDEFINED, &m_heap); + } + + if (clust_index) { + + dberr_t err = adjust_cluster_record( + m_index->m_srv_index, rec, m_offsets, + deleted); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* If it is a delete marked record then try an + optimistic delete. */ + + if (deleted) { + /* A successful purge will move the cursor to the + next record. */ + + if (!purge(m_offsets)) { + m_rec_iter.next(); + } + + ++m_index->m_stats.m_n_deleted; + } else { + ++m_index->m_stats.m_n_rows; + m_rec_iter.next(); + } + } + + return(DB_SUCCESS); +} + +/** +Update the space, index id, trx id. +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::update_index_page( + buf_block_t* block) UNIV_NOTHROW +{ + index_id_t id; + buf_frame_t* page = block->frame; + + if (is_free(buf_block_get_page_no(block))) { + return(DB_SUCCESS); + } else if ((id = btr_page_get_index_id(page)) != m_index->m_id) { + + row_index_t* index = find_index(id); + + if (index == 0) { + m_index = 0; + return(DB_CORRUPTION); + } + + /* Update current index */ + m_index = index; + } + + /* If the .cfg file is missing and there is an index mismatch + then ignore the error. */ + if (m_cfg->m_missing && (m_index == 0 || m_index->m_srv_index == 0)) { + return(DB_SUCCESS); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!is_compressed_table() + || page_zip_validate(m_page_zip_ptr, page, m_index->m_srv_index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* This has to be written to uncompressed index header. Set it to + the current index id. */ + btr_page_set_index_id( + page, m_page_zip_ptr, m_index->m_srv_index->id, 0); + + page_set_max_trx_id(block, m_page_zip_ptr, m_trx->id, 0); + + if (page_get_n_recs(block->frame) == 0) { + + /* Only a root page can be empty. */ + if (!is_root_page(block->frame)) { + // TODO: We should relax this and skip secondary + // indexes. Mark them as corrupt because they can + // always be rebuilt. + return(DB_CORRUPTION); + } + + return(DB_SUCCESS); + } + + return(update_records(block)); +} + +/** +Validate the space flags and update tablespace header page. +@param block - block read from file, not from the buffer pool. +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_header( + buf_block_t* block) UNIV_NOTHROW +{ + /* Check for valid header */ + switch(fsp_header_get_space_id(get_frame(block))) { + case 0: + return(DB_CORRUPTION); + case ULINT_UNDEFINED: + ib_logf(IB_LOG_LEVEL_WARN, + "Space id check in the header failed " + "- ignored"); + } + + ulint space_flags = fsp_header_get_flags(get_frame(block)); + + if (!fsp_flags_is_valid(space_flags)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unsupported tablespace format %lu", + (ulong) space_flags); + + return(DB_UNSUPPORTED); + } + + mach_write_to_8( + get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN, m_current_lsn); + + /* Write space_id to the tablespace header, page 0. */ + mach_write_to_4( + get_frame(block) + FSP_HEADER_OFFSET + FSP_SPACE_ID, + get_space_id()); + + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + get_space_id()); + + return(DB_SUCCESS); +} + +/** +Update the page, set the space id, max trx id and index id. +@param block - block read from file +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_page( + buf_block_t* block, + ulint& page_type) UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + + switch (page_type = fil_page_get_type(get_frame(block))) { + case FIL_PAGE_TYPE_FSP_HDR: + /* Work directly on the uncompressed page headers. */ + ut_a(buf_block_get_page_no(block) == 0); + return(update_header(block)); + + case FIL_PAGE_INDEX: + /* We need to decompress the contents into block->frame + before we can do any thing with Btree pages. */ + + if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) { + return(DB_CORRUPTION); + } + + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + /* Only update the Btree nodes. */ + return(update_index_page(block)); + + case FIL_PAGE_TYPE_SYS: + /* This is page 0 in the system tablespace. */ + return(DB_CORRUPTION); + + case FIL_PAGE_TYPE_XDES: + err = set_current_xdes( + buf_block_get_page_no(block), get_frame(block)); + case FIL_PAGE_INODE: + case FIL_PAGE_TYPE_TRX_SYS: + case FIL_PAGE_IBUF_FREE_LIST: + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_BLOB: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + + /* Work directly on the uncompressed page headers. */ + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + return(err); + } + + ib_logf(IB_LOG_LEVEL_WARN, "Unknown page type (%lu)", page_type); + + return(DB_CORRUPTION); +} + +/** +Validate the page +@param offset - physical offset within file. +@param page - page read from file. +@return status */ +PageConverter::import_page_status_t +PageConverter::validate( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + buf_frame_t* page = get_frame(block); + + /* Check that the page number corresponds to the offset in + the file. Flag as corrupt if it doesn't. Disable the check + for LSN in buf_page_is_corrupted() */ + + if (buf_page_is_corrupted(false, page, get_zip_size()) + || (page_get_page_no(page) != offset / m_page_size + && page_get_page_no(page) != 0)) { + + return(IMPORT_PAGE_STATUS_CORRUPTED); + + } else if (offset > 0 && page_get_page_no(page) == 0) { + const byte* b = page; + const byte* e = b + m_page_size; + + /* If the page number is zero and offset > 0 then + the entire page MUST consist of zeroes. If not then + we flag it as corrupt. */ + + while (b != e) { + + if (*b++ && !trigger_corruption()) { + return(IMPORT_PAGE_STATUS_CORRUPTED); + } + } + + /* The page is all zero: do nothing. */ + return(IMPORT_PAGE_STATUS_ALL_ZERO); + } + + return(IMPORT_PAGE_STATUS_OK); +} + +/** +Called for every page in the tablespace. If the page was not +updated then its state must be set to BUF_PAGE_NOT_USED. +@param offset - physical offset within the file +@param block - block read from file, note it is not from the buffer pool +@retval DB_SUCCESS or error code. */ +dberr_t +PageConverter::operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + ulint page_type; + dberr_t err = DB_SUCCESS; + + if ((err = periodic_check()) != DB_SUCCESS) { + return(err); + } + + if (is_compressed_table()) { + m_page_zip_ptr = &block->page.zip; + } else { + ut_ad(m_page_zip_ptr == 0); + } + + switch(validate(offset, block)) { + case IMPORT_PAGE_STATUS_OK: + + /* We have to decompress the compressed pages before + we can work on them */ + + if ((err = update_page(block, page_type)) != DB_SUCCESS) { + return(err); + } + + /* Note: For compressed pages this function will write to the + zip descriptor and for uncompressed pages it will write to + page (ie. the block->frame). Therefore the caller should write + out the descriptor contents and not block->frame for compressed + pages. */ + + if (!is_compressed_table() || page_type == FIL_PAGE_INDEX) { + + buf_flush_init_for_writing( + !is_compressed_table() + ? block->frame : block->page.zip.data, + !is_compressed_table() ? 0 : m_page_zip_ptr, + m_current_lsn); + } else { + /* Calculate and update the checksum of non-btree + pages for compressed tables explicitly here. */ + + buf_flush_update_zip_checksum( + get_frame(block), get_zip_size(), + m_current_lsn); + } + + break; + + case IMPORT_PAGE_STATUS_ALL_ZERO: + /* The page is all zero: leave it as is. */ + break; + + case IMPORT_PAGE_STATUS_CORRUPTED: + + ib_logf(IB_LOG_LEVEL_WARN, + "%s: Page %lu at offset " UINT64PF " looks corrupted.", + m_filepath, (ulong) (offset / m_page_size), offset); + + return(DB_CORRUPTION); + } + + return(err); +} + +/*****************************************************************//** +Clean up after import tablespace failure, this function will acquire +the dictionary latches on behalf of the transaction if the transaction +hasn't already acquired them. */ +static __attribute__((nonnull)) +void +row_import_discard_changes( +/*=======================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + dict_table_t* table = prebuilt->table; + + ut_a(err != DB_SUCCESS); + + prebuilt->trx->error_info = NULL; + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + prebuilt->table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Discarding tablespace of table %s: %s", + table_name, ut_strerr(err)); + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + ut_a(trx->dict_operation_lock_mode == 0); + row_mysql_lock_data_dictionary(trx); + } + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Since we update the index root page numbers on disk after + we've done a successful import. The table will not be loadable. + However, we need to ensure that the in memory root page numbers + are reset to "NULL". */ + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + index->page = FIL_NULL; + index->space = FIL_NULL; + } + + table->ibd_file_missing = TRUE; + + fil_close_tablespace(trx, table->space); +} + +/*****************************************************************//** +Clean up after import tablespace. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_cleanup( +/*===============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + ut_a(prebuilt->trx != trx); + + if (err != DB_SUCCESS) { + row_import_discard_changes(prebuilt, trx, err); + } + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE();); + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + prebuilt->trx->op_info = ""; + + DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE();); + + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + return(err); +} + +/*****************************************************************//** +Report error during tablespace import. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_error( +/*=============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + if (!trx_is_interrupted(trx)) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + prebuilt->table->name, FALSE); + + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_INNODB_IMPORT_ERROR, + table_name, (ulong) err, ut_strerr(err)); + } + + return(row_import_cleanup(prebuilt, trx, err)); +} + +/*****************************************************************//** +Adjust the root page index node and leaf node segment headers, update +with the new space id. For all the table's secondary indexes. +@return error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_adjust_root_pages_of_secondary_indexes( +/*==============================================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from + handler */ + trx_t* trx, /*!< in: transaction used for + the import */ + dict_table_t* table, /*!< in: table the indexes + belong to */ + const row_import& cfg) /*!< Import context */ +{ + dict_index_t* index; + ulint n_rows_in_table; + dberr_t err = DB_SUCCESS; + + /* Skip the clustered index. */ + index = dict_table_get_first_index(table); + + n_rows_in_table = cfg.get_n_rows(index->name); + + DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure", + n_rows_in_table++;); + + /* Adjust the root pages of the secondary indexes only. */ + while ((index = dict_table_get_next_index(index)) != NULL) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), index->name, TRUE); + + ut_a(!dict_index_is_clust(index)); + + if (!(index->type & DICT_CORRUPT) + && index->space != FIL_NULL + && index->page != FIL_NULL) { + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + } else { + ib_logf(IB_LOG_LEVEL_WARN, + "Skip adjustment of root pages for " + "index %s.", index->name); + + err = DB_CORRUPTION; + } + + if (err != DB_SUCCESS) { + + if (index->type & DICT_CLUSTERED) { + break; + } + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' not found or corrupt, " + "you should recreate this index.", + index_name); + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + index->type |= DICT_CORRUPT; + continue; + } + + /* If we failed to purge any records in the index then + do it the hard way. + + TODO: We can do this in the first pass by generating UNDO log + records for the failed rows. */ + + if (!cfg.requires_purge(index->name)) { + continue; + } + + IndexPurge purge(trx, index); + + trx->op_info = "secondary: purge delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + + if (err != DB_SUCCESS) { + break; + } else if (purge.get_n_rows() != n_rows_in_table) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' contains %lu entries, " + "should be %lu, you should recreate " + "this index.", index_name, + (ulong) purge.get_n_rows(), + (ulong) n_rows_in_table); + + index->type |= DICT_CORRUPT; + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + } + } + + return(err); +} + +/*****************************************************************//** +Ensure that dict_sys->row_id exceeds SELECT MAX(DB_ROW_ID). +@return error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_set_sys_max_row_id( +/*==========================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from + handler */ + const dict_table_t* table) /*!< in: table to import */ +{ + dberr_t err; + const rec_t* rec; + mtr_t mtr; + btr_pcur_t pcur; + row_id_t row_id = 0; + dict_index_t* index; + + index = dict_table_get_first_index(table); + ut_a(dict_index_is_clust(index)); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + btr_pcur_open_at_index_side( + false, // High end + index, + BTR_SEARCH_LEAF, + &pcur, + true, // Init cursor + 0, // Leaf level + &mtr); + + btr_pcur_move_to_prev_on_page(&pcur); + rec = btr_pcur_get_rec(&pcur); + + /* Check for empty table. */ + if (!page_rec_is_infimum(rec)) { + ulint len; + const byte* field; + mem_heap_t* heap = NULL; + ulint offsets_[1 + REC_OFFS_HEADER_SIZE]; + ulint* offsets; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap); + + field = rec_get_nth_field( + rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), + &len); + + if (len == DATA_ROW_ID_LEN) { + row_id = mach_read_from_6(field); + err = DB_SUCCESS; + } else { + err = DB_CORRUPTION; + } + + if (heap != NULL) { + mem_heap_free(heap); + } + } else { + /* The table is empty. */ + err = DB_SUCCESS; + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + DBUG_EXECUTE_IF("ib_import_set_max_rowid_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), index->name, TRUE); + + ib_errf(prebuilt->trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' corruption detected, invalid DB_ROW_ID " + "in index.", index_name); + + return(err); + + } else if (row_id > 0) { + + /* Update the system row id if the imported index row id is + greater than the max system row id. */ + + mutex_enter(&dict_sys->mutex); + + if (row_id >= dict_sys->row_id) { + dict_sys->row_id = row_id + 1; + dict_hdr_flush_row_id(); + } + + mutex_exit(&dict_sys->mutex); + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the a string from the meta data file. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_cfg_read_string( +/*=======================*/ + FILE* file, /*!< in/out: File to read from */ + byte* ptr, /*!< out: string to read */ + ulint max_len) /*!< in: maximum length of the output + buffer in bytes */ +{ + DBUG_EXECUTE_IF("ib_import_string_read_error", + errno = EINVAL; return(DB_IO_ERROR);); + + ulint len = 0; + + while (!feof(file)) { + int ch = fgetc(file); + + if (ch == EOF) { + break; + } else if (ch != 0) { + if (len < max_len) { + ptr[len++] = ch; + } else { + break; + } + /* max_len includes the NUL byte */ + } else if (len != max_len - 1) { + break; + } else { + ptr[len] = 0; + return(DB_SUCCESS); + } + } + + errno = EINVAL; + + return(DB_IO_ERROR); +} + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_cfg_read_index_fields( +/*=============================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_index_t* index, /*!< Index being read in */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte row[sizeof(ib_uint32_t) * 3]; + ulint n_fields = index->m_n_fields; + + index->m_fields = new(std::nothrow) dict_field_t[n_fields]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_4", + delete [] index->m_fields; index->m_fields = 0;); + + if (index->m_fields == 0) { + return(DB_OUT_OF_MEMORY); + } + + dict_field_t* field = index->m_fields; + + memset(field, 0x0, sizeof(*field) * n_fields); + + for (ulint i = 0; i < n_fields; ++i, ++field) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_1", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading index fields."); + + return(DB_IO_ERROR); + } + + field->prefix_len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + field->fixed_len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* Include the NUL byte in the length. */ + ulint len = mach_read_from_4(ptr); + + byte* name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_5", delete [] name; name = 0;); + + if (name == 0) { + return(DB_OUT_OF_MEMORY); + } + + field->name = reinterpret_cast<const char*>(name); + + dberr_t err = row_import_cfg_read_string(file, name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the index names and root page numbers of the indexes and set the values. +Row format [root_page_no, len of str, str ... ] +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_index_data( +/*=======================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte* ptr; + row_index_t* cfg_index; + byte row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9]; + + /* FIXME: What is the max value? */ + ut_a(cfg->m_n_indexes > 0); + ut_a(cfg->m_n_indexes < 1024); + + cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_6", + delete [] cfg->m_indexes; cfg->m_indexes = 0;); + + if (cfg->m_indexes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + cfg_index = cfg->m_indexes; + + for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) { + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_2", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the index data. */ + size_t n_bytes = fread(row, 1, sizeof(row), file); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error", + (void) fseek(file, 0L, SEEK_END);); + + if (n_bytes != sizeof(row)) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), + "while reading index meta-data, expected " + "to read %lu bytes but read only %lu " + "bytes", + (ulong) sizeof(row), (ulong) n_bytes); + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), msg); + + ib_logf(IB_LOG_LEVEL_ERROR, "IO Error: %s", msg); + + return(DB_IO_ERROR); + } + + ptr = row; + + cfg_index->m_id = mach_read_from_8(ptr); + ptr += sizeof(index_id_t); + + cfg_index->m_space = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_page_no = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_type = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_trx_id_offset = mach_read_from_4(ptr); + if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) { + ut_ad(0); + /* Overflow. Pretend that the clustered index + has a variable-length PRIMARY KEY. */ + cfg_index->m_trx_id_offset = 0; + } + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_uniq = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_nullable = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_fields = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* The NUL byte is included in the name length. */ + ulint len = mach_read_from_4(ptr); + + if (len > OS_FILE_MAX_PATH) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Index name length (%lu) is too long, " + "the meta-data is corrupt", len); + + return(DB_CORRUPTION); + } + + cfg_index->m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_7", + delete [] cfg_index->m_name; + cfg_index->m_name = 0;); + + if (cfg_index->m_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string(file, cfg_index->m_name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing index name."); + + return(err); + } + + err = row_import_cfg_read_index_fields( + file, thd, cfg_index, cfg); + + if (err != DB_SUCCESS) { + return(err); + } + + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Set the index root page number for v1 format. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_read_indexes( +/*====================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_3", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the number of indexes. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading number of indexes."); + + return(DB_IO_ERROR); + } + + cfg->m_n_indexes = mach_read_from_4(row); + + if (cfg->m_n_indexes == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is 0"); + + return(DB_CORRUPTION); + + } else if (cfg->m_n_indexes > 1024) { + // FIXME: What is the upper limit? */ + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is too high: %lu", + (ulong) cfg->m_n_indexes); + cfg->m_n_indexes = 0; + + return(DB_CORRUPTION); + } + + return(row_import_read_index_data(file, thd, cfg)); +} + +/*********************************************************************//** +Read the meta data (table columns) config file. Deserialise the contents of +dict_col_t structure, along with the column name. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_columns( +/*====================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 8]; + + /* FIXME: What should the upper limit be? */ + ut_a(cfg->m_n_cols > 0); + ut_a(cfg->m_n_cols < 1024); + + cfg->m_cols = new(std::nothrow) dict_col_t[cfg->m_n_cols]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_8", + delete [] cfg->m_cols; cfg->m_cols = 0;); + + if (cfg->m_cols == 0) { + return(DB_OUT_OF_MEMORY); + } + + cfg->m_col_names = new(std::nothrow) byte* [cfg->m_n_cols]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_9", + delete [] cfg->m_col_names; cfg->m_col_names = 0;); + + if (cfg->m_col_names == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols); + memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols); + + col = cfg->m_cols; + + for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_4", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading table column meta-data."); + + return(DB_IO_ERROR); + } + + col->prtype = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->mtype = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->mbminmaxlen = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->ind = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->ord_part = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->max_prefix = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* Read in the column name as [len, byte array]. The len + includes the NUL byte. */ + + ulint len = mach_read_from_4(ptr); + + /* FIXME: What is the maximum column name length? */ + if (len == 0 || len > 128) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_IO_READ_ERROR, + "Column name length %lu, is invalid", + (ulong) len); + + return(DB_CORRUPTION); + } + + cfg->m_col_names[i] = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_10", + delete [] cfg->m_col_names[i]; + cfg->m_col_names[i] = 0;); + + if (cfg->m_col_names[i] == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string( + file, cfg->m_col_names[i], len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table column name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_v1( +/*===============*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< out: meta data */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_5", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the hostname where the tablespace was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data export hostname length."); + + return(DB_IO_ERROR); + } + + ulint len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_hostname = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_1", + delete [] cfg->m_hostname; cfg->m_hostname = 0;); + + if (cfg->m_hostname == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err = row_import_cfg_read_string(file, cfg->m_hostname, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing export hostname."); + + return(err); + } + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_6", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the table name of tablespace that was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data table name length."); + + return(DB_IO_ERROR); + } + + len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_table_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_2", + delete [] cfg->m_table_name; cfg->m_table_name = 0;); + + if (cfg->m_table_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + err = row_import_cfg_read_string(file, cfg->m_table_name, len); + + if (err != DB_SUCCESS) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table name."); + + return(err); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Importing tablespace for table '%s' that was exported " + "from host '%s'", cfg->m_table_name, cfg->m_hostname); + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_7", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the autoinc value. */ + if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading autoinc value."); + + return(DB_IO_ERROR); + } + + cfg->m_autoinc = mach_read_from_8(row); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_8", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the tablespace page size. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data header."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + cfg->m_page_size = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + if (cfg->m_page_size != UNIV_PAGE_SIZE) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Tablespace to be imported has a different " + "page size than this server. Server page size " + "is %lu, whereas tablespace page size is %lu", + UNIV_PAGE_SIZE, (ulong) cfg->m_page_size); + + return(DB_ERROR); + } + + cfg->m_flags = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg->m_n_cols = mach_read_from_4(ptr); + + if (!dict_tf_is_valid(cfg->m_flags)) { + + return(DB_CORRUPTION); + + } else if ((err = row_import_read_columns(file, thd, cfg)) + != DB_SUCCESS) { + + return(err); + + } else if ((err = row_import_read_indexes(file, thd, cfg)) + != DB_SUCCESS) { + + return(err); + } + + ut_a(err == DB_SUCCESS); + return(err); +} + +/** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_meta_data( +/*======================*/ + dict_table_t* table, /*!< in: table */ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_9", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(&row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data version."); + + return(DB_IO_ERROR); + } + + cfg.m_version = mach_read_from_4(row); + + /* Check the version number. */ + switch (cfg.m_version) { + case IB_EXPORT_CFG_VERSION_V1: + + return(row_import_read_v1(file, thd, &cfg)); + default: + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Unsupported meta-data version number (%lu), " + "file ignored", (ulong) cfg.m_version); + } + + return(DB_ERROR); +} + +/** +Read the contents of the <tablename>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_cfg( +/*================*/ + dict_table_t* table, /*!< in: table */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + cfg.m_table = table; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + FILE* file = fopen(name, "rb"); + + if (file == NULL) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), + "Error opening '%s', will attempt to import " + "without schema verification", name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR, + errno, strerror(errno), msg); + + cfg.m_missing = true; + + err = DB_FAIL; + } else { + + cfg.m_missing = false; + + err = row_import_read_meta_data(table, file, thd, cfg); + fclose(file); + } + + return(err); +} + +/*****************************************************************//** +Update the <space, root page> of a table's indexes from the values +in the data dictionary. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_import_update_index_root( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + const dict_table_t* table, /*!< in: Table for which we want + to set the root page_no */ + bool reset, /*!< in: if true then set to + FIL_NUL */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + +{ + const dict_index_t* index; + que_t* graph = 0; + dberr_t err = DB_SUCCESS; + + static const char sql[] = { + "PROCEDURE UPDATE_INDEX_ROOT() IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES\n" + "SET SPACE = :space,\n" + " PAGE_NO = :page,\n" + " TYPE = :type\n" + "WHERE TABLE_ID = :table_id AND ID = :index_id;\n" + "END;\n"}; + + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + + for (index = dict_table_get_first_index(table); + index != 0; + index = dict_table_get_next_index(index)) { + + pars_info_t* info; + ib_uint32_t page; + ib_uint32_t space; + ib_uint32_t type; + index_id_t index_id; + table_id_t table_id; + + info = (graph != 0) ? graph->info : pars_info_create(); + + mach_write_to_4( + reinterpret_cast<byte*>(&type), + index->type); + + mach_write_to_4( + reinterpret_cast<byte*>(&page), + reset ? FIL_NULL : index->page); + + mach_write_to_4( + reinterpret_cast<byte*>(&space), + reset ? FIL_NULL : index->space); + + mach_write_to_8( + reinterpret_cast<byte*>(&index_id), + index->id); + + mach_write_to_8( + reinterpret_cast<byte*>(&table_id), + table->id); + + /* If we set the corrupt bit during the IMPORT phase then + we need to update the system tables. */ + pars_info_bind_int4_literal(info, "type", &type); + pars_info_bind_int4_literal(info, "space", &space); + pars_info_bind_int4_literal(info, "page", &page); + pars_info_bind_ull_literal(info, "index_id", &index_id); + pars_info_bind_ull_literal(info, "table_id", &table_id); + + if (graph == 0) { + graph = pars_sql(info, sql); + ut_a(graph); + graph->trx = trx; + } + + que_thr_t* thr; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + DBUG_EXECUTE_IF("ib_import_internal_error", + trx->error_state = DB_ERROR;); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), + index->name, TRUE); + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "While updating the <space, root page " + "number> of index %s - %s", + index_name, ut_strerr(err)); + + break; + } + } + + que_graph_free(graph); + + if (!dict_locked) { + mutex_exit(&dict_sys->mutex); + } + + return(err); +} + +/** Callback arg for row_import_set_discarded. */ +struct discard_t { + ib_uint32_t flags2; /*!< Value read from column */ + bool state; /*!< New state of the flag */ + ulint n_recs; /*!< Number of recs processed */ +}; + +/******************************************************************//** +Fetch callback that sets or unsets the DISCARDED tablespace flag in +SYS_TABLES. The flags is stored in MIX_LEN column. +@return FALSE if all OK */ +static +ibool +row_import_set_discarded( +/*=====================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: bool set/unset flag */ +{ + sel_node_t* node = static_cast<sel_node_t*>(row); + discard_t* discard = static_cast<discard_t*>(user_arg); + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == sizeof(ib_uint32_t)); + + ulint flags2 = mach_read_from_4( + static_cast<byte*>(dfield_get_data(dfield))); + + if (discard->state) { + flags2 |= DICT_TF2_DISCARDED; + } else { + flags2 &= ~DICT_TF2_DISCARDED; + } + + mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2); + + ++discard->n_recs; + + /* There should be at most one matching record. */ + ut_a(discard->n_recs == 1); + + return(FALSE); +} + +/*****************************************************************//** +Update the DICT_TF2_DISCARDED flag in SYS_TABLES. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_import_update_discarded_flag( +/*=============================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + table_id_t table_id, /*!< in: Table for which we want + to set the root table->flags2 */ + bool discarded, /*!< in: set MIX_LEN column bit + to discarded, if true */ + bool dict_locked) /*!< in: set to true if the + caller already owns the + dict_sys_t:: mutex. */ + +{ + pars_info_t* info; + discard_t discard; + + static const char sql[] = + "PROCEDURE UPDATE_DISCARDED_FLAG() IS\n" + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS\n" + " SELECT MIX_LEN " + " FROM SYS_TABLES " + " WHERE ID = :table_id FOR UPDATE;" + "\n" + "BEGIN\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_TABLES" + " SET MIX_LEN = :flags2" + " WHERE ID = :table_id;\n" + "CLOSE c;\n" + "END;\n"; + + discard.n_recs = 0; + discard.state = discarded; + discard.flags2 = ULINT32_UNDEFINED; + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "table_id", table_id); + pars_info_bind_int4_literal(info, "flags2", &discard.flags2); + + pars_info_bind_function( + info, "my_func", row_import_set_discarded, &discard); + + dberr_t err = que_eval_sql(info, sql, !dict_locked, trx); + + ut_a(discard.n_recs == 1); + ut_a(discard.flags2 != ULINT32_UNDEFINED); + + return(err); +} + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ +{ + dberr_t err; + trx_t* trx; + ib_uint64_t autoinc = 0; + char table_name[MAX_FULL_NAME_LEN + 1]; + char* filepath = NULL; + + ut_ad(!srv_read_only_mode); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ut_a(table->space); + ut_ad(prebuilt->trx); + ut_a(table->ibd_file_missing); + + trx_start_if_not_started(prebuilt->trx); + + trx = trx_allocate_for_mysql(); + + /* So that the table is not DROPped during recovery. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + trx_start_if_not_started(trx); + + /* So that we can send error messages to the user. */ + trx->mysql_thd = prebuilt->trx->mysql_thd; + + /* Ensure that the table will be dropped by trx_rollback_active() + in case of a crash. */ + + trx->table_id = table->id; + + /* Assign an undo segment for the transaction, so that the + transaction will be recovered after a crash. */ + + mutex_enter(&trx->undo_mutex); + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + mutex_exit(&trx->undo_mutex); + + DBUG_EXECUTE_IF("ib_import_undo_assign_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + if (err != DB_SUCCESS) { + + return(row_import_cleanup(prebuilt, trx, err)); + + } else if (trx->update_undo == 0) { + + err = DB_TOO_MANY_CONCURRENT_TRXS; + return(row_import_cleanup(prebuilt, trx, err)); + } + + prebuilt->trx->op_info = "read meta-data file"; + + /* Prevent DDL operations while we are checking. */ + + rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + + row_import cfg; + + memset(&cfg, 0x0, sizeof(cfg)); + + err = row_import_read_cfg(table, trx->mysql_thd, cfg); + + /* Check if the table column definitions match the contents + of the config file. */ + + if (err == DB_SUCCESS) { + + /* We have a schema file, try and match it with the our + data dictionary. */ + + err = cfg.match_schema(trx->mysql_thd); + + /* Update index->page and SYS_INDEXES.PAGE_NO to match the + B-tree root page numbers in the tablespace. Use the index + name from the .cfg file to find match. */ + + if (err == DB_SUCCESS) { + cfg.set_root_by_name(); + autoinc = cfg.m_autoinc; + } + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + DBUG_EXECUTE_IF("ib_import_set_index_root_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + } else if (cfg.m_missing) { + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + /* We don't have a schema file, we will have to discover + the index root pages from the .ibd file and skip the schema + matching step. */ + + ut_a(err == DB_FAIL); + + cfg.m_page_size = UNIV_PAGE_SIZE; + + FetchIndexRootPages fetchIndexRootPages(table, trx); + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(cfg.m_page_size), + fetchIndexRootPages); + + if (err == DB_SUCCESS) { + + err = fetchIndexRootPages.build_row_import(&cfg); + + /* Update index->page and SYS_INDEXES.PAGE_NO + to match the B-tree root page numbers in the + tablespace. */ + + if (err == DB_SUCCESS) { + err = cfg.set_root_by_heuristic(); + } + } + + } else { + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + } + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + prebuilt->trx->op_info = "importing tablespace"; + + ib_logf(IB_LOG_LEVEL_INFO, "Phase I - Update all pages"); + + /* Iterate over all the pages and do the sanity checking and + the conversion required to import the tablespace. */ + + PageConverter converter(&cfg, trx); + + /* Set the IO buffer size in pages. */ + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(cfg.m_page_size), converter); + + DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + if (err != DB_SUCCESS) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "Cannot reset LSNs in table '%s' : %s", + table_name, ut_strerr(err)); + + return(row_import_cleanup(prebuilt, trx, err)); + } + + row_mysql_lock_data_dictionary(trx); + + /* If the table is stored in a remote tablespace, we need to + determine that filepath from the link file and system tables. + Find the space ID in SYS_TABLES since this is an ALTER TABLE. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + ut_a(filepath); + + /* Open the tablespace so that we can access via the buffer pool. + We set the 2nd param (fix_dict = true) here because we already + have an x-lock on dict_operation_lock and dict_sys->mutex. */ + + err = fil_open_single_table_tablespace( + true, true, table->space, + dict_tf_to_fsp_flags(table->flags), + table->name, filepath); + + DBUG_EXECUTE_IF("ib_import_open_tablespace_failure", + err = DB_TABLESPACE_NOT_FOUND;); + + if (err != DB_SUCCESS) { + row_mysql_unlock_data_dictionary(trx); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_FILE_NOT_FOUND, + filepath, err, ut_strerr(err)); + + mem_free(filepath); + + return(row_import_cleanup(prebuilt, trx, err)); + } + + row_mysql_unlock_data_dictionary(trx); + + mem_free(filepath); + + err = ibuf_check_bitmap_on_import(trx, table->space); + + DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_cleanup(prebuilt, trx, err)); + } + + /* The first index must always be the clustered index. */ + + dict_index_t* index = dict_table_get_first_index(table); + + if (!dict_index_is_clust(index)) { + return(row_import_error(prebuilt, trx, DB_CORRUPTION)); + } + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + + DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } else if (cfg.requires_purge(index->name)) { + + /* Purge any delete-marked records that couldn't be + purged during the page conversion phase from the + cluster index. */ + + IndexPurge purge(trx, index); + + trx->op_info = "cluster: purging delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + } + + DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* For secondary indexes, purge any records that couldn't be purged + during the page conversion phase. */ + + err = row_import_adjust_root_pages_of_secondary_indexes( + prebuilt, trx, table, cfg); + + DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* Ensure that the next available DB_ROW_ID is not smaller than + any DB_ROW_ID stored in the table. */ + + if (prebuilt->clust_index_was_generated) { + + err = row_import_set_sys_max_row_id(prebuilt, table); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + } + + ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush changes to disk"); + + /* Ensure that all pages dirtied during the IMPORT make it to disk. + The only dirty pages generated should be from the pessimistic purge + of delete marked records that couldn't be purged in Phase I. */ + + buf_LRU_flush_or_remove_pages( + prebuilt->table->space, BUF_REMOVE_FLUSH_WRITE, trx); + + if (trx_is_interrupted(trx)) { + ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush interrupted"); + return(row_import_error(prebuilt, trx, DB_INTERRUPTED)); + } else { + ib_logf(IB_LOG_LEVEL_INFO, "Phase IV - Flush complete"); + } + + /* The dictionary latches will be released in in row_import_cleanup() + after the transaction commit, for both success and error. */ + + row_mysql_lock_data_dictionary(trx); + + /* Update the root pages of the table's indexes. */ + err = row_import_update_index_root(trx, table, false, true); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* Update the table's discarded flag, unset it. */ + err = row_import_update_discarded_flag(trx, table->id, false, true); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + table->ibd_file_missing = false; + table->flags2 &= ~DICT_TF2_DISCARDED; + + if (autoinc != 0) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, "%s autoinc value set to " IB_ID_FMT, + table_name, autoinc); + + dict_table_autoinc_lock(table); + dict_table_autoinc_initialize(table, autoinc); + dict_table_autoinc_unlock(table); + } + + ut_a(err == DB_SUCCESS); + + return(row_import_cleanup(prebuilt, trx, err)); +} + diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index e79518e24de..c1c27152831 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -23,11 +23,8 @@ Insert into a table Created 4/20/1996 Heikki Tuuri *******************************************************/ -#include "m_string.h" /* for my_sys.h */ #include "row0ins.h" -#define DEBUG_SYNC_C_IF_THD(A,B) DEBUG_SYNC(A,B) - #ifdef UNIV_NONINL #include "row0ins.ic" #endif @@ -35,6 +32,7 @@ Created 4/20/1996 Heikki Tuuri #include "ha_prototypes.h" #include "dict0dict.h" #include "dict0boot.h" +#include "trx0rec.h" #include "trx0undo.h" #include "btr0btr.h" #include "btr0cur.h" @@ -43,6 +41,7 @@ Created 4/20/1996 Heikki Tuuri #include "row0upd.h" #include "row0sel.h" #include "row0row.h" +#include "row0log.h" #include "rem0cmp.h" #include "lock0lock.h" #include "log0log.h" @@ -52,6 +51,7 @@ Created 4/20/1996 Heikki Tuuri #include "buf0lru.h" #include "fts0fts.h" #include "fts0types.h" +#include "m_string.h" /************************************************************************* IMPORTANT NOTE: Any operation that generates redo MUST check that there @@ -101,7 +101,7 @@ ins_node_create( /***********************************************************//** Creates an entry template for each index of a table. */ -UNIV_INTERN +static void ins_node_create_entry_list( /*=======================*/ @@ -222,68 +222,92 @@ Does an insert operation by updating a delete-marked existing record in the index. This situation can occur if the delete-marked record is kept in the index for consistent reads. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_sec_index_entry_by_modify( /*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether mtr holds just a leaf latch or also a tree latch */ btr_cur_t* cursor, /*!< in: B-tree cursor */ + ulint** offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ const dtuple_t* entry, /*!< in: index entry to insert */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ { big_rec_t* dummy_big_rec; - mem_heap_t* heap; upd_t* update; rec_t* rec; - ulint err; + dberr_t err; rec = btr_cur_get_rec(cursor); ut_ad(!dict_index_is_clust(cursor->index)); - ut_ad(rec_get_deleted_flag(rec, - dict_table_is_comp(cursor->index->table))); + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); + ut_ad(!entry->info_bits); /* We know that in the alphabetical ordering, entry and rec are identified. But in their binary form there may be differences if there are char fields in them. Therefore we have to calculate the difference. */ - heap = mem_heap_create(1024); - update = row_upd_build_sec_rec_difference_binary( - cursor->index, entry, rec, thr_get_trx(thr), heap); + rec, cursor->index, *offsets, entry, heap); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* We should never insert in place of a record that + has not been delete-marked. The only exception is when + online CREATE INDEX copied the changes that we already + made to the clustered index, and completed the + secondary index creation before we got here. In this + case, the change would already be there. The CREATE + INDEX should be waiting for a MySQL meta-data lock + upgrade at least until this INSERT or UPDATE + returns. After that point, the TEMP_INDEX_PREFIX + would be dropped from the index name in + commit_inplace_alter_table(). */ + ut_a(update->n_fields == 0); + ut_a(*cursor->index->name == TEMP_INDEX_PREFIX); + ut_ad(!dict_index_is_online_ddl(cursor->index)); + return(DB_SUCCESS); + } + if (mode == BTR_MODIFY_LEAF) { /* Try an optimistic updating of the record, keeping changes within the page */ - err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor, - update, 0, thr, mtr); + /* TODO: pass only *offsets */ + err = btr_cur_optimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); switch (err) { case DB_OVERFLOW: case DB_UNDERFLOW: case DB_ZIP_OVERFLOW: err = DB_FAIL; + default: + break; } } else { ut_a(mode == BTR_MODIFY_TREE); if (buf_LRU_buf_pool_running_out()) { - err = DB_LOCK_TABLE_FULL; - - goto func_exit; + return(DB_LOCK_TABLE_FULL); } - err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor, - &heap, &dummy_big_rec, update, - 0, thr, mtr); + err = btr_cur_pessimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, + heap, &dummy_big_rec, update, 0, + thr, thr_get_trx(thr)->id, mtr); ut_ad(!dummy_big_rec); } -func_exit: - mem_heap_free(heap); return(err); } @@ -293,15 +317,20 @@ Does an insert operation by delete unmarking and updating a delete marked existing record in the index. This situation can occur if the delete marked record is kept in the index for consistent reads. @return DB_SUCCESS, DB_FAIL, or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_clust_index_entry_by_modify( /*================================*/ + ulint flags, /*!< in: undo logging and locking flags */ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether mtr holds just a leaf latch or also a tree latch */ btr_cur_t* cursor, /*!< in: B-tree cursor */ - mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap that can + be emptied, or NULL */ + mem_heap_t* heap, /*!< in/out: memory heap */ big_rec_t** big_rec,/*!< out: possible big rec vector of fields which have to be stored externally by the caller */ @@ -310,9 +339,9 @@ row_ins_clust_index_entry_by_modify( mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ { - rec_t* rec; - upd_t* update; - ulint err; + const rec_t* rec; + const upd_t* update; + dberr_t err; ut_ad(dict_index_is_clust(cursor->index)); @@ -323,38 +352,40 @@ row_ins_clust_index_entry_by_modify( ut_ad(rec_get_deleted_flag(rec, dict_table_is_comp(cursor->index->table))); - if (!*heap) { - *heap = mem_heap_create(1024); - } - /* Build an update vector containing all the fields to be modified; NOTE that this vector may NOT contain system columns trx_id or roll_ptr */ - update = row_upd_build_difference_binary(cursor->index, entry, rec, - thr_get_trx(thr), *heap); - if (mode == BTR_MODIFY_LEAF) { + update = row_upd_build_difference_binary( + cursor->index, entry, rec, NULL, true, + thr_get_trx(thr), heap); + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); + /* Try optimistic updating of the record, keeping changes within the page */ - err = btr_cur_optimistic_update(0, cursor, update, 0, thr, - mtr); + err = btr_cur_optimistic_update( + flags, cursor, offsets, offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); switch (err) { case DB_OVERFLOW: case DB_UNDERFLOW: case DB_ZIP_OVERFLOW: err = DB_FAIL; + default: + break; } } else { - ut_a(mode == BTR_MODIFY_TREE); if (buf_LRU_buf_pool_running_out()) { return(DB_LOCK_TABLE_FULL); } err = btr_cur_pessimistic_update( - BTR_KEEP_POS_FLAG, cursor, heap, big_rec, update, - 0, thr, mtr); + flags | BTR_KEEP_POS_FLAG, + cursor, offsets, offsets_heap, heap, + big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr); } return(err); @@ -394,7 +425,7 @@ row_ins_cascade_ancestor_updates_table( Returns the number of ancestor UPDATE or DELETE nodes of a cascaded update/delete node. @return number of ancestors */ -static +static __attribute__((nonnull, warn_unused_result)) ulint row_ins_cascade_n_ancestors( /*========================*/ @@ -420,7 +451,7 @@ a cascaded update. can also be 0 if no foreign key fields changed; the returned value is ULINT_UNDEFINED if the column type in the child table is too short to fit the new value in the parent table: that means the update fails */ -static +static __attribute__((nonnull, warn_unused_result)) ulint row_ins_cascade_calc_update_vec( /*============================*/ @@ -691,6 +722,8 @@ row_ins_set_detailed( trx_t* trx, /*!< in: transaction */ dict_foreign_t* foreign) /*!< in: foreign key constraint */ { + ut_ad(!srv_read_only_mode); + mutex_enter(&srv_misc_tmpfile_mutex); rewind(srv_misc_tmpfile); @@ -717,13 +750,17 @@ row_ins_foreign_trx_print( /*======================*/ trx_t* trx) /*!< in: transaction */ { - ulint n_lock_rec; - ulint n_lock_struct; + ulint n_rec_locks; + ulint n_trx_locks; ulint heap_size; + if (srv_read_only_mode) { + return; + } + lock_mutex_enter(); - n_lock_rec = lock_number_of_rows_locked(&trx->lock); - n_lock_struct = UT_LIST_GET_LEN(trx->lock.trx_locks); + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); heap_size = mem_heap_get_size(trx->lock.lock_heap); lock_mutex_exit(); @@ -735,7 +772,7 @@ row_ins_foreign_trx_print( fputs(" Transaction:\n", dict_foreign_err_file); trx_print_low(dict_foreign_err_file, trx, 600, - n_lock_rec, n_lock_struct, heap_size); + n_rec_locks, n_trx_locks, heap_size); mutex_exit(&trx_sys->mutex); @@ -759,6 +796,10 @@ row_ins_foreign_report_err( const dtuple_t* entry) /*!< in: index entry in the parent table */ { + if (srv_read_only_mode) { + return; + } + FILE* ef = dict_foreign_err_file; trx_t* trx = thr_get_trx(thr); @@ -810,6 +851,10 @@ row_ins_foreign_report_add_err( const dtuple_t* entry) /*!< in: index entry to insert in the child table */ { + if (srv_read_only_mode) { + return; + } + FILE* ef = dict_foreign_err_file; row_ins_set_detailed(trx, foreign); @@ -879,8 +924,8 @@ Perform referential actions or checks when a parent row is deleted or updated and the constraint had an ON DELETE or ON UPDATE condition which was not RESTRICT. @return DB_SUCCESS, DB_LOCK_WAIT, or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_foreign_check_on_constraint( /*================================*/ que_thr_t* thr, /*!< in: query thread whose run_node @@ -906,7 +951,7 @@ row_ins_foreign_check_on_constraint( const buf_block_t* clust_block; upd_t* update; ulint n_to_update; - ulint err; + dberr_t err; ulint i; trx_t* trx; mem_heap_t* tmp_heap = NULL; @@ -1242,6 +1287,9 @@ row_ins_foreign_check_on_constraint( release the latch. */ row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); + + DEBUG_SYNC_C("innodb_dml_cascade_dict_unfreeze"); + row_mysql_freeze_data_dictionary(thr_get_trx(thr)); mtr_start(mtr); @@ -1284,7 +1332,7 @@ Sets a shared lock on a record. Used in locking possible duplicate key records and also in checking foreign key constraints. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ static -enum db_err +dberr_t row_ins_set_shared_rec_lock( /*========================*/ ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or @@ -1295,7 +1343,7 @@ row_ins_set_shared_rec_lock( const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ que_thr_t* thr) /*!< in: query thread */ { - enum db_err err; + dberr_t err; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -1315,7 +1363,7 @@ Sets a exclusive lock on a record. Used in locking possible duplicate key records @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ static -enum db_err +dberr_t row_ins_set_exclusive_rec_lock( /*===========================*/ ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or @@ -1326,7 +1374,7 @@ row_ins_set_exclusive_rec_lock( const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ que_thr_t* thr) /*!< in: query thread */ { - enum db_err err; + dberr_t err; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -1347,7 +1395,7 @@ which lock either the success or the failure of the constraint. NOTE that the caller must have a shared latch on dict_operation_lock. @return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ UNIV_INTERN -ulint +dberr_t row_ins_check_foreign_constraint( /*=============================*/ ibool check_ref,/*!< in: TRUE if we want to check that @@ -1361,7 +1409,7 @@ row_ins_check_foreign_constraint( dtuple_t* entry, /*!< in: index entry for index */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; upd_node_t* upd_node; dict_table_t* check_table; dict_index_t* check_index; @@ -1433,9 +1481,11 @@ run_again: check_index = foreign->foreign_index; } - if (check_table == NULL || check_table->ibd_file_missing + if (check_table == NULL + || check_table->ibd_file_missing || check_index == NULL) { - if (check_ref) { + + if (!srv_read_only_mode && check_ref) { FILE* ef = dict_foreign_err_file; row_ins_set_detailed(trx, foreign); @@ -1611,6 +1661,8 @@ run_again: } else { err = DB_SUCCESS; } + default: + break; } goto end_scan; @@ -1635,18 +1687,43 @@ end_scan: do_possible_lock_wait: if (err == DB_LOCK_WAIT) { - trx->error_state = static_cast<enum db_err>(err); + bool verified = false; + + trx->error_state = err; que_thr_stop_for_mysql(thr); lock_wait_suspend_thread(thr); - if (trx->error_state == DB_SUCCESS) { + if (check_table->to_be_dropped) { + /* The table is being dropped. We shall timeout + this operation */ + err = DB_LOCK_WAIT_TIMEOUT; + goto exit_func; + } - goto run_again; + /* We had temporarily released dict_operation_lock in + above lock sleep wait, now we have the lock again, and + we will need to re-check whether the foreign key has been + dropped */ + for (const dict_foreign_t* check_foreign = UT_LIST_GET_FIRST( + table->referenced_list); + check_foreign; + check_foreign = UT_LIST_GET_NEXT( + referenced_list, check_foreign)) { + if (check_foreign == foreign) { + verified = true; + break; + } } - err = trx->error_state; + if (!verified) { + err = DB_DICT_CHANGED; + } else if (trx->error_state == DB_SUCCESS) { + goto run_again; + } else { + err = trx->error_state; + } } exit_func: @@ -1663,8 +1740,8 @@ Otherwise does searches to the indexes of referenced tables and sets shared locks which lock either the success or the failure of a constraint. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_check_foreign_constraints( /*==============================*/ dict_table_t* table, /*!< in: table */ @@ -1673,7 +1750,7 @@ row_ins_check_foreign_constraints( que_thr_t* thr) /*!< in: query thread */ { dict_foreign_t* foreign; - ulint err; + dberr_t err; trx_t* trx; ibool got_s_lock = FALSE; @@ -1681,14 +1758,21 @@ row_ins_check_foreign_constraints( foreign = UT_LIST_GET_FIRST(table->foreign_list); + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "foreign_constraint_check_for_ins"); + while (foreign) { if (foreign->foreign_index == index) { dict_table_t* ref_table = NULL; + dict_table_t* foreign_table = foreign->foreign_table; + dict_table_t* referenced_table + = foreign->referenced_table; - if (foreign->referenced_table == NULL) { + if (referenced_table == NULL) { ref_table = dict_table_open_on_name( - foreign->referenced_table_name_lookup, FALSE); + foreign->referenced_table_name_lookup, + FALSE, FALSE, DICT_ERR_IGNORE_NONE); } if (0 == trx->dict_operation_lock_mode) { @@ -1697,9 +1781,9 @@ row_ins_check_foreign_constraints( row_mysql_freeze_data_dictionary(trx); } - if (foreign->referenced_table) { + if (referenced_table) { os_inc_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } @@ -1711,9 +1795,12 @@ row_ins_check_foreign_constraints( err = row_ins_check_foreign_constraint( TRUE, foreign, table, entry, thr); - if (foreign->referenced_table) { + DBUG_EXECUTE_IF("row_ins_dict_change_err", + err = DB_DICT_CHANGED;); + + if (referenced_table) { os_dec_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } @@ -1722,7 +1809,7 @@ row_ins_check_foreign_constraints( } if (ref_table != NULL) { - dict_table_close(ref_table, FALSE); + dict_table_close(ref_table, FALSE, FALSE); } if (err != DB_SUCCESS) { @@ -1778,8 +1865,7 @@ row_ins_dupl_error_with_rec( if (!dict_index_is_clust(index)) { for (i = 0; i < n_unique; i++) { - if (UNIV_SQL_NULL == dfield_get_len( - dtuple_get_nth_field(entry, i))) { + if (dfield_is_null(dtuple_get_nth_field(entry, i))) { return(FALSE); } @@ -1794,26 +1880,30 @@ Scans a unique non-clustered index at a given index entry to determine whether a uniqueness violation has occurred for the key value of the entry. Set shared locks on possible duplicate records. @return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_scan_sec_index_for_duplicate( /*=================================*/ + ulint flags, /*!< in: undo logging and locking flags */ dict_index_t* index, /*!< in: non-clustered unique index */ dtuple_t* entry, /*!< in: index entry */ - que_thr_t* thr) /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread */ + bool s_latch,/*!< in: whether index->lock is being held */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mem_heap_t* offsets_heap) + /*!< in/out: memory heap that can be emptied */ { ulint n_unique; - ulint i; int cmp; ulint n_fields_cmp; btr_pcur_t pcur; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ulint allow_duplicates; - mtr_t mtr; - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets = offsets_; - rec_offs_init(offsets_); + ulint* offsets = NULL; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(s_latch == rw_lock_own(&index->lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ n_unique = dict_index_get_n_unique(index); @@ -1821,7 +1911,7 @@ row_ins_scan_sec_index_for_duplicate( n_unique first fields is NULL, a unique key violation cannot occur, since we define NULL != NULL in this case */ - for (i = 0; i < n_unique; i++) { + for (ulint i = 0; i < n_unique; i++) { if (UNIV_SQL_NULL == dfield_get_len( dtuple_get_nth_field(entry, i))) { @@ -1829,15 +1919,17 @@ row_ins_scan_sec_index_for_duplicate( } } - mtr_start(&mtr); - /* Store old value on n_fields_cmp */ n_fields_cmp = dtuple_get_n_fields_cmp(entry); - dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index)); + dtuple_set_n_fields_cmp(entry, n_unique); - btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); + btr_pcur_open(index, entry, PAGE_CUR_GE, + s_latch + ? BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED + : BTR_SEARCH_LEAF, + &pcur, mtr); allow_duplicates = thr_get_trx(thr)->duplicates; @@ -1853,9 +1945,12 @@ row_ins_scan_sec_index_for_duplicate( } offsets = rec_get_offsets(rec, index, offsets, - ULINT_UNDEFINED, &heap); + ULINT_UNDEFINED, &offsets_heap); - if (allow_duplicates) { + if (flags & BTR_NO_LOCKING_FLAG) { + /* Set no locks when applying log + in online table rebuild. */ + } else if (allow_duplicates) { /* If the SQL-query will update or replace duplicate key we will take X-lock for @@ -1901,37 +1996,115 @@ row_ins_scan_sec_index_for_duplicate( ut_a(cmp < 0); goto end_scan; } - } while (btr_pcur_move_to_next(&pcur, &mtr)); + } while (btr_pcur_move_to_next(&pcur, mtr)); end_scan: - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - mtr_commit(&mtr); - /* Restore old value */ dtuple_set_n_fields_cmp(entry, n_fields_cmp); return(err); } +/** Checks for a duplicate when the table is being rebuilt online. +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_online( +/*=====================*/ + ulint n_uniq, /*!< in: offset of DB_TRX_ID */ + const dtuple_t* entry, /*!< in: entry that is being inserted */ + const rec_t* rec, /*!< in: clustered index record */ + ulint* offsets)/*!< in/out: rec_get_offsets(rec) */ +{ + ulint fields = 0; + ulint bytes = 0; + + /* During rebuild, there should not be any delete-marked rows + in the new table. */ + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq); + + /* Compare the PRIMARY KEY fields and the + DB_TRX_ID, DB_ROLL_PTR. */ + cmp_dtuple_rec_with_match_low( + entry, rec, offsets, n_uniq + 2, &fields, &bytes); + + if (fields < n_uniq) { + /* Not a duplicate. */ + return(DB_SUCCESS); + } + + if (fields == n_uniq + 2) { + /* rec is an exact match of entry. */ + ut_ad(bytes == 0); + return(DB_SUCCESS_LOCKED_REC); + } + + return(DB_DUPLICATE_KEY); +} + +/** Checks for a duplicate when the table is being rebuilt online. +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_error_in_clust_online( +/*====================================*/ + ulint n_uniq, /*!< in: offset of DB_TRX_ID */ + const dtuple_t* entry, /*!< in: entry that is being inserted */ + const btr_cur_t*cursor, /*!< in: cursor on insert position */ + ulint** offsets,/*!< in/out: rec_get_offsets(rec) */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + dberr_t err = DB_SUCCESS; + const rec_t* rec = btr_cur_get_rec(cursor); + + if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); + if (err != DB_SUCCESS) { + return(err); + } + } + + rec = page_rec_get_next_const(btr_cur_get_rec(cursor)); + + if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); + } + + return(err); +} + /***************************************************************//** Checks if a unique key violation error would occur at an index entry insert. Sets shared locks on possible duplicate records. Works only for a clustered index! -@return DB_SUCCESS if no error, DB_DUPLICATE_KEY if error, -DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate -record */ -static -ulint +@retval DB_SUCCESS if no error +@retval DB_DUPLICATE_KEY if error, +@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate +record +@retval DB_SUCCESS_LOCKED_REC if an exact match of the record was found +in online table rebuild (flags & (BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG)) */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_duplicate_error_in_clust( /*=============================*/ + ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: B-tree cursor */ const dtuple_t* entry, /*!< in: entry to insert */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; + dberr_t err; rec_t* rec; ulint n_unique; trx_t* trx = thr_get_trx(thr); @@ -1942,8 +2115,7 @@ row_ins_duplicate_error_in_clust( UT_NOT_USED(mtr); - ut_a(dict_index_is_clust(cursor->index)); - ut_ad(dict_index_is_unique(cursor->index)); + ut_ad(dict_index_is_clust(cursor->index)); /* NOTE: For unique non-clustered indexes there may be any number of delete marked records with the same value for the non-clustered @@ -2002,6 +2174,7 @@ row_ins_duplicate_error_in_clust( if (row_ins_dupl_error_with_rec( rec, entry, cursor->index, offsets)) { +duplicate: trx->error_info = cursor->index; err = DB_DUPLICATE_KEY; goto func_exit; @@ -2046,14 +2219,12 @@ row_ins_duplicate_error_in_clust( if (row_ins_dupl_error_with_rec( rec, entry, cursor->index, offsets)) { - trx->error_info = cursor->index; - err = DB_DUPLICATE_KEY; - goto func_exit; + goto duplicate; } } - ut_a(!dict_index_is_clust(cursor->index)); /* This should never happen */ + ut_error; } err = DB_SUCCESS; @@ -2081,12 +2252,12 @@ row_ins_must_modify_rec( /*====================*/ const btr_cur_t* cursor) /*!< in: B-tree cursor */ { - /* NOTE: (compare to the note in row_ins_duplicate_error) Because node - pointers on upper levels of the B-tree may match more to entry than - to actual user records on the leaf level, we have to check if the - candidate record is actually a user record. In a clustered index - node pointers contain index->n_unique first fields, and in the case - of a secondary index, all fields of the index. */ + /* NOTE: (compare to the note in row_ins_duplicate_error_in_clust) + Because node pointers on upper levels of the B-tree may match more + to entry than to actual user records on the leaf level, we + have to check if the candidate record is actually a user record. + A clustered index node pointer contains index->n_unique first fields, + and a secondary index node pointer contains all index fields. */ return(cursor->low_match >= dict_index_get_n_unique_in_tree(cursor->index) @@ -2094,56 +2265,359 @@ row_ins_must_modify_rec( } /***************************************************************//** -Tries to insert an index entry to an index. If the index is clustered -and a record with the same unique key is found, the other record is -necessarily marked deleted by a committed transaction, or a unique key -violation error occurs. The delete marked record is then updated to an -existing record, and we must write an undo log record on the delete -marked record. If the index is secondary, and a record with exactly the -same fields is found, the other record is necessarily marked deleted. -It is then unmarked. Otherwise, the entry is just inserted to the index. -@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL if pessimistic retry needed, -or error code */ -static -ulint -row_ins_index_entry_low( -/*====================*/ +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether we wish optimistic or pessimistic descent down the index tree */ - dict_index_t* index, /*!< in: index */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ dtuple_t* entry, /*!< in/out: index entry to insert */ ulint n_ext, /*!< in: number of externally stored columns */ que_thr_t* thr) /*!< in: query thread */ { btr_cur_t cursor; - ulint search_mode; - ibool modify = FALSE; - rec_t* insert_rec; - rec_t* rec; - ulint* offsets; - ulint err; - ulint n_unique; - big_rec_t* big_rec = NULL; + ulint* offsets = NULL; + dberr_t err; + big_rec_t* big_rec = NULL; mtr_t mtr; - mem_heap_t* heap = NULL; + mem_heap_t* offsets_heap = NULL; - log_free_check(); + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_unique(index) + || n_uniq == dict_index_get_n_unique(index)); + ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index)); mtr_start(&mtr); + if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } + cursor.thr = thr; /* Note that we use PAGE_CUR_LE as the search mode, because then the function will return in both low_match and up_match of the cursor sensible values */ - if (dict_index_is_clust(index)) { - search_mode = mode; - } else if (!(thr_get_trx(thr)->check_unique_secondary)) { - search_mode = mode | BTR_INSERT | BTR_IGNORE_SEC_UNIQUE; + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, mode, + &cursor, 0, __FILE__, __LINE__, &mtr); + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } +#endif + + if (n_uniq && (cursor.up_match >= n_uniq + || cursor.low_match >= n_uniq)) { + + if (flags + == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) { + /* Set no locks when applying log + in online table rebuild. Only check for duplicates. */ + err = row_ins_duplicate_error_in_clust_online( + n_uniq, entry, &cursor, + &offsets, &offsets_heap); + + switch (err) { + case DB_SUCCESS: + break; + default: + ut_ad(0); + /* fall through */ + case DB_SUCCESS_LOCKED_REC: + case DB_DUPLICATE_KEY: + thr_get_trx(thr)->error_info = cursor.index; + } + } else { + /* Note that the following may return also + DB_LOCK_WAIT */ + + err = row_ins_duplicate_error_in_clust( + flags, &cursor, entry, thr, &mtr); + } + + if (err != DB_SUCCESS) { +err_exit: + mtr_commit(&mtr); + goto func_exit; + } + } + + if (row_ins_must_modify_rec(&cursor)) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + mem_heap_t* entry_heap = mem_heap_create(1024); + + err = row_ins_clust_index_entry_by_modify( + flags, mode, &cursor, &offsets, &offsets_heap, + entry_heap, &big_rec, entry, thr, &mtr); + + rec_t* rec = btr_cur_get_rec(&cursor); + + if (big_rec) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. Allocate + pages for big_rec in the mtr that + modified the B-tree, but be sure to skip + any pages that were freed in mtr. We will + write out the big_rec pages before + committing the B-tree mini-transaction. If + the system crashes so that crash recovery + will not replay the mtr_commit(&mtr), the + big_rec pages will be left orphaned until + the pages are allocated for something else. + + TODO: If the allocation extends the + tablespace, it will not be redo + logged, in either mini-transaction. + Tablespace extension should be + redo-logged in the big_rec + mini-transaction, so that recovery + will not fail when the big_rec was + written to the extended portion of the + file, in case the file was somehow + truncated in the crash. */ + + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "before_row_ins_upd_extern"); + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr, + BTR_STORE_INSERT_UPDATE); + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "after_row_ins_upd_extern"); + /* If writing big_rec fails (for + example, because of DB_OUT_OF_FILE_SPACE), + the record will be corrupted. Even if + we did not update any externally + stored columns, our update could cause + the record to grow so that a + non-updated column was selected for + external storage. This non-update + would not have been written to the + undo log, and thus the record cannot + be rolled back. + + However, because we have not executed + mtr_commit(mtr) yet, the update will + not be replayed in crash recovery, and + the following assertion failure will + effectively "roll back" the operation. */ + ut_a(err == DB_SUCCESS); + dtuple_big_rec_free(big_rec); + } + + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_insert(rec, index, offsets); + } + + mtr_commit(&mtr); + mem_heap_free(entry_heap); } else { - search_mode = mode | BTR_INSERT; + rec_t* insert_rec; + + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) + == BTR_MODIFY_LEAF); + err = btr_cur_optimistic_insert( + flags, &cursor, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } else { + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + goto err_exit; + } + + err = btr_cur_optimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } + } + + if (UNIV_LIKELY_NULL(big_rec)) { + mtr_commit(&mtr); + + /* Online table rebuild could read (and + ignore) the incomplete record at this point. + If online rebuild is in progress, the + row_ins_index_entry_big_rec() will write log. */ + + DBUG_EXECUTE_IF( + "row_ins_extern_checkpoint", + log_make_checkpoint_at( + IB_ULONGLONG_MAX, TRUE);); + err = row_ins_index_entry_big_rec( + entry, big_rec, offsets, &offsets_heap, index, + thr_get_trx(thr)->mysql_thd, + __FILE__, __LINE__); + dtuple_convert_back_big_rec(index, entry, big_rec); + } else { + if (err == DB_SUCCESS + && dict_index_is_online_ddl(index)) { + row_log_table_insert( + insert_rec, index, offsets); + } + + mtr_commit(&mtr); + } + } + +func_exit: + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + return(err); +} + +/***************************************************************//** +Starts a mini-transaction and checks if the index will be dropped. +@return true if the index is to be dropped */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_ins_sec_mtr_start_and_check_if_aborted( +/*=======================================*/ + mtr_t* mtr, /*!< out: mini-transaction */ + dict_index_t* index, /*!< in/out: secondary index */ + bool check, /*!< in: whether to check */ + ulint search_mode) + /*!< in: flags */ +{ + ut_ad(!dict_index_is_clust(index)); + + mtr_start(mtr); + + if (!check) { + return(false); + } + + if (search_mode & BTR_ALREADY_S_LATCHED) { + mtr_s_lock(dict_index_get_lock(index), mtr); + } else { + mtr_x_lock(dict_index_get_lock(index), mtr); + } + + switch (index->online_status) { + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + ut_ad(*index->name == TEMP_INDEX_PREFIX); + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_CREATION: + break; + } + + ut_error; + return(true); +} + +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_cur_t cursor; + ulint search_mode = mode | BTR_INSERT; + dberr_t err = DB_SUCCESS; + ulint n_unique; + mtr_t mtr; + ulint* offsets = NULL; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE); + + cursor.thr = thr; + ut_ad(thr_get_trx(thr)->id); + mtr_start(&mtr); + + /* Ensure that we acquire index->lock when inserting into an + index with index->online_status == ONLINE_INDEX_COMPLETE, but + could still be subject to rollback_inplace_alter_table(). + This prevents a concurrent change of index->online_status. + The memory object cannot be freed as long as we have an open + reference to the table, or index->table->n_ref_count > 0. */ + const bool check = *index->name == TEMP_INDEX_PREFIX; + if (check) { + DEBUG_SYNC_C("row_ins_sec_index_enter"); + if (mode == BTR_MODIFY_LEAF) { + search_mode |= BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try( + index, entry, thr_get_trx(thr)->id)) { + goto func_exit; + } + } + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + if (!thr_get_trx(thr)->check_unique_secondary) { + search_mode |= BTR_IGNORE_SEC_UNIQUE; } btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, @@ -2151,13 +2625,8 @@ row_ins_index_entry_low( &cursor, 0, __FILE__, __LINE__, &mtr); if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { - /* The insertion was made to the insert buffer already during - the search: we are done */ - - ut_ad(search_mode & BTR_INSERT); - err = DB_SUCCESS; - - goto function_exit; + /* The insert was buffered during the search: we are done */ + goto func_exit; } #ifdef UNIV_DEBUG @@ -2174,213 +2643,250 @@ row_ins_index_entry_low( n_unique = dict_index_get_n_unique(index); - if (dict_index_is_unique(index) && (cursor.up_match >= n_unique - || cursor.low_match >= n_unique)) { + if (dict_index_is_unique(index) + && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) { + mtr_commit(&mtr); - if (dict_index_is_clust(index)) { - /* Note that the following may return also - DB_LOCK_WAIT */ + DEBUG_SYNC_C("row_ins_sec_index_unique"); - err = row_ins_duplicate_error_in_clust( - &cursor, entry, thr, &mtr); - if (err != DB_SUCCESS) { + if (row_ins_sec_mtr_start_and_check_if_aborted( + &mtr, index, check, search_mode)) { + goto func_exit; + } - goto function_exit; - } - } else { - mtr_commit(&mtr); - err = row_ins_scan_sec_index_for_duplicate( - index, entry, thr); - mtr_start(&mtr); + err = row_ins_scan_sec_index_for_duplicate( + flags, index, entry, thr, check, &mtr, offsets_heap); - if (err != DB_SUCCESS) { - goto function_exit; + mtr_commit(&mtr); + + switch (err) { + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + if (*index->name == TEMP_INDEX_PREFIX) { + ut_ad(!thr_get_trx(thr) + ->dict_operation_lock_mode); + mutex_enter(&dict_sys->mutex); + dict_set_corrupted_index_cache_only( + index, index->table); + mutex_exit(&dict_sys->mutex); + /* Do not return any error to the + caller. The duplicate will be reported + by ALTER TABLE or CREATE UNIQUE INDEX. + Unfortunately we cannot report the + duplicate key value to the DDL thread, + because the altered_table object is + private to its call stack. */ + err = DB_SUCCESS; } + /* fall through */ + default: + return(err); + } - /* We did not find a duplicate and we have now - locked with s-locks the necessary records to - prevent any insertion of a duplicate by another - transaction. Let us now reposition the cursor and - continue the insertion. */ - - btr_cur_search_to_nth_level(index, 0, entry, - PAGE_CUR_LE, - mode | BTR_INSERT, - &cursor, 0, - __FILE__, __LINE__, &mtr); + if (row_ins_sec_mtr_start_and_check_if_aborted( + &mtr, index, check, search_mode)) { + goto func_exit; } - } - modify = row_ins_must_modify_rec(&cursor); + /* We did not find a duplicate and we have now + locked with s-locks the necessary records to + prevent any insertion of a duplicate by another + transaction. Let us now reposition the cursor and + continue the insertion. */ + + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + search_mode & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE), + &cursor, 0, __FILE__, __LINE__, &mtr); + } - if (modify) { + if (row_ins_must_modify_rec(&cursor)) { /* There is already an index entry with a long enough common prefix, we must convert the insert into a modify of an existing record */ + offsets = rec_get_offsets( + btr_cur_get_rec(&cursor), index, offsets, + ULINT_UNDEFINED, &offsets_heap); - if (dict_index_is_clust(index)) { - err = row_ins_clust_index_entry_by_modify( - mode, &cursor, &heap, &big_rec, entry, - thr, &mtr); - - if (big_rec) { - ut_a(err == DB_SUCCESS); - /* Write out the externally stored - columns while still x-latching - index->lock and block->lock. Allocate - pages for big_rec in the mtr that - modified the B-tree, but be sure to skip - any pages that were freed in mtr. We will - write out the big_rec pages before - committing the B-tree mini-transaction. If - the system crashes so that crash recovery - will not replay the mtr_commit(&mtr), the - big_rec pages will be left orphaned until - the pages are allocated for something else. - - TODO: If the allocation extends the - tablespace, it will not be redo - logged, in either mini-transaction. - Tablespace extension should be - redo-logged in the big_rec - mini-transaction, so that recovery - will not fail when the big_rec was - written to the extended portion of the - file, in case the file was somehow - truncated in the crash. */ - - rec = btr_cur_get_rec(&cursor); - offsets = rec_get_offsets( - rec, index, NULL, - ULINT_UNDEFINED, &heap); - - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "before_row_ins_upd_extern"); - err = btr_store_big_rec_extern_fields( - index, btr_cur_get_block(&cursor), - rec, offsets, big_rec, &mtr, - BTR_STORE_INSERT_UPDATE); - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "after_row_ins_upd_extern"); - /* If writing big_rec fails (for - example, because of DB_OUT_OF_FILE_SPACE), - the record will be corrupted. Even if - we did not update any externally - stored columns, our update could cause - the record to grow so that a - non-updated column was selected for - external storage. This non-update - would not have been written to the - undo log, and thus the record cannot - be rolled back. - - However, because we have not executed - mtr_commit(mtr) yet, the update will - not be replayed in crash recovery, and - the following assertion failure will - effectively "roll back" the operation. */ - ut_a(err == DB_SUCCESS); - goto stored_big_rec; - } - } else { - ut_ad(!n_ext); - err = row_ins_sec_index_entry_by_modify( - mode, &cursor, entry, thr, &mtr); - } + err = row_ins_sec_index_entry_by_modify( + flags, mode, &cursor, &offsets, + offsets_heap, heap, entry, thr, &mtr); } else { + rec_t* insert_rec; + big_rec_t* big_rec; + if (mode == BTR_MODIFY_LEAF) { err = btr_cur_optimistic_insert( - 0, &cursor, entry, &insert_rec, &big_rec, - n_ext, thr, &mtr); + flags, &cursor, &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); } else { - ut_a(mode == BTR_MODIFY_TREE); + ut_ad(mode == BTR_MODIFY_TREE); if (buf_LRU_buf_pool_running_out()) { err = DB_LOCK_TABLE_FULL; - - goto function_exit; + goto func_exit; } err = btr_cur_optimistic_insert( - 0, &cursor, entry, &insert_rec, &big_rec, - n_ext, thr, &mtr); - + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); if (err == DB_FAIL) { err = btr_cur_pessimistic_insert( - 0, &cursor, entry, &insert_rec, - &big_rec, n_ext, thr, &mtr); + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); } } + + if (err == DB_SUCCESS && trx_id) { + page_update_max_trx_id( + btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + + ut_ad(!big_rec); } -function_exit: +func_exit: mtr_commit(&mtr); + return(err); +} - if (UNIV_LIKELY_NULL(big_rec)) { - DBUG_EXECUTE_IF( - "row_ins_extern_checkpoint", - log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);); - - mtr_start(&mtr); - - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "before_row_ins_extern_latch"); - btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, - BTR_MODIFY_TREE, &cursor, 0, - __FILE__, __LINE__, &mtr); - rec = btr_cur_get_rec(&cursor); - offsets = rec_get_offsets(rec, index, NULL, - ULINT_UNDEFINED, &heap); - - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "before_row_ins_extern"); - err = btr_store_big_rec_extern_fields( - index, btr_cur_get_block(&cursor), - rec, offsets, big_rec, &mtr, BTR_STORE_INSERT); - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "after_row_ins_extern"); - -stored_big_rec: - if (modify) { - dtuple_big_rec_free(big_rec); - } else { - dtuple_convert_back_big_rec(index, entry, big_rec); +/***************************************************************//** +Tries to insert the externally stored fields (off-page columns) +of a clustered index entry. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +row_ins_index_entry_big_rec_func( +/*=============================*/ + const dtuple_t* entry, /*!< in/out: index entry to insert */ + const big_rec_t* big_rec,/*!< in: externally stored fields */ + ulint* offsets,/*!< in/out: rec offsets */ + mem_heap_t** heap, /*!< in/out: memory heap */ + dict_index_t* index, /*!< in: index */ + const char* file, /*!< in: file name of caller */ +#ifndef DBUG_OFF + const void* thd, /*!< in: connection, or NULL */ +#endif /* DBUG_OFF */ + ulint line) /*!< in: line number of caller */ +{ + mtr_t mtr; + btr_cur_t cursor; + rec_t* rec; + dberr_t error; + + ut_ad(dict_index_is_clust(index)); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch"); + + mtr_start(&mtr); + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + file, line, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, heap); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern"); + error = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr, BTR_STORE_INSERT); + DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern"); + + if (error == DB_SUCCESS + && dict_index_is_online_ddl(index)) { + row_log_table_insert(rec, index, offsets); + } + + mtr_commit(&mtr); + + return(error); +} + +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + dberr_t err; + ulint n_uniq; + + if (UT_LIST_GET_FIRST(index->table->foreign_list)) { + err = row_ins_check_foreign_constraints( + index->table, index, entry, thr); + if (err != DB_SUCCESS) { + + return(err); } + } - mtr_commit(&mtr); + n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0; + + /* Try first optimistic descent to the B-tree */ + + log_free_check(); + + err = row_ins_clust_index_entry_low( + 0, BTR_MODIFY_LEAF, index, n_uniq, entry, n_ext, thr); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_clust_index_entry_leaf"); } +#endif /* UNIV_DEBUG */ - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); + if (err != DB_FAIL) { + DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after"); + return(err); } - return(err); + + /* Try then pessimistic descent to the B-tree */ + + log_free_check(); + + return(row_ins_clust_index_entry_low( + 0, BTR_MODIFY_TREE, index, n_uniq, entry, n_ext, thr)); } /***************************************************************//** -Inserts an index entry to index. Tries first optimistic, then pessimistic -descent down the tree. If the entry matches enough to a delete marked record, -performs the insert by updating or delete unmarking the delete marked -record. +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ UNIV_INTERN -ulint -row_ins_index_entry( -/*================*/ - dict_index_t* index, /*!< in: index */ +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ dtuple_t* entry, /*!< in/out: index entry to insert */ - ulint n_ext, /*!< in: number of externally stored columns */ - ibool foreign,/*!< in: TRUE=check foreign key constraints - (foreign=FALSE only during CREATE INDEX) */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; + mem_heap_t* offsets_heap; + mem_heap_t* heap; - if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) { + if (UT_LIST_GET_FIRST(index->table->foreign_list)) { err = row_ins_check_foreign_constraints(index->table, index, entry, thr); if (err != DB_SUCCESS) { @@ -2389,26 +2895,59 @@ row_ins_index_entry( } } + ut_ad(thr_get_trx(thr)->id); + + offsets_heap = mem_heap_create(1024); + heap = mem_heap_create(1024); + /* Try first optimistic descent to the B-tree */ - err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, - n_ext, thr); - if (err != DB_FAIL) { + log_free_check(); - return(err); - } + err = row_ins_sec_index_entry_low( + 0, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry, 0, thr); + if (err == DB_FAIL) { + mem_heap_empty(heap); - /* Try then pessimistic descent to the B-tree */ + /* Try then pessimistic descent to the B-tree */ + + log_free_check(); - err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, - n_ext, thr); + err = row_ins_sec_index_entry_low( + 0, BTR_MODIFY_TREE, index, + offsets_heap, heap, entry, 0, thr); + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); return(err); } +/***************************************************************//** +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +static +dberr_t +row_ins_index_entry( +/*================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + if (dict_index_is_clust(index)) { + return(row_ins_clust_index_entry(index, entry, thr, 0)); + } else { + return(row_ins_sec_index_entry(index, entry, thr)); + } +} + /***********************************************************//** Sets the values of the dtuple fields in entry from the values of appropriate columns in row. */ -static +static __attribute__((nonnull)) void row_ins_index_entry_set_vals( /*=========================*/ @@ -2419,8 +2958,6 @@ row_ins_index_entry_set_vals( ulint n_fields; ulint i; - ut_ad(entry && row); - n_fields = dtuple_get_n_fields(entry); for (i = 0; i < n_fields; i++) { @@ -2463,14 +3000,14 @@ row_ins_index_entry_set_vals( Inserts a single index entry to the table. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_index_entry_step( /*=====================*/ ins_node_t* node, /*!< in: row insert node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; ut_ad(dtuple_check_typed(node->row)); @@ -2478,7 +3015,16 @@ row_ins_index_entry_step( ut_ad(dtuple_check_typed(node->entry)); - err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr); + err = row_ins_index_entry(node->index, node->entry, thr); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_index_entry_step"); + } +#endif /* UNIV_DEBUG */ return(err); } @@ -2577,16 +3123,14 @@ row_ins_get_row_from_select( Inserts a row to a table. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins( /*====*/ ins_node_t* node, /*!< in: row insert node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; - - ut_ad(node && thr); + dberr_t err; if (node->state == INS_NODE_ALLOC_ROW_ID) { @@ -2622,6 +3166,10 @@ row_ins( node->index = dict_table_get_next_index(node->index); node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + DBUG_EXECUTE_IF( + "row_ins_skip_sec", + node->index = NULL; node->entry = NULL; break;); + /* Skip corrupted secondary index and its entry */ while (node->index && dict_index_is_corrupted(node->index)) { @@ -2651,7 +3199,7 @@ row_ins_step( que_node_t* parent; sel_node_t* sel_node; trx_t* trx; - ulint err; + dberr_t err; ut_ad(thr); @@ -2684,6 +3232,8 @@ row_ins_step( if (node->state == INS_NODE_SET_IX_LOCK) { + node->state = INS_NODE_ALLOC_ROW_ID; + /* It may be that the current session has not yet started its transaction, or it has been committed: */ @@ -2695,6 +3245,9 @@ row_ins_step( err = lock_table(0, node->table, LOCK_IX, thr); + DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait", + err = DB_LOCK_WAIT;); + if (err != DB_SUCCESS) { goto error_handling; @@ -2702,8 +3255,6 @@ row_ins_step( node->trx_id = trx->id; same_trx: - node->state = INS_NODE_ALLOC_ROW_ID; - if (node->ins_type == INS_SEARCHED) { /* Reset the cursor */ sel_node->state = SEL_NODE_OPEN; @@ -2732,7 +3283,7 @@ same_trx: err = row_ins(node, thr); error_handling: - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { /* err == DB_LOCK_WAIT or SQL error detected */ diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc new file mode 100644 index 00000000000..b373b70ab7a --- /dev/null +++ b/storage/innobase/row/row0log.cc @@ -0,0 +1,3219 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0log.cc +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#include "row0log.h" + +#ifdef UNIV_NONINL +#include "row0log.ic" +#endif + +#include "row0row.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0merge.h" +#include "row0ext.h" +#include "data0data.h" +#include "que0que.h" +#include "handler0alter.h" + +#include<set> + +/** Table row modification operations during online table rebuild. +Delete-marked records are not copied to the rebuilt table. */ +enum row_tab_op { + /** Insert a record */ + ROW_T_INSERT = 0x41, + /** Update a record in place */ + ROW_T_UPDATE, + /** Delete (purge) a record */ + ROW_T_DELETE +}; + +/** Index record modification operations during online index creation */ +enum row_op { + /** Insert a record */ + ROW_OP_INSERT = 0x61, + /** Delete a record */ + ROW_OP_DELETE +}; + +#ifdef UNIV_DEBUG +/** Write information about the applied record to the error log */ +# define ROW_LOG_APPLY_PRINT +#endif /* UNIV_DEBUG */ + +#ifdef ROW_LOG_APPLY_PRINT +/** When set, write information about the applied record to the error log */ +static bool row_log_apply_print; +#endif /* ROW_LOG_APPLY_PRINT */ + +/** Size of the modification log entry header, in bytes */ +#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/ + +/** Log block for modifications during online index creation */ +struct row_log_buf_t { + byte* block; /*!< file block buffer */ + mrec_buf_t buf; /*!< buffer for accessing a record + that spans two blocks */ + ulint blocks; /*!< current position in blocks */ + ulint bytes; /*!< current position within buf */ +}; + +/** Set of transactions that rolled back inserts of BLOBs during +online table rebuild */ +typedef std::set<trx_id_t> trx_id_set; + +/** @brief Buffer for logging modifications during online index creation + +All modifications to an index that is being created will be logged by +row_log_online_op() to this buffer. + +All modifications to a table that is being rebuilt will be logged by +row_log_table_delete(), row_log_table_update(), row_log_table_insert() +to this buffer. + +When head.blocks == tail.blocks, the reader will access tail.block +directly. When also head.bytes == tail.bytes, both counts will be +reset to 0 and the file will be truncated. */ +struct row_log_t { + int fd; /*!< file descriptor */ + ib_mutex_t mutex; /*!< mutex protecting trx_log, error, + max_trx and tail */ + trx_id_set* trx_rb; /*!< set of transactions that rolled back + inserts of BLOBs during online table rebuild; + protected by mutex */ + dict_table_t* table; /*!< table that is being rebuilt, + or NULL when this is a secondary + index that is being created online */ + bool same_pk;/*!< whether the definition of the PRIMARY KEY + has remained the same */ + const dtuple_t* add_cols; + /*!< default values of added columns, or NULL */ + const ulint* col_map;/*!< mapping of old column numbers to + new ones, or NULL if !table */ + dberr_t error; /*!< error that occurred during online + table rebuild */ + trx_id_t max_trx;/*!< biggest observed trx_id in + row_log_online_op(); + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + row_log_buf_t tail; /*!< writer context; + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + row_log_buf_t head; /*!< reader context; protected by MDL only; + modifiable by row_log_apply_ops() */ + ulint size; /*!< allocated size */ +}; + +/******************************************************//** +Logs an operation to a secondary index that is (or was) being created. */ +UNIV_INTERN +void +row_log_online_op( +/*==============*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ +{ + byte* b; + ulint extra_size; + ulint size; + ulint mrec_size; + ulint avail_size; + row_log_t* log; + + ut_ad(dtuple_validate(tuple)); + ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index)) { + return; + } + + ut_ad(dict_index_is_online_ddl(index)); + + /* Compute the size of the record. This differs from + row_merge_buf_encode(), because here we do not encode + extra_size+1 (and reserve 0 as the end-of-chunk marker). */ + + size = rec_get_converted_size_temp( + index, tuple->fields, tuple->n_fields, &extra_size); + ut_ad(size >= extra_size); + ut_ad(size <= sizeof log->tail.buf); + + mrec_size = ROW_LOG_HEADER_SIZE + + (extra_size >= 0x80) + size + + (trx_id ? DATA_TRX_ID_LEN : 0); + + log = index->online_log; + mutex_enter(&log->mutex); + + if (trx_id > log->max_trx) { + log->max_trx = trx_id; + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + + ut_ad(log->tail.bytes < srv_sort_buf_size); + avail_size = srv_sort_buf_size - log->tail.bytes; + + if (mrec_size > avail_size) { + b = log->tail.buf; + } else { + b = log->tail.block + log->tail.bytes; + } + + if (trx_id != 0) { + *b++ = ROW_OP_INSERT; + trx_write_trx_id(b, trx_id); + b += DATA_TRX_ID_LEN; + } else { + *b++ = ROW_OP_DELETE; + } + + if (extra_size < 0x80) { + *b++ = (byte) extra_size; + } else { + ut_ad(extra_size < 0x8000); + *b++ = (byte) (0x80 | (extra_size >> 8)); + *b++ = (byte) extra_size; + } + + rec_convert_dtuple_to_temp( + b + extra_size, index, tuple->fields, tuple->n_fields); + b += size; + + if (mrec_size >= avail_size) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + ibool ret; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + goto write_failed; + } + + if (mrec_size == avail_size) { + ut_ad(b == &log->tail.block[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + mrec_size); + memcpy(log->tail.block + log->tail.bytes, + log->tail.buf, avail_size); + } + UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size); + ret = os_file_write( + "(modification log)", + OS_FILE_FROM_FD(log->fd), + log->tail.block, byte_offset, srv_sort_buf_size); + log->tail.blocks++; + if (!ret) { +write_failed: + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + } + UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size); + memcpy(log->tail.block, log->tail.buf + avail_size, + mrec_size - avail_size); + log->tail.bytes = mrec_size - avail_size; + } else { + log->tail.bytes += mrec_size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + mutex_exit(&log->mutex); +} + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + return(index->online_log->error); +} + +/******************************************************//** +Starts logging an operation to a table that is being rebuilt. +@return pointer to log, or NULL if no logging is necessary */ +static __attribute__((nonnull, warn_unused_result)) +byte* +row_log_table_open( +/*===============*/ + row_log_t* log, /*!< in/out: online rebuild log */ + ulint size, /*!< in: size of log record */ + ulint* avail) /*!< out: available size for log record */ +{ + mutex_enter(&log->mutex); + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + + if (log->error != DB_SUCCESS) { + mutex_exit(&log->mutex); + return(NULL); + } + + ut_ad(log->tail.bytes < srv_sort_buf_size); + *avail = srv_sort_buf_size - log->tail.bytes; + + if (size > *avail) { + return(log->tail.buf); + } else { + return(log->tail.block + log->tail.bytes); + } +} + +/******************************************************//** +Stops logging an operation to a table that is being rebuilt. */ +static __attribute__((nonnull)) +void +row_log_table_close_func( +/*=====================*/ + row_log_t* log, /*!< in/out: online rebuild log */ +#ifdef UNIV_DEBUG + const byte* b, /*!< in: end of log record */ +#endif /* UNIV_DEBUG */ + ulint size, /*!< in: size of log record */ + ulint avail) /*!< in: available size for log record */ +{ + ut_ad(mutex_own(&log->mutex)); + + if (size >= avail) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + ibool ret; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + goto write_failed; + } + + if (size == avail) { + ut_ad(b == &log->tail.block[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + size); + memcpy(log->tail.block + log->tail.bytes, + log->tail.buf, avail); + } + UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size); + ret = os_file_write( + "(modification log)", + OS_FILE_FROM_FD(log->fd), + log->tail.block, byte_offset, srv_sort_buf_size); + log->tail.blocks++; + if (!ret) { +write_failed: + log->error = DB_ONLINE_LOG_TOO_BIG; + } + UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size); + memcpy(log->tail.block, log->tail.buf + avail, size - avail); + log->tail.bytes = size - avail; + } else { + log->tail.bytes += size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + mutex_exit(&log->mutex); +} + +#ifdef UNIV_DEBUG +# define row_log_table_close(log, b, size, avail) \ + row_log_table_close_func(log, b, size, avail) +#else /* UNIV_DEBUG */ +# define row_log_table_close(log, b, size, avail) \ + row_log_table_close_func(log, size, avail) +#endif /* UNIV_DEBUG */ + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +UNIV_INTERN +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + trx_id_t trx_id) /*!< in: DB_TRX_ID of the record before + it was deleted */ +{ + ulint old_pk_extra_size; + ulint old_pk_size; + ulint ext_size = 0; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + const dtuple_t* old_pk; + row_ext_t* ext; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index) + || !dict_index_is_online_ddl(index) + || index->online_log->error != DB_SUCCESS) { + return; + } + + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index(new_table); + + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + + /* Create the tuple PRIMARY KEY, DB_TRX_ID in the new_table. */ + if (index->online_log->same_pk) { + byte* db_trx_id; + dtuple_t* tuple; + ut_ad(new_index->n_uniq == index->n_uniq); + + /* The PRIMARY KEY and DB_TRX_ID are in the first + fields of the record. */ + heap = mem_heap_create( + DATA_TRX_ID_LEN + + DTUPLE_EST_ALLOC(new_index->n_uniq + 1)); + old_pk = tuple = dtuple_create(heap, new_index->n_uniq + 1); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_index->n_uniq); + + for (ulint i = 0; i < new_index->n_uniq; i++) { + ulint len; + const void* field = rec_get_nth_field( + rec, offsets, i, &len); + dfield_t* dfield = dtuple_get_nth_field( + tuple, i); + ut_ad(len != UNIV_SQL_NULL); + ut_ad(!rec_offs_nth_extern(offsets, i)); + dfield_set_data(dfield, field, len); + } + + db_trx_id = static_cast<byte*>( + mem_heap_alloc(heap, DATA_TRX_ID_LEN)); + trx_write_trx_id(db_trx_id, trx_id); + + dfield_set_data(dtuple_get_nth_field(tuple, new_index->n_uniq), + db_trx_id, DATA_TRX_ID_LEN); + } else { + /* The PRIMARY KEY has changed. Translate the tuple. */ + dfield_t* dfield; + + old_pk = row_log_table_get_pk(rec, index, offsets, &heap); + + if (!old_pk) { + ut_ad(index->online_log->error != DB_SUCCESS); + return; + } + + /* Remove DB_ROLL_PTR. */ + ut_ad(dtuple_get_n_fields_cmp(old_pk) + == dict_index_get_n_unique(new_index)); + ut_ad(dtuple_get_n_fields(old_pk) + == dict_index_get_n_unique(new_index) + 2); + const_cast<ulint&>(old_pk->n_fields)--; + + /* Overwrite DB_TRX_ID with the old trx_id. */ + dfield = dtuple_get_nth_field(old_pk, new_index->n_uniq); + ut_ad(dfield_get_type(dfield)->mtype == DATA_SYS); + ut_ad(dfield_get_type(dfield)->prtype + == (DATA_NOT_NULL | DATA_TRX_ID)); + ut_ad(dfield_get_len(dfield) == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(dfield->data), trx_id); + } + + ut_ad(dtuple_get_n_fields(old_pk) > 1); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + + mrec_size = 4 + old_pk_size; + + /* If the row is marked as rollback, we will need to + log the enough prefix of the BLOB unless both the + old and new table are in COMPACT or REDUNDANT format */ + if ((dict_table_get_format(index->table) >= UNIV_FORMAT_B + || dict_table_get_format(new_table) >= UNIV_FORMAT_B) + && row_log_table_is_rollback(index, trx_id)) { + if (rec_offs_any_extern(offsets)) { + /* Build a cache of those off-page column + prefixes that are referenced by secondary + indexes. It can be that none of the off-page + columns are needed. */ + row_build(ROW_COPY_DATA, index, rec, + offsets, NULL, NULL, NULL, &ext, heap); + if (ext) { + /* Log the row_ext_t, ext->ext and ext->buf */ + ext_size = ext->n_ext * ext->max_len + + sizeof(*ext) + + ext->n_ext * sizeof(ulint) + + (ext->n_ext - 1) * sizeof ext->len; + mrec_size += ext_size; + } + } + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = ROW_T_DELETE; + *b++ = static_cast<byte>(old_pk_extra_size); + + /* Log the size of external prefix we saved */ + mach_write_to_2(b, ext_size); + b += 2; + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + + b += old_pk_size; + + if (ext_size) { + ulint cur_ext_size = sizeof(*ext) + + (ext->n_ext - 1) * sizeof ext->len; + + memcpy(b, ext, cur_ext_size); + b += cur_ext_size; + + /* Check if we need to col_map to adjust the column + number. If columns were added/removed/reordered, + adjust the column number. */ + if (const ulint* col_map = + index->online_log->col_map) { + for (ulint i = 0; i < ext->n_ext; i++) { + const_cast<ulint&>(ext->ext[i]) = + col_map[ext->ext[i]]; + } + } + + memcpy(b, ext->ext, ext->n_ext * sizeof(*ext->ext)); + b += ext->n_ext * sizeof(*ext->ext); + + ext_size -= cur_ext_size + + ext->n_ext * sizeof(*ext->ext); + memcpy(b, ext->buf, ext_size); + b += ext_size; + } + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } + + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static __attribute__((nonnull(1,2,3))) +void +row_log_table_low_redundant( +/*========================*/ + const rec_t* rec, /*!< in: clustered index leaf + page record in ROW_FORMAT=REDUNDANT, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + bool insert, /*!< in: true if insert, + false if update */ + const dtuple_t* old_pk, /*!< in: old PRIMARY KEY value + (if !insert and a PRIMARY KEY + is being created) */ + const dict_index_t* new_index) + /*!< in: clustered index of the + new table, not latched */ +{ + ulint old_pk_size; + ulint old_pk_extra_size; + ulint size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + dtuple_t* tuple; + + ut_ad(!page_is_comp(page_align(rec))); + ut_ad(dict_index_get_n_fields(index) == rec_get_n_fields_old(rec)); + + heap = mem_heap_create(DTUPLE_EST_ALLOC(index->n_fields)); + tuple = dtuple_create(heap, index->n_fields); + dict_index_copy_types(tuple, index, index->n_fields); + dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index)); + + if (rec_get_1byte_offs_flag(rec)) { + for (ulint i = 0; i < index->n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + } + } else { + for (ulint i = 0; i < index->n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_2_is_field_extern(rec, i)) { + dfield_set_ext(dfield); + } + } + } + + size = rec_get_converted_size_temp( + index, tuple->fields, tuple->n_fields, &extra_size); + + mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80); + + if (insert || index->online_log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + rec_convert_dtuple_to_temp( + b + extra_size, index, tuple->fields, tuple->n_fields); + b += size; + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } + + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static __attribute__((nonnull(1,2,3))) +void +row_log_table_low( +/*==============*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + bool insert, /*!< in: true if insert, false if update */ + const dtuple_t* old_pk) /*!< in: old PRIMARY KEY value (if !insert + and a PRIMARY KEY is being created) */ +{ + ulint omit_size; + ulint old_pk_size; + ulint old_pk_extra_size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + const dict_index_t* new_index = dict_table_get_first_index( + index->online_log->table); + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + ut_ad(page_is_leaf(page_align(rec))); + ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets)); + + if (dict_index_is_corrupted(index) + || !dict_index_is_online_ddl(index) + || index->online_log->error != DB_SUCCESS) { + return; + } + + if (!rec_offs_comp(offsets)) { + row_log_table_low_redundant( + rec, index, offsets, insert, old_pk, new_index); + return; + } + + ut_ad(page_is_comp(page_align(rec))); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + + omit_size = REC_N_NEW_EXTRA_BYTES; + + extra_size = rec_offs_extra_size(offsets) - omit_size; + + mrec_size = rec_offs_size(offsets) - omit_size + + ROW_LOG_HEADER_SIZE + (extra_size >= 0x80); + + if (insert || index->online_log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + memcpy(b, rec - rec_offs_extra_size(offsets), extra_size); + b += extra_size; + memcpy(b, rec, rec_offs_data_size(offsets)); + b += rec_offs_data_size(offsets); + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } +} + +/******************************************************//** +Logs an update to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +UNIV_INTERN +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk) /*!< in: row_log_table_get_pk() + before the update */ +{ + row_log_table_low(rec, index, offsets, false, old_pk); +} + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +UNIV_INTERN +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ +{ + dtuple_t* tuple = NULL; + row_log_t* log = index->online_log; + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(!offsets || rec_offs_validate(rec, index, offsets)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(log); + ut_ad(log->table); + + if (log->same_pk) { + /* The PRIMARY KEY columns are unchanged. */ + return(NULL); + } + + mutex_enter(&log->mutex); + + /* log->error is protected by log->mutex. */ + if (log->error == DB_SUCCESS) { + dict_table_t* new_table = log->table; + dict_index_t* new_index + = dict_table_get_first_index(new_table); + const ulint new_n_uniq + = dict_index_get_n_unique(new_index); + + if (!*heap) { + ulint size = 0; + + if (!offsets) { + size += (1 + REC_OFFS_HEADER_SIZE + + index->n_fields) + * sizeof *offsets; + } + + for (ulint i = 0; i < new_n_uniq; i++) { + size += dict_col_get_min_size( + dict_index_get_nth_col(new_index, i)); + } + + *heap = mem_heap_create( + DTUPLE_EST_ALLOC(new_n_uniq + 2) + size); + } + + if (!offsets) { + offsets = rec_get_offsets(rec, index, NULL, + ULINT_UNDEFINED, heap); + } + + tuple = dtuple_create(*heap, new_n_uniq + 2); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_n_uniq); + + for (ulint new_i = 0; new_i < new_n_uniq; new_i++) { + dict_field_t* ifield; + dfield_t* dfield; + const dict_col_t* new_col; + const dict_col_t* col; + ulint col_no; + ulint i; + ulint len; + const byte* field; + + ifield = dict_index_get_nth_field(new_index, new_i); + dfield = dtuple_get_nth_field(tuple, new_i); + new_col = dict_field_get_col(ifield); + col_no = new_col->ind; + + for (ulint old_i = 0; old_i < index->table->n_cols; + old_i++) { + if (col_no == log->col_map[old_i]) { + col_no = old_i; + goto copy_col; + } + } + + /* No matching column was found in the old + table, so this must be an added column. + Copy the default value. */ + ut_ad(log->add_cols); + dfield_copy(dfield, + dtuple_get_nth_field( + log->add_cols, col_no)); + continue; + +copy_col: + col = dict_table_get_nth_col(index->table, col_no); + + i = dict_col_get_clust_pos(col, index); + + if (i == ULINT_UNDEFINED) { + ut_ad(0); + log->error = DB_CORRUPTION; + tuple = NULL; + goto func_exit; + } + + field = rec_get_nth_field(rec, offsets, i, &len); + + if (len == UNIV_SQL_NULL) { + log->error = DB_INVALID_NULL; + tuple = NULL; + goto func_exit; + } + + if (rec_offs_nth_extern(offsets, i)) { + ulint field_len = ifield->prefix_len; + byte* blob_field; + const ulint max_len = + DICT_MAX_FIELD_LEN_BY_FORMAT( + new_table); + + if (!field_len) { + field_len = ifield->fixed_len; + if (!field_len) { + field_len = max_len + 1; + } + } + + blob_field = static_cast<byte*>( + mem_heap_alloc(*heap, field_len)); + + len = btr_copy_externally_stored_field_prefix( + blob_field, field_len, + dict_table_zip_size(index->table), + field, len); + if (len == max_len + 1) { + log->error = DB_TOO_BIG_INDEX_COL; + tuple = NULL; + goto func_exit; + } + + dfield_set_data(dfield, blob_field, len); + } else { + if (ifield->prefix_len + && ifield->prefix_len < len) { + len = ifield->prefix_len; + } + + dfield_set_data( + dfield, + mem_heap_dup(*heap, field, len), len); + } + } + + const byte* trx_roll = rec + + row_get_trx_id_offset(index, offsets); + + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq), + trx_roll, DATA_TRX_ID_LEN); + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1), + trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN); + } + +func_exit: + mutex_exit(&log->mutex); + return(tuple); +} + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +UNIV_INTERN +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets)/*!< in: rec_get_offsets(rec,index) */ +{ + row_log_table_low(rec, index, offsets, true, NULL); +} + +/******************************************************//** +Notes that a transaction is being rolled back. */ +UNIV_INTERN +void +row_log_table_rollback( +/*===================*/ + dict_index_t* index, /*!< in/out: clustered index */ + trx_id_t trx_id) /*!< in: transaction being rolled back */ +{ + ut_ad(dict_index_is_clust(index)); +#ifdef UNIV_DEBUG + ibool corrupt = FALSE; + ut_ad(trx_rw_is_active(trx_id, &corrupt)); + ut_ad(!corrupt); +#endif /* UNIV_DEBUG */ + + /* Protect transitions of index->online_status and access to + index->online_log. */ + rw_lock_s_lock(&index->lock); + + if (dict_index_is_online_ddl(index)) { + ut_ad(index->online_log); + ut_ad(index->online_log->table); + mutex_enter(&index->online_log->mutex); + trx_id_set* trxs = index->online_log->trx_rb; + + if (!trxs) { + index->online_log->trx_rb = trxs = new trx_id_set(); + } + + trxs->insert(trx_id); + + mutex_exit(&index->online_log->mutex); + } + + rw_lock_s_unlock(&index->lock); +} + +/******************************************************//** +Check if a transaction rollback has been initiated. +@return true if inserts of this transaction were rolled back */ +UNIV_INTERN +bool +row_log_table_is_rollback( +/*======================*/ + const dict_index_t* index, /*!< in: clustered index */ + trx_id_t trx_id) /*!< in: transaction id */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(index->online_log); + + if (const trx_id_set* trxs = index->online_log->trx_rb) { + mutex_enter(&index->online_log->mutex); + bool is_rollback = trxs->find(trx_id) != trxs->end(); + mutex_exit(&index->online_log->mutex); + + return(is_rollback); + } + + return(false); +} + +/******************************************************//** +Converts a log record to a table row. +@return converted row, or NULL if the conversion fails +or the transaction has been rolled back */ +static __attribute__((nonnull, warn_unused_result)) +const dtuple_t* +row_log_table_apply_convert_mrec( +/*=============================*/ + const mrec_t* mrec, /*!< in: merge record */ + dict_index_t* index, /*!< in: index of mrec */ + const ulint* offsets, /*!< in: offsets of mrec */ + const row_log_t* log, /*!< in: rebuild context */ + mem_heap_t* heap, /*!< in/out: memory heap */ + trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ + dberr_t* error) /*!< out: DB_SUCCESS or + reason of failure */ +{ + dtuple_t* row; + +#ifdef UNIV_SYNC_DEBUG + /* This prevents BLOBs from being freed, in case an insert + transaction rollback starts after row_log_table_is_rollback(). */ + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (row_log_table_is_rollback(index, trx_id)) { + row = NULL; + goto func_exit; + } + + /* This is based on row_build(). */ + if (log->add_cols) { + row = dtuple_copy(log->add_cols, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(log->table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else { + row = dtuple_create(heap, dict_table_get_n_cols(log->table)); + dict_table_copy_types(row, log->table); + } + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + + const dict_col_t* col + = dict_field_get_col(ind_field); + ulint col_no + = log->col_map[dict_col_get_no(col)]; + + if (col_no == ULINT_UNDEFINED) { + /* dropped column */ + continue; + } + + dfield_t* dfield + = dtuple_get_nth_field(row, col_no); + ulint len; + const void* data; + + if (rec_offs_nth_extern(offsets, i)) { + ut_ad(rec_offs_any_extern(offsets)); + data = btr_rec_copy_externally_stored_field( + mrec, offsets, + dict_table_zip_size(index->table), + i, &len, heap); + ut_a(data); + } else { + data = rec_get_nth_field(mrec, offsets, i, &len); + } + + dfield_set_data(dfield, data, len); + + /* See if any columns were changed to NULL or NOT NULL. */ + const dict_col_t* new_col + = dict_table_get_nth_col(log->table, col_no); + ut_ad(new_col->mtype == col->mtype); + + /* Assert that prtype matches except for nullability. */ + ut_ad(!((new_col->prtype ^ col->prtype) & ~DATA_NOT_NULL)); + ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype) + & ~DATA_NOT_NULL)); + + if (new_col->prtype == col->prtype) { + continue; + } + + if ((new_col->prtype & DATA_NOT_NULL) + && dfield_is_null(dfield)) { + /* We got a NULL value for a NOT NULL column. */ + *error = DB_INVALID_NULL; + return(NULL); + } + + /* Adjust the DATA_NOT_NULL flag in the parsed row. */ + dfield_get_type(dfield)->prtype = new_col->prtype; + + ut_ad(dict_col_type_assert_equal(new_col, + dfield_get_type(dfield))); + } + +func_exit: + *error = DB_SUCCESS; + return(row); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert_low( +/*===========================*/ + que_thr_t* thr, /*!< in: query graph */ + const dtuple_t* row, /*!< in: table row + in the old table definition */ + trx_id_t trx_id, /*!< in: trx_id of the row */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup) /*!< in/out: for reporting + duplicate key errors */ +{ + dberr_t error; + dtuple_t* entry; + const row_log_t*log = dup->index->online_log; + dict_index_t* index = dict_table_get_first_index(log->table); + + ut_ad(dtuple_validate(row)); + ut_ad(trx_id); + +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply insert " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + dtuple_print(stderr, row); + } +#endif /* ROW_LOG_APPLY_PRINT */ + + static const ulint flags + = (BTR_CREATE_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG); + + entry = row_build_index_entry(row, NULL, index, heap); + + error = row_ins_clust_index_entry_low( + flags, BTR_MODIFY_TREE, index, index->n_uniq, entry, 0, thr); + + switch (error) { + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + /* The row had already been copied to the table. */ + return(DB_SUCCESS); + default: + return(error); + } + + do { + if (!(index = dict_table_get_next_index(index))) { + break; + } + + if (index->type & DICT_FTS) { + continue; + } + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + flags, BTR_MODIFY_TREE, + index, offsets_heap, heap, entry, trx_id, thr); + } while (error == DB_SUCCESS); + + return(error); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + const mrec_t* mrec, /*!< in: record to insert */ + const ulint* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + trx_id_t trx_id) /*!< in: DB_TRX_ID of mrec */ +{ + const row_log_t*log = dup->index->online_log; + dberr_t error; + const dtuple_t* row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, trx_id, &error); + + ut_ad(error == DB_SUCCESS || !row); + /* Handling of duplicate key error requires storing + of offending key in a record buffer. */ + ut_ad(error != DB_DUPLICATE_KEY); + + if (error != DB_SUCCESS) + return(error); + + if (row) { + error = row_log_table_apply_insert_low( + thr, row, trx_id, offsets_heap, heap, dup); + if (error != DB_SUCCESS) { + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + } + return(error); +} + +/******************************************************//** +Deletes a record from a table that is being rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull(1, 2, 4, 5), warn_unused_result)) +dberr_t +row_log_table_apply_delete_low( +/*===========================*/ + btr_pcur_t* pcur, /*!< in/out: B-tree cursor, + will be trashed */ + const ulint* offsets, /*!< in: offsets on pcur */ + const row_ext_t* save_ext, /*!< in: saved external field + info, or NULL */ + mem_heap_t* heap, /*!< in/out: memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction, + will be committed */ +{ + dberr_t error; + row_ext_t* ext; + dtuple_t* row; + dict_index_t* index = btr_pcur_get_btr_cur(pcur)->index; + + ut_ad(dict_index_is_clust(index)); + +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply delete " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + rec_print_new(stderr, btr_pcur_get_rec(pcur), offsets); + } +#endif /* ROW_LOG_APPLY_PRINT */ + if (dict_table_get_next_index(index)) { + /* Build a row template for purging secondary index entries. */ + row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(pcur), + offsets, NULL, NULL, NULL, + save_ext ? NULL : &ext, heap); + if (!save_ext) { + save_ext = ext; + } + } else { + row = NULL; + } + + btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, RB_NONE, mtr); + mtr_commit(mtr); + + if (error != DB_SUCCESS) { + return(error); + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->type & DICT_FTS) { + continue; + } + + const dtuple_t* entry = row_build_index_entry( + row, save_ext, index, heap); + mtr_start(mtr); + btr_pcur_open(index, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, pcur, mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(pcur)) + || btr_pcur_get_low_match(pcur) < index->n_uniq) { + /* All secondary index entries should be + found, because new_table is being modified by + this thread only, and all indexes should be + updated in sync. */ + mtr_commit(mtr); + return(DB_INDEX_CORRUPT); + } + + btr_cur_pessimistic_delete(&error, FALSE, + btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, RB_NONE, mtr); + mtr_commit(mtr); + } + + return(error); +} + +/******************************************************//** +Replays a delete operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull(1, 3, 4, 5, 6, 7), warn_unused_result)) +dberr_t +row_log_table_apply_delete( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: merge record */ + const ulint* moffsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dict_table_t* new_table, /*!< in: rebuilt table */ + const row_ext_t* save_ext) /*!< in: saved external field + info, or NULL */ +{ + dict_index_t* index = dict_table_get_first_index(new_table); + dtuple_t* old_pk; + mtr_t mtr; + btr_pcur_t pcur; + ulint* offsets; + + ut_ad(rec_offs_n_fields(moffsets) + == dict_index_get_n_unique(index) + 1); + ut_ad(!rec_offs_any_extern(moffsets)); + + /* Convert the row to a search tuple. */ + old_pk = dtuple_create(heap, index->n_uniq + 1); + dict_index_copy_types(old_pk, index, old_pk->n_fields); + dtuple_set_n_fields_cmp(old_pk, index->n_uniq); + + for (ulint i = 0; i <= index->n_uniq; i++) { + ulint len; + const void* field; + field = rec_get_nth_field(mrec, moffsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + dfield_set_data(dtuple_get_nth_field(old_pk, i), + field, len); + } + + mtr_start(&mtr); + btr_pcur_open(index, old_pk, PAGE_CUR_LE, + BTR_MODIFY_TREE, &pcur, &mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + || btr_pcur_get_low_match(&pcur) < index->n_uniq) { +all_done: + mtr_commit(&mtr); + /* The record was not found. All done. */ + return(DB_SUCCESS); + } + + offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL, + ULINT_UNDEFINED, &offsets_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + /* Only remove the record if DB_TRX_ID matches what was + buffered. */ + + { + ulint len; + const void* mrec_trx_id + = rec_get_nth_field(mrec, moffsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + const void* rec_trx_id + = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + if (memcmp(mrec_trx_id, rec_trx_id, DATA_TRX_ID_LEN)) { + goto all_done; + } + } + + return(row_log_table_apply_delete_low(&pcur, offsets, save_ext, + heap, &mtr)); +} + +/******************************************************//** +Replays an update operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_update( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in the + old clustered index */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: new value */ + const ulint* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ + const dtuple_t* old_pk) /*!< in: PRIMARY KEY and + DB_TRX_ID,DB_ROLL_PTR + of the old value, + or PRIMARY KEY if same_pk */ +{ + const row_log_t*log = dup->index->online_log; + const dtuple_t* row; + dict_index_t* index = dict_table_get_first_index(log->table); + mtr_t mtr; + btr_pcur_t pcur; + dberr_t error; + + ut_ad(dtuple_get_n_fields_cmp(old_pk) + == dict_index_get_n_unique(index)); + ut_ad(dtuple_get_n_fields(old_pk) + == dict_index_get_n_unique(index) + + (dup->index->online_log->same_pk ? 0 : 2)); + + row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, trx_id, &error); + + ut_ad(error == DB_SUCCESS || !row); + /* Handling of duplicate key error requires storing + of offending key in a record buffer. */ + ut_ad(error != DB_DUPLICATE_KEY); + + if (!row) { + return(error); + } + + mtr_start(&mtr); + btr_pcur_open(index, old_pk, PAGE_CUR_LE, + BTR_MODIFY_TREE, &pcur, &mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + ut_ad(0);/* We did not request buffering. */ + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + break; + } +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + || btr_pcur_get_low_match(&pcur) < index->n_uniq) { + mtr_commit(&mtr); +insert: + ut_ad(mtr.state == MTR_COMMITTED); + /* The row was not found. Insert it. */ + error = row_log_table_apply_insert_low( + thr, row, trx_id, offsets_heap, heap, dup); + if (error != DB_SUCCESS) { +err_exit: + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + + return(error); + } + + /* Update the record. */ + ulint* cur_offsets = rec_get_offsets( + btr_pcur_get_rec(&pcur), + index, NULL, ULINT_UNDEFINED, &offsets_heap); + + dtuple_t* entry = row_build_index_entry( + row, NULL, index, heap); + const upd_t* update = row_upd_build_difference_binary( + index, entry, btr_pcur_get_rec(&pcur), cur_offsets, + false, NULL, heap); + + error = DB_SUCCESS; + + if (!update->n_fields) { + /* Nothing to do. */ + goto func_exit; + } + + if (rec_offs_any_extern(cur_offsets)) { + /* If the record contains any externally stored + columns, perform the update by delete and insert, + because we will not write any undo log that would + allow purge to free any orphaned externally stored + columns. */ +delete_insert: + error = row_log_table_apply_delete_low( + &pcur, cur_offsets, NULL, heap, &mtr); + ut_ad(mtr.state == MTR_COMMITTED); + + if (error != DB_SUCCESS) { + goto err_exit; + } + + goto insert; + } + + if (upd_get_nth_field(update, 0)->field_no < new_trx_id_col) { + if (dup->index->online_log->same_pk) { + /* The ROW_T_UPDATE log record should only be + written when the PRIMARY KEY fields of the + record did not change in the old table. We + can only get a change of PRIMARY KEY columns + in the rebuilt table if the PRIMARY KEY was + redefined (!same_pk). */ + ut_ad(0); + error = DB_CORRUPTION; + goto func_exit; + } + + /* The PRIMARY KEY columns have changed. + Delete the record with the old PRIMARY KEY value, + provided that it carries the same + DB_TRX_ID,DB_ROLL_PTR. Then, insert the new row. */ + ulint len; + const byte* cur_trx_roll = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + const dfield_t* new_trx_roll = dtuple_get_nth_field( + old_pk, new_trx_id_col); + /* We assume that DB_TRX_ID,DB_ROLL_PTR are stored + in one contiguous block. */ + ut_ad(rec_get_nth_field(mrec, offsets, trx_id_col + 1, &len) + == cur_trx_roll + DATA_TRX_ID_LEN); + ut_ad(len == DATA_ROLL_PTR_LEN); + ut_ad(new_trx_roll->len == DATA_TRX_ID_LEN); + ut_ad(dtuple_get_nth_field(old_pk, new_trx_id_col + 1) + -> len == DATA_ROLL_PTR_LEN); + ut_ad(static_cast<const byte*>( + dtuple_get_nth_field(old_pk, new_trx_id_col + 1) + ->data) + == static_cast<const byte*>(new_trx_roll->data) + + DATA_TRX_ID_LEN); + + if (!memcmp(cur_trx_roll, new_trx_roll->data, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { + /* The old row exists. Remove it. */ + goto delete_insert; + } + + /* Unless we called row_log_table_apply_delete_low(), + this will likely cause a duplicate key error. */ + mtr_commit(&mtr); + goto insert; + } + + dtuple_t* old_row; + row_ext_t* old_ext; + + if (dict_table_get_next_index(index)) { + /* Construct the row corresponding to the old value of + the record. */ + old_row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur), + cur_offsets, NULL, NULL, NULL, &old_ext, heap); + ut_ad(old_row); +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply update " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + dtuple_print(stderr, old_row); + dtuple_print(stderr, row); + } +#endif /* ROW_LOG_APPLY_PRINT */ + } else { + old_row = NULL; + old_ext = NULL; + } + + big_rec_t* big_rec; + + error = btr_cur_pessimistic_update( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG + | BTR_KEEP_POS_FLAG, + btr_pcur_get_btr_cur(&pcur), + &cur_offsets, &offsets_heap, heap, &big_rec, + update, 0, NULL, 0, &mtr); + + if (big_rec) { + if (error == DB_SUCCESS) { + error = btr_store_big_rec_extern_fields( + index, btr_pcur_get_block(&pcur), + btr_pcur_get_rec(&pcur), cur_offsets, + big_rec, &mtr, BTR_STORE_UPDATE); + } + + dtuple_big_rec_free(big_rec); + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (error != DB_SUCCESS) { + break; + } + + if (index->type & DICT_FTS) { + continue; + } + + if (!row_upd_changes_ord_field_binary( + index, update, thr, old_row, NULL)) { + continue; + } + + mtr_commit(&mtr); + + entry = row_build_index_entry(old_row, old_ext, index, heap); + if (!entry) { + ut_ad(0); + return(DB_CORRUPTION); + } + + mtr_start(&mtr); + + if (ROW_FOUND != row_search_index_entry( + index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) { + ut_ad(0); + error = DB_CORRUPTION; + break; + } + + btr_cur_pessimistic_delete( + &error, FALSE, btr_pcur_get_btr_cur(&pcur), + BTR_CREATE_FLAG, RB_NONE, &mtr); + + if (error != DB_SUCCESS) { + break; + } + + mtr_commit(&mtr); + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG, + BTR_MODIFY_TREE, index, offsets_heap, heap, + entry, trx_id, thr); + + mtr_start(&mtr); + } + +func_exit: + mtr_commit(&mtr); + if (error != DB_SUCCESS) { + goto err_exit; + } + + return(error); +} + +/******************************************************//** +Applies an operation to a table that was rebuilt. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static __attribute__((nonnull, warn_unused_result)) +const mrec_t* +row_log_table_apply_op( +/*===================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in old index */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in new index */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS + or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + ulint* offsets) /*!< in/out: work area + for parsing mrec */ +{ + const row_log_t*log = dup->index->online_log; + dict_index_t* new_index = dict_table_get_first_index(log->table); + ulint extra_size; + const mrec_t* next_mrec; + dtuple_t* old_pk; + row_ext_t* ext; + ulint ext_size; + + ut_ad(dict_index_is_clust(dup->index)); + ut_ad(dup->index->table != log->table); + + *error = DB_SUCCESS; + + /* 3 = 1 (op type) + 1 (ext_size) + at least 1 byte payload */ + if (mrec + 3 >= mrec_end) { + return(NULL); + } + + switch (*mrec++) { + default: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + case ROW_T_INSERT: + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } else { + ulint len; + const byte* db_trx_id + = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + *error = row_log_table_apply_insert( + thr, mrec, offsets, offsets_heap, + heap, dup, trx_read_trx_id(db_trx_id)); + } + break; + + case ROW_T_DELETE: + /* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */ + if (mrec + 4 >= mrec_end) { + return(NULL); + } + + extra_size = *mrec++; + ext_size = mach_read_from_2(mrec); + mrec += 2; + ut_ad(mrec < mrec_end); + + /* We assume extra_size < 0x100 for the PRIMARY KEY prefix. + For fixed-length PRIMARY key columns, it is 0. */ + mrec += extra_size; + + rec_offs_set_n_fields(offsets, new_index->n_uniq + 1); + rec_init_offsets_temp(mrec, new_index, offsets); + next_mrec = mrec + rec_offs_data_size(offsets) + ext_size; + if (next_mrec > mrec_end) { + return(NULL); + } + + /* If there are external fields, retrieve those logged + prefix info and reconstruct the row_ext_t */ + if (ext_size) { + /* We use memcpy to avoid unaligned + access on some non-x86 platforms.*/ + ext = static_cast<row_ext_t*>( + mem_heap_dup(heap, + mrec + rec_offs_data_size(offsets), + ext_size)); + + byte* ext_start = reinterpret_cast<byte*>(ext); + + ulint ext_len = sizeof(*ext) + + (ext->n_ext - 1) * sizeof ext->len; + + ext->ext = reinterpret_cast<ulint*>(ext_start + ext_len); + ext_len += ext->n_ext * sizeof(*ext->ext); + + ext->buf = static_cast<byte*>(ext_start + ext_len); + } else { + ext = NULL; + } + + *error = row_log_table_apply_delete( + thr, new_trx_id_col, + mrec, offsets, offsets_heap, heap, + log->table, ext); + break; + + case ROW_T_UPDATE: + /* Logically, the log entry consists of the + (PRIMARY KEY,DB_TRX_ID) of the old value (converted + to the new primary key definition) followed by + the new value in the old table definition. If the + definition of the columns belonging to PRIMARY KEY + is not changed, the log will only contain + DB_TRX_ID,new_row. */ + + if (dup->index->online_log->same_pk) { + ut_ad(new_index->n_uniq == dup->index->n_uniq); + + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + + old_pk = dtuple_create(heap, new_index->n_uniq); + dict_index_copy_types( + old_pk, new_index, old_pk->n_fields); + + /* Copy the PRIMARY KEY fields from mrec to old_pk. */ + for (ulint i = 0; i < new_index->n_uniq; i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + } else { + /* We assume extra_size < 0x100 + for the PRIMARY KEY prefix. */ + mrec += *mrec + 1; + + if (mrec > mrec_end) { + return(NULL); + } + + /* Get offsets for PRIMARY KEY, + DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs_set_n_fields(offsets, new_index->n_uniq + 2); + rec_init_offsets_temp(mrec, new_index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + if (next_mrec + 2 > mrec_end) { + return(NULL); + } + + /* Copy the PRIMARY KEY fields and + DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */ + old_pk = dtuple_create(heap, new_index->n_uniq + 2); + dict_index_copy_types(old_pk, new_index, + old_pk->n_fields); + + for (ulint i = 0; + i < dict_index_get_n_unique(new_index) + 2; + i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + + mrec = next_mrec; + + /* Fetch the new value of the row as it was + in the old table definition. */ + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + } + + ut_ad(next_mrec <= mrec_end); + dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq); + + { + ulint len; + const byte* db_trx_id + = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + *error = row_log_table_apply_update( + thr, trx_id_col, new_trx_id_col, + mrec, offsets, offsets_heap, + heap, dup, trx_read_trx_id(db_trx_id), old_pk); + } + + break; + } + + mem_heap_empty(offsets_heap); + mem_heap_empty(heap); + return(next_mrec); +} + +/******************************************************//** +Applies operations to a table was rebuilt. +@return DB_SUCCESS, or error code on failure */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_ops( +/*====================*/ + que_thr_t* thr, /*!< in: query graph */ + row_merge_dup_t*dup) /*!< in/out: for reporting duplicate key + errors */ +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end = NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* heap; + mem_heap_t* offsets_heap; + ulint* offsets; + bool has_index_lock; + dict_index_t* index = const_cast<dict_index_t*>( + dup->index); + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index( + new_table); + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + ut_max(dict_index_get_n_fields(index), + dict_index_get_n_unique(new_index) + 2); + const ulint trx_id_col = dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, DATA_TRX_ID), index); + const ulint new_trx_id_col = dict_col_get_clust_pos( + dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index); + trx_t* trx = thr_get_trx(thr); + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(trx->mysql_thd); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + ut_ad(new_trx_id_col > 0); + ut_ad(new_trx_id_col != ULINT_UNDEFINED); + + UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end); + + offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets)); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + + heap = mem_heap_create(UNIV_PAGE_SIZE); + offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log->head.bytes == 0); + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + if (dict_index_is_corrupted(index)) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + ut_ad(dict_index_is_online_ddl(index)); + + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + fprintf(stderr, "InnoDB: unexpected end of temporary file" + " for table %s\n", index->table_name); +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + ftruncate(index->online_log->fd, 0); +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + index->online_log->head.bytes = 0; + index->online_log->tail.bytes = 0; + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs; + ibool success; + + ofs = (os_offset_t) index->online_log->head.blocks + * srv_sort_buf_size; + + ut_ad(has_index_lock); + has_index_lock = false; + rw_lock_x_unlock(dict_index_get_lock(index)); + + log_free_check(); + + ut_ad(dict_index_is_online_ddl(index)); + + success = os_file_read_no_error_handling( + OS_FILE_FROM_FD(index->online_log->fd), + index->online_log->head.block, ofs, + srv_sort_buf_size); + + if (!success) { + fprintf(stderr, "InnoDB: unable to read temporary file" + " for table %s\n", index->table_name); + goto corruption; + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ +#ifdef FALLOC_FL_PUNCH_HOLE + /* Try to deallocate the space for the file on disk. + This should work on ext4 on Linux 2.6.39 and later, + and be ignored when the operation is unsupported. */ + fallocate(index->online_log->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + ofs, srv_buf_size); +#endif /* FALLOC_FL_PUNCH_HOLE */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + /* This read is not protected by index->online_log->mutex for + performance reasons. We will eventually notice any error that + was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + (&index->online_log->head.buf)[1] - mrec_end); + mrec = row_log_table_apply_op( + thr, trx_id_col, new_trx_id_col, + dup, &error, offsets_heap, heap, + index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = mrec - mrec_end; + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec < mrec_end); + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + /* This read is not protected by index->online_log->mutex + for performance reasons. We will eventually notice any + error that was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + next_mrec = row_log_table_apply_op( + thr, trx_id_col, new_trx_id_col, + dup, &error, offsets_heap, heap, + mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + rw_lock_x_lock(dict_index_get_lock(index)); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes += next_mrec - mrec; + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + mrec_end - mrec); + mrec_end += index->online_log->head.buf - mrec; + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + rw_lock_x_lock(dict_index_get_lock(index)); + } + + mem_heap_free(offsets_heap); + mem_heap_free(heap); + ut_free(offsets); + return(error); +} + +/******************************************************//** +Apply the row_log_table log to a table upon completing rebuild. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_table_apply( +/*================*/ + que_thr_t* thr, /*!< in: query graph */ + dict_table_t* old_table, + /*!< in: old table */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ +{ + dberr_t error; + dict_index_t* clust_index; + + thr_get_trx(thr)->error_key_num = 0; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + clust_index = dict_table_get_first_index(old_table); + + rw_lock_x_lock(dict_index_get_lock(clust_index)); + + if (!clust_index->online_log) { + ut_ad(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + /* This function should not be called unless + rebuilding a table online. Build in some fault + tolerance. */ + ut_ad(0); + error = DB_ERROR; + } else { + row_merge_dup_t dup = { + clust_index, table, + clust_index->online_log->col_map, 0 + }; + + error = row_log_table_apply_ops(thr, &dup); + } + + rw_lock_x_unlock(dict_index_get_lock(clust_index)); + return(error); +} + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +UNIV_INTERN +bool +row_log_allocate( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map)/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ +{ + byte* buf; + row_log_t* log; + ulint size; + + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(dict_index_is_clust(index) == !!table); + ut_ad(!table || index->table != table); + ut_ad(same_pk || table); + ut_ad(!table || col_map); + ut_ad(!add_cols || col_map); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + size = 2 * srv_sort_buf_size + sizeof *log; + buf = (byte*) os_mem_alloc_large(&size); + if (!buf) { + return(false); + } + + log = (row_log_t*) &buf[2 * srv_sort_buf_size]; + log->size = size; + log->fd = row_merge_file_create_low(); + if (log->fd < 0) { + os_mem_free_large(buf, size); + return(false); + } + mutex_create(index_online_log_key, &log->mutex, + SYNC_INDEX_ONLINE_LOG); + log->trx_rb = NULL; + log->table = table; + log->same_pk = same_pk; + log->add_cols = add_cols; + log->col_map = col_map; + log->error = DB_SUCCESS; + log->max_trx = 0; + log->head.block = buf; + log->tail.block = buf + srv_sort_buf_size; + log->tail.blocks = log->tail.bytes = 0; + log->head.blocks = log->head.bytes = 0; + dict_index_set_online_status(index, ONLINE_INDEX_CREATION); + index->online_log = log; + + /* While we might be holding an exclusive data dictionary lock + here, in row_log_abort_sec() we will not always be holding it. Use + atomic operations in both cases. */ + MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX); + + return(true); +} + +/******************************************************//** +Free the row log for an index that was being created online. */ +UNIV_INTERN +void +row_log_free( +/*=========*/ + row_log_t*& log) /*!< in,own: row log */ +{ + MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX); + + delete log->trx_rb; + row_merge_file_destroy_low(log->fd); + mutex_free(&log->mutex); + os_mem_free_large(log->head.block, log->size); + log = 0; +} + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +UNIV_INTERN +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ +{ + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION); +#ifdef UNIV_SYNC_DEBUG + ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + && mutex_own(&index->online_log->mutex)) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + return(index->online_log->max_trx); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. */ +static __attribute__((nonnull)) +void +row_log_apply_op_low( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + enum row_op op, /*!< in: operation being applied */ + trx_id_t trx_id, /*!< in: transaction identifier */ + const dtuple_t* entry) /*!< in: row */ +{ + mtr_t mtr; + btr_cur_t cursor; + ulint* offsets = NULL; + + ut_ad(!dict_index_is_clust(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX) + == has_index_lock); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!dict_index_is_corrupted(index)); + ut_ad(trx_id != 0 || op == ROW_OP_DELETE); + + mtr_start(&mtr); + + /* We perform the pessimistic variant of the operations if we + already hold index->lock exclusively. First, search the + record. The operation may already have been performed, + depending on when the row in the clustered index was + scanned. */ + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + has_index_lock + ? BTR_MODIFY_TREE + : BTR_MODIFY_LEAF, + &cursor, 0, __FILE__, __LINE__, + &mtr); + + ut_ad(dict_index_get_n_unique(index) > 0); + /* This test is somewhat similar to row_ins_must_modify_rec(), + but not identical for unique secondary indexes. */ + if (cursor.low_match >= dict_index_get_n_unique(index) + && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) { + /* We have a matching record. */ + bool exists = (cursor.low_match + == dict_index_get_n_fields(index)); +#ifdef UNIV_DEBUG + rec_t* rec = btr_cur_get_rec(&cursor); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); +#endif /* UNIV_DEBUG */ + + ut_ad(exists || dict_index_is_unique(index)); + + switch (op) { + case ROW_OP_DELETE: + if (!exists) { + /* The record was already deleted. */ + goto func_exit; + } + + if (btr_cur_optimistic_delete( + &cursor, BTR_CREATE_FLAG, &mtr)) { + *error = DB_SUCCESS; + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + __FILE__, __LINE__, &mtr); + + /* No other thread than the current one + is allowed to modify the index tree. + Thus, the record should still exist. */ + ut_ad(cursor.low_match + >= dict_index_get_n_fields(index)); + ut_ad(page_rec_is_user_rec( + btr_cur_get_rec(&cursor))); + } + + /* As there are no externally stored fields in + a secondary index record, the parameter + rb_ctx = RB_NONE will be ignored. */ + + btr_cur_pessimistic_delete( + error, FALSE, &cursor, + BTR_CREATE_FLAG, RB_NONE, &mtr); + break; + case ROW_OP_INSERT: + if (exists) { + /* The record already exists. There + is nothing to be inserted. */ + goto func_exit; + } + + if (dtuple_contains_null(entry)) { + /* The UNIQUE KEY columns match, but + there is a NULL value in the key, and + NULL!=NULL. */ + goto insert_the_rec; + } + + /* Duplicate key error */ + ut_ad(dict_index_is_unique(index)); + row_merge_dup_report(dup, entry->fields); + goto func_exit; + } + } else { + switch (op) { + rec_t* rec; + big_rec_t* big_rec; + case ROW_OP_DELETE: + /* The record does not exist. */ + goto func_exit; + case ROW_OP_INSERT: + if (dict_index_is_unique(index) + && (cursor.up_match + >= dict_index_get_n_unique(index) + || cursor.low_match + >= dict_index_get_n_unique(index)) + && (!index->n_nullable + || !dtuple_contains_null(entry))) { + /* Duplicate key */ + row_merge_dup_report(dup, entry->fields); + goto func_exit; + } +insert_the_rec: + /* Insert the record. As we are inserting into + a secondary index, there cannot be externally + stored columns (!big_rec). */ + *error = btr_cur_optimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, 0, NULL, &mtr); + ut_ad(!big_rec); + if (*error != DB_FAIL) { + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + __FILE__, __LINE__, &mtr); + } + + /* We already determined that the + record did not exist. No other thread + than the current one is allowed to + modify the index tree. Thus, the + record should still not exist. */ + + *error = btr_cur_pessimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, + 0, NULL, &mtr); + ut_ad(!big_rec); + break; + } + mem_heap_empty(offsets_heap); + } + + if (*error == DB_SUCCESS && trx_id) { + page_update_max_trx_id(btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + +func_exit: + mtr_commit(&mtr); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static __attribute__((nonnull, warn_unused_result)) +const mrec_t* +row_log_apply_op( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap for + allocating data tuples */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + ulint* offsets) /*!< in/out: work area for + rec_init_offsets_temp() */ + +{ + enum row_op op; + ulint extra_size; + ulint data_size; + ulint n_ext; + dtuple_t* entry; + trx_id_t trx_id; + + /* Online index creation is only used for secondary indexes. */ + ut_ad(!dict_index_is_clust(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX) + == has_index_lock); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index)) { + *error = DB_INDEX_CORRUPT; + return(NULL); + } + + *error = DB_SUCCESS; + + if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) { + return(NULL); + } + + switch (*mrec) { + case ROW_OP_INSERT: + if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) { + return(NULL); + } + + op = static_cast<enum row_op>(*mrec++); + trx_id = trx_read_trx_id(mrec); + mrec += DATA_TRX_ID_LEN; + break; + case ROW_OP_DELETE: + op = static_cast<enum row_op>(*mrec++); + trx_id = 0; + break; + default: +corrupted: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + } + + extra_size = *mrec++; + + ut_ad(mrec < mrec_end); + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_init_offsets_temp(mrec, index, offsets); + + if (rec_offs_any_extern(offsets)) { + /* There should never be any externally stored fields + in a secondary index, which is what online index + creation is used for. Therefore, the log file must be + corrupted. */ + goto corrupted; + } + + data_size = rec_offs_data_size(offsets); + + mrec += data_size; + + if (mrec > mrec_end) { + return(NULL); + } + + entry = row_rec_to_index_entry_low( + mrec - data_size, index, offsets, &n_ext, heap); + /* Online index creation is only implemented for secondary + indexes, which never contain off-page columns. */ + ut_ad(n_ext == 0); +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "apply " IB_ID_FMT " " TRX_ID_FMT " %u %u ", + index->id, trx_id, + unsigned (op), unsigned (has_index_lock)); + for (const byte* m = mrec - data_size; m < mrec; m++) { + fprintf(stderr, "%02x", *m); + } + putc('\n', stderr); + } +#endif /* ROW_LOG_APPLY_PRINT */ + row_log_apply_op_low(index, dup, error, offsets_heap, + has_index_lock, op, trx_id, entry); + return(mrec); +} + +/******************************************************//** +Applies operations to a secondary index that was being created. +@return DB_SUCCESS, or error code on failure */ +static __attribute__((nonnull)) +dberr_t +row_log_apply_ops( +/*==============*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup) /*!< in/out: for reporting duplicate key + errors */ +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end= NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* offsets_heap; + mem_heap_t* heap; + ulint* offsets; + bool has_index_lock; + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(*index->name == TEMP_INDEX_PREFIX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log); + UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end); + + offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets)); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + + offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); + heap = mem_heap_create(UNIV_PAGE_SIZE); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log->head.bytes == 0); + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + if (dict_index_is_corrupted(index)) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + fprintf(stderr, "InnoDB: unexpected end of temporary file" + " for index %s\n", index->name + 1); +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + ftruncate(index->online_log->fd, 0); +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs; + ibool success; + + ofs = (os_offset_t) index->online_log->head.blocks + * srv_sort_buf_size; + + ut_ad(has_index_lock); + has_index_lock = false; + rw_lock_x_unlock(dict_index_get_lock(index)); + + log_free_check(); + + success = os_file_read_no_error_handling( + OS_FILE_FROM_FD(index->online_log->fd), + index->online_log->head.block, ofs, + srv_sort_buf_size); + + if (!success) { + fprintf(stderr, "InnoDB: unable to read temporary file" + " for index %s\n", index->name + 1); + goto corruption; + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ +#ifdef FALLOC_FL_PUNCH_HOLE + /* Try to deallocate the space for the file on disk. + This should work on ext4 on Linux 2.6.39 and later, + and be ignored when the operation is unsupported. */ + fallocate(index->online_log->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + ofs, srv_buf_size); +#endif /* FALLOC_FL_PUNCH_HOLE */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + (&index->online_log->head.buf)[1] - mrec_end); + mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = mrec - mrec_end; + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec < mrec_end); + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + next_mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + rw_lock_x_lock(dict_index_get_lock(index)); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes += next_mrec - mrec; + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + mrec_end - mrec); + mrec_end += index->online_log->head.buf - mrec; + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + rw_lock_x_lock(dict_index_get_lock(index)); + } + + switch (error) { + case DB_SUCCESS: + break; + case DB_INDEX_CORRUPT: + if (((os_offset_t) index->online_log->tail.blocks + 1) + * srv_sort_buf_size >= srv_online_max_size) { + /* The log file grew too big. */ + error = DB_ONLINE_LOG_TOO_BIG; + } + /* fall through */ + default: + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + ut_free(offsets); + return(error); +} + +/******************************************************//** +Apply the row log to the index upon completing index creation. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_apply( +/*==========*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: secondary index */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ +{ + dberr_t error; + row_log_t* log; + row_merge_dup_t dup = { index, table, NULL, 0 }; + + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(!dict_index_is_clust(index)); + + log_free_check(); + + rw_lock_x_lock(dict_index_get_lock(index)); + + if (!dict_table_is_corrupted(index->table)) { + error = row_log_apply_ops(trx, index, &dup); + } else { + error = DB_SUCCESS; + } + + if (error != DB_SUCCESS || dup.n_dup) { + ut_a(!dict_table_is_discarded(index->table)); + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + index->table->drop_aborted = TRUE; + + if (error == DB_SUCCESS) { + error = DB_DUPLICATE_KEY; + } + + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + } else { + dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE); + } + + log = index->online_log; + index->online_log = NULL; + /* We could remove the TEMP_INDEX_PREFIX and update the data + dictionary to say that this index is complete, if we had + access to the .frm file here. If the server crashes before + all requested indexes have been created, this completed index + will be dropped. */ + rw_lock_x_unlock(dict_index_get_lock(index)); + + row_log_free(log); + + return(error); +} diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index cf662cb1f88..a509e2c5ca8 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,40 +26,18 @@ Completed by Sunny Bains and Marko Makela #include "row0merge.h" #include "row0ext.h" -#include "row0row.h" -#include "row0upd.h" +#include "row0log.h" #include "row0ins.h" #include "row0sel.h" -#include "dict0dict.h" -#include "dict0mem.h" -#include "dict0boot.h" #include "dict0crea.h" -#include "dict0load.h" -#include "btr0btr.h" -#include "mach0data.h" -#include "trx0rseg.h" -#include "trx0trx.h" -#include "trx0roll.h" -#include "trx0undo.h" #include "trx0purge.h" -#include "trx0rec.h" -#include "que0que.h" -#include "rem0cmp.h" -#include "read0read.h" -#include "os0file.h" #include "lock0lock.h" -#include "data0data.h" -#include "data0type.h" -#include "que0que.h" #include "pars0pars.h" -#include "mem0mem.h" -#include "log0log.h" #include "ut0sort.h" -#include "handler0alter.h" -#include "fts0fts.h" -#include "fts0types.h" -#include "fts0priv.h" #include "row0ftsort.h" +#include "row0import.h" +#include "handler0alter.h" +#include "ha_prototypes.h" /* Ignore posix_fadvise() on those platforms where it does not exist */ #if defined __WIN__ @@ -69,8 +47,6 @@ Completed by Sunny Bains and Marko Makela #ifdef UNIV_DEBUG /** Set these in order ot enable debug printout. */ /* @{ */ -/** Log the outcome of each row_merge_cmp() call, comparing records. */ -static ibool row_merge_print_cmp; /** Log each record read from temporary file. */ static ibool row_merge_print_read; /** Log each record write to temporary file. */ @@ -86,39 +62,23 @@ static ibool row_merge_print_block_write; #endif /* UNIV_DEBUG */ /* Whether to disable file system cache */ -UNIV_INTERN char srv_disable_sort_file_cache; - -/********************************************************************//** -Read sorted file containing index data tuples and insert these data -tuples to the index -@return DB_SUCCESS or error number */ -static -ulint -row_merge_insert_index_tuples( -/*==========================*/ - trx_t* trx, /*!< in: transaction */ - dict_index_t* index, /*!< in: index */ - dict_table_t* table, /*!< in: new table */ - ulint zip_size,/*!< in: compressed page size of - the old table, or 0 if uncompressed */ - int fd, /*!< in: file descriptor */ - row_merge_block_t* block); /*!< in/out: file buffer */ +UNIV_INTERN char srv_disable_sort_file_cache; #ifdef UNIV_DEBUG /******************************************************//** Display a merge tuple. */ -static +static __attribute__((nonnull)) void row_merge_tuple_print( /*==================*/ FILE* f, /*!< in: output stream */ - const dfield_t* entry, /*!< in: tuple to print */ + const mtuple_t* entry, /*!< in: tuple to print */ ulint n_fields)/*!< in: number of fields in the tuple */ { ulint j; for (j = 0; j < n_fields; j++) { - const dfield_t* field = &entry[j]; + const dfield_t* field = &entry->fields[j]; if (dfield_is_null(field)) { fputs("\n NULL;", f); @@ -141,16 +101,54 @@ row_merge_tuple_print( #endif /* UNIV_DEBUG */ /******************************************************//** +Encode an index record. */ +static __attribute__((nonnull)) +void +row_merge_buf_encode( +/*=================*/ + byte** b, /*!< in/out: pointer to + current end of output buffer */ + const dict_index_t* index, /*!< in: index */ + const mtuple_t* entry, /*!< in: index fields + of the record to encode */ + ulint n_fields) /*!< in: number of fields + in the entry */ +{ + ulint size; + ulint extra_size; + + size = rec_get_converted_size_temp( + index, entry->fields, n_fields, &extra_size); + ut_ad(size >= extra_size); + + /* Encode extra_size + 1 */ + if (extra_size + 1 < 0x80) { + *(*b)++ = (byte) (extra_size + 1); + } else { + ut_ad((extra_size + 1) < 0x8000); + *(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8)); + *(*b)++ = (byte) (extra_size + 1); + } + + rec_convert_dtuple_to_temp(*b + extra_size, index, + entry->fields, n_fields); + + *b += size; +} + +/******************************************************//** Allocate a sort buffer. @return own: sort buffer */ -static +static __attribute__((malloc, nonnull)) row_merge_buf_t* row_merge_buf_create_low( /*=====================*/ mem_heap_t* heap, /*!< in: heap where allocated */ dict_index_t* index, /*!< in: secondary index */ - ulint max_tuples, /*!< in: maximum number of data tuples */ - ulint buf_size) /*!< in: size of the buffer, in bytes */ + ulint max_tuples, /*!< in: maximum number of + data tuples */ + ulint buf_size) /*!< in: size of the buffer, + in bytes */ { row_merge_buf_t* buf; @@ -162,7 +160,7 @@ row_merge_buf_create_low( buf->heap = heap; buf->index = index; buf->max_tuples = max_tuples; - buf->tuples = static_cast<const dfield_t**>( + buf->tuples = static_cast<mtuple_t*>( ut_malloc(2 * max_tuples * sizeof *buf->tuples)); buf->tmp_tuples = buf->tuples + max_tuples; @@ -204,13 +202,11 @@ row_merge_buf_empty( /*================*/ row_merge_buf_t* buf) /*!< in,own: sort buffer */ { - ulint buf_size; + ulint buf_size = sizeof *buf; ulint max_tuples = buf->max_tuples; mem_heap_t* heap = buf->heap; dict_index_t* index = buf->index; - void* tuple = buf->tuples; - - buf_size = (sizeof *buf);; + mtuple_t* tuples = buf->tuples; mem_heap_empty(heap); @@ -218,7 +214,7 @@ row_merge_buf_empty( buf->heap = heap; buf->index = index; buf->max_tuples = max_tuples; - buf->tuples = static_cast<const dfield_t**>(tuple); + buf->tuples = tuples; buf->tmp_tuples = buf->tuples + max_tuples; return(buf); @@ -230,7 +226,7 @@ UNIV_INTERN void row_merge_buf_free( /*===============*/ - row_merge_buf_t* buf) /*!< in,own: sort buffer, to be freed */ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ { ut_free(buf->tuples); mem_heap_free(buf->heap); @@ -244,19 +240,18 @@ ulint row_merge_buf_add( /*==============*/ row_merge_buf_t* buf, /*!< in/out: sort buffer */ - dict_index_t* fts_index,/*!< fts index to be - created */ + dict_index_t* fts_index,/*!< in: fts index to be created */ + const dict_table_t* old_table,/*!< in: original table */ fts_psort_t* psort_info, /*!< in: parallel sort info */ - const dtuple_t* row, /*!< in: row in clustered index */ + const dtuple_t* row, /*!< in: table row */ const row_ext_t* ext, /*!< in: cache of externally stored column prefixes, or NULL */ doc_id_t* doc_id) /*!< in/out: Doc ID if we are creating FTS index */ - { ulint i; const dict_index_t* index; - dfield_t* entry; + mtuple_t* entry; dfield_t* field; const dict_field_t* ifield; ulint n_fields; @@ -267,9 +262,13 @@ row_merge_buf_add( ulint n_row_added = 0; if (buf->n_tuples >= buf->max_tuples) { - return(FALSE); + return(0); } + DBUG_EXECUTE_IF( + "ib_row_merge_buf_add_two", + if (buf->n_tuples >= 2) return(0);); + UNIV_PREFETCH_R(row->fields); /* If we are building FTS index, buf->index points to @@ -279,11 +278,9 @@ row_merge_buf_add( n_fields = dict_index_get_n_fields(index); - entry = static_cast<dfield_t*>( - mem_heap_alloc(buf->heap, n_fields * sizeof *entry)); - - buf->tuples[buf->n_tuples] = entry; - field = entry; + entry = &buf->tuples[buf->n_tuples]; + field = entry->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields)); data_size = 0; extra_size = UT_BITS_IN_BYTES(index->n_nullable); @@ -294,31 +291,15 @@ row_merge_buf_add( ulint len; const dict_col_t* col; ulint col_no; + ulint fixed_len; const dfield_t* row_field; - ibool col_adjusted; col = ifield->col; col_no = dict_col_get_no(col); - col_adjusted = FALSE; - - /* If we are creating a FTS index, a new Doc - ID column is being added, so we need to adjust - any column number positioned after this Doc ID */ - if (*doc_id > 0 - && DICT_TF2_FLAG_IS_SET(index->table, - DICT_TF2_FTS_ADD_DOC_ID) - && col_no > index->table->fts->doc_col) { - - ut_ad(index->table->fts); - - col_no--; - col_adjusted = TRUE; - } /* Process the Doc ID column */ if (*doc_id > 0 - && col_no == index->table->fts->doc_col - && !col_adjusted) { + && col_no == index->table->fts->doc_col) { fts_write_doc_id((byte*) &write_doc_id, *doc_id); /* Note: field->data now points to a value on the @@ -435,9 +416,30 @@ row_merge_buf_add( ut_ad(len <= col->len || col->mtype == DATA_BLOB); - if (ifield->fixed_len) { - ut_ad(len == ifield->fixed_len); + fixed_len = ifield->fixed_len; + if (fixed_len && !dict_table_is_comp(index->table) + && DATA_MBMINLEN(col->mbminmaxlen) + != DATA_MBMAXLEN(col->mbminmaxlen)) { + /* CHAR in ROW_FORMAT=REDUNDANT is always + fixed-length, but in the temporary file it is + variable-length for variable-length character + sets. */ + fixed_len = 0; + } + + if (fixed_len) { +#ifdef UNIV_DEBUG + ulint mbminlen = DATA_MBMINLEN(col->mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen); + + /* len should be between size calcualted base on + mbmaxlen and mbminlen */ + ut_ad(len <= fixed_len); + ut_ad(!mbmaxlen || len >= mbminlen + * (fixed_len / mbmaxlen)); + ut_ad(!dfield_is_ext(field)); +#endif /* UNIV_DEBUG */ } else if (dfield_is_ext(field)) { extra_size += 2; } else if (len < 128 @@ -464,12 +466,11 @@ row_merge_buf_add( ulint size; ulint extra; - size = rec_get_converted_size_comp(index, - REC_STATUS_ORDINARY, - entry, n_fields, &extra); + size = rec_get_converted_size_temp( + index, entry->fields, n_fields, &extra); - ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size); - ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra); + ut_ad(data_size + extra_size == size); + ut_ad(extra_size == extra); } #endif /* UNIV_DEBUG */ @@ -479,12 +480,6 @@ row_merge_buf_add( of extra_size. */ data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80); - /* The following assertion may fail if row_merge_block_t is - declared very small and a PRIMARY KEY is being created with - many prefix columns. In that case, the record may exceed the - page_zip_rec_needs_ext() limit. However, no further columns - will be moved to external storage until the record is inserted - to the clustered index B-tree. */ ut_ad(data_size < srv_sort_buf_size); /* Reserve one byte for the end marker of row_merge_block_t. */ @@ -496,7 +491,7 @@ row_merge_buf_add( buf->n_tuples++; n_row_added++; - field = entry; + field = entry->fields; /* Copy the data fields. */ @@ -509,118 +504,120 @@ row_merge_buf_add( /*************************************************************//** Report a duplicate key. */ -static +UNIV_INTERN void row_merge_dup_report( /*=================*/ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ const dfield_t* entry) /*!< in: duplicate index entry */ { - mrec_buf_t* buf; - const dtuple_t* tuple; - dtuple_t tuple_store; - const rec_t* rec; - const dict_index_t* index = dup->index; - ulint n_fields= dict_index_get_n_fields(index); - mem_heap_t* heap; - ulint* offsets; - ulint n_ext; - - if (dup->n_dup++) { + if (!dup->n_dup++) { /* Only report the first duplicate record, but count all duplicate records. */ - return; + innobase_fields_to_mysql(dup->table, dup->index, entry); } - - /* Convert the tuple to a record and then to MySQL format. */ - heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields) - * sizeof *offsets - + sizeof *buf); - - buf = static_cast<mrec_buf_t*>(mem_heap_alloc(heap, sizeof *buf)); - - tuple = dtuple_from_fields(&tuple_store, entry, n_fields); - n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0; - - rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext); - offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); - - innobase_rec_to_mysql(dup->table, rec, index, offsets); - - mem_heap_free(heap); } /*************************************************************//** Compare two tuples. @return 1, 0, -1 if a is greater, equal, less, respectively, than b */ -static +static __attribute__((warn_unused_result)) int row_merge_tuple_cmp( /*================*/ + ulint n_uniq, /*!< in: number of unique fields */ ulint n_field,/*!< in: number of fields */ - const dfield_t* a, /*!< in: first tuple to be compared */ - const dfield_t* b, /*!< in: second tuple to be compared */ - row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */ + const mtuple_t& a, /*!< in: first tuple to be compared */ + const mtuple_t& b, /*!< in: second tuple to be compared */ + row_merge_dup_t* dup) /*!< in/out: for reporting duplicates, + NULL if non-unique index */ { int cmp; - const dfield_t* field = a; + const dfield_t* af = a.fields; + const dfield_t* bf = b.fields; + ulint n = n_uniq; + + ut_ad(n_uniq > 0); + ut_ad(n_uniq <= n_field); /* Compare the fields of the tuples until a difference is found or we run out of fields to compare. If !cmp at the end, the tuples are equal. */ do { - cmp = cmp_dfield_dfield(a++, b++); - } while (!cmp && --n_field); + cmp = cmp_dfield_dfield(af++, bf++); + } while (!cmp && --n); + + if (cmp) { + return(cmp); + } - if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) { + if (dup) { /* Report a duplicate value error if the tuples are logically equal. NULL columns are logically inequal, although they are equal in the sorting order. Find out if any of the fields are NULL. */ - for (b = field; b != a; b++) { - if (dfield_is_null(b)) { - - goto func_exit; + for (const dfield_t* df = a.fields; df != af; df++) { + if (dfield_is_null(df)) { + goto no_report; } } - row_merge_dup_report(dup, field); + row_merge_dup_report(dup, a.fields); } -func_exit: +no_report: + /* The n_uniq fields were equal, but we compare all fields so + that we will get the same (internal) order as in the B-tree. */ + for (n = n_field - n_uniq + 1; --n; ) { + cmp = cmp_dfield_dfield(af++, bf++); + if (cmp) { + return(cmp); + } + } + + /* This should never be reached, except in a secondary index + when creating a secondary index and a PRIMARY KEY, and there + is a duplicate in the PRIMARY KEY that has not been detected + yet. Internally, an index must never contain duplicates. */ return(cmp); } /** Wrapper for row_merge_tuple_sort() to inject some more context to UT_SORT_FUNCTION_BODY(). -@param a array of tuples that being sorted -@param b aux (work area), same size as tuples[] -@param c lower bound of the sorting area, inclusive -@param d upper bound of the sorting area, inclusive */ -#define row_merge_tuple_sort_ctx(a,b,c,d) \ - row_merge_tuple_sort(n_field, dup, a, b, c, d) +@param tuples array of tuples that being sorted +@param aux work area, same size as tuples[] +@param low lower bound of the sorting area, inclusive +@param high upper bound of the sorting area, inclusive */ +#define row_merge_tuple_sort_ctx(tuples, aux, low, high) \ + row_merge_tuple_sort(n_uniq, n_field, dup, tuples, aux, low, high) /** Wrapper for row_merge_tuple_cmp() to inject some more context to UT_SORT_FUNCTION_BODY(). @param a first tuple to be compared @param b second tuple to be compared @return 1, 0, -1 if a is greater, equal, less, respectively, than b */ -#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup) +#define row_merge_tuple_cmp_ctx(a,b) \ + row_merge_tuple_cmp(n_uniq, n_field, a, b, dup) /**********************************************************************//** Merge sort the tuple buffer in main memory. */ -static +static __attribute__((nonnull(4,5))) void row_merge_tuple_sort( /*=================*/ + ulint n_uniq, /*!< in: number of unique fields */ ulint n_field,/*!< in: number of fields */ - row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ - const dfield_t** tuples, /*!< in/out: tuples */ - const dfield_t** aux, /*!< in/out: work area */ + row_merge_dup_t* dup, /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + mtuple_t* tuples, /*!< in/out: tuples */ + mtuple_t* aux, /*!< in/out: work area */ ulint low, /*!< in: lower bound of the sorting area, inclusive */ ulint high) /*!< in: upper bound of the sorting area, exclusive */ { + ut_ad(n_field > 0); + ut_ad(n_uniq <= n_field); + UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx, tuples, aux, low, high, row_merge_tuple_cmp_ctx); } @@ -632,9 +629,12 @@ void row_merge_buf_sort( /*===============*/ row_merge_buf_t* buf, /*!< in/out: sort buffer */ - row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ { - row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup, + row_merge_tuple_sort(dict_index_get_n_unique(buf->index), + dict_index_get_n_fields(buf->index), + dup, buf->tuples, buf->tmp_tuples, 0, buf->n_tuples); } @@ -653,39 +653,11 @@ row_merge_buf_write( ulint n_fields= dict_index_get_n_fields(index); byte* b = &block[0]; - ulint i; - - for (i = 0; i < buf->n_tuples; i++) { - ulint size; - ulint extra_size; - const dfield_t* entry = buf->tuples[i]; - - size = rec_get_converted_size_comp(index, - REC_STATUS_ORDINARY, - entry, n_fields, - &extra_size); - ut_ad(size >= extra_size); - ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES); - extra_size -= REC_N_NEW_EXTRA_BYTES; - size -= REC_N_NEW_EXTRA_BYTES; - - /* Encode extra_size + 1 */ - if (extra_size + 1 < 0x80) { - *b++ = (byte) (extra_size + 1); - } else { - ut_ad((extra_size + 1) < 0x8000); - *b++ = (byte) (0x80 | ((extra_size + 1) >> 8)); - *b++ = (byte) (extra_size + 1); - } - - ut_ad(b + size < &block[srv_sort_buf_size]); - - rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index, - REC_STATUS_ORDINARY, - entry, n_fields); - - b += size; + for (ulint i = 0; i < buf->n_tuples; i++) { + const mtuple_t* entry = &buf->tuples[i]; + row_merge_buf_encode(&b, index, entry, n_fields); + ut_ad(b < &block[srv_sort_buf_size]); #ifdef UNIV_DEBUG if (row_merge_print_write) { fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu", @@ -744,36 +716,6 @@ row_merge_heap_create( return(heap); } -/**********************************************************************//** -Search an index object by name and column names. If several indexes match, -return the index with the max id. -@return matching index, NULL if not found */ -static -dict_index_t* -row_merge_dict_table_get_index( -/*===========================*/ - dict_table_t* table, /*!< in: table */ - const merge_index_def_t*index_def) /*!< in: index definition */ -{ - ulint i; - dict_index_t* index; - const char** column_names; - - column_names = static_cast<const char**>( - mem_alloc(index_def->n_fields * sizeof *column_names)); - - for (i = 0; i < index_def->n_fields; ++i) { - column_names[i] = index_def->fields[i].field_name; - } - - index = dict_table_get_index_by_max_id( - table, index_def->name, column_names, index_def->n_fields); - - mem_free((void*) column_names); - - return(index); -} - /********************************************************************//** Read a merge block from the file system. @return TRUE if request was successful, FALSE if fail */ @@ -790,6 +732,8 @@ row_merge_read( os_offset_t ofs = ((os_offset_t) offset) * srv_sort_buf_size; ibool success; + DBUG_EXECUTE_IF("row_merge_read_failure", return(FALSE);); + #ifdef UNIV_DEBUG if (row_merge_print_block_read) { fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n", @@ -837,6 +781,8 @@ row_merge_write( os_offset_t ofs = buf_len * (os_offset_t) offset; ibool ret; + DBUG_EXECUTE_IF("row_merge_write_failure", return(FALSE);); + ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len); #ifdef UNIV_DEBUG @@ -858,7 +804,7 @@ row_merge_write( /********************************************************************//** Read a merge record. @return pointer to next record, or NULL on I/O error or end of list */ -UNIV_INTERN __attribute__((nonnull)) +UNIV_INTERN const byte* row_merge_read_rec( /*===============*/ @@ -934,7 +880,7 @@ err_exit: case. */ avail_size = &block[srv_sort_buf_size] - b; - + ut_ad(avail_size < sizeof *buf); memcpy(*buf, b, avail_size); if (!row_merge_read(fd, ++(*foffs), block)) { @@ -951,7 +897,7 @@ err_exit: *mrec = *buf + extra_size; - rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets); + rec_init_offsets_temp(*mrec, index, offsets); data_size = rec_offs_data_size(offsets); @@ -970,7 +916,7 @@ err_exit: *mrec = b + extra_size; - rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets); + rec_init_offsets_temp(*mrec, index, offsets); data_size = rec_offs_data_size(offsets); ut_ad(extra_size + data_size < sizeof *buf); @@ -1174,46 +1120,12 @@ row_merge_write_eof( return(&block[0]); } -/*************************************************************//** -Compare two merge records. -@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */ -UNIV_INTERN -int -row_merge_cmp( -/*==========*/ - const mrec_t* mrec1, /*!< in: first merge - record to be compared */ - const mrec_t* mrec2, /*!< in: second merge - record to be compared */ - const ulint* offsets1, /*!< in: first record offsets */ - const ulint* offsets2, /*!< in: second record offsets */ - const dict_index_t* index, /*!< in: index */ - ibool* null_eq) /*!< out: set to TRUE if - found matching null values */ -{ - int cmp; - - cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index, - null_eq); - -#ifdef UNIV_DEBUG - if (row_merge_print_cmp) { - fputs("row_merge_cmp1 ", stderr); - rec_print_comp(stderr, mrec1, offsets1); - fputs("\nrow_merge_cmp2 ", stderr); - rec_print_comp(stderr, mrec2, offsets2); - fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp); - } -#endif /* UNIV_DEBUG */ - - return(cmp); -} /********************************************************************//** Reads clustered index of the table and create temporary files containing the index entries for the indexes to be built. @return DB_SUCCESS or error */ -static __attribute__((nonnull)) -ulint +static __attribute__((nonnull(1,2,3,4,6,9,10,16), warn_unused_result)) +dberr_t row_merge_read_clustered_index( /*===========================*/ trx_t* trx, /*!< in: transaction */ @@ -1224,23 +1136,40 @@ row_merge_read_clustered_index( const dict_table_t* new_table,/*!< in: table where indexes are created; identical to old_table unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ dict_index_t** index, /*!< in: indexes to be created */ dict_index_t* fts_sort_idx, - /*!< in: indexes to be created */ - fts_psort_t* psort_info, /*!< in: parallel sort info */ + /*!< in: full-text index to be created, + or NULL */ + fts_psort_t* psort_info, + /*!< in: parallel sort info for + fts_sort_idx creation, or NULL */ merge_file_t* files, /*!< in: temporary files */ + const ulint* key_numbers, + /*!< in: MySQL key numbers to create */ ulint n_index,/*!< in: number of indexes to create */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, + /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence,/*!< in/out: autoinc sequence */ row_merge_block_t* block) /*!< in/out: file buffer */ { dict_index_t* clust_index; /* Clustered index */ mem_heap_t* row_heap; /* Heap memory to create - clustered index records */ + clustered index tuples */ row_merge_buf_t** merge_buf; /* Temporary list for records*/ - btr_pcur_t pcur; /* Persistent cursor on the - clustered index */ + btr_pcur_t pcur; /* Cursor on the clustered + index */ mtr_t mtr; /* Mini transaction */ - ulint err = DB_SUCCESS;/* Return code */ - ulint i; + dberr_t err = DB_SUCCESS;/* Return code */ ulint n_nonnull = 0; /* number of columns changed to NOT NULL */ ulint* nonnull = NULL; /* NOT NULL columns */ @@ -1252,13 +1181,10 @@ row_merge_read_clustered_index( ibool fts_pll_sort = FALSE; ib_int64_t sig_count = 0; - trx->op_info = "reading clustered index"; + ut_ad((old_table == new_table) == !col_map); + ut_ad(!add_cols || col_map); - ut_ad(trx); - ut_ad(old_table); - ut_ad(new_table); - ut_ad(index); - ut_ad(files); + trx->op_info = "reading clustered index"; #ifdef FTS_INTERNAL_DIAG_PRINT DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n"); @@ -1269,8 +1195,7 @@ row_merge_read_clustered_index( merge_buf = static_cast<row_merge_buf_t**>( mem_alloc(n_index * sizeof *merge_buf)); - - for (i = 0; i < n_index; i++) { + for (ulint i = 0; i < n_index; i++) { if (index[i]->type & DICT_FTS) { /* We are building a FT index, make sure @@ -1282,14 +1207,14 @@ row_merge_read_clustered_index( merge_buf[i] = row_merge_buf_create(fts_sort_idx); add_doc_id = DICT_TF2_FLAG_IS_SET( - old_table, DICT_TF2_FTS_ADD_DOC_ID); + new_table, DICT_TF2_FTS_ADD_DOC_ID); /* If Doc ID does not exist in the table itself, fetch the first FTS Doc ID */ if (add_doc_id) { fts_get_next_doc_id( (dict_table_t*) new_table, - &doc_id); + &doc_id); ut_ad(doc_id > 0); } @@ -1310,35 +1235,34 @@ row_merge_read_clustered_index( clust_index = dict_table_get_first_index(old_table); btr_pcur_open_at_index_side( - TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); - - if (UNIV_UNLIKELY(old_table != new_table)) { - ulint n_cols = dict_table_get_n_cols(old_table); + true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); - /* A primary key will be created. Identify the - columns that were flagged NOT NULL in the new table, - so that we can quickly check that the records in the - (old) clustered index do not violate the added NOT - NULL constraints. */ - - if (!fts_sort_idx) { - ut_a(n_cols == dict_table_get_n_cols(new_table)); - } + if (old_table != new_table) { + /* The table is being rebuilt. Identify the columns + that were flagged NOT NULL in the new table, so that + we can quickly check that the records in the old table + do not violate the added NOT NULL constraints. */ nonnull = static_cast<ulint*>( - mem_alloc(n_cols * sizeof *nonnull)); + mem_alloc(dict_table_get_n_cols(new_table) + * sizeof *nonnull)); - for (i = 0; i < n_cols; i++) { + for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) { if (dict_table_get_nth_col(old_table, i)->prtype & DATA_NOT_NULL) { + continue; + } + + const ulint j = col_map[i]; + if (j == ULINT_UNDEFINED) { + /* The column was dropped. */ continue; } - if (dict_table_get_nth_col(new_table, i)->prtype + if (dict_table_get_nth_col(new_table, j)->prtype & DATA_NOT_NULL) { - - nonnull[n_nonnull++] = i; + nonnull[n_nonnull++] = j; } } @@ -1354,81 +1278,221 @@ row_merge_read_clustered_index( for (;;) { const rec_t* rec; ulint* offsets; - dtuple_t* row = NULL; + const dtuple_t* row; row_ext_t* ext; - ibool has_next = TRUE; - - btr_pcur_move_to_next_on_page(&pcur); + page_cur_t* cur = btr_pcur_get_page_cur(&pcur); - /* When switching pages, commit the mini-transaction - in order to release the latch on the old page. */ + page_cur_move_to_next(cur); - if (btr_pcur_is_after_last_on_page(&pcur)) { + if (page_cur_is_after_last(cur)) { if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { err = DB_INTERRUPTED; trx->error_key_num = 0; goto func_exit; } - /* Store the cursor position on the last user - record on the page. */ - btr_pcur_move_to_prev_on_page(&pcur); - /* Leaf pages must never be empty, unless - this is the only page in the index tree. */ - ut_ad(btr_pcur_is_on_user_rec(&pcur) - || buf_block_get_page_no( - btr_pcur_get_block(&pcur)) - == clust_index->page); - - btr_pcur_store_position(&pcur, &mtr); - mtr_commit(&mtr); - mtr_start(&mtr); - /* Restore position on the record, or its - predecessor if the record was purged - meanwhile. */ - btr_pcur_restore_position(BTR_SEARCH_LEAF, - &pcur, &mtr); - /* Move to the successor of the original record. */ - has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr); + if (online && old_table != new_table) { + err = row_log_table_get_error(clust_index); + if (err != DB_SUCCESS) { + trx->error_key_num = 0; + goto func_exit; + } + } +#ifdef DBUG_OFF +# define dbug_run_purge false +#else /* DBUG_OFF */ + bool dbug_run_purge = false; +#endif /* DBUG_OFF */ + DBUG_EXECUTE_IF( + "ib_purge_on_create_index_page_switch", + dbug_run_purge = true;); + + if (dbug_run_purge + || rw_lock_get_waiters( + dict_index_get_lock(clust_index))) { + /* There are waiters on the clustered + index tree lock, likely the purge + thread. Store and restore the cursor + position, and yield so that scanning a + large table will not starve other + threads. */ + + /* Store the cursor position on the last user + record on the page. */ + btr_pcur_move_to_prev_on_page(&pcur); + /* Leaf pages must never be empty, unless + this is the only page in the index tree. */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) + || buf_block_get_page_no( + btr_pcur_get_block(&pcur)) + == clust_index->page); + + btr_pcur_store_position(&pcur, &mtr); + mtr_commit(&mtr); + + if (dbug_run_purge) { + /* This is for testing + purposes only (see + DBUG_EXECUTE_IF above). We + signal the purge thread and + hope that the purge batch will + complete before we execute + btr_pcur_restore_position(). */ + trx_purge_run(); + os_thread_sleep(1000000); + } + + /* Give the waiters a chance to proceed. */ + os_thread_yield(); + + mtr_start(&mtr); + /* Restore position on the record, or its + predecessor if the record was purged + meanwhile. */ + btr_pcur_restore_position( + BTR_SEARCH_LEAF, &pcur, &mtr); + /* Move to the successor of the + original record. */ + if (!btr_pcur_move_to_next_user_rec( + &pcur, &mtr)) { +end_of_index: + row = NULL; + mtr_commit(&mtr); + mem_heap_free(row_heap); + if (nonnull) { + mem_free(nonnull); + } + goto write_buffers; + } + } else { + ulint next_page_no; + buf_block_t* block; + + next_page_no = btr_page_get_next( + page_cur_get_page(cur), &mtr); + + if (next_page_no == FIL_NULL) { + goto end_of_index; + } + + block = page_cur_get_block(cur); + block = btr_block_get( + buf_block_get_space(block), + buf_block_get_zip_size(block), + next_page_no, BTR_SEARCH_LEAF, + clust_index, &mtr); + + btr_leaf_page_release(page_cur_get_block(cur), + BTR_SEARCH_LEAF, &mtr); + page_cur_set_before_first(block, cur); + page_cur_move_to_next(cur); + + ut_ad(!page_cur_is_after_last(cur)); + } } - if (UNIV_LIKELY(has_next)) { - rec = btr_pcur_get_rec(&pcur); - offsets = rec_get_offsets(rec, clust_index, NULL, - ULINT_UNDEFINED, &row_heap); + rec = page_cur_get_rec(cur); + + offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &row_heap); + + if (online && new_table != old_table) { + /* When rebuilding the table online, perform a + REPEATABLE READ, so that row_log_table_apply() + will not see a newer state of the table when + applying the log. This is mainly to prevent + false duplicate key errors, because the log + will identify records by the PRIMARY KEY. */ + ut_ad(trx->read_view); + + if (!read_view_sees_trx_id( + trx->read_view, + row_get_rec_trx_id( + rec, clust_index, offsets))) { + rec_t* old_vers; + + row_vers_build_for_consistent_read( + rec, &mtr, clust_index, &offsets, + trx->read_view, &row_heap, + row_heap, &old_vers); + + rec = old_vers; + + if (!rec) { + continue; + } + } - /* Skip delete marked records. */ if (rec_get_deleted_flag( - rec, dict_table_is_comp(old_table))) { + rec, + dict_table_is_comp(old_table))) { + /* This record was deleted in the latest + committed version, or it was deleted and + then reinserted-by-update before purge + kicked in. Skip it. */ continue; } - srv_n_rows_inserted++; + ut_ad(!rec_offs_any_null_extern(rec, offsets)); + } else if (rec_get_deleted_flag( + rec, dict_table_is_comp(old_table))) { + /* Skip delete-marked records. + + Skipping delete-marked records will make the + created indexes unuseable for transactions + whose read views were created before the index + creation completed, but preserving the history + would make it tricky to detect duplicate + keys. */ + continue; + } else if (UNIV_LIKELY_NULL(rec_offs_any_null_extern( + rec, offsets))) { + /* This is essentially a READ UNCOMMITTED to + fetch the most recent version of the record. */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + trx_id_t trx_id; + ulint trx_id_offset; + + /* It is possible that the record was + just inserted and the off-page columns + have not yet been written. We will + ignore the record if this is the case, + because it should be covered by the + index->info.online log in that case. */ + + trx_id_offset = clust_index->trx_id_offset; + if (!trx_id_offset) { + trx_id_offset = row_get_trx_id_offset( + clust_index, offsets); + } - /* Build a row based on the clustered index. */ + trx_id = trx_read_trx_id(rec + trx_id_offset); + ut_a(trx_rw_is_active(trx_id, NULL)); + ut_a(trx_undo_trx_id_is_insert(rec + trx_id_offset)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - row = row_build(ROW_COPY_POINTERS, clust_index, - rec, offsets, - new_table, &ext, row_heap); + /* When !online, we are holding an X-lock on + old_table, preventing any inserts. */ + ut_ad(online); + continue; + } - if (UNIV_LIKELY_NULL(nonnull)) { - for (i = 0; i < n_nonnull; i++) { - dfield_t* field - = &row->fields[nonnull[i]]; - dtype_t* field_type - = dfield_get_type(field); + /* Build a row based on the clustered index. */ - ut_a(!(field_type->prtype - & DATA_NOT_NULL)); + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, offsets, new_table, + add_cols, col_map, &ext, row_heap); + ut_ad(row); - if (dfield_is_null(field)) { - err = DB_PRIMARY_KEY_IS_NULL; - trx->error_key_num = 0; - goto func_exit; - } + for (ulint i = 0; i < n_nonnull; i++) { + const dfield_t* field = &row->fields[nonnull[i]]; - field_type->prtype |= DATA_NOT_NULL; - } + ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL); + + if (dfield_is_null(field)) { + err = DB_INVALID_NULL; + trx->error_key_num = 0; + goto func_exit; } } @@ -1439,19 +1503,72 @@ row_merge_read_clustered_index( doc_id = 0; } + if (add_autoinc != ULINT_UNDEFINED) { + + ut_ad(add_autoinc + < dict_table_get_n_user_cols(new_table)); + + const dfield_t* dfield; + + dfield = dtuple_get_nth_field(row, add_autoinc); + if (dfield_is_null(dfield)) { + goto write_buffers; + } + + const dtype_t* dtype = dfield_get_type(dfield); + byte* b = static_cast<byte*>(dfield_get_data(dfield)); + + if (sequence.eof()) { + err = DB_ERROR; + trx->error_key_num = 0; + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_AUTOINC_READ_FAILED, "[NULL]"); + + goto func_exit; + } + + ulonglong value = sequence++; + + switch (dtype_get_mtype(dtype)) { + case DATA_INT: { + ibool usign; + ulint len = dfield_get_len(dfield); + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + mach_write_ulonglong(b, value, len, usign); + + break; + } + + case DATA_FLOAT: + mach_float_write( + b, static_cast<float>(value)); + break; + + case DATA_DOUBLE: + mach_double_write( + b, static_cast<double>(value)); + break; + + default: + ut_ad(0); + } + } + +write_buffers: /* Build all entries for all the indexes to be created in a single scan of the clustered index. */ - for (i = 0; i < n_index; i++) { + for (ulint i = 0; i < n_index; i++) { row_merge_buf_t* buf = merge_buf[i]; merge_file_t* file = &files[i]; - const dict_index_t* index = buf->index; ulint rows_added = 0; if (UNIV_LIKELY (row && (rows_added = row_merge_buf_add( - buf, fts_index, psort_info, - row, ext, &doc_id)))) { + buf, fts_index, old_table, + psort_info, row, ext, &doc_id)))) { /* If we are creating FTS index, a single row can generate more @@ -1464,35 +1581,60 @@ row_merge_read_clustered_index( continue; } - if ((!row || !doc_id) - && index->type & DICT_FTS) { + if ((buf->index->type & DICT_FTS) + && (!row || !doc_id)) { continue; } /* The buffer must be sufficiently large - to hold at least one record. */ - ut_ad(buf->n_tuples || !has_next); + to hold at least one record. It may only + be empty when we reach the end of the + clustered index. row_merge_buf_add() + must not have been called in this loop. */ + ut_ad(buf->n_tuples || row == NULL); /* We have enough data tuples to form a block. Sort them and write to disk. */ if (buf->n_tuples) { - if (dict_index_is_unique(index)) { - row_merge_dup_t dup; - dup.index = buf->index; - dup.table = table; - dup.n_dup = 0; + if (dict_index_is_unique(buf->index)) { + row_merge_dup_t dup = { + buf->index, table, col_map, 0}; row_merge_buf_sort(buf, &dup); if (dup.n_dup) { err = DB_DUPLICATE_KEY; - trx->error_key_num = i; - goto func_exit; + trx->error_key_num + = key_numbers[i]; + break; } } else { row_merge_buf_sort(buf, NULL); } + } else if (online && new_table == old_table) { + /* Note the newest transaction that + modified this index when the scan was + completed. We prevent older readers + from accessing this index, to ensure + read consistency. */ + + trx_id_t max_trx_id; + + ut_a(row == NULL); + rw_lock_x_lock( + dict_index_get_lock(buf->index)); + ut_a(dict_index_get_online_status(buf->index) + == ONLINE_INDEX_CREATION); + + max_trx_id = row_log_get_max_trx(buf->index); + + if (max_trx_id > buf->index->trx_id) { + buf->index->trx_id = max_trx_id; + } + + rw_lock_x_unlock( + dict_index_get_lock(buf->index)); } row_merge_buf_write(buf, file, block); @@ -1501,7 +1643,7 @@ row_merge_read_clustered_index( block)) { err = DB_OUT_OF_FILE_SPACE; trx->error_key_num = i; - goto func_exit; + break; } UNIV_MEM_INVALID(&block[0], srv_sort_buf_size); @@ -1514,14 +1656,11 @@ row_merge_read_clustered_index( if (UNIV_UNLIKELY (!(rows_added = row_merge_buf_add( - buf, fts_index, psort_info, row, - ext, &doc_id)))) { + buf, fts_index, old_table, + psort_info, row, ext, + &doc_id)))) { /* An empty buffer should have enough - room for at least one record. - TODO: for FTS index building, we'll - need to prepared for coping with very - large text/blob data in a single row - that could fill up the merge file */ + room for at least one record. */ ut_error; } @@ -1529,27 +1668,40 @@ row_merge_read_clustered_index( } } - mem_heap_empty(row_heap); + if (row == NULL) { + goto all_done; + } - if (UNIV_UNLIKELY(!has_next)) { + if (err != DB_SUCCESS) { goto func_exit; } + + mem_heap_empty(row_heap); } func_exit: + mtr_commit(&mtr); + mem_heap_free(row_heap); + + if (nonnull) { + mem_free(nonnull); + } + +all_done: #ifdef FTS_INTERNAL_DIAG_PRINT DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n"); #endif if (fts_pll_sort) { - for (i = 0; i < fts_sort_pll_degree; i++) { + for (ulint i = 0; i < fts_sort_pll_degree; i++) { psort_info[i].state = FTS_PARENT_COMPLETE; } wait_again: os_event_wait_time_low(fts_parallel_sort_event, 1000000, sig_count); - for (i = 0; i < fts_sort_pll_degree; i++) { - if (psort_info[i].child_status != FTS_CHILD_COMPLETE) { + for (ulint i = 0; i < fts_sort_pll_degree; i++) { + if (psort_info[i].child_status != FTS_CHILD_COMPLETE + && psort_info[i].child_status != FTS_CHILD_EXITING) { sig_count = os_event_reset( fts_parallel_sort_event); goto wait_again; @@ -1560,17 +1712,7 @@ wait_again: #ifdef FTS_INTERNAL_DIAG_PRINT DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n"); #endif - - btr_pcur_close(&pcur); - mtr_commit(&mtr); - mem_heap_free(row_heap); - - if (UNIV_LIKELY_NULL(nonnull)) { - mem_free(nonnull); - } - - - for (i = 0; i < n_index; i++) { + for (ulint i = 0; i < n_index; i++) { row_merge_buf_free(merge_buf[i]); } @@ -1578,10 +1720,13 @@ wait_again: mem_free(merge_buf); + btr_pcur_close(&pcur); + /* Update the next Doc ID we used. Table should be locked, so no concurrent DML */ if (max_doc_id) { - fts_update_next_doc_id(new_table, old_table->name, max_doc_id); + fts_update_next_doc_id( + 0, new_table, old_table->name, max_doc_id); } trx->op_info = ""; @@ -1590,24 +1735,20 @@ wait_again: } /** Write a record via buffer 2 and read the next record to buffer N. -@param M FTS merge info structure -@param N index into array of merge info structure -@param INDEX the FTS index */ - - -/** Write a record via buffer 2 and read the next record to buffer N. @param N number of the buffer (0 or 1) +@param INDEX record descriptor @param AT_END statement to execute at end of input */ -#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \ +#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \ do { \ - b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], &buf[2], b2, \ + b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \ + &buf[2], b2, \ of->fd, &of->offset, \ mrec##N, offsets##N); \ if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \ goto corrupt; \ } \ - b##N = row_merge_read_rec(&block[N * srv_sort_buf_size], &buf[N], \ - b##N, index, \ + b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\ + &buf[N], b##N, INDEX, \ file->fd, foffs##N, \ &mrec##N, offsets##N); \ if (UNIV_UNLIKELY(!b##N)) { \ @@ -1621,11 +1762,12 @@ wait_again: /*************************************************************//** Merge two blocks of records on disk and write a bigger block. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_merge_blocks( /*=============*/ - const dict_index_t* index, /*!< in: index being created */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ const merge_file_t* file, /*!< in: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ @@ -1633,20 +1775,18 @@ row_merge_blocks( source list in the file */ ulint* foffs1, /*!< in/out: offset of second source list in the file */ - merge_file_t* of, /*!< in/out: output file */ - struct TABLE* table) /*!< in/out: MySQL table, for - reporting erroneous key value - if applicable */ + merge_file_t* of) /*!< in/out: output file */ { mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ mrec_buf_t* buf; /*!< buffer for handling split mrec in block[] */ const byte* b0; /*!< pointer to block[0] */ - const byte* b1; /*!< pointer to block[1] */ - byte* b2; /*!< pointer to block[2] */ + const byte* b1; /*!< pointer to block[srv_sort_buf_size] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */ - const mrec_t* mrec1; /*!< merge rec, points to block[1] or buf[1] */ + const mrec_t* mrec1; /*!< merge rec, points to + block[srv_sort_buf_size] or buf[1] */ ulint* offsets0;/* offsets of mrec0 */ ulint* offsets1;/* offsets of mrec1 */ @@ -1661,7 +1801,7 @@ row_merge_blocks( } #endif /* UNIV_DEBUG */ - heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1); + heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1); /* Write a record and read the next record. Split the output file in two halves, which can be merged on the following pass. */ @@ -1677,10 +1817,13 @@ corrupt: b1 = &block[srv_sort_buf_size]; b2 = &block[2 * srv_sort_buf_size]; - b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd, - foffs0, &mrec0, offsets0); - b1 = row_merge_read_rec(&block[srv_sort_buf_size], &buf[srv_sort_buf_size], b1, index, file->fd, - foffs1, &mrec1, offsets1); + b0 = row_merge_read_rec( + &block[0], &buf[0], b0, dup->index, + file->fd, foffs0, &mrec0, offsets0); + b1 = row_merge_read_rec( + &block[srv_sort_buf_size], + &buf[srv_sort_buf_size], b1, dup->index, + file->fd, foffs1, &mrec1, offsets1); if (UNIV_UNLIKELY(!b0 && mrec0) || UNIV_UNLIKELY(!b1 && mrec1)) { @@ -1688,56 +1831,49 @@ corrupt: } while (mrec0 && mrec1) { - ibool null_eq = FALSE; - switch (row_merge_cmp(mrec0, mrec1, - offsets0, offsets1, index, - &null_eq)) { + switch (cmp_rec_rec_simple( + mrec0, mrec1, offsets0, offsets1, + dup->index, dup->table)) { case 0: - if (UNIV_UNLIKELY - (dict_index_is_unique(index) && !null_eq)) { - innobase_rec_to_mysql(table, mrec0, - index, offsets0); - mem_heap_free(heap); - return(DB_DUPLICATE_KEY); - } - /* fall through */ + mem_heap_free(heap); + return(DB_DUPLICATE_KEY); case -1: - ROW_MERGE_WRITE_GET_NEXT(0, goto merged); + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged); break; case 1: - ROW_MERGE_WRITE_GET_NEXT(1, goto merged); + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged); break; default: ut_error; } - } merged: if (mrec0) { /* append all mrec0 to output */ for (;;) { - ROW_MERGE_WRITE_GET_NEXT(0, goto done0); + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0); } } done0: if (mrec1) { /* append all mrec1 to output */ for (;;) { - ROW_MERGE_WRITE_GET_NEXT(1, goto done1); + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1); } } done1: mem_heap_free(heap); - b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size], b2, of->fd, &of->offset); + b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset); return(b2 ? DB_SUCCESS : DB_CORRUPTION); } /*************************************************************//** Copy a block of index entries. @return TRUE on success, FALSE on failure */ -static __attribute__((nonnull)) +static __attribute__((nonnull, warn_unused_result)) ibool row_merge_blocks_copy( /*==================*/ @@ -1752,7 +1888,7 @@ row_merge_blocks_copy( mrec_buf_t* buf; /*!< buffer for handling split mrec in block[] */ const byte* b0; /*!< pointer to block[0] */ - byte* b2; /*!< pointer to block[2] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ const mrec_t* mrec0; /*!< merge rec, points to block[0] */ ulint* offsets0;/* offsets of mrec0 */ ulint* offsets1;/* dummy offsets */ @@ -1782,8 +1918,8 @@ corrupt: b2 = &block[2 * srv_sort_buf_size]; - b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd, - foffs0, &mrec0, offsets0); + b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, + file->fd, foffs0, &mrec0, offsets0); if (UNIV_UNLIKELY(!b0 && mrec0)) { goto corrupt; @@ -1792,7 +1928,7 @@ corrupt: if (mrec0) { /* append all mrec0 to output */ for (;;) { - ROW_MERGE_WRITE_GET_NEXT(0, goto done0); + ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0); } } done0: @@ -1802,7 +1938,8 @@ done0: (*foffs0)++; mem_heap_free(heap); - return(row_merge_write_eof(&block[2 * srv_sort_buf_size], b2, of->fd, &of->offset) + return(row_merge_write_eof(&block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset) != NULL); } @@ -1810,18 +1947,16 @@ done0: Merge disk files. @return DB_SUCCESS or error code */ static __attribute__((nonnull)) -ulint +dberr_t row_merge( /*======*/ trx_t* trx, /*!< in: transaction */ - const dict_index_t* index, /*!< in: index being created */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ merge_file_t* file, /*!< in/out: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ int* tmpfd, /*!< in/out: temporary file handle */ - struct TABLE* table, /*!< in/out: MySQL table, for - reporting erroneous key value - if applicable */ ulint* num_run,/*!< in/out: Number of runs remain to be merged */ ulint* run_offset) /*!< in/out: Array contains the @@ -1830,7 +1965,7 @@ row_merge( { ulint foffs0; /*!< first input offset */ ulint foffs1; /*!< second input offset */ - ulint error; /*!< error code */ + dberr_t error; /*!< error code */ merge_file_t of; /*!< output file */ const ulint ihalf = run_offset[*num_run / 2]; /*!< half the input file */ @@ -1861,15 +1996,15 @@ row_merge( for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) { - if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + if (trx_is_interrupted(trx)) { return(DB_INTERRUPTED); } /* Remember the offset number for this run */ run_offset[n_run++] = of.offset; - error = row_merge_blocks(index, file, block, - &foffs0, &foffs1, &of, table); + error = row_merge_blocks(dup, file, block, + &foffs0, &foffs1, &of); if (error != DB_SUCCESS) { return(error); @@ -1887,7 +2022,8 @@ row_merge( /* Remember the offset number for this run */ run_offset[n_run++] = of.offset; - if (!row_merge_blocks_copy(index, file, block, &foffs0, &of)) { + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs0, &of)) { return(DB_CORRUPTION); } } @@ -1895,14 +2031,15 @@ row_merge( ut_ad(foffs0 == ihalf); while (foffs1 < file->offset) { - if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + if (trx_is_interrupted(trx)) { return(DB_INTERRUPTED); } /* Remember the offset number for this run */ run_offset[n_run++] = of.offset; - if (!row_merge_blocks_copy(index, file, block, &foffs1, &of)) { + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs1, &of)) { return(DB_CORRUPTION); } } @@ -1940,23 +2077,21 @@ row_merge( Merge disk files. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_sort( /*===========*/ trx_t* trx, /*!< in: transaction */ - const dict_index_t* index, /*!< in: index being created */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ merge_file_t* file, /*!< in/out: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ - int* tmpfd, /*!< in/out: temporary file handle */ - struct TABLE* table) /*!< in/out: MySQL table, for - reporting erroneous key value - if applicable */ + int* tmpfd) /*!< in/out: temporary file handle */ { - ulint half = file->offset / 2; - ulint num_runs; - ulint* run_offset; - ulint error = DB_SUCCESS; + const ulint half = file->offset / 2; + ulint num_runs; + ulint* run_offset; + dberr_t error = DB_SUCCESS; /* Record the number of merge runs we need to perform */ num_runs = file->offset; @@ -1979,14 +2114,14 @@ row_merge_sort( /* Merge the runs until we have one big run */ do { - error = row_merge(trx, index, file, block, tmpfd, - table, &num_runs, run_offset); - - UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset); + error = row_merge(trx, dup, file, block, tmpfd, + &num_runs, run_offset); if (error != DB_SUCCESS) { break; } + + UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset); } while (num_runs > 1); mem_free(run_offset); @@ -1995,8 +2130,25 @@ row_merge_sort( } /*************************************************************//** +Set blob fields empty */ +static __attribute__((nonnull)) +void +row_merge_set_blob_empty( +/*=====================*/ + dtuple_t* tuple) /*!< in/out: data tuple */ +{ + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { + dfield_t* field = dtuple_get_nth_field(tuple, i); + + if (dfield_is_ext(field)) { + dfield_set_data(field, NULL, 0); + } + } +} + +/*************************************************************//** Copy externally stored columns to the data tuple. */ -static +static __attribute__((nonnull)) void row_merge_copy_blobs( /*=================*/ @@ -2006,10 +2158,9 @@ row_merge_copy_blobs( dtuple_t* tuple, /*!< in/out: data tuple */ mem_heap_t* heap) /*!< in/out: memory heap */ { - ulint i; - ulint n_fields = dtuple_get_n_fields(tuple); + ut_ad(rec_offs_any_extern(offsets)); - for (i = 0; i < n_fields; i++) { + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { ulint len; const void* data; dfield_t* field = dtuple_get_nth_field(tuple, i); @@ -2020,11 +2171,12 @@ row_merge_copy_blobs( ut_ad(!dfield_is_null(field)); - /* The table is locked during index creation. - Therefore, externally stored columns cannot possibly - be freed between the time the BLOB pointers are read - (row_merge_read_clustered_index()) and dereferenced - (below). */ + /* During the creation of a PRIMARY KEY, the table is + X-locked, and we skip copying records that have been + marked for deletion. Therefore, externally stored + columns cannot possibly be freed between the time the + BLOB pointers are read (row_merge_read_clustered_index()) + and dereferenced (below). */ data = btr_rec_copy_externally_stored_field( mrec, offsets, zip_size, i, &len, heap); /* Because we have locked the table, any records @@ -2041,54 +2193,38 @@ row_merge_copy_blobs( Read sorted file containing index data tuples and insert these data tuples to the index @return DB_SUCCESS or error number */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_merge_insert_index_tuples( /*==========================*/ - trx_t* trx, /*!< in: transaction */ + trx_id_t trx_id, /*!< in: transaction identifier */ dict_index_t* index, /*!< in: index */ - dict_table_t* table, /*!< in: new table */ - ulint zip_size,/*!< in: compressed page size of - the old table, or 0 if uncompressed */ + const dict_table_t* old_table,/*!< in: old table */ int fd, /*!< in: file descriptor */ row_merge_block_t* block) /*!< in/out: file buffer */ { const byte* b; - que_thr_t* thr; - ins_node_t* node; + mem_heap_t* heap; mem_heap_t* tuple_heap; - mem_heap_t* graph_heap; - ulint error = DB_SUCCESS; + mem_heap_t* ins_heap; + dberr_t error = DB_SUCCESS; ulint foffs = 0; ulint* offsets; + mrec_buf_t* buf; - ut_ad(trx); - ut_ad(index); - ut_ad(table); - + ut_ad(!srv_read_only_mode); ut_ad(!(index->type & DICT_FTS)); - - /* We use the insert query graph as the dummy graph - needed in the row module call */ - - trx->op_info = "inserting index entries"; - - graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t)); - node = ins_node_create(INS_DIRECT, table, graph_heap); - - thr = pars_complete_graph_for_exec(node, trx, graph_heap); - - que_thr_move_to_run_state_for_mysql(thr, trx); + ut_ad(trx_id); tuple_heap = mem_heap_create(1000); { ulint i = 1 + REC_OFFS_HEADER_SIZE + dict_index_get_n_fields(index); - + heap = mem_heap_create(sizeof *buf + i * sizeof *offsets); + ins_heap = mem_heap_create(sizeof *buf + i * sizeof *offsets); offsets = static_cast<ulint*>( - mem_heap_alloc(graph_heap, i * sizeof *offsets)); - + mem_heap_alloc(heap, i * sizeof *offsets)); offsets[0] = i; offsets[1] = dict_index_get_n_fields(index); } @@ -2098,15 +2234,17 @@ row_merge_insert_index_tuples( if (!row_merge_read(fd, foffs, block)) { error = DB_CORRUPTION; } else { - mrec_buf_t* buf; - buf = static_cast<mrec_buf_t*>( - mem_heap_alloc(graph_heap, sizeof *buf)); + mem_heap_alloc(heap, sizeof *buf)); for (;;) { const mrec_t* mrec; dtuple_t* dtuple; ulint n_ext; + big_rec_t* big_rec; + rec_t* rec; + btr_cur_t cursor; + mtr_t mtr; b = row_merge_read_rec(block, buf, b, index, fd, &foffs, &mrec, offsets); @@ -2118,55 +2256,164 @@ row_merge_insert_index_tuples( break; } + dict_index_t* old_index + = dict_table_get_first_index(old_table); + + if (dict_index_is_clust(index) + && dict_index_is_online_ddl(old_index)) { + error = row_log_table_get_error(old_index); + if (error != DB_SUCCESS) { + break; + } + } + dtuple = row_rec_to_index_entry_low( mrec, index, offsets, &n_ext, tuple_heap); - if (UNIV_UNLIKELY(n_ext)) { - row_merge_copy_blobs(mrec, offsets, zip_size, - dtuple, tuple_heap); - } + if (!n_ext) { + /* There are no externally stored columns. */ + } else if (!dict_index_is_online_ddl(old_index)) { + ut_ad(dict_index_is_clust(index)); + /* Modifications to the table are + blocked while we are not rebuilding it + or creating indexes. Off-page columns + can be fetched safely. */ + row_merge_copy_blobs( + mrec, offsets, + dict_table_zip_size(old_table), + dtuple, tuple_heap); + } else { + ut_ad(dict_index_is_clust(index)); - node->row = dtuple; - node->table = table; - node->trx_id = trx->id; + ulint offset = index->trx_id_offset; - ut_ad(dtuple_validate(dtuple)); + if (!offset) { + offset = row_get_trx_id_offset( + index, offsets); + } - do { - thr->run_node = thr; - thr->prev_node = thr->common.parent; + /* Copy the off-page columns while + holding old_index->lock, so + that they cannot be freed by + a rollback of a fresh insert. */ + rw_lock_s_lock(&old_index->lock); + + if (row_log_table_is_rollback( + old_index, + trx_read_trx_id(mrec + offset))) { + /* The row and BLOB could + already be freed. They + will be deleted by + row_undo_ins_remove_clust_rec + when rolling back a fresh + insert. So, no need to retrieve + the off-page column. */ + row_merge_set_blob_empty( + dtuple); + } else { + row_merge_copy_blobs( + mrec, offsets, + dict_table_zip_size(old_table), + dtuple, tuple_heap); + } - error = row_ins_index_entry(index, dtuple, - 0, FALSE, thr); + rw_lock_s_unlock(&old_index->lock); + } - if (UNIV_LIKELY(error == DB_SUCCESS)) { + ut_ad(dtuple_validate(dtuple)); + log_free_check(); - goto next_rec; - } + mtr_start(&mtr); + /* Insert after the last user record. */ + btr_cur_open_at_index_side( + false, index, BTR_MODIFY_LEAF, + &cursor, 0, &mtr); + page_cur_position( + page_rec_get_prev(btr_cur_get_rec(&cursor)), + btr_cur_get_block(&cursor), + btr_cur_get_page_cur(&cursor)); + cursor.flag = BTR_CUR_BINARY; +#ifdef UNIV_DEBUG + /* Check that the records are inserted in order. */ + rec = btr_cur_get_rec(&cursor); + + if (!page_rec_is_infimum(rec)) { + ulint* rec_offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &tuple_heap); + ut_ad(cmp_dtuple_rec(dtuple, rec, rec_offsets) + > 0); + } +#endif /* UNIV_DEBUG */ + ulint* ins_offsets = NULL; + + error = btr_cur_optimistic_insert( + BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG, + &cursor, &ins_offsets, &ins_heap, + dtuple, &rec, &big_rec, 0, NULL, &mtr); + + if (error == DB_FAIL) { + ut_ad(!big_rec); + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_open_at_index_side( + false, index, BTR_MODIFY_TREE, + &cursor, 0, &mtr); + page_cur_position( + page_rec_get_prev(btr_cur_get_rec( + &cursor)), + btr_cur_get_block(&cursor), + btr_cur_get_page_cur(&cursor)); + + error = btr_cur_pessimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG, + &cursor, &ins_offsets, &ins_heap, + dtuple, &rec, &big_rec, 0, NULL, &mtr); + } + + if (!dict_index_is_clust(index)) { + page_update_max_trx_id( + btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } - thr->lock_state = QUE_THR_LOCK_ROW; + mtr_commit(&mtr); - trx->error_state = static_cast<enum db_err>( - error); + if (UNIV_LIKELY_NULL(big_rec)) { + /* If the system crashes at this + point, the clustered index record will + contain a null BLOB pointer. This + should not matter, because the copied + table will be dropped on crash + recovery anyway. */ + + ut_ad(dict_index_is_clust(index)); + ut_ad(error == DB_SUCCESS); + error = row_ins_index_entry_big_rec( + dtuple, big_rec, + ins_offsets, &ins_heap, + index, NULL, __FILE__, __LINE__); + dtuple_convert_back_big_rec( + index, dtuple, big_rec); + } - que_thr_stop_for_mysql(thr); - thr->lock_state = QUE_THR_LOCK_NOLOCK; - } while (row_mysql_handle_errors(&error, trx, - thr, NULL)); + if (error != DB_SUCCESS) { + goto err_exit; + } - goto err_exit; -next_rec: mem_heap_empty(tuple_heap); + mem_heap_empty(ins_heap); } } - que_thr_stop_for_mysql_no_error(thr, trx); err_exit: - que_graph_free(thr->graph); - - trx->op_info = ""; - mem_heap_free(tuple_heap); + mem_heap_free(ins_heap); + mem_heap_free(heap); return(error); } @@ -2175,7 +2422,7 @@ err_exit: Sets an exclusive lock on a table, for the duration of creating indexes. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_merge_lock_table( /*=================*/ trx_t* trx, /*!< in/out: transaction */ @@ -2184,10 +2431,10 @@ row_merge_lock_table( { mem_heap_t* heap; que_thr_t* thr; - ulint err; + dberr_t err; sel_node_t* node; - ut_ad(trx); + ut_ad(!srv_read_only_mode); ut_ad(mode == LOCK_X || mode == LOCK_S); heap = mem_heap_create(512); @@ -2213,7 +2460,7 @@ run_again: err = lock_table(0, table, mode, thr); - trx->error_state =static_cast<enum db_err>( err); + trx->error_state = err; if (UNIV_LIKELY(err == DB_SUCCESS)) { que_thr_stop_for_mysql_no_error(thr, trx); @@ -2221,7 +2468,7 @@ run_again: que_thr_stop_for_mysql(thr); if (err != DB_QUE_THR_SUSPENDED) { - ibool was_lock_wait; + bool was_lock_wait; was_lock_wait = row_mysql_handle_errors( &err, trx, thr, NULL); @@ -2255,105 +2502,312 @@ run_again: } /*********************************************************************//** -Drop an index from the InnoDB system tables. The data dictionary must -have been locked exclusively by the caller, because the transaction -will not be committed. */ -UNIV_INTERN +Drop an index that was created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +static void -row_merge_drop_index( -/*=================*/ - dict_index_t* index, /*!< in: index to be removed */ - dict_table_t* table, /*!< in: table */ - trx_t* trx) /*!< in: transaction handle */ +row_merge_drop_index_dict( +/*======================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + index_id_t index_id)/*!< in: index identifier */ { - db_err err; - pars_info_t* info = pars_info_create(); - - /* We use the private SQL parser of Innobase to generate the - query graphs needed in deleting the dictionary data from system - tables in Innobase. Deleting a row from SYS_INDEXES table also - frees the file segments of the B-tree associated with the index. */ - static const char sql[] = "PROCEDURE DROP_INDEX_PROC () IS\n" "BEGIN\n" - /* Rename the index, so that it will be dropped by - row_merge_drop_temp_indexes() at crash recovery - if the server crashes before this trx is committed. */ - "UPDATE SYS_INDEXES SET NAME=CONCAT('" - TEMP_INDEX_PREFIX_STR "', NAME) WHERE ID = :indexid;\n" - "COMMIT WORK;\n" - /* Drop the field definitions of the index. */ - "DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n" - /* Drop the index definition and the B-tree. */ - "DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n" + "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n" + "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n" "END;\n"; + dberr_t error; + pars_info_t* info; - ut_ad(index && table && trx); + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ - pars_info_add_ull_literal(info, "indexid", index->id); + info = pars_info_create(); + pars_info_add_ull_literal(info, "indexid", index_id); + trx->op_info = "dropping index from dictionary"; + error = que_eval_sql(info, sql, FALSE, trx); - trx_start_if_not_started_xa(trx); - trx->op_info = "dropping index"; + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; - ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: row_merge_drop_index_dict " + "failed with error code: %u.\n", (unsigned) error); + } - err = static_cast<db_err>(que_eval_sql(info, sql, FALSE, trx)); + trx->op_info = ""; +} - DBUG_EXECUTE_IF( - "ib_drop_index_too_many_concurrent_trxs", - err = DB_TOO_MANY_CONCURRENT_TRXS; - trx->error_state = err;); +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ +{ + static const char sql[] = + "PROCEDURE DROP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" - if (err == DB_SUCCESS) { + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE TABLE_ID=:tableid AND\n" + " SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" - /* If it is FTS index, drop from table->fts and also drop - its auxiliary tables */ - if (index->type & DICT_FTS) { - ut_a(table->fts); - fts_drop_index(table, index, trx); - } + "BEGIN\n" + "found := 1;\n" + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" - /* Replace this index with another equivalent index for all - foreign key constraints on this table where this index is - used */ + "END;\n"; + dberr_t error; + pars_info_t* info; - dict_table_replace_index_in_foreign_list(table, index, trx); - dict_index_remove_from_cache(table, index); + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ - } else { + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. + + A concurrent purge will be prevented by dict_operation_lock. */ + + info = pars_info_create(); + pars_info_add_ull_literal(info, "tableid", table_id); + trx->op_info = "dropping indexes"; + error = que_eval_sql(info, sql, FALSE, trx); + + if (error != DB_SUCCESS) { /* Even though we ensure that DDL transactions are WAIT and DEADLOCK free, we could encounter other errors e.g., - DB_TOO_MANY_TRANSACTIONS. */ + DB_TOO_MANY_CONCURRENT_TRXS. */ trx->error_state = DB_SUCCESS; ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: row_merge_drop_index failed " - "with error code: %lu.\n", (ulint) err); + fprintf(stderr, " InnoDB: Error: row_merge_drop_indexes_dict " + "failed with error code: %u.\n", (unsigned) error); } trx->op_info = ""; } /*********************************************************************//** -Drop those indexes which were created before an error occurred when -building an index. The data dictionary must have been locked -exclusively by the caller, because the transaction will not be -committed. */ +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ UNIV_INTERN void row_merge_drop_indexes( /*===================*/ - trx_t* trx, /*!< in: transaction */ - dict_table_t* table, /*!< in: table containing the indexes */ - dict_index_t** index, /*!< in: indexes to drop */ - ulint num_created) /*!< in: number of elements in index[] */ + trx_t* trx, /*!< in/out: dictionary transaction */ + dict_table_t* table, /*!< in/out: table containing the indexes */ + ibool locked) /*!< in: TRUE=table locked, + FALSE=may need to do a lazy drop */ { - ulint key_num; + dict_index_t* index; + dict_index_t* next_index; + + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = dict_table_get_first_index(table); + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE); + + /* the caller should have an open handle to the table */ + ut_ad(table->n_ref_count >= 1); + + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. + + A concurrent purge will be prevented by dict_operation_lock. */ + + if (!locked && table->n_ref_count > 1) { + /* We will have to drop the indexes later, when the + table is guaranteed to be no longer in use. Mark the + indexes as incomplete and corrupted, so that other + threads will stop using them. Let dict_table_close() + or crash recovery or the next invocation of + prepare_inplace_alter_table() take care of dropping + the indexes. */ + + while ((index = dict_table_get_next_index(index)) != NULL) { + ut_ad(!dict_index_is_clust(index)); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_ABORTED_DROPPED: + continue; + case ONLINE_INDEX_COMPLETE: + if (*index->name != TEMP_INDEX_PREFIX) { + /* Do nothing to already + published indexes. */ + } else if (index->type & DICT_FTS) { + /* Drop a completed FULLTEXT + index, due to a timeout during + MDL upgrade for + commit_inplace_alter_table(). + Because only concurrent reads + are allowed (and they are not + seeing this index yet) we + are safe to drop the index. */ + dict_index_t* prev = UT_LIST_GET_PREV( + indexes, index); + /* At least there should be + the clustered index before + this one. */ + ut_ad(prev); + ut_a(table->fts); + fts_drop_index(table, index, trx); + /* Since + INNOBASE_SHARE::idx_trans_tbl + is shared between all open + ha_innobase handles to this + table, no thread should be + accessing this dict_index_t + object. Also, we should be + holding LOCK=SHARED MDL on the + table even after the MDL + upgrade timeout. */ + + /* We can remove a DICT_FTS + index from the cache, because + we do not allow ADD FULLTEXT INDEX + with LOCK=NONE. If we allowed that, + we should exclude FTS entries from + prebuilt->ins_node->entry_list + in ins_node_create_entry_list(). */ + dict_index_remove_from_cache( + table, index); + index = prev; + } else { + rw_lock_x_lock( + dict_index_get_lock(index)); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED); + index->type |= DICT_CORRUPT; + table->drop_aborted = TRUE; + goto drop_aborted; + } + continue; + case ONLINE_INDEX_CREATION: + rw_lock_x_lock(dict_index_get_lock(index)); + ut_ad(*index->name == TEMP_INDEX_PREFIX); + row_log_abort_sec(index); + drop_aborted: + rw_lock_x_unlock(dict_index_get_lock(index)); + + DEBUG_SYNC_C("merge_drop_index_after_abort"); + /* covered by dict_sys->mutex */ + MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX); + /* fall through */ + case ONLINE_INDEX_ABORTED: + /* Drop the index tree from the + data dictionary and free it from + the tablespace, but keep the object + in the data dictionary cache. */ + row_merge_drop_index_dict(trx, index->id); + rw_lock_x_lock(dict_index_get_lock(index)); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED_DROPPED); + rw_lock_x_unlock(dict_index_get_lock(index)); + table->drop_aborted = TRUE; + continue; + } + ut_error; + } - for (key_num = 0; key_num < num_created; key_num++) { - row_merge_drop_index(index[key_num], table, trx); + return; } + + row_merge_drop_indexes_dict(trx, table->id); + + /* Invalidate all row_prebuilt_t::ins_graph that are referring + to this table. That is, force row_get_prebuilt_insert_row() to + rebuild prebuilt->ins_node->entry_list). */ + ut_ad(table->def_trx_id <= trx->id); + table->def_trx_id = trx->id; + + next_index = dict_table_get_next_index(index); + + while ((index = next_index) != NULL) { + /* read the next pointer before freeing the index */ + next_index = dict_table_get_next_index(index); + + ut_ad(!dict_index_is_clust(index)); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* If it is FTS index, drop from table->fts + and also drop its auxiliary tables */ + if (index->type & DICT_FTS) { + ut_a(table->fts); + fts_drop_index(table, index, trx); + } + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + /* This state should only be possible + when prepare_inplace_alter_table() fails + after invoking row_merge_create_index(). + In inplace_alter_table(), + row_merge_build_indexes() + should never leave the index in this state. + It would invoke row_log_abort_sec() on + failure. */ + case ONLINE_INDEX_COMPLETE: + /* In these cases, we are able to drop + the index straight. The DROP INDEX was + never deferred. */ + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* covered by dict_sys->mutex */ + MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX); + } + + dict_index_remove_from_cache(table, index); + } + } + + table->drop_aborted = FALSE; + ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE)); } /*********************************************************************//** @@ -2363,9 +2817,32 @@ void row_merge_drop_temp_indexes(void) /*=============================*/ { - trx_t* trx; - btr_pcur_t pcur; - mtr_t mtr; + static const char sql[] = + "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" + + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "found := 1;\n" + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" + "END;\n"; + trx_t* trx; + dberr_t error; /* Load the table definitions that contain partially defined indexes, so that the data dictionary information can be checked @@ -2373,75 +2850,26 @@ row_merge_drop_temp_indexes(void) trx = trx_allocate_for_background(); trx->op_info = "dropping partially created indexes"; row_mysql_lock_data_dictionary(trx); + /* Ensure that this transaction will be rolled back and locks + will be released, if the server gets killed before the commit + gets written to the redo log. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); - mtr_start(&mtr); - - btr_pcur_open_at_index_side( - TRUE, - dict_table_get_first_index(dict_sys->sys_indexes), - BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); - - for (;;) { - const rec_t* rec; - const byte* field; - ulint len; - table_id_t table_id; - dict_table_t* table; - - btr_pcur_move_to_next_user_rec(&pcur, &mtr); - - if (!btr_pcur_is_on_user_rec(&pcur)) { - break; - } - - rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__NAME, &len); - if (len == UNIV_SQL_NULL || len == 0 - || (char) *field != TEMP_INDEX_PREFIX) { - continue; - } - - /* This is a temporary index. */ - - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len); - if (len != 8) { - /* Corrupted TABLE_ID */ - continue; - } - - table_id = mach_read_from_8(field); - - btr_pcur_store_position(&pcur, &mtr); - btr_pcur_commit_specify_mtr(&pcur, &mtr); - - table = dict_table_open_on_id(table_id, TRUE); + trx->op_info = "dropping indexes"; + error = que_eval_sql(NULL, sql, FALSE, trx); - if (table) { - dict_index_t* index; - dict_index_t* next_index; - - for (index = dict_table_get_first_index(table); - index; index = next_index) { - - next_index = dict_table_get_next_index(index); - - if (*index->name == TEMP_INDEX_PREFIX) { - row_merge_drop_index(index, table, trx); - trx_commit_for_mysql(trx); - } - } - - dict_table_close(table, TRUE); - } + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; - mtr_start(&mtr); - btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: row_merge_drop_temp_indexes " + "failed with error code: %u.\n", (unsigned) error); } - btr_pcur_close(&pcur); - mtr_commit(&mtr); + trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); trx_free_for_background(trx); } @@ -2449,8 +2877,8 @@ row_merge_drop_temp_indexes(void) /*********************************************************************//** Creates temporary merge files, and if UNIV_PFS_IO defined, register the file descriptor with Performance Schema. -@return File descriptor */ -UNIV_INLINE +@return file descriptor, or -1 on failure */ +UNIV_INTERN int row_merge_file_create_low(void) /*===========================*/ @@ -2469,31 +2897,43 @@ row_merge_file_create_low(void) #endif fd = innobase_mysql_tmpfile(); #ifdef UNIV_PFS_IO - register_pfs_file_open_end(locker, fd); + register_pfs_file_open_end(locker, fd); #endif + + if (fd < 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create temporary merge file"); + return -1; + } return(fd); } /*********************************************************************//** -Create a merge file. */ +Create a merge file. +@return file descriptor, or -1 on failure */ UNIV_INTERN -void +int row_merge_file_create( /*==================*/ merge_file_t* merge_file) /*!< out: merge file structure */ { merge_file->fd = row_merge_file_create_low(); - if (srv_disable_sort_file_cache) { - os_file_set_nocache(merge_file->fd, "row0merge.c", "sort"); - } merge_file->offset = 0; merge_file->n_rec = 0; + + if (merge_file->fd >= 0) { + if (srv_disable_sort_file_cache) { + os_file_set_nocache(merge_file->fd, + "row0merge.cc", "sort"); + } + } + return(merge_file->fd); } /*********************************************************************//** Destroy a merge file. And de-register the file from Performance Schema if UNIV_PFS_IO is defined. */ -UNIV_INLINE +UNIV_INTERN void row_merge_file_destroy_low( /*=======================*/ @@ -2506,7 +2946,9 @@ row_merge_file_destroy_low( fd, 0, PSI_FILE_CLOSE, __FILE__, __LINE__); #endif - close(fd); + if (fd >= 0) { + close(fd); + } #ifdef UNIV_PFS_IO register_pfs_file_io_end(locker, 0); #endif @@ -2517,8 +2959,10 @@ UNIV_INTERN void row_merge_file_destroy( /*===================*/ - merge_file_t* merge_file) /*!< out: merge file structure */ + merge_file_t* merge_file) /*!< in/out: merge file structure */ { + ut_ad(!srv_read_only_mode); + if (merge_file->fd != -1) { row_merge_file_destroy_low(merge_file->fd); merge_file->fd = -1; @@ -2526,173 +2970,109 @@ row_merge_file_destroy( } /*********************************************************************//** -Determine the precise type of a column that is added to a tem -if a column must be constrained NOT NULL. -@return col->prtype, possibly ORed with DATA_NOT_NULL */ -UNIV_INLINE -ulint -row_merge_col_prtype( -/*=================*/ - const dict_col_t* col, /*!< in: column */ - const char* col_name, /*!< in: name of the column */ - const merge_index_def_t*index_def) /*!< in: the index definition - of the primary key */ -{ - ulint prtype = col->prtype; - ulint i; - - ut_ad(index_def->ind_type & DICT_CLUSTERED); - - if (prtype & DATA_NOT_NULL) { - - return(prtype); - } - - /* All columns that are included - in the PRIMARY KEY must be NOT NULL. */ - - for (i = 0; i < index_def->n_fields; i++) { - if (!strcmp(col_name, index_def->fields[i].field_name)) { - return(prtype | DATA_NOT_NULL); - } - } - - return(prtype); -} - -/*********************************************************************//** -Create a temporary table for creating a primary key, using the definition -of an existing table. -@return table, or NULL on error */ +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ UNIV_INTERN -dict_table_t* -row_merge_create_temporary_table( -/*=============================*/ - const char* table_name, /*!< in: new table name */ - const merge_index_def_t*index_def, /*!< in: the index definition - of the primary key */ - const dict_table_t* table, /*!< in: old table definition */ - trx_t* trx) /*!< in/out: transaction - (sets error_state) */ +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ { - ulint i; - dict_table_t* new_table = NULL; - ulint n_cols = dict_table_get_n_user_cols(table); - ulint error; - mem_heap_t* heap = mem_heap_create(1000); - ulint num_col; - - ut_ad(table_name); - ut_ad(index_def); - ut_ad(table); - ut_ad(mutex_own(&dict_sys->mutex)); - - num_col = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID) - ? n_cols + 1 - : n_cols; - - new_table = dict_mem_table_create( - table_name, 0, num_col, table->flags, table->flags2); - - for (i = 0; i < n_cols; i++) { - const dict_col_t* col; - const char* col_name; + dberr_t err = DB_SUCCESS; + pars_info_t* info = pars_info_create(); - col = dict_table_get_nth_col(table, i); - col_name = dict_table_get_col_name(table, i); + /* We use the private SQL parser of Innobase to generate the + query graphs needed in renaming indexes. */ - dict_mem_table_add_col(new_table, heap, col_name, col->mtype, - row_merge_col_prtype(col, col_name, - index_def), - col->len); - } + static const char rename_index[] = + "PROCEDURE RENAME_INDEX_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" + "WHERE TABLE_ID = :tableid AND ID = :indexid;\n" + "END;\n"; - /* Add the FTS doc_id hidden column */ - if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { - fts_add_doc_id_column(new_table); - new_table->fts->doc_col = n_cols; - } + ut_ad(trx); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); - error = row_create_table_for_mysql(new_table, trx); - mem_heap_free(heap); + trx->op_info = "renaming index to add"; - if (error != DB_SUCCESS) { - trx->error_state = static_cast<enum db_err>(error); - new_table = NULL; - } else { - dict_table_t* temp_table; + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_ull_literal(info, "indexid", index_id); - /* We need to bump up the table ref count and before we can - use it we need to open the table. */ + err = que_eval_sql(info, rename_index, FALSE, trx); - temp_table = dict_table_open_on_name_no_stats( - new_table->name, TRUE, DICT_ERR_IGNORE_NONE); + if (err != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; - ut_a(new_table == temp_table); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: row_merge_rename_index_to_add " + "failed with error code: %u.\n", (unsigned) err); } - return(new_table); + trx->op_info = ""; + + return(err); } /*********************************************************************//** -Rename the temporary indexes in the dictionary to permanent ones. The -data dictionary must have been locked exclusively by the caller, -because the transaction will not be committed. +Rename an index in the dictionary that is to be dropped. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. @return DB_SUCCESS if all OK */ UNIV_INTERN -ulint -row_merge_rename_indexes( -/*=====================*/ +dberr_t +row_merge_rename_index_to_drop( +/*===========================*/ trx_t* trx, /*!< in/out: transaction */ - dict_table_t* table) /*!< in/out: table with new indexes */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ { - db_err err = DB_SUCCESS; + dberr_t err; pars_info_t* info = pars_info_create(); + ut_ad(!srv_read_only_mode); + /* We use the private SQL parser of Innobase to generate the query graphs needed in renaming indexes. */ - static const char* sql = - "PROCEDURE RENAME_INDEXES_PROC () IS\n" + static const char rename_index[] = + "PROCEDURE RENAME_INDEX_PROC () IS\n" "BEGIN\n" - "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" - "WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='" - TEMP_INDEX_PREFIX_STR "';\n" + "UPDATE SYS_INDEXES SET NAME=CONCAT('" + TEMP_INDEX_PREFIX_STR "',NAME)\n" + "WHERE TABLE_ID = :tableid AND ID = :indexid;\n" "END;\n"; - ut_ad(table); ut_ad(trx); ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); - trx->op_info = "renaming indexes"; + trx->op_info = "renaming index to drop"; - pars_info_add_ull_literal(info, "tableid", table->id); + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_ull_literal(info, "indexid", index_id); - err = static_cast<db_err>(que_eval_sql(info, sql, FALSE, trx)); + err = que_eval_sql(info, rename_index, FALSE, trx); - DBUG_EXECUTE_IF( - "ib_rename_indexes_too_many_concurrent_trxs", - err = DB_TOO_MANY_CONCURRENT_TRXS; - trx->error_state = static_cast<db_err>(err);); - - if (err == DB_SUCCESS) { - dict_index_t* index = dict_table_get_first_index(table); - do { - if (*index->name == TEMP_INDEX_PREFIX) { - index->name++; - } - index = dict_table_get_next_index(index); - } while (index); - } else { + if (err != DB_SUCCESS) { /* Even though we ensure that DDL transactions are WAIT and DEADLOCK free, we could encounter other errors e.g., - DB_TOO_MANY_TRANSACTIONS. */ - + DB_TOO_MANY_CONCURRENT_TRXS. */ trx->error_state = DB_SUCCESS; ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: row_merge_rename_indexes " - "failed with error code: %lu.\n", (ulint) err); + fprintf(stderr, + " InnoDB: Error: row_merge_rename_index_to_drop " + "failed with error code: %u.\n", (unsigned) err); } trx->op_info = ""; @@ -2701,12 +3081,39 @@ row_merge_rename_indexes( } /*********************************************************************//** +Provide a new pathname for a table that is being renamed if it belongs to +a file-per-table tablespace. The caller is responsible for freeing the +memory allocated for the return value. +@return new pathname of tablespace file, or NULL if space = 0 */ +UNIV_INTERN +char* +row_make_new_pathname( +/*==================*/ + dict_table_t* table, /*!< in: table to be renamed */ + const char* new_name) /*!< in: new name */ +{ + char* new_path; + char* old_path; + + ut_ad(table->space != TRX_SYS_SPACE); + + old_path = fil_space_get_first_path(table->space); + ut_a(old_path); + + new_path = os_file_make_new_pathname(old_path, new_name); + + mem_free(old_path); + + return(new_path); +} + +/*********************************************************************//** Rename the tables in the data dictionary. The data dictionary must have been locked exclusively by the caller, because the transaction will not be committed. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_merge_rename_tables( /*====================*/ dict_table_t* old_table, /*!< in/out: old table, renamed to @@ -2716,28 +3123,32 @@ row_merge_rename_tables( const char* tmp_name, /*!< in: new name for old_table */ trx_t* trx) /*!< in: transaction handle */ { - ulint err = DB_ERROR; + dberr_t err = DB_ERROR; pars_info_t* info; char old_name[MAX_FULL_NAME_LEN + 1]; + ut_ad(!srv_read_only_mode); ut_ad(old_table != new_table); ut_ad(mutex_own(&dict_sys->mutex)); - ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE); /* store the old/current name to an automatic variable */ if (strlen(old_table->name) + 1 <= sizeof(old_name)) { memcpy(old_name, old_table->name, strlen(old_table->name) + 1); } else { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: too long table name: '%s', " - "max length is %d\n", old_table->name, - MAX_FULL_NAME_LEN); + ib_logf(IB_LOG_LEVEL_ERROR, + "Too long table name: '%s', max length is %d", + old_table->name, MAX_FULL_NAME_LEN); ut_error; } trx->op_info = "renaming tables"; + DBUG_EXECUTE_IF( + "ib_rebuild_cannot_rename", + err = DB_ERROR; goto err_exit;); + /* We use the private SQL parser of Innobase to generate the query graphs needed in updating the dictionary data in system tables. */ @@ -2756,21 +3167,124 @@ row_merge_rename_tables( " WHERE NAME = :new_name;\n" "END;\n", FALSE, trx); - if (err != DB_SUCCESS) { + /* Update SYS_TABLESPACES and SYS_DATAFILES if the old + table is in a non-system tablespace where space > 0. */ + if (err == DB_SUCCESS + && old_table->space != TRX_SYS_SPACE + && !old_table->ibd_file_missing) { + /* Make pathname to update SYS_DATAFILES. */ + char* tmp_path = row_make_new_pathname(old_table, tmp_name); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "tmp_name", tmp_name); + pars_info_add_str_literal(info, "tmp_path", tmp_path); + pars_info_add_int4_literal(info, "old_space", + (lint) old_table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_OLD_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :tmp_name\n" + " WHERE SPACE = :old_space;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :tmp_path\n" + " WHERE SPACE = :old_space;\n" + "END;\n", FALSE, trx); + + mem_free(tmp_path); + } + + /* Update SYS_TABLESPACES and SYS_DATAFILES if the new + table is in a non-system tablespace where space > 0. */ + if (err == DB_SUCCESS && new_table->space != TRX_SYS_SPACE) { + /* Make pathname to update SYS_DATAFILES. */ + char* old_path = row_make_new_pathname(new_table, old_name); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "old_name", old_name); + pars_info_add_str_literal(info, "old_path", old_path); + pars_info_add_int4_literal(info, "new_space", + (lint) new_table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_NEW_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :old_name\n" + " WHERE SPACE = :new_space;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :old_path\n" + " WHERE SPACE = :new_space;\n" + "END;\n", FALSE, trx); + + mem_free(old_path); + } + if (err != DB_SUCCESS) { goto err_exit; } + /* Generate the redo logs for file operations */ + fil_mtr_rename_log(old_table->space, old_name, + new_table->space, new_table->name, tmp_name); + + /* What if the redo logs are flushed to disk here? This is + tested with following crash point */ + DBUG_EXECUTE_IF("bug14669848_precommit", log_buffer_flush_to_disk(); + DBUG_SUICIDE();); + + /* File operations cannot be rolled back. So, before proceeding + with file operations, commit the dictionary changes.*/ + trx_commit_for_mysql(trx); + + /* If server crashes here, the dictionary in InnoDB and MySQL + will differ. The .ibd files and the .frm files must be swapped + manually by the administrator. No loss of data. */ + DBUG_EXECUTE_IF("bug14669848", DBUG_SUICIDE();); + + /* Ensure that the redo logs are flushed to disk. The config + innodb_flush_log_at_trx_commit must not affect this. */ + log_buffer_flush_to_disk(); + /* The following calls will also rename the .ibd data files if the tables are stored in a single-table tablespace */ - if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE) - || !dict_table_rename_in_cache(new_table, old_name, FALSE)) { + err = dict_table_rename_in_cache(old_table, tmp_name, FALSE); - err = DB_ERROR; - goto err_exit; + if (err == DB_SUCCESS) { + + ut_ad(dict_table_is_discarded(old_table) + == dict_table_is_discarded(new_table)); + + err = dict_table_rename_in_cache(new_table, old_name, FALSE); + + if (err != DB_SUCCESS) { + + if (dict_table_rename_in_cache( + old_table, old_name, FALSE) + != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot undo the rename in cache " + "from %s to %s", old_name, tmp_name); + } + + goto err_exit; + } + + if (dict_table_is_discarded(new_table)) { + + err = row_import_update_discarded_flag( + trx, new_table->id, true, true); + } } + DBUG_EXECUTE_IF("ib_rebuild_cannot_load_fk", + err = DB_ERROR; goto err_exit;); + err = dict_load_foreigns(old_name, FALSE, TRUE); if (err != DB_SUCCESS) { @@ -2788,8 +3302,8 @@ err_exit: /*********************************************************************//** Create and execute a query graph for creating an index. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_merge_create_index_graph( /*=========================*/ trx_t* trx, /*!< in: trx */ @@ -2799,7 +3313,7 @@ row_merge_create_index_graph( ind_node_t* node; /*!< Index creation node */ mem_heap_t* heap; /*!< Memory heap */ que_thr_t* thr; /*!< Query thread */ - ulint err; + dberr_t err; ut_ad(trx); ut_ad(table); @@ -2808,7 +3322,7 @@ row_merge_create_index_graph( heap = mem_heap_create(512); index->table = table; - node = ind_create_graph_create(index, heap); + node = ind_create_graph_create(index, heap, false); thr = pars_complete_graph_for_exec(node, trx, heap); ut_a(thr == que_fork_start_command( @@ -2832,14 +3346,16 @@ row_merge_create_index( /*===================*/ trx_t* trx, /*!< in/out: trx (sets error_state) */ dict_table_t* table, /*!< in: the index is on this table */ - const merge_index_def_t*index_def) + const index_def_t* index_def) /*!< in: the index definition */ { dict_index_t* index; - ulint err; + dberr_t err; ulint n_fields = index_def->n_fields; ulint i; + ut_ad(!srv_read_only_mode); + /* Create the index prototype, using the passed in def, this is not a persistent operation. We pass 0 as the space id, and determine at a lower level the space id where to store the table. */ @@ -2850,10 +3366,11 @@ row_merge_create_index( ut_a(index); for (i = 0; i < n_fields; i++) { - merge_index_field_t* ifield = &index_def->fields[i]; + index_field_t* ifield = &index_def->fields[i]; - dict_mem_index_add_field(index, ifield->field_name, - ifield->prefix_len); + dict_mem_index_add_field( + index, dict_table_get_col_name(table, ifield->col_no), + ifield->prefix_len); } /* Add the index to SYS_INDEXES, using the index prototype. */ @@ -2861,15 +3378,14 @@ row_merge_create_index( if (err == DB_SUCCESS) { - index = row_merge_dict_table_get_index( - table, index_def); + index = dict_table_get_index_on_name(table, index_def->name); ut_a(index); /* Note the id of the transaction that created this index, we use it to restrict readers from accessing this index, to ensure read consistency. */ - index->trx_id = trx->id; + ut_ad(index->trx_id == trx->id); } else { index = NULL; } @@ -2886,35 +3402,46 @@ row_merge_is_index_usable( const trx_t* trx, /*!< in: transaction */ const dict_index_t* index) /*!< in: index to check */ { + if (!dict_index_is_clust(index) + && dict_index_is_online_ddl(index)) { + /* Indexes that are being created are not useable. */ + return(FALSE); + } + return(!dict_index_is_corrupted(index) - && (!trx->read_view - || read_view_sees_trx_id(trx->read_view, index->trx_id))); + && (dict_table_is_temporary(index->table) + || !trx->read_view + || read_view_sees_trx_id(trx->read_view, index->trx_id))); } /*********************************************************************//** -Drop the old table. +Drop a table. The caller must have ensured that the background stats +thread is not processing the table. This can be done by calling +dict_stats_wait_bg_to_stop_using_tables() after locking the dictionary and +before calling this function. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_drop_table( /*=================*/ trx_t* trx, /*!< in: transaction */ dict_table_t* table) /*!< in: table to drop */ { + ut_ad(!srv_read_only_mode); + /* There must be no open transactions on the table. */ ut_a(table->n_ref_count == 0); - return(row_drop_table_for_mysql(table->name, trx, FALSE)); + return(row_drop_table_for_mysql(table->name, trx, false, false)); } - /*********************************************************************//** Build indexes on a table by reading a clustered index, creating a temporary file containing index entries, merge sorting these index entries and inserting sorted index entries to indexes. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_build_indexes( /*====================*/ trx_t* trx, /*!< in: transaction */ @@ -2923,45 +3450,62 @@ row_merge_build_indexes( dict_table_t* new_table, /*!< in: table where indexes are created; identical to old_table unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ dict_index_t** indexes, /*!< in: indexes to be created */ + const ulint* key_numbers, /*!< in: MySQL key numbers */ ulint n_indexes, /*!< in: size of indexes[] */ - struct TABLE* table) /*!< in/out: MySQL table, for + struct TABLE* table, /*!< in/out: MySQL table, for reporting erroneous key value if applicable */ + const dtuple_t* add_cols, /*!< in: default values of + added columns, or NULL */ + const ulint* col_map, /*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence) /*!< in: autoinc instance if + add_autoinc != ULINT_UNDEFINED */ { merge_file_t* merge_files; row_merge_block_t* block; ulint block_size; ulint i; ulint j; - ulint error; + dberr_t error; int tmpfd; dict_index_t* fts_sort_idx = NULL; fts_psort_t* psort_info = NULL; fts_psort_t* merge_info = NULL; ib_int64_t sig_count = 0; - ut_ad(trx); - ut_ad(old_table); - ut_ad(new_table); - ut_ad(indexes); - ut_ad(n_indexes); - - trx_start_if_not_started_xa(trx); + ut_ad(!srv_read_only_mode); + ut_ad((old_table == new_table) == !col_map); + ut_ad(!add_cols || col_map); /* Allocate memory for merge file data structure and initialize fields */ - merge_files = static_cast<merge_file_t*>( - mem_alloc(n_indexes * sizeof *merge_files)); - block_size = 3 * srv_sort_buf_size; block = static_cast<row_merge_block_t*>( os_mem_alloc_large(&block_size)); - for (i = 0; i < n_indexes; i++) { + if (block == NULL) { + return(DB_OUT_OF_MEMORY); + } + + trx_start_if_not_started_xa(trx); - row_merge_file_create(&merge_files[i]); + merge_files = static_cast<merge_file_t*>( + mem_alloc(n_indexes * sizeof *merge_files)); + + for (i = 0; i < n_indexes; i++) { + if (row_merge_file_create(&merge_files[i]) < 0) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } if (indexes[i]->type & DICT_FTS) { ibool opt_doc_id_size = FALSE; @@ -2971,17 +3515,28 @@ row_merge_build_indexes( we need to build a "fts sort index" indexing on above three 'fields' */ fts_sort_idx = row_merge_create_fts_sort_index( - indexes[i], old_table, - &opt_doc_id_size); - - row_fts_psort_info_init(trx, table, new_table, - fts_sort_idx, opt_doc_id_size, - &psort_info, &merge_info); + indexes[i], old_table, &opt_doc_id_size); + + row_merge_dup_t* dup = static_cast<row_merge_dup_t*>( + ut_malloc(sizeof *dup)); + dup->index = fts_sort_idx; + dup->table = table; + dup->col_map = col_map; + dup->n_dup = 0; + + row_fts_psort_info_init( + trx, dup, new_table, opt_doc_id_size, + &psort_info, &merge_info); } } tmpfd = row_merge_file_create_low(); + if (tmpfd < 0) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + /* Reset the MySQL row buffer that is used when reporting duplicate keys. */ innobase_rec_reset(table); @@ -2990,31 +3545,61 @@ row_merge_build_indexes( secondary index entries for merge sort */ error = row_merge_read_clustered_index( - trx, table, old_table, new_table, indexes, - fts_sort_idx, psort_info, merge_files, n_indexes, block); + trx, table, old_table, new_table, online, indexes, + fts_sort_idx, psort_info, merge_files, key_numbers, + n_indexes, add_cols, col_map, + add_autoinc, sequence, block); if (error != DB_SUCCESS) { goto func_exit; } + DEBUG_SYNC_C("row_merge_after_scan"); + /* Now we have files containing index entries ready for sorting and inserting. */ for (i = 0; i < n_indexes; i++) { - dict_index_t* sort_idx; - - sort_idx = (indexes[i]->type & DICT_FTS) - ? fts_sort_idx - : indexes[i]; + dict_index_t* sort_idx = indexes[i]; if (indexes[i]->type & DICT_FTS) { os_event_t fts_parallel_merge_event; + bool all_exit = false; + ulint trial_count = 0; + + sort_idx = fts_sort_idx; + + /* Now all children should complete, wait + a bit until they all finish using event */ + while (!all_exit && trial_count < 10000) { + all_exit = true; + + for (j = 0; j < fts_sort_pll_degree; + j++) { + if (psort_info[j].child_status + != FTS_CHILD_EXITING) { + all_exit = false; + os_thread_sleep(1000); + break; + } + } + trial_count++; + } + + if (!all_exit) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Not all child sort threads exited" + " when creating FTS index '%s'", + indexes[i]->name); + } fts_parallel_merge_event - = merge_info[0].psort_common->sort_event; + = merge_info[0].psort_common->merge_event; if (FTS_PLL_MERGE) { + trial_count = 0; + all_exit = false; os_event_reset(fts_parallel_merge_event); row_fts_start_parallel_merge(merge_info); wait_again: @@ -3024,33 +3609,64 @@ wait_again: for (j = 0; j < FTS_NUM_AUX_INDEX; j++) { if (merge_info[j].child_status - != FTS_CHILD_COMPLETE) { + != FTS_CHILD_COMPLETE + && merge_info[j].child_status + != FTS_CHILD_EXITING) { sig_count = os_event_reset( fts_parallel_merge_event); goto wait_again; } } + + /* Now all children should complete, wait + a bit until they all finish using event */ + while (!all_exit && trial_count < 10000) { + all_exit = true; + + for (j = 0; j < FTS_NUM_AUX_INDEX; + j++) { + if (merge_info[j].child_status + != FTS_CHILD_EXITING) { + all_exit = false; + os_thread_sleep(1000); + break; + } + } + trial_count++; + } + + if (!all_exit) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Not all child merge threads" + " exited when creating FTS" + " index '%s'", + indexes[i]->name); + } } else { + /* This cannot report duplicates; an + assertion would fail in that case. */ error = row_fts_merge_insert( sort_idx, new_table, psort_info, 0); } +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n"); +#endif } else { - error = row_merge_sort(trx, sort_idx, &merge_files[i], - block, &tmpfd, table); + row_merge_dup_t dup = { + sort_idx, table, col_map, 0}; + + error = row_merge_sort( + trx, &dup, &merge_files[i], + block, &tmpfd); if (error == DB_SUCCESS) { error = row_merge_insert_index_tuples( - trx, sort_idx, new_table, - dict_table_zip_size(old_table), + trx->id, sort_idx, old_table, merge_files[i].fd, block); } - -#ifdef FTS_INTERNAL_DIAG_PRINT - DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n"); -#endif } /* Close the temporary file to free up space. */ @@ -3058,10 +3674,20 @@ wait_again: if (indexes[i]->type & DICT_FTS) { row_fts_psort_info_destroy(psort_info, merge_info); + } else if (error != DB_SUCCESS || !online) { + /* Do not apply any online log. */ + } else if (old_table != new_table) { + ut_ad(!sort_idx->online_log); + ut_ad(sort_idx->online_status + == ONLINE_INDEX_COMPLETE); + } else { + DEBUG_SYNC_C("row_log_apply_before"); + error = row_log_apply(trx, sort_idx, table); + DEBUG_SYNC_C("row_log_apply_after"); } if (error != DB_SUCCESS) { - trx->error_key_num = i; + trx->error_key_num = key_numbers[i]; goto func_exit; } @@ -3082,7 +3708,7 @@ func_exit: DBUG_EXECUTE_IF( "ib_build_indexes_too_many_concurrent_trxs", error = DB_TOO_MANY_CONCURRENT_TRXS; - trx->error_state = static_cast<db_err>(error);); + trx->error_state = error;); row_merge_file_destroy_low(tmpfd); @@ -3097,5 +3723,45 @@ func_exit: mem_free(merge_files); os_mem_free_large(block, block_size); + DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID); + + if (online && old_table == new_table && error != DB_SUCCESS) { + /* On error, flag all online secondary index creation + as aborted. */ + for (i = 0; i < n_indexes; i++) { + ut_ad(!(indexes[i]->type & DICT_FTS)); + ut_ad(*indexes[i]->name == TEMP_INDEX_PREFIX); + ut_ad(!dict_index_is_clust(indexes[i])); + + /* Completed indexes should be dropped as + well, and indexes whose creation was aborted + should be dropped from the persistent + storage. However, at this point we can only + set some flags in the not-yet-published + indexes. These indexes will be dropped later + in row_merge_drop_indexes(), called by + rollback_inplace_alter_table(). */ + + switch (dict_index_get_online_status(indexes[i])) { + case ONLINE_INDEX_COMPLETE: + break; + case ONLINE_INDEX_CREATION: + rw_lock_x_lock( + dict_index_get_lock(indexes[i])); + row_log_abort_sec(indexes[i]); + indexes[i]->type |= DICT_CORRUPT; + rw_lock_x_unlock( + dict_index_get_lock(indexes[i])); + new_table->drop_aborted = TRUE; + /* fall through */ + case ONLINE_INDEX_ABORTED_DROPPED: + case ONLINE_INDEX_ABORTED: + MONITOR_MUTEX_INC( + &dict_sys->mutex, + MONITOR_BACKGROUND_DROP_INDEX); + } + } + } + return(error); } diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index f1811a664c2..f748bb4f60f 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -30,6 +30,9 @@ Created 9/17/2000 Heikki Tuuri #include "row0mysql.ic" #endif +#include <debug_sync.h> +#include <my_dbug.h> + #include "row0ins.h" #include "row0merge.h" #include "row0sel.h" @@ -42,6 +45,7 @@ Created 9/17/2000 Heikki Tuuri #include "dict0load.h" #include "dict0boot.h" #include "dict0stats.h" +#include "dict0stats_bg.h" #include "trx0roll.h" #include "trx0purge.h" #include "trx0rec.h" @@ -54,16 +58,16 @@ Created 9/17/2000 Heikki Tuuri #include "ibuf0ibuf.h" #include "fts0fts.h" #include "fts0types.h" -#include "srv0mon.h" +#include "srv0start.h" +#include "row0import.h" +#include "m_string.h" +#include "my_sys.h" /** Provide optional 4.x backwards compatibility for 5.0 and above */ UNIV_INTERN ibool row_rollback_on_timeout = FALSE; /** Chain node of the list of tables to drop in the background. */ -typedef struct row_mysql_drop_struct row_mysql_drop_t; - -/** Chain node of the list of tables to drop in the background. */ -struct row_mysql_drop_struct{ +struct row_mysql_drop_t{ char* table_name; /*!< table name */ UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list; /*!< list chain node */ @@ -82,7 +86,7 @@ more. Protected by row_drop_list_mutex. */ static UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list; /** Mutex protecting the background table drop list. */ -static mutex_t row_drop_list_mutex; +static ib_mutex_t row_drop_list_mutex; /** Flag: has row_mysql_drop_list been initialized? */ static ibool row_mysql_drop_list_inited = FALSE; @@ -570,21 +574,21 @@ next_column: /****************************************************************//** Handles user errors and lock waits detected by the database engine. -@return TRUE if it was a lock wait and we should continue running the +@return true if it was a lock wait and we should continue running the query thread and in that case the thr is ALREADY in the running state. */ UNIV_INTERN -ibool +bool row_mysql_handle_errors( /*====================*/ - ulint* new_err,/*!< out: possible new error encountered in + dberr_t* new_err,/*!< out: possible new error encountered in lock wait, or if no new error, the value of trx->error_state at the entry of this function */ trx_t* trx, /*!< in: transaction */ - que_thr_t* thr, /*!< in: query thread */ - trx_savept_t* savept) /*!< in: savepoint or NULL */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ { - ulint err; + dberr_t err; handle_new_error: err = trx->error_state; @@ -612,6 +616,7 @@ handle_new_error: case DB_READ_ONLY: case DB_FTS_INVALID_DOCID: case DB_INTERRUPTED: + case DB_DICT_CHANGED: if (savept) { /* Roll back the latest, possibly incomplete insertion or update */ @@ -631,7 +636,7 @@ handle_new_error: *new_err = err; - return(TRUE); + return(true); case DB_DEADLOCK: case DB_LOCK_TABLE_FULL: @@ -648,6 +653,7 @@ handle_new_error: " a new data file to\n" "InnoDB: my.cnf and restart the database.\n", stderr); + ut_ad(0); exit(1); case DB_CORRUPTION: @@ -686,7 +692,7 @@ handle_new_error: trx->error_state = DB_SUCCESS; - return(FALSE); + return(false); } /********************************************************************//** @@ -774,7 +780,7 @@ row_create_prebuilt( prebuilt->clust_ref = ref; - prebuilt->autoinc_error = 0; + prebuilt->autoinc_error = DB_SUCCESS; prebuilt->autoinc_offset = 0; /* Default to 1, we will set the actual value later in @@ -883,7 +889,7 @@ row_prebuilt_free( mem_free(base); } - dict_table_close(prebuilt->table, dict_locked); + dict_table_close(prebuilt->table, dict_locked, TRUE); mem_heap_free(prebuilt->heap); } @@ -950,44 +956,62 @@ row_get_prebuilt_insert_row( row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL handle */ { - ins_node_t* node; - dtuple_t* row; - dict_table_t* table = prebuilt->table; + dict_table_t* table = prebuilt->table; ut_ad(prebuilt && table && prebuilt->trx); - if (prebuilt->ins_node == NULL) { - - /* Not called before for this handle: create an insert node - and query graph to the prebuilt struct */ + if (prebuilt->ins_node != 0) { - node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + /* Check if indexes have been dropped or added and we + may need to rebuild the row insert template. */ - prebuilt->ins_node = node; + if (prebuilt->trx_id == table->def_trx_id + && UT_LIST_GET_LEN(prebuilt->ins_node->entry_list) + == UT_LIST_GET_LEN(table->indexes)) { - if (prebuilt->ins_upd_rec_buff == NULL) { - prebuilt->ins_upd_rec_buff = static_cast<byte*>( - mem_heap_alloc( - prebuilt->heap, - prebuilt->mysql_row_len)); + return(prebuilt->ins_node->row); } - row = dtuple_create(prebuilt->heap, - dict_table_get_n_cols(table)); + ut_ad(prebuilt->trx_id < table->def_trx_id); - dict_table_copy_types(row, table); + que_graph_free_recursive(prebuilt->ins_graph); - ins_node_set_new_row(node, row); + prebuilt->ins_graph = 0; + } - prebuilt->ins_graph = static_cast<que_fork_t*>( - que_node_get_parent( - pars_complete_graph_for_exec( - node, - prebuilt->trx, prebuilt->heap))); + /* Create an insert node and query graph to the prebuilt struct */ - prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + ins_node_t* node; + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == 0) { + prebuilt->ins_upd_rec_buff = static_cast<byte*>( + mem_heap_alloc( + prebuilt->heap, + prebuilt->mysql_row_len)); } + dtuple_t* row; + + row = dtuple_create(prebuilt->heap, dict_table_get_n_cols(table)); + + dict_table_copy_types(row, table); + + ins_node_set_new_row(node, row); + + prebuilt->ins_graph = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec( + node, + prebuilt->trx, prebuilt->heap))); + + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + + prebuilt->trx_id = table->def_trx_id; + return(prebuilt->ins_node->row); } @@ -1000,23 +1024,41 @@ row_update_statistics_if_needed( /*============================*/ dict_table_t* table) /*!< in: table */ { - ulint counter; + ib_uint64_t counter; + ib_uint64_t n_rows; + + if (!table->stat_initialized) { + DBUG_EXECUTE_IF( + "test_upd_stats_if_needed_not_inited", + fprintf(stderr, "test_upd_stats_if_needed_not_inited " + "was executed\n"); + ); + return; + } - counter = table->stat_modified_counter; + counter = table->stat_modified_counter++; + n_rows = dict_table_get_n_rows(table); - table->stat_modified_counter = counter + 1; + if (dict_stats_is_persistent_enabled(table)) { + if (counter > n_rows / 10 /* 10% */ + && dict_stats_auto_recalc_is_enabled(table)) { + + dict_stats_recalc_pool_add(table); + table->stat_modified_counter = 0; + } + return; + } /* Calculate new statistics if 1 / 16 of table has been modified - since the last time a statistics batch was run, or if - stat_modified_counter > 2 000 000 000 (to avoid wrap-around). + since the last time a statistics batch was run. We calculate statistics at most every 16th round, since we may have a counter table which is very small and updated very often. */ - if (counter > 2000000000 - || ((ib_int64_t) counter > 16 + table->stat_n_rows / 16)) { + if (counter > 16 + n_rows / 16 /* 6.25% */) { ut_ad(!mutex_own(&dict_sys->mutex)); - dict_stats_update(table, DICT_STATS_FETCH, FALSE); + /* this will reset table->stat_modified_counter to 0 */ + dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT); } } @@ -1028,7 +1070,7 @@ It is not compatible with another AUTO_INC or exclusive lock on the table. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_lock_table_autoinc_for_mysql( /*=============================*/ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL @@ -1038,7 +1080,7 @@ row_lock_table_autoinc_for_mysql( ins_node_t* node = prebuilt->ins_node; const dict_table_t* table = prebuilt->table; que_thr_t* thr; - ulint err; + dberr_t err; ibool was_lock_wait; ut_ad(trx); @@ -1053,10 +1095,8 @@ row_lock_table_autoinc_for_mysql( trx->op_info = "setting auto-inc lock"; - if (node == NULL) { - row_get_prebuilt_insert_row(prebuilt); - node = prebuilt->ins_node; - } + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; /* We use the insert query graph as the dummy graph needed in the lock module call */ @@ -1076,7 +1116,7 @@ run_again: err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr); - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { que_thr_stop_for_mysql(thr); @@ -1089,21 +1129,21 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } que_thr_stop_for_mysql_no_error(thr, trx); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** Sets a table lock on the table mentioned in prebuilt. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_lock_table_for_mysql( /*=====================*/ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL @@ -1117,7 +1157,7 @@ row_lock_table_for_mysql( { trx_t* trx = prebuilt->trx; que_thr_t* thr; - ulint err; + dberr_t err; ibool was_lock_wait; ut_ad(trx); @@ -1157,7 +1197,7 @@ run_again: thr); } - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { que_thr_stop_for_mysql(thr); @@ -1170,21 +1210,21 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } que_thr_stop_for_mysql_no_error(thr, trx); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** Does an insert for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_insert_for_mysql( /*=================*/ byte* mysql_rec, /*!< in: row in the MySQL format */ @@ -1193,7 +1233,7 @@ row_insert_for_mysql( { trx_savept_t savept; que_thr_t* thr; - ulint err; + dberr_t err; ibool was_lock_wait; trx_t* trx = prebuilt->trx; ins_node_t* node = prebuilt->ins_node; @@ -1201,24 +1241,23 @@ row_insert_for_mysql( ut_ad(trx); - if (table->ibd_file_missing) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error:\n" - "InnoDB: MySQL is trying to use a table handle" - " but the .ibd file for\n" - "InnoDB: table %s does not exist.\n" - "InnoDB: Have you deleted the .ibd file" - " from the database directory under\n" - "InnoDB: the MySQL datadir, or have you" - " used DISCARD TABLESPACE?\n" - "InnoDB: Look from\n" - "InnoDB: " REFMAN "innodb-troubleshooting.html\n" - "InnoDB: how you can resolve the problem.\n", + if (dict_table_is_discarded(prebuilt->table)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "The table %s doesn't have a corresponding " + "tablespace, it was discarded.", prebuilt->table->name); - return(DB_ERROR); - } - if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + return(DB_TABLESPACE_DELETED); + + } else if (prebuilt->table->ibd_file_missing) { + + ib_logf(IB_LOG_LEVEL_ERROR, + ".ibd file is missing for table %s", + prebuilt->table->name); + + return(DB_TABLESPACE_NOT_FOUND); + + } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" "InnoDB: table handle. Magic n %lu, table name ", @@ -1229,9 +1268,7 @@ row_insert_for_mysql( mem_analyze_corruption(prebuilt); ut_error; - } - - if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) { + } else if (srv_created_new_raw || srv_force_recovery) { fputs("InnoDB: A new raw disk partition was initialized or\n" "InnoDB: innodb_force_recovery is on: we do not allow\n" "InnoDB: database modifications by the user. Shut down\n" @@ -1249,10 +1286,8 @@ row_insert_for_mysql( trx_start_if_not_started_xa(trx); - if (node == NULL) { - row_get_prebuilt_insert_row(prebuilt); - node = prebuilt->ins_node; - } + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec); @@ -1290,12 +1325,14 @@ error_exit: thr->lock_state = QUE_THR_LOCK_NOLOCK; if (was_lock_wait) { + ut_ad(node->state == INS_NODE_INSERT_ENTRIES + || node->state == INS_NODE_ALLOC_ROW_ID); goto run_again; } trx->op_info = ""; - return((int) err); + return(err); } if (dict_table_has_fts_index(table)) { @@ -1353,19 +1390,18 @@ error_exit: que_thr_stop_for_mysql_no_error(thr, trx); - table->stat_n_rows++; + srv_stats.n_rows_inserted.add((size_t)trx->id, 1); - srv_n_rows_inserted++; - - if (prebuilt->table->stat_n_rows == 0) { - /* Avoid wrap-over */ - table->stat_n_rows--; - } + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_inc(table); row_update_statistics_if_needed(table); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** @@ -1490,7 +1526,7 @@ row_fts_do_update( Handles FTS matters for an update or a delete. NOTE: should not be called if the table does not have an FTS index. .*/ static -ulint +dberr_t row_fts_update_or_delete( /*=====================*/ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL @@ -1530,16 +1566,18 @@ void init_fts_doc_id_for_ref( /*====================*/ dict_table_t* table, /*!< in: table */ - ulint depth) /*!< in: recusive call depth */ + ulint* depth) /*!< in: recusive call depth */ { dict_foreign_t* foreign; foreign = UT_LIST_GET_FIRST(table->referenced_list); - depth++; + table->fk_max_recusive_level = 0; + + (*depth)++; /* Limit on tables involved in cascading delete/update */ - if (depth > FK_MAX_CASCADE_DEL) { + if (*depth > FK_MAX_CASCADE_DEL) { return; } @@ -1563,7 +1601,7 @@ init_fts_doc_id_for_ref( Does an update or delete of a row for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_update_for_mysql( /*=================*/ byte* mysql_rec, /*!< in: the row to be updated, in @@ -1572,7 +1610,7 @@ row_update_for_mysql( handle */ { trx_savept_t savept; - ulint err; + dberr_t err; que_thr_t* thr; ibool was_lock_wait; dict_index_t* clust_index; @@ -1580,6 +1618,7 @@ row_update_for_mysql( upd_node_t* node; dict_table_t* table = prebuilt->table; trx_t* trx = prebuilt->trx; + ulint fk_depth = 0; ut_ad(prebuilt && trx); UT_NOT_USED(mysql_rec); @@ -1626,14 +1665,26 @@ row_update_for_mysql( return(DB_ERROR); } + DEBUG_SYNC_C("innodb_row_update_for_mysql_begin"); + trx->op_info = "updating or deleting"; row_mysql_delay_if_needed(); - init_fts_doc_id_for_ref(table, 0); - trx_start_if_not_started_xa(trx); + if (dict_table_is_referenced_by_foreign_key(table)) { + /* Share lock the data dictionary to prevent any + table dictionary (for foreign constraint) change. + This is similar to row_ins_check_foreign_constraint + check protect by the dictionary lock as well. + In the future, this can be removed once the Foreign + key MDL is implemented */ + row_mysql_freeze_data_dictionary(trx); + init_fts_doc_id_for_ref(table, &fk_depth); + row_mysql_unfreeze_data_dictionary(trx); + } + node = prebuilt->upd_node; clust_index = dict_table_get_first_index(table); @@ -1683,10 +1734,13 @@ run_again: trx->error_state = DB_SUCCESS; trx->op_info = ""; - return((int) err); + return(err); } thr->lock_state= QUE_THR_LOCK_ROW; + + DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error"); + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, &savept); thr->lock_state= QUE_THR_LOCK_NOLOCK; @@ -1697,7 +1751,7 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } que_thr_stop_for_mysql_no_error(thr, trx); @@ -1707,18 +1761,20 @@ run_again: err = row_fts_update_or_delete(prebuilt); if (err != DB_SUCCESS) { trx->op_info = ""; - return((int) err); + return(err); } } if (node->is_delete) { - if (prebuilt->table->stat_n_rows > 0) { - prebuilt->table->stat_n_rows--; - } + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_dec(prebuilt->table); - srv_n_rows_deleted++; + srv_stats.n_rows_deleted.add((size_t)trx->id, 1); } else { - srv_n_rows_updated++; + srv_stats.n_rows_updated.add((size_t)trx->id, 1); } /* We update table statistics only if it is a DELETE or UPDATE @@ -1730,7 +1786,7 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** @@ -1744,7 +1800,7 @@ prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that releases the latest clustered index record lock we set. @return error code or DB_SUCCESS */ UNIV_INTERN -int +void row_unlock_for_mysql( /*=================*/ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL @@ -1770,8 +1826,7 @@ row_unlock_for_mysql( "InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n" "InnoDB: this session is not using" " READ COMMITTED isolation level.\n"); - - return(DB_SUCCESS); + return; } trx->op_info = "unlock_row"; @@ -1863,15 +1918,13 @@ no_unlock: } trx->op_info = ""; - - return(DB_SUCCESS); } /**********************************************************************//** Does a cascaded delete or set null in a foreign key operation. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_update_cascade_for_mysql( /*=========================*/ que_thr_t* thr, /*!< in: query thread */ @@ -1879,7 +1932,7 @@ row_update_cascade_for_mysql( or set null operation */ dict_table_t* table) /*!< in: table where we do the operation */ { - ulint err; + dberr_t err; trx_t* trx; trx = thr_get_trx(thr); @@ -1890,12 +1943,14 @@ row_update_cascade_for_mysql( thr->fk_cascade_depth++; if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) { - return (DB_FOREIGN_EXCEED_MAX_CASCADE); + return(DB_FOREIGN_EXCEED_MAX_CASCADE); } run_again: thr->run_node = node; thr->prev_node = node; + DEBUG_SYNC_C("foreign_constraint_update_cascade"); + row_upd_step(thr); /* The recursive call for cascading update/delete happens @@ -1937,13 +1992,15 @@ run_again: } if (node->is_delete) { - if (table->stat_n_rows > 0) { - table->stat_n_rows--; - } + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_dec(table); - srv_n_rows_deleted++; + srv_stats.n_rows_deleted.add((size_t)trx->id, 1); } else { - srv_n_rows_updated++; + srv_stats.n_rows_updated.add((size_t)trx->id, 1); } row_update_statistics_if_needed(table); @@ -1981,7 +2038,7 @@ row_mysql_freeze_data_dictionary_func( { ut_a(trx->dict_operation_lock_mode == 0); - rw_lock_s_lock_func(&dict_operation_lock, 0, file, line); + rw_lock_s_lock_inline(&dict_operation_lock, 0, file, line); trx->dict_operation_lock_mode = RW_S_LATCH; } @@ -1994,6 +2051,8 @@ row_mysql_unfreeze_data_dictionary( /*===============================*/ trx_t* trx) /*!< in/out: transaction */ { + ut_ad(lock_trx_has_sys_table_locks(trx) == NULL); + ut_a(trx->dict_operation_lock_mode == RW_S_LATCH); rw_lock_s_unlock(&dict_operation_lock); @@ -2018,7 +2077,7 @@ row_mysql_lock_data_dictionary_func( /* Serialize data dictionary operations with dictionary mutex: no deadlocks or lock waits can occur then in these operations */ - rw_lock_x_lock_func(&dict_operation_lock, 0, file, line); + rw_lock_x_lock_inline(&dict_operation_lock, 0, file, line); trx->dict_operation_lock_mode = RW_X_LATCH; mutex_enter(&(dict_sys->mutex)); @@ -2032,6 +2091,8 @@ row_mysql_unlock_data_dictionary( /*=============================*/ trx_t* trx) /*!< in/out: transaction */ { + ut_ad(lock_trx_has_sys_table_locks(trx) == NULL); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); /* Serialize data dictionary operations with dictionary mutex: @@ -2052,19 +2113,21 @@ InnoDB will try to invoke mem_validate(). On failure the transaction will be rolled back and the 'table' object will be freed. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_create_table_for_mysql( /*=======================*/ dict_table_t* table, /*!< in, own: table definition - (will be freed) */ - trx_t* trx) /*!< in: transaction handle */ + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true, commit the transaction */ { tab_node_t* node; mem_heap_t* heap; que_thr_t* thr; const char* table_name; ulint table_name_len; - ulint err; + dberr_t err; #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -2072,6 +2135,11 @@ row_create_table_for_mysql( ut_ad(mutex_own(&(dict_sys->mutex))); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_start_of_row_create_table_for_mysql", + goto err_exit; + ); + if (srv_created_new_raw) { fputs("InnoDB: A new raw disk partition was initialized:\n" "InnoDB: we do not allow database modifications" @@ -2080,7 +2148,10 @@ row_create_table_for_mysql( " is replaced with raw.\n", stderr); err_exit: dict_mem_table_free(table); - trx_commit_for_mysql(trx); + + if (commit) { + trx_commit_for_mysql(trx); + } return(DB_ERROR); } @@ -2117,23 +2188,23 @@ err_exit: /* The lock timeout monitor thread also takes care of InnoDB monitor prints */ - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } else if (STR_EQ(table_name, table_name_len, S_innodb_lock_monitor)) { srv_print_innodb_monitor = TRUE; srv_print_innodb_lock_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } else if (STR_EQ(table_name, table_name_len, S_innodb_tablespace_monitor)) { srv_print_innodb_tablespace_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } else if (STR_EQ(table_name, table_name_len, S_innodb_table_monitor)) { srv_print_innodb_table_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); #ifdef UNIV_MEM_DEBUG } else if (STR_EQ(table_name, table_name_len, S_innodb_mem_validate)) { @@ -2152,12 +2223,21 @@ err_exit: #endif /* UNIV_MEM_DEBUG */ } - heap = mem_heap_create(512); - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + case TRX_DICT_OP_TABLE: + break; + case TRX_DICT_OP_INDEX: + /* If the transaction was previously flagged as + TRX_DICT_OP_INDEX, we should be creating auxiliary + tables for full-text indexes. */ + ut_ad(strstr(table->name, "/FTS_") != NULL); + } - node = tab_create_graph_create(table, heap); + node = tab_create_graph_create(table, heap, commit); thr = pars_complete_graph_for_exec(node, trx, heap); @@ -2168,6 +2248,29 @@ err_exit: err = trx->error_state; + if (table->space != TRX_SYS_SPACE) { + ut_a(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE)); + + /* Update SYS_TABLESPACES and SYS_DATAFILES if a new + tablespace was created. */ + if (err == DB_SUCCESS) { + char* path; + path = fil_space_get_first_path(table->space); + + err = dict_create_add_tablespace_to_dictionary( + table->space, table->name, + fil_space_get_flags(table->space), + path, trx, commit); + + mem_free(path); + } + + if (err != DB_SUCCESS) { + /* We must delete the link file. */ + fil_delete_link_file(table->name); + } + } + switch (err) { case DB_SUCCESS: break; @@ -2181,8 +2284,8 @@ err_exit: ut_print_name(stderr, trx, TRUE, table->name); fputs(" because tablespace full\n", stderr); - if (dict_table_open_on_name_no_stats( - table->name, FALSE, DICT_ERR_IGNORE_NONE)) { + if (dict_table_open_on_name(table->name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE)) { /* Make things easy for the drop table code. */ @@ -2190,10 +2293,13 @@ err_exit: dict_table_move_from_lru_to_non_lru(table); } - dict_table_close(table, FALSE); + dict_table_close(table, TRUE, FALSE); row_drop_table_for_mysql(table->name, trx, FALSE); - trx_commit_for_mysql(trx); + + if (commit) { + trx_commit_for_mysql(trx); + } } else { dict_mem_table_free(table); } @@ -2203,7 +2309,12 @@ err_exit: case DB_TOO_MANY_CONCURRENT_TRXS: /* We already have .ibd file here. it should be deleted. */ - if (table->space && !fil_delete_tablespace(table->space)) { + if (table->space + && fil_delete_tablespace( + table->space, + BUF_REMOVE_FLUSH_NO_WRITE) + != DB_SUCCESS) { + ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: not able to" @@ -2215,10 +2326,8 @@ err_exit: /* fall through */ case DB_DUPLICATE_KEY: + case DB_TABLESPACE_EXISTS: default: - /* We may also get err == DB_ERROR if the .ibd file for the - table already exists */ - trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); dict_mem_table_free(table); @@ -2229,7 +2338,7 @@ err_exit: trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** @@ -2238,7 +2347,7 @@ to create an index results in dropping the whole table! This is no problem currently as all indexes must be created at the same time as the table. @return error number or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_create_index_for_mysql( /*=======================*/ dict_index_t* index, /*!< in, own: index definition @@ -2254,13 +2363,13 @@ row_create_index_for_mysql( ind_node_t* node; mem_heap_t* heap; que_thr_t* thr; - ulint err; + dberr_t err; ulint i; ulint len; char* table_name; char* index_name; dict_table_t* table; - ibool is_fts = FALSE; + ibool is_fts; #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -2277,8 +2386,8 @@ row_create_index_for_mysql( is_fts = (index->type == DICT_FTS); - table = dict_table_open_on_name_no_stats(table_name, TRUE, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name(table_name, TRUE, TRUE, + DICT_ERR_IGNORE_NONE); trx_start_if_not_started_xa(trx); @@ -2292,6 +2401,11 @@ row_create_index_for_mysql( len = ut_max(len, field_lengths[i]); } + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_create_index", + len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1; + ); + /* Column or prefix length exceeds maximum column length */ if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) { err = DB_TOO_BIG_INDEX_COL; @@ -2308,7 +2422,7 @@ row_create_index_for_mysql( /* Note that the space id where we store the index is inherited from the table in dict_build_index_def_step() in dict0crea.cc. */ - node = ind_create_graph_create(index, heap); + node = ind_create_graph_create(index, heap, true); thr = pars_complete_graph_for_exec(node, trx, heap); @@ -2332,7 +2446,7 @@ row_create_index_for_mysql( } error_handling: - dict_table_close(table, TRUE); + dict_table_close(table, TRUE, FALSE); if (err != DB_SUCCESS) { /* We have special error handling here */ @@ -2353,7 +2467,7 @@ error_handling: mem_free(table_name); mem_free(index_name); - return((int) err); + return(err); } /*********************************************************************//** @@ -2366,7 +2480,7 @@ fields than mentioned in the constraint. Check also that foreign key constraints which reference this table are ok. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_table_add_foreign_constraints( /*==============================*/ trx_t* trx, /*!< in: transaction */ @@ -2383,7 +2497,7 @@ row_table_add_foreign_constraints( code DB_CANNOT_ADD_CONSTRAINT if any foreign keys are found. */ { - ulint err; + dberr_t err; ut_ad(mutex_own(&(dict_sys->mutex))); #ifdef UNIV_SYNC_DEBUG @@ -2399,6 +2513,12 @@ row_table_add_foreign_constraints( err = dict_create_foreign_constraints(trx, sql_string, sql_length, name, reject_fks); + + DBUG_EXECUTE_IF("ib_table_add_foreign_fail", + err = DB_DUPLICATE_KEY;); + + DEBUG_SYNC_C("table_add_foreign_constraints"); + if (err == DB_SUCCESS) { /* Check that also referencing constraints are ok */ err = dict_load_foreigns(name, FALSE, TRUE); @@ -2418,7 +2538,7 @@ row_table_add_foreign_constraints( trx->error_state = DB_SUCCESS; } - return((int) err); + return(err); } /*********************************************************************//** @@ -2430,12 +2550,12 @@ as a background operation, which is taken care of by the master thread in srv0srv.cc. @return error code or DB_SUCCESS */ static -int +dberr_t row_drop_table_for_mysql_in_background( /*===================================*/ const char* name) /*!< in: table name */ { - ulint error; + dberr_t error; trx_t* trx; trx = trx_allocate_for_background(); @@ -2464,7 +2584,7 @@ row_drop_table_for_mysql_in_background( trx_free_for_background(trx); - return((int) error); + return(error); } /*********************************************************************//** @@ -2498,8 +2618,8 @@ loop: return(n_tables + n_tables_dropped); } - table = dict_table_open_on_name_no_stats(drop->table_name, FALSE, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name(drop->table_name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); if (table == NULL) { /* If for some reason the table has already been dropped @@ -2510,7 +2630,7 @@ loop: ut_a(!table->can_be_evicted); - dict_table_close(table, FALSE); + dict_table_close(table, FALSE, FALSE); if (DB_SUCCESS != row_drop_table_for_mysql_in_background( drop->table_name)) { @@ -2617,356 +2737,429 @@ row_add_table_to_background_drop_list( } /*********************************************************************//** -Discards the tablespace of a table which stored in an .ibd file. Discarding -means that this function deletes the .ibd file and assigns a new table id for -the table. Also the flag table->ibd_file_missing is set TRUE. +Reassigns the table identifier of a table. @return error code or DB_SUCCESS */ UNIV_INTERN -int -row_discard_tablespace_for_mysql( -/*=============================*/ - const char* name, /*!< in: table name */ - trx_t* trx) /*!< in: transaction handle */ +dberr_t +row_mysql_table_id_reassign( +/*========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx, /*!< in/out: transaction */ + table_id_t* new_id) /*!< out: new table id */ { - dict_foreign_t* foreign; - table_id_t new_id; - dict_table_t* table; - ibool success; - ulint err; - pars_info_t* info = NULL; + dberr_t err; + pars_info_t* info = pars_info_create(); - /* How do we prevent crashes caused by ongoing operations on - the table? Old operations could try to access non-existent - pages. + dict_hdr_get_new_id(new_id, NULL, NULL); - 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive - MySQL table lock on the table before we can do DISCARD - TABLESPACE. Then there are no running queries on the table. + /* Remove all locks except the table-level S and X locks. */ + lock_remove_all_on_table(table, FALSE); - 2) Purge and rollback: we assign a new table id for the - table. Since purge and rollback look for the table based on - the table id, they see the table as 'dropped' and discard - their operations. + pars_info_add_ull_literal(info, "old_id", table->id); + pars_info_add_ull_literal(info, "new_id", *new_id); + + err = que_eval_sql( + info, + "PROCEDURE RENUMBER_TABLE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET ID = :new_id\n" + " WHERE ID = :old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "END;\n", FALSE, trx); - 3) Insert buffer: we remove all entries for the tablespace in - the insert buffer tree; as long as the tablespace mem object - does not exist, ongoing insert buffer page merges are - discarded in buf0rea.cc. If we recreate the tablespace mem - object with IMPORT TABLESPACE later, then the tablespace will - have the same id, but the tablespace_version field in the mem - object is different, and ongoing old insert buffer page merges - get discarded. + return(err); +} - 4) Linear readahead and random readahead: we use the same - method as in 3) to discard ongoing operations. +/*********************************************************************//** +Setup the pre-requisites for DISCARD TABLESPACE. It will start the transaction, +acquire the data dictionary lock in X mode and open the table. +@return table instance or 0 if not found. */ +static +dict_table_t* +row_discard_tablespace_begin( +/*=========================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ +{ + trx->op_info = "discarding tablespace"; - 5) FOREIGN KEY operations: if - table->n_foreign_key_checks_running > 0, we do not allow the - discard. We also reserve the data dictionary latch. */ + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - trx->op_info = "discarding tablespace"; trx_start_if_not_started_xa(trx); /* Serialize data dictionary operations with dictionary mutex: - no deadlocks can occur then in these operations */ + this is to avoid deadlocks during data dictionary operations */ row_mysql_lock_data_dictionary(trx); - table = dict_table_open_on_name_no_stats(name, TRUE, - DICT_ERR_IGNORE_NONE); - - if (!table) { - err = DB_TABLE_NOT_FOUND; - - goto funct_exit; - } + dict_table_t* table; - if (table->space == 0) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: is in the system tablespace 0" - " which cannot be discarded\n", stderr); - err = DB_ERROR; + table = dict_table_open_on_name( + name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); - goto funct_exit; + if (table) { + dict_stats_wait_bg_to_stop_using_tables(table, NULL, trx); + ut_a(table->space != TRX_SYS_SPACE); + ut_a(table->n_foreign_key_checks_running == 0); } - if (table->n_foreign_key_checks_running > 0) { - - ut_print_timestamp(stderr); - fputs(" InnoDB: You are trying to DISCARD table ", stderr); - ut_print_name(stderr, trx, TRUE, table->name); - fputs("\n" - "InnoDB: though there is a foreign key check" - " running on it.\n" - "InnoDB: Cannot discard the table.\n", - stderr); - - err = DB_ERROR; + return(table); +} - goto funct_exit; - } +/*********************************************************************//** +Do the foreign key constraint checks. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace_foreign_key_checks( +/*======================================*/ + const trx_t* trx, /*!< in: transaction handle */ + const dict_table_t* table) /*!< in: table to be discarded */ +{ + const dict_foreign_t* foreign; /* Check if the table is referenced by foreign key constraints from some other table (not the table itself) */ - foreign = UT_LIST_GET_FIRST(table->referenced_list); + for (foreign = UT_LIST_GET_FIRST(table->referenced_list); + foreign && foreign->foreign_table == table; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { - while (foreign && foreign->foreign_table == table) { - foreign = UT_LIST_GET_NEXT(referenced_list, foreign); } - if (foreign && trx->check_foreigns) { + if (!srv_read_only_mode && foreign && trx->check_foreigns) { FILE* ef = dict_foreign_err_file; /* We only allow discarding a referenced table if FOREIGN_KEY_CHECKS is set to 0 */ - err = DB_CANNOT_DROP_CONSTRAINT; - mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); fputs(" Cannot DISCARD table ", ef); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, table->name); fputs("\n" "because it is referenced by ", ef); ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name); putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); - goto funct_exit; + return(DB_CANNOT_DROP_CONSTRAINT); } - dict_hdr_get_new_id(&new_id, NULL, NULL); + return(DB_SUCCESS); +} - /* Remove all locks except the table-level S and X locks. */ - lock_remove_all_on_table(table, FALSE); +/*********************************************************************//** +Cleanup after the DISCARD TABLESPACE operation. +@return error code. */ +static +dberr_t +row_discard_tablespace_end( +/*=======================*/ + trx_t* trx, /*!< in/out: transaction handle */ + dict_table_t* table, /*!< in/out: table to be discarded */ + dberr_t err) /*!< in: error code */ +{ + if (table != 0) { + dict_table_close(table, TRUE, FALSE); + } - info = pars_info_create(); + DBUG_EXECUTE_IF("ib_discard_before_commit_crash", + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + DBUG_SUICIDE();); - pars_info_add_str_literal(info, "table_name", name); - pars_info_add_ull_literal(info, "new_id", new_id); + trx_commit_for_mysql(trx); - err = que_eval_sql(info, - "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n" - "old_id CHAR;\n" - "BEGIN\n" - "SELECT ID INTO old_id\n" - "FROM SYS_TABLES\n" - "WHERE NAME = :table_name\n" - "LOCK IN SHARE MODE;\n" - "IF (SQL % NOTFOUND) THEN\n" - " COMMIT WORK;\n" - " RETURN;\n" - "END IF;\n" - "UPDATE SYS_TABLES SET ID = :new_id\n" - " WHERE ID = old_id;\n" - "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" - " WHERE TABLE_ID = old_id;\n" - "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" - " WHERE TABLE_ID = old_id;\n" - "COMMIT WORK;\n" - "END;\n" - , FALSE, trx); + DBUG_EXECUTE_IF("ib_discard_after_commit_crash", + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + DBUG_SUICIDE();); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Do the DISCARD TABLESPACE operation. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace( +/*===================*/ + trx_t* trx, /*!< in/out: transaction handle */ + dict_table_t* table) /*!< in/out: table to be discarded */ +{ + dberr_t err; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. MySQL will block all DML on the table using MDL and a + DISCARD will not start unless all existing operations on the + table to be discarded are completed. + + 1) Acquire the data dictionary latch in X mode. To prevent any + internal operations that MySQL is not aware off and also for + the internal SQL parser. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: we remove all entries for the tablespace in + the insert buffer tree. + + 4) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, + we do not allow the discard. */ + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + table_id_t new_id; + + /* Set the TABLESPACE DISCARD flag in the table definition on disk. */ + + err = row_import_update_discarded_flag(trx, table->id, true, true); if (err != DB_SUCCESS) { - trx->error_state = DB_SUCCESS; - trx_rollback_to_savepoint(trx, NULL); - trx->error_state = DB_SUCCESS; - } else { - dict_table_change_id_in_cache(table, new_id); + return(err); + } - success = fil_discard_tablespace(table->space); + /* Update the index root pages in the system tables, on disk */ - if (!success) { - trx->error_state = DB_SUCCESS; - trx_rollback_to_savepoint(trx, NULL); - trx->error_state = DB_SUCCESS; + err = row_import_update_index_root(trx, table, true, true); - err = DB_ERROR; - } else { - /* Set the flag which tells that now it is legal to - IMPORT a tablespace for this table */ - table->tablespace_discarded = TRUE; - table->ibd_file_missing = TRUE; - } + if (err != DB_SUCCESS) { + return(err); } -funct_exit: + /* Drop all the FTS auxiliary tables. */ + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { - if (table != NULL) { - dict_table_close(table, TRUE); + fts_drop_tables(trx, table); } - trx_commit_for_mysql(trx); + /* Assign a new space ID to the table definition so that purge + can ignore the changes. Update the system table on disk. */ - row_mysql_unlock_data_dictionary(trx); + err = row_mysql_table_id_reassign(table, trx, &new_id); - trx->op_info = ""; + if (err != DB_SUCCESS) { + return(err); + } - return((int) err); + /* Discard the physical file that is used for the tablespace. */ + + err = fil_discard_tablespace(table->space); + + switch(err) { + case DB_SUCCESS: + case DB_IO_ERROR: + case DB_TABLESPACE_NOT_FOUND: + /* All persistent operations successful, update the + data dictionary memory cache. */ + + table->ibd_file_missing = TRUE; + + table->flags2 |= DICT_TF2_DISCARDED; + + dict_table_change_id_in_cache(table, new_id); + + /* Reset the root page numbers. */ + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + index->page = FIL_NULL; + index->space = FIL_NULL; + } + + /* If the tablespace did not already exist or we couldn't + write to it, we treat that as a successful DISCARD. It is + unusable anyway. */ + + err = DB_SUCCESS; + break; + + default: + /* We need to rollback the disk changes, something failed. */ + + trx->error_state = DB_SUCCESS; + + trx_rollback_to_savepoint(trx, NULL); + + trx->error_state = DB_SUCCESS; + } + + return(err); } -/*****************************************************************//** -Imports a tablespace. The space id in the .ibd file must match the space id -of the table in the data dictionary. +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function renames the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set to TRUE. @return error code or DB_SUCCESS */ UNIV_INTERN -int -row_import_tablespace_for_mysql( -/*============================*/ +dberr_t +row_discard_tablespace_for_mysql( +/*=============================*/ const char* name, /*!< in: table name */ trx_t* trx) /*!< in: transaction handle */ { + dberr_t err; dict_table_t* table; - ibool success; - lsn_t current_lsn; - ulint err = DB_SUCCESS; - trx_start_if_not_started_xa(trx); + /* Open the table and start the transaction if not started. */ - trx->op_info = "importing tablespace"; + table = row_discard_tablespace_begin(name, trx); - current_lsn = log_get_lsn(); + if (table == 0) { + err = DB_TABLE_NOT_FOUND; + } else if (table->space == TRX_SYS_SPACE) { + char table_name[MAX_FULL_NAME_LEN + 1]; - /* It is possible, though very improbable, that the lsn's in the - tablespace to be imported have risen above the current system lsn, if - a lengthy purge, ibuf merge, or rollback was performed on a backup - taken with ibbackup. If that is the case, reset page lsn's in the - file. We assume that mysqld was shut down after it performed these - cleanup operations on the .ibd file, so that it stamped the latest lsn - to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file. + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); - TODO: reset also the trx id's in clustered index records and write - a new space id to each data page. That would allow us to import clean - .ibd files from another MySQL installation. */ + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_IN_SYSTEM_TABLESPACE, table_name); - success = fil_reset_too_high_lsns(name, current_lsn); + err = DB_ERROR; - if (!success) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: cannot reset lsn's in table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", - stderr); + } else if (table->n_foreign_key_checks_running > 0) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_DISCARD_FK_CHECKS_RUNNING, table_name); err = DB_ERROR; - row_mysql_lock_data_dictionary(trx); - table = NULL; + } else { + /* Do foreign key constraint checks. */ - goto funct_exit; - } + err = row_discard_tablespace_foreign_key_checks(trx, table); - /* Serialize data dictionary operations with dictionary mutex: - no deadlocks can occur then in these operations */ + if (err == DB_SUCCESS) { + err = row_discard_tablespace(trx, table); + } + } - row_mysql_lock_data_dictionary(trx); + return(row_discard_tablespace_end(trx, table, err)); +} - table = dict_table_open_on_name_no_stats(name, TRUE, - DICT_ERR_IGNORE_NONE); +/*********************************************************************//** +Sets an exclusive lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */ + const char* op_info) /*!< in: string for trx->op_info */ +{ + mem_heap_t* heap; + que_thr_t* thr; + dberr_t err; + sel_node_t* node; - if (!table) { - ut_print_timestamp(stderr); - fputs(" InnoDB: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: does not exist in the InnoDB data dictionary\n" - "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", - stderr); + ut_ad(trx); + ut_ad(mode == LOCK_X || mode == LOCK_S); - err = DB_TABLE_NOT_FOUND; + heap = mem_heap_create(512); - goto funct_exit; - } + trx->op_info = op_info; - if (table->space == 0) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: is in the system tablespace 0" - " which cannot be imported\n", stderr); - err = DB_ERROR; + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; - goto funct_exit; - } + /* We use the select query graph as the dummy graph needed + in the lock module call */ - if (!table->tablespace_discarded) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: you are trying to" - " IMPORT a tablespace\n" - "InnoDB: ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs(", though you have not called DISCARD on it yet\n" - "InnoDB: during the lifetime of the mysqld process!\n", - stderr); + thr = que_fork_get_first_thr( + static_cast<que_fork_t*>(que_node_get_parent(thr))); - err = DB_ERROR; + que_thr_move_to_run_state_for_mysql(thr, trx); - goto funct_exit; - } +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; - /* Play safe and remove all insert buffer entries, though we should - have removed them already when DISCARD TABLESPACE was called */ + err = lock_table(0, table, mode, thr); - ibuf_delete_for_discarded_space(table->space); + trx->error_state = err; - success = fil_open_single_table_tablespace( - TRUE, table->space, - dict_tf_to_fsp_flags(table->flags), - table->name); - if (success) { - table->ibd_file_missing = FALSE; - table->tablespace_discarded = FALSE; + if (err == DB_SUCCESS) { + que_thr_stop_for_mysql_no_error(thr, trx); } else { - if (table->ibd_file_missing) { - ut_print_timestamp(stderr); - fputs(" InnoDB: cannot find or open in the" - " database directory the .ibd file of\n" - "InnoDB: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", - stderr); - } + que_thr_stop_for_mysql(thr); - err = DB_ERROR; - } + if (err != DB_QUE_THR_SUSPENDED) { + ibool was_lock_wait; -funct_exit: + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, NULL); - if (table != NULL) { - dict_table_close(table, TRUE); - } + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; - trx_commit_for_mysql(trx); + parent = que_node_get_parent(thr); - row_mysql_unlock_data_dictionary(trx); + run_thr = que_fork_start_command( + static_cast<que_fork_t*>(parent)); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + goto run_again; + } + } + + que_graph_free(thr->graph); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** Truncates a table for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_truncate_table_for_mysql( /*=========================*/ dict_table_t* table, /*!< in: table handle */ trx_t* trx) /*!< in: transaction handle */ { dict_foreign_t* foreign; - ulint err; + dberr_t err; mem_heap_t* heap; byte* buf; dtuple_t* tuple; @@ -2978,17 +3171,15 @@ row_truncate_table_for_mysql( ulint recreate_space = 0; pars_info_t* info = NULL; ibool has_internal_doc_id; + ulint old_space = table->space; /* How do we prevent crashes caused by ongoing operations on the table? Old operations could try to access non-existent pages. 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive - MySQL table lock on the table before we can do TRUNCATE - TABLE. Then there are no running queries on the table. This is - guaranteed, because in ha_innobase::store_lock(), we do not - weaken the TL_WRITE lock requested by MySQL when executing - SQLCOM_TRUNCATE. + InnoDB table lock on the table before we can do TRUNCATE + TABLE. Then there are no running queries on the table. 2) Purge and rollback: we assign a new table id for the table. Since purge and rollback look for the table based on @@ -3031,9 +3222,15 @@ row_truncate_table_for_mysql( return(DB_ERROR); } - trx->op_info = "truncating table"; + if (dict_table_is_discarded(table)) { + return(DB_TABLESPACE_DELETED); + } else if (table->ibd_file_missing) { + return(DB_TABLESPACE_NOT_FOUND); + } - trx_start_if_not_started_xa(trx); + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + + trx->op_info = "truncating table"; /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ @@ -3049,16 +3246,22 @@ row_truncate_table_for_mysql( ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ + dict_stats_wait_bg_to_stop_using_tables(table, NULL, trx); + /* Check if the table is referenced by foreign key constraints from some other table (not the table itself) */ - foreign = UT_LIST_GET_FIRST(table->referenced_list); + for (foreign = UT_LIST_GET_FIRST(table->referenced_list); + foreign != 0 && foreign->foreign_table == table; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { - while (foreign && foreign->foreign_table == table) { - foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + /* Do nothing. */ } - if (foreign && trx->check_foreigns) { + if (!srv_read_only_mode + && foreign + && trx->check_foreigns) { + FILE* ef = dict_foreign_err_file; /* We only allow truncating a referenced table if @@ -3099,19 +3302,41 @@ row_truncate_table_for_mysql( goto funct_exit; } - /* Remove all locks except the table-level S and X locks. */ + /* Remove all locks except the table-level X lock. */ lock_remove_all_on_table(table, FALSE); + /* Ensure that the table will be dropped by + trx_rollback_active() in case of a crash. */ + trx->table_id = table->id; + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + /* Assign an undo segment for the transaction, so that the + transaction will be recovered after a crash. */ + + mutex_enter(&trx->undo_mutex); + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + mutex_exit(&trx->undo_mutex); + + if (err != DB_SUCCESS) { + + goto funct_exit; + } if (table->space && !table->dir_path_of_temp_table) { /* Discard and create the single-table tablespace. */ ulint space = table->space; ulint flags = fil_space_get_flags(space); + ut_a(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)); + + dict_get_and_save_data_dir_path(table, true); + if (flags != ULINT_UNDEFINED - && fil_discard_tablespace(space)) { + && fil_discard_tablespace(space) == DB_SUCCESS) { dict_index_t* index; @@ -3124,15 +3349,18 @@ row_truncate_table_for_mysql( if (space == ULINT_UNDEFINED || fil_create_new_single_table_tablespace( - space, table->name, FALSE, + space, table->name, + table->data_dir_path, flags, table->flags2, - FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + FIL_IBD_FILE_INITIAL_SIZE) + != DB_SUCCESS) { dict_table_x_unlock_indexes(table); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: TRUNCATE TABLE %s failed to" - " create a new tablespace\n", + + ib_logf(IB_LOG_LEVEL_ERROR, + "TRUNCATE TABLE %s failed to " + "create a new tablespace", table->name); + table->ibd_file_missing = 1; err = DB_ERROR; goto funct_exit; @@ -3240,7 +3468,6 @@ next_rec: mtr_commit(&mtr); mem_heap_free(heap); - /* Done with index truncation, release index tree locks, subsequent work relates to table level metadata change */ dict_table_x_unlock_indexes(table); @@ -3259,21 +3486,21 @@ next_rec: fts_table.name = table->name; fts_table.id = new_id; - err = fts_create_common_tables(trx, &fts_table, table->name, - TRUE); + err = fts_create_common_tables( + trx, &fts_table, table->name, TRUE); - if (err == DB_SUCCESS) { - for (i = 0; i < ib_vector_size(table->fts->indexes); - i++) { - dict_index_t* fts_index; + for (i = 0; + i < ib_vector_size(table->fts->indexes) + && err == DB_SUCCESS; + i++) { - fts_index = static_cast<dict_index_t*>( - ib_vector_getp( - table->fts->indexes, i)); + dict_index_t* fts_index; - fts_create_index_tables_low( - trx, fts_index, table->name, new_id); - } + fts_index = static_cast<dict_index_t*>( + ib_vector_getp(table->fts->indexes, i)); + + err = fts_create_index_tables_low( + trx, fts_index, table->name, new_id); } if (err != DB_SUCCESS) { @@ -3287,34 +3514,64 @@ next_rec: fputs("\n", stderr); goto funct_exit; + } else { + ut_ad(trx->state != TRX_STATE_NOT_STARTED); } } info = pars_info_create(); - pars_info_add_int4_literal(info, "space", (lint) table->space); + pars_info_add_int4_literal(info, "new_space", (lint) table->space); pars_info_add_ull_literal(info, "old_id", table->id); pars_info_add_ull_literal(info, "new_id", new_id); err = que_eval_sql(info, - "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n" "BEGIN\n" "UPDATE SYS_TABLES" - " SET ID = :new_id, SPACE = :space\n" + " SET ID = :new_id, SPACE = :new_space\n" " WHERE ID = :old_id;\n" "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" " WHERE TABLE_ID = :old_id;\n" "UPDATE SYS_INDEXES" - " SET TABLE_ID = :new_id, SPACE = :space\n" + " SET TABLE_ID = :new_id, SPACE = :new_space\n" " WHERE TABLE_ID = :old_id;\n" - "COMMIT WORK;\n" "END;\n" , FALSE, trx); + if (err == DB_SUCCESS && old_space != table->space) { + info = pars_info_create(); + + pars_info_add_int4_literal(info, "old_space", (lint) old_space); + + pars_info_add_int4_literal( + info, "new_space", (lint) table->space); + + err = que_eval_sql(info, + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET SPACE = :new_space\n" + " WHERE SPACE = :old_space;\n" + "UPDATE SYS_DATAFILES" + " SET SPACE = :new_space" + " WHERE SPACE = :old_space;\n" + "END;\n" + , FALSE, trx); + } + DBUG_EXECUTE_IF("ib_ddl_crash_before_fts_truncate", err = DB_ERROR;); + if (err != DB_SUCCESS) { trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; + + /* Update system table failed. Table in memory metadata + could be in an inconsistent state, mark the in-memory + table->corrupted to be true. In the long run, this should + be fixed by atomic truncate table */ + table->corrupted = true; + ut_print_timestamp(stderr); fputs(" InnoDB: Unable to assign a new identifier to table ", stderr); @@ -3323,30 +3580,40 @@ next_rec: "InnoDB: after truncating it. Background processes" " may corrupt the table!\n", stderr); - /* Fail to update the table id, so drop the new + /* Failed to update the table id, so drop the new FTS auxiliary tables */ if (has_internal_doc_id) { - dict_table_t fts_table; + ut_ad(trx->state == TRX_STATE_NOT_STARTED); + + table_id_t id = table->id; - fts_table.name = table->name; - fts_table.id = new_id; + table->id = new_id; - fts_drop_tables(trx, &fts_table); + fts_drop_tables(trx, table); + + table->id = id; + + ut_ad(trx->state != TRX_STATE_NOT_STARTED); } err = DB_ERROR; } else { /* Drop the old FTS index */ if (has_internal_doc_id) { + ut_ad(trx->state != TRX_STATE_NOT_STARTED); fts_drop_tables(trx, table); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); } + DBUG_EXECUTE_IF("ib_truncate_crash_after_fts_drop", + DBUG_SUICIDE();); + dict_table_change_id_in_cache(table, new_id); /* Reset the Doc ID in cache to 0 */ if (has_internal_doc_id && table->fts->cache) { table->fts->fts_status |= TABLE_DICT_LOCKED; - fts_update_next_doc_id(table, NULL, 0); + fts_update_next_doc_id(trx, table, NULL, 0); fts_cache_clear(table->fts->cache, TRUE); fts_cache_init(table->fts->cache); table->fts->fts_status &= ~TABLE_DICT_LOCKED; @@ -3364,16 +3631,13 @@ funct_exit: row_mysql_unlock_data_dictionary(trx); - /* We are supposed to recalc and save the stats only - on ANALYZE, but it also makes sense to do so on TRUNCATE */ - dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT_SILENT, - FALSE); + dict_stats_update(table, DICT_STATS_EMPTY_TABLE); trx->op_info = ""; srv_wake_master_thread(); - return((int) err); + return(err); } /*********************************************************************//** @@ -3385,23 +3649,29 @@ by the transaction, the transaction will be committed. Otherwise, the data dictionary will remain locked. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_drop_table_for_mysql( /*=====================*/ const char* name, /*!< in: table name */ trx_t* trx, /*!< in: transaction handle */ - ibool drop_db)/*!< in: TRUE=dropping whole database */ + bool drop_db,/*!< in: true=dropping whole database */ + bool nonatomic) + /*!< in: whether it is permitted + to release and reacquire dict_operation_lock */ { + dberr_t err; dict_foreign_t* foreign; dict_table_t* table; - dict_index_t* index; + ibool print_msg; ulint space_id; - ulint err; - const char* table_name; + char* filepath = NULL; + const char* tablename_minus_db; + char* tablename = NULL; + bool ibd_file_missing; ulint namelen; - ibool locked_dictionary = FALSE; - ibool fts_bg_thread_exited = FALSE; + bool locked_dictionary = false; pars_info_t* info = NULL; + mem_heap_t* heap = NULL; ut_a(name != NULL); @@ -3419,19 +3689,19 @@ row_drop_table_for_mysql( Certain table names starting with 'innodb_' have their special meaning regardless of the database name. Thus, we need to ignore the database name prefix in the comparisons. */ - table_name = strchr(name, '/'); + tablename_minus_db = strchr(name, '/'); - if (table_name) { - table_name++; + if (tablename_minus_db) { + tablename_minus_db++; } else { /* Ancillary FTS tables don't have '/' characters. */ - table_name = name; + tablename_minus_db = name; } - namelen = strlen(table_name) + 1; + namelen = strlen(tablename_minus_db) + 1; if (namelen == sizeof S_innodb_monitor - && !memcmp(table_name, S_innodb_monitor, + && !memcmp(tablename_minus_db, S_innodb_monitor, sizeof S_innodb_monitor)) { /* Table name equals "innodb_monitor": @@ -3440,17 +3710,17 @@ row_drop_table_for_mysql( srv_print_innodb_monitor = FALSE; srv_print_innodb_lock_monitor = FALSE; } else if (namelen == sizeof S_innodb_lock_monitor - && !memcmp(table_name, S_innodb_lock_monitor, + && !memcmp(tablename_minus_db, S_innodb_lock_monitor, sizeof S_innodb_lock_monitor)) { srv_print_innodb_monitor = FALSE; srv_print_innodb_lock_monitor = FALSE; } else if (namelen == sizeof S_innodb_tablespace_monitor - && !memcmp(table_name, S_innodb_tablespace_monitor, + && !memcmp(tablename_minus_db, S_innodb_tablespace_monitor, sizeof S_innodb_tablespace_monitor)) { srv_print_innodb_tablespace_monitor = FALSE; } else if (namelen == sizeof S_innodb_table_monitor - && !memcmp(table_name, S_innodb_table_monitor, + && !memcmp(tablename_minus_db, S_innodb_table_monitor, sizeof S_innodb_table_monitor)) { srv_print_innodb_table_monitor = FALSE; @@ -3461,7 +3731,10 @@ row_drop_table_for_mysql( trx->op_info = "dropping table"; - trx_start_if_not_started(trx); + /* This function is called recursively via fts_drop_tables(). */ + if (trx->state == TRX_STATE_NOT_STARTED) { + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + } if (trx->dict_operation_lock_mode != RW_X_LATCH) { /* Prevent foreign key checks etc. while we are dropping the @@ -3469,17 +3742,17 @@ row_drop_table_for_mysql( row_mysql_lock_data_dictionary(trx); - locked_dictionary = TRUE; + locked_dictionary = true; + nonatomic = true; } -retry: ut_ad(mutex_own(&(dict_sys->mutex))); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - table = dict_table_open_on_name_no_stats( - name, TRUE, + table = dict_table_open_on_name( + name, TRUE, FALSE, static_cast<dict_err_ignore_t>( DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT)); @@ -3502,34 +3775,53 @@ retry: goto funct_exit; } - if (table->fts) { - fts_t* fts = table->fts; + /* Turn on this drop bit before we could release the dictionary + latch */ + table->to_be_dropped = true; - /* It is possible that background 'Add' thread fts_add_thread() - just gets called and the fts_optimize_thread() - is processing deleted records. There could be undetected - deadlock between threads synchronization and dict_sys_mutex - since fts_parse_sql() requires dict_sys->mutex. Ask the - background thread to exit before proceeds to drop table to - avoid undetected deadlocks */ - row_mysql_unlock_data_dictionary(trx); + if (nonatomic) { + /* This trx did not acquire any locks on dictionary + table records yet. Thus it is safe to release and + reacquire the data dictionary latches. */ + if (table->fts) { + ut_ad(!table->fts->add_wq); + ut_ad(lock_trx_has_sys_table_locks(trx) == 0); - if (fts->add_wq && (!fts_bg_thread_exited)) { - /* Wait for any background threads accessing the table - to exit. */ - mutex_enter(&fts->bg_threads_mutex); - fts->fts_status |= BG_THREAD_STOP; + row_mysql_unlock_data_dictionary(trx); + fts_optimize_remove_table(table); + row_mysql_lock_data_dictionary(trx); + } - dict_table_wait_for_bg_threads_to_exit(table, 250000); + /* Do not bother to deal with persistent stats for temp + tables since we know temp tables do not use persistent + stats. */ + if (!dict_table_is_temporary(table)) { + dict_stats_wait_bg_to_stop_using_tables( + table, NULL, trx); + } + } - mutex_exit(&fts->bg_threads_mutex); + /* make sure background stats thread is not running on the table */ + ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)); - row_mysql_lock_data_dictionary(trx); - fts_bg_thread_exited = TRUE; - goto retry; - } else { - fts_optimize_remove_table(table); - row_mysql_lock_data_dictionary(trx); + /* Delete the link file if used. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + fil_delete_link_file(name); + } + + if (!dict_table_is_temporary(table)) { + + dict_stats_recalc_pool_del(table); + + /* Remove stats for this table and all of its indexes from the + persistent storage if it exists and if there are stats for this + table in there. This function creates its own trx and commits + it. */ + char errstr[1024]; + err = dict_stats_drop_table(name, errstr, sizeof(errstr)); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, "%s", errstr); } } @@ -3540,7 +3832,7 @@ retry: dict_table_move_from_lru_to_non_lru(table); } - dict_table_close(table, TRUE); + dict_table_close(table, TRUE, FALSE); /* Check if the table is referenced by foreign key constraints from some other table (not the table itself) */ @@ -3552,7 +3844,9 @@ check_next_foreign: foreign = UT_LIST_GET_NEXT(referenced_list, foreign); } - if (foreign && trx->check_foreigns + if (!srv_read_only_mode + && foreign + && trx->check_foreigns && !(drop_db && dict_tables_have_same_db( name, foreign->foreign_table_name_lookup))) { FILE* ef = dict_foreign_err_file; @@ -3589,16 +3883,16 @@ check_next_foreign: if (table->n_foreign_key_checks_running > 0) { - const char* table_name = table->name; + const char* save_tablename = table->name; ibool added; - added = row_add_table_to_background_drop_list(table_name); + added = row_add_table_to_background_drop_list(save_tablename); if (added) { ut_print_timestamp(stderr); fputs(" InnoDB: You are trying to drop table ", stderr); - ut_print_name(stderr, trx, TRUE, table_name); + ut_print_name(stderr, trx, TRUE, save_tablename); fputs("\n" "InnoDB: though there is a" " foreign key check running on it.\n" @@ -3663,23 +3957,54 @@ check_next_foreign: goto funct_exit; } + /* The "to_be_dropped" marks table that is to be dropped, but + has not been dropped, instead, was put in the background drop + list due to being used by concurrent DML operations. Clear it + here since there are no longer any concurrent activities on it, + and it is free to be dropped */ + table->to_be_dropped = false; + /* If we get this far then the table to be dropped must not have any table or record locks on it. */ ut_a(!lock_table_has_locks(table)); - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - trx->table_id = table->id; + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = table->id; + case TRX_DICT_OP_TABLE: + break; + case TRX_DICT_OP_INDEX: + /* If the transaction was previously flagged as + TRX_DICT_OP_INDEX, we should be dropping auxiliary + tables for full-text indexes. */ + ut_ad(strstr(table->name, "/FTS_") != NULL); + } /* Mark all indexes unavailable in the data dictionary cache before starting to drop the table. */ - for (index = dict_table_get_first_index(table); + unsigned* page_no; + unsigned* page_nos; + heap = mem_heap_create( + 200 + UT_LIST_GET_LEN(table->indexes) * sizeof *page_nos); + tablename = mem_heap_strdup(heap, name); + + page_no = page_nos = static_cast<unsigned*>( + mem_heap_alloc( + heap, + UT_LIST_GET_LEN(table->indexes) * sizeof *page_no)); + + for (dict_index_t* index = dict_table_get_first_index(table); index != NULL; index = dict_table_get_next_index(index)) { rw_lock_x_lock(dict_index_get_lock(index)); - ut_ad(!index->to_be_dropped); - index->to_be_dropped = TRUE; + /* Save the page numbers so that we can restore them + if the operation fails. */ + *page_no++ = index->page; + /* Mark the index unusable. */ + index->page = FIL_NULL; rw_lock_x_unlock(dict_index_get_lock(index)); } @@ -3698,6 +4023,7 @@ check_next_foreign: "table_id CHAR;\n" "index_id CHAR;\n" "foreign_id CHAR;\n" + "space_id INT;\n" "found INT;\n" "DECLARE CURSOR cur_fk IS\n" @@ -3720,6 +4046,12 @@ check_next_foreign: "IF (SQL % NOTFOUND) THEN\n" " RETURN;\n" "END IF;\n" + "SELECT SPACE INTO space_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name;\n" + "IF (SQL % NOTFOUND) THEN\n" + " RETURN;\n" + "END IF;\n" "found := 1;\n" "SELECT ID INTO sys_foreign_id\n" "FROM SYS_TABLES\n" @@ -3762,56 +4094,90 @@ check_next_foreign: " END IF;\n" "END LOOP;\n" "CLOSE cur_idx;\n" + "DELETE FROM SYS_TABLESPACES\n" + "WHERE SPACE = space_id;\n" + "DELETE FROM SYS_DATAFILES\n" + "WHERE SPACE = space_id;\n" "DELETE FROM SYS_COLUMNS\n" "WHERE TABLE_ID = table_id;\n" "DELETE FROM SYS_TABLES\n" - "WHERE ID = table_id;\n" + "WHERE NAME = :table_name;\n" "END;\n" , FALSE, trx); switch (err) { - ibool is_temp; - mem_heap_t* heap; + ibool is_temp; case DB_SUCCESS: - - heap = mem_heap_create(200); - /* Clone the name, in case it has been allocated from table->heap, which will be freed by dict_table_remove_from_cache(table) below. */ - name = mem_heap_strdup(heap, name); space_id = table->space; + ibd_file_missing = table->ibd_file_missing; - is_temp = table->flags2 & DICT_TF2_TEMPORARY; + is_temp = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY); + + /* If there is a temp path then the temp flag is set. + However, during recovery, we might have a temp flag but + not know the temp path */ ut_a(table->dir_path_of_temp_table == NULL || is_temp); + if (dict_table_is_discarded(table) + || table->ibd_file_missing) { + /* Do not attempt to drop known-to-be-missing + tablespaces. */ + space_id = 0; + } + + /* We do not allow temporary tables with a remote path. */ + ut_a(!(is_temp && DICT_TF_HAS_DATA_DIR(table->flags))); + + if (space_id && DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else if (table->dir_path_of_temp_table) { + filepath = fil_make_ibd_name( + table->dir_path_of_temp_table, true); + } else { + filepath = fil_make_ibd_name(tablename, false); + } if (dict_table_has_fts_index(table) || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { ut_ad(table->n_ref_count == 0); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); err = fts_drop_tables(trx, table); if (err != DB_SUCCESS) { ut_print_timestamp(stderr); - fprintf(stderr," InnoDB: Error: (%lu) not " + fprintf(stderr," InnoDB: Error: (%s) not " "able to remove ancillary FTS tables " - "for table ", err); - ut_print_name(stderr, trx, TRUE, name); + "for table ", ut_strerr(err)); + ut_print_name(stderr, trx, TRUE, tablename); fputs("\n", stderr); goto funct_exit; } + } + /* The table->fts flag can be set on the table for which + the cluster index is being rebuilt. Such table might not have + DICT_TF2_FTS flag set. So keep this out of above + dict_table_has_fts_index condition */ + if (table->fts) { fts_free(table); } dict_table_remove_from_cache(table); - if (dict_load_table(name, TRUE, DICT_ERR_IGNORE_NONE) != NULL) { + if (dict_load_table(tablename, TRUE, + DICT_ERR_IGNORE_NONE) != NULL) { ut_print_timestamp(stderr); fputs(" InnoDB: Error: not able to remove table ", stderr); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fputs(" from the dictionary cache!\n", stderr); err = DB_ERROR; } @@ -3819,23 +4185,46 @@ check_next_foreign: /* Do not drop possible .ibd tablespace if something went wrong: we do not want to delete valuable data of the user */ - if (err == DB_SUCCESS && space_id > 0) { - if (!fil_space_for_table_exists_in_mem( - space_id, name, FALSE, !is_temp)) { + /* Don't spam the log if we can't find the tablespace of + a temp table or if the tablesace has been discarded. */ + print_msg = !(is_temp || ibd_file_missing); + + if (err == DB_SUCCESS && space_id > TRX_SYS_SPACE) { + if (!is_temp + && !fil_space_for_table_exists_in_mem( + space_id, tablename, FALSE, + print_msg, false, NULL, 0)) { + /* This might happen if we are dropping a + discarded tablespace */ err = DB_SUCCESS; + if (print_msg) { + char msg_tablename[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + msg_tablename, sizeof(tablename), + tablename, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Removed the table %s from " + "InnoDB's data dictionary", + msg_tablename); + } + + /* Force a delete of any discarded + or temporary files. */ + + fil_delete_file(filepath); + + } else if (fil_delete_tablespace( + space_id, + BUF_REMOVE_FLUSH_NO_WRITE) + != DB_SUCCESS) { fprintf(stderr, "InnoDB: We removed now the InnoDB" " internal data dictionary entry\n" "InnoDB: of table "); - ut_print_name(stderr, trx, TRUE, name); - fprintf(stderr, ".\n"); - } else if (!fil_delete_tablespace(space_id)) { - fprintf(stderr, - "InnoDB: We removed now the InnoDB" - " internal data dictionary entry\n" - "InnoDB: of table "); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fprintf(stderr, ".\n"); ut_print_timestamp(stderr); @@ -3843,13 +4232,12 @@ check_next_foreign: " InnoDB: Error: not able to" " delete tablespace %lu of table ", (ulong) space_id); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fputs("!\n", stderr); err = DB_ERROR; } } - mem_heap_free(heap); break; case DB_OUT_OF_FILE_SPACE: @@ -3874,7 +4262,7 @@ check_next_foreign: fprintf(stderr, "InnoDB: unknown error code %lu" " while dropping table:", (ulong) err); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fprintf(stderr, ".\n"); trx->error_state = DB_SUCCESS; @@ -3884,16 +4272,25 @@ check_next_foreign: /* Mark all indexes available in the data dictionary cache again. */ - for (index = dict_table_get_first_index(table); + page_no = page_nos; + + for (dict_index_t* index = dict_table_get_first_index(table); index != NULL; index = dict_table_get_next_index(index)) { rw_lock_x_lock(dict_index_get_lock(index)); - index->to_be_dropped = FALSE; + ut_a(index->page == FIL_NULL); + index->page = *page_no++; rw_lock_x_unlock(dict_index_get_lock(index)); } } funct_exit: + if (heap) { + mem_heap_free(heap); + } + if (filepath) { + mem_free(filepath); + } if (locked_dictionary) { trx_commit_for_mysql(trx); @@ -3905,7 +4302,7 @@ funct_exit: srv_wake_master_thread(); - return((int) err); + return(err); } /*********************************************************************//** @@ -3929,9 +4326,9 @@ row_mysql_drop_temp_tables(void) mtr_start(&mtr); btr_pcur_open_at_index_side( - TRUE, + true, dict_table_get_first_index(dict_sys->sys_tables), - BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); for (;;) { const rec_t* rec; @@ -3950,6 +4347,8 @@ row_mysql_drop_temp_tables(void) ROW_FORMAT=REDUNDANT. */ rec = btr_pcur_get_rec(&pcur); field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + field = rec_get_nth_field_old( rec, DICT_FLD__SYS_TABLES__N_COLS, &len); if (len != 4 || !(mach_read_from_4(field) & DICT_N_COLS_COMPACT)) { @@ -4003,15 +4402,15 @@ row_mysql_drop_temp_tables(void) Drop all foreign keys in a database, see Bug#18942. Called at the end of row_drop_database_for_mysql(). @return error code or DB_SUCCESS */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t drop_all_foreign_keys_in_db( /*========================*/ const char* name, /*!< in: database name which ends to '/' */ trx_t* trx) /*!< in: transaction handle */ { pars_info_t* pinfo; - ulint err; + dberr_t err; ut_a(name[strlen(name) - 1] == '/'); @@ -4063,22 +4462,24 @@ drop_all_foreign_keys_in_db( Drops a database for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_drop_database_for_mysql( /*========================*/ const char* name, /*!< in: database name which ends to '/' */ trx_t* trx) /*!< in: transaction handle */ { - dict_table_t* table; - char* table_name; - int err = DB_SUCCESS; - ulint namelen = strlen(name); + dict_table_t* table; + char* table_name; + dberr_t err = DB_SUCCESS; + ulint namelen = strlen(name); ut_a(name != NULL); ut_a(name[namelen - 1] == '/'); trx->op_info = "dropping database"; + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx_start_if_not_started_xa(trx); loop: row_mysql_lock_data_dictionary(trx); @@ -4086,11 +4487,29 @@ loop: while ((table_name = dict_get_first_table_name_in_db(name))) { ut_a(memcmp(table_name, name, namelen) == 0); - table = dict_table_open_on_name_no_stats(table_name, TRUE, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name( + table_name, TRUE, FALSE, static_cast<dict_err_ignore_t>( + DICT_ERR_IGNORE_INDEX_ROOT + | DICT_ERR_IGNORE_CORRUPT)); - ut_a(table); - ut_a(!table->can_be_evicted); + if (!table) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot load table %s from InnoDB internal " + "data dictionary during drop database", + table_name); + mem_free(table_name); + err = DB_TABLE_NOT_FOUND; + break; + + } + + if (row_is_mysql_tmp_table_name(table->name)) { + /* There could be an orphan temp table left from + interupted alter table rebuild operation */ + dict_table_close(table, TRUE, FALSE); + } else { + ut_a(!table->can_be_evicted || table->ibd_file_missing); + } /* Wait until MySQL does not have any queries running on the table */ @@ -4121,8 +4540,8 @@ loop: if (err != DB_SUCCESS) { fputs("InnoDB: DROP DATABASE ", stderr); ut_print_name(stderr, trx, TRUE, name); - fprintf(stderr, " failed with error %lu for table ", - (ulint) err); + fprintf(stderr, " failed with error (%s) for table ", + ut_strerr(err)); ut_print_name(stderr, trx, TRUE, table_name); putc('\n', stderr); mem_free(table_name); @@ -4135,7 +4554,7 @@ loop: if (err == DB_SUCCESS) { /* after dropping all tables try to drop all leftover foreign keys in case orphaned ones exist */ - err = (int) drop_all_foreign_keys_in_db(name, trx); + err = drop_all_foreign_keys_in_db(name, trx); if (err != DB_SUCCESS) { fputs("InnoDB: DROP DATABASE ", stderr); @@ -4157,9 +4576,9 @@ loop: /*********************************************************************//** Checks if a table name contains the string "/#sql" which denotes temporary tables in MySQL. -@return TRUE if temporary table */ -static -ibool +@return true if temporary table */ +UNIV_INTERN __attribute__((warn_unused_result)) +bool row_is_mysql_tmp_table_name( /*========================*/ const char* name) /*!< in: table name in the form @@ -4172,8 +4591,8 @@ row_is_mysql_tmp_table_name( /****************************************************************//** Delete a single constraint. @return error code or DB_SUCCESS */ -static -int +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_delete_constraint_low( /*======================*/ const char* id, /*!< in: constraint id */ @@ -4183,7 +4602,7 @@ row_delete_constraint_low( pars_info_add_str_literal(info, "id", id); - return((int) que_eval_sql(info, + return(que_eval_sql(info, "PROCEDURE DELETE_CONSTRAINT () IS\n" "BEGIN\n" "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n" @@ -4195,8 +4614,8 @@ row_delete_constraint_low( /****************************************************************//** Delete a single constraint. @return error code or DB_SUCCESS */ -static -int +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_delete_constraint( /*==================*/ const char* id, /*!< in: constraint id */ @@ -4205,7 +4624,7 @@ row_delete_constraint( mem_heap_t* heap, /*!< in: memory heap */ trx_t* trx) /*!< in: transaction handle */ { - ulint err; + dberr_t err; /* New format constraints have ids <databasename>/<constraintname>. */ err = row_delete_constraint_low( @@ -4222,29 +4641,30 @@ row_delete_constraint( err = row_delete_constraint_low(id, trx); } - return((int) err); + return(err); } /*********************************************************************//** Renames a table for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_rename_table_for_mysql( /*=======================*/ const char* old_name, /*!< in: old table name */ const char* new_name, /*!< in: new table name */ - trx_t* trx, /*!< in: transaction handle */ - ibool commit) /*!< in: if TRUE then commit trx */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: whether to commit trx */ { dict_table_t* table = NULL; ibool dict_locked = FALSE; - ulint err = DB_ERROR; + dberr_t err = DB_ERROR; mem_heap_t* heap = NULL; const char** constraints_to_drop = NULL; ulint n_constraints_to_drop = 0; ibool old_is_tmp, new_is_tmp; pars_info_t* info = NULL; + int retry; ut_a(old_name != NULL); ut_a(new_name != NULL); @@ -4279,8 +4699,8 @@ row_rename_table_for_mysql( dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH; - table = dict_table_open_on_name_no_stats(old_name, dict_locked, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name(old_name, dict_locked, FALSE, + DICT_ERR_IGNORE_NONE); if (!table) { err = DB_TABLE_NOT_FOUND; @@ -4299,18 +4719,19 @@ row_rename_table_for_mysql( "InnoDB: " REFMAN "innodb-troubleshooting.html\n", stderr); goto funct_exit; - } else if (table->ibd_file_missing) { + + } else if (table->ibd_file_missing + && !dict_table_is_discarded(table)) { + err = DB_TABLE_NOT_FOUND; - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_name(stderr, trx, TRUE, old_name); - fputs(" does not have an .ibd file" - " in the database directory.\n" - "InnoDB: You can look for further help from\n" - "InnoDB: " REFMAN "innodb-troubleshooting.html\n", - stderr); + ib_logf(IB_LOG_LEVEL_ERROR, + "Table %s does not have an .ibd file in the database " + "directory. See " REFMAN "innodb-troubleshooting.html", + old_name); + goto funct_exit; + } else if (new_is_tmp) { /* MySQL is doing an ALTER TABLE command and it renames the original table to a temporary table name. We want to preserve @@ -4329,27 +4750,75 @@ row_rename_table_for_mysql( } } + /* Is a foreign key check running on this table? */ + for (retry = 0; retry < 100 + && table->n_foreign_key_checks_running > 0; ++retry) { + row_mysql_unlock_data_dictionary(trx); + os_thread_yield(); + row_mysql_lock_data_dictionary(trx); + } + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: in ALTER TABLE ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fprintf(stderr, "\n" + "InnoDB: a FOREIGN KEY check is running.\n" + "InnoDB: Cannot rename table.\n"); + err = DB_TABLE_IN_FK_CHECK; + goto funct_exit; + } + /* We use the private SQL parser of Innobase to generate the query graphs needed in updating the dictionary data from system tables. */ info = pars_info_create(); pars_info_add_str_literal(info, "new_table_name", new_name); - pars_info_add_str_literal(info, "old_table_name", old_name); err = que_eval_sql(info, "PROCEDURE RENAME_TABLE () IS\n" "BEGIN\n" - "UPDATE SYS_TABLES SET NAME = :new_table_name\n" + "UPDATE SYS_TABLES" + " SET NAME = :new_table_name\n" " WHERE NAME = :old_table_name;\n" "END;\n" , FALSE, trx); - if (err != DB_SUCCESS) { + /* SYS_TABLESPACES and SYS_DATAFILES track non-system tablespaces + which have space IDs > 0. */ + if (err == DB_SUCCESS + && table->space != TRX_SYS_SPACE + && !table->ibd_file_missing) { + /* Make a new pathname to update SYS_DATAFILES. */ + char* new_path = row_make_new_pathname(table, new_name); + + info = pars_info_create(); + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "new_path_name", new_path); + pars_info_add_int4_literal(info, "space_id", table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :new_table_name\n" + " WHERE SPACE = :space_id;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :new_path_name\n" + " WHERE SPACE = :space_id;\n" + "END;\n" + , FALSE, trx); + + mem_free(new_path); + } + if (err != DB_SUCCESS) { goto end; - } else if (!new_is_tmp) { + } + + if (!new_is_tmp) { /* Rename all constraints. */ info = pars_info_create(); @@ -4486,12 +4955,12 @@ end: /* The following call will also rename the .ibd data file if the table is stored in a single-table tablespace */ - if (!dict_table_rename_in_cache(table, new_name, - !new_is_tmp)) { + err = dict_table_rename_in_cache( + table, new_name, !new_is_tmp); + if (err != DB_SUCCESS) { trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; - err = DB_ERROR; goto funct_exit; } @@ -4527,8 +4996,8 @@ end: stderr); } - ut_a(dict_table_rename_in_cache(table, - old_name, FALSE)); + ut_a(DB_SUCCESS == dict_table_rename_in_cache( + table, old_name, FALSE)); trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; @@ -4538,7 +5007,7 @@ end: funct_exit: if (table != NULL) { - dict_table_close(table, dict_locked); + dict_table_close(table, dict_locked, FALSE); } if (commit) { @@ -4558,9 +5027,9 @@ funct_exit: Checks that the index contains entries in an ascending order, unique constraint is not broken, and calculates the number of index entries in the read view of the current transaction. -@return TRUE if ok */ +@return true if ok */ UNIV_INTERN -ibool +bool row_check_index_for_mysql( /*======================*/ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct @@ -4575,7 +5044,7 @@ row_check_index_for_mysql( byte* buf; ulint ret; rec_t* rec; - ibool is_ok = TRUE; + bool is_ok = true; int cmp; ibool contains_null; ulint i; @@ -4588,10 +5057,20 @@ row_check_index_for_mysql( *n_rows = 0; - /* Full Text index are implemented by auxiliary tables, - not the B-tree */ - if (index->type & DICT_FTS) { - return(TRUE); + if (dict_index_is_clust(index)) { + /* The clustered index of a table is always available. + During online ALTER TABLE that rebuilds the table, the + clustered index in the old table will have + index->online_log pointing to the new table. All + indexes of the old table will remain valid and the new + table will be unaccessible to MySQL until the + completion of the ALTER TABLE. */ + } else if (dict_index_is_online_ddl(index) + || (index->type & DICT_FTS)) { + /* Full Text index are implemented by auxiliary tables, + not the B-tree. We also skip secondary indexes that are + being created online. */ + return(true); } buf = static_cast<byte*>(mem_alloc(UNIV_PAGE_SIZE)); @@ -4672,7 +5151,7 @@ not_ok: "InnoDB: record ", stderr); rec_print_new(stderr, rec, offsets); putc('\n', stderr); - is_ok = FALSE; + is_ok = false; } else if (dict_index_is_unique(index) && !contains_null && matched_fields @@ -4702,9 +5181,8 @@ not_ok: mem_heap_empty(heap); - prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, - index, offsets, - &n_ext, heap); + prev_entry = row_rec_to_index_entry( + rec, index, offsets, &n_ext, heap); if (UNIV_LIKELY_NULL(tmp_heap)) { mem_heap_free(tmp_heap); @@ -4718,9 +5196,9 @@ not_ok: /*********************************************************************//** Determines if a table is a magic monitor table. -@return TRUE if monitor table */ +@return true if monitor table */ UNIV_INTERN -ibool +bool row_is_magic_monitor_table( /*=======================*/ const char* table_name) /*!< in: name of the table, in the @@ -4751,7 +5229,7 @@ row_mysql_init(void) { mutex_create( row_drop_list_mutex_key, - &row_drop_list_mutex, SYNC_NO_ORDER_CHECK); + &row_drop_list_mutex, SYNC_NO_ORDER_CHECK); UT_LIST_INIT(row_mysql_drop_list); diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index ab28b396920..ee603be453a 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,8 +42,10 @@ Created 3/14/1997 Heikki Tuuri #include "row0upd.h" #include "row0vers.h" #include "row0mysql.h" +#include "row0log.h" #include "log0log.h" #include "srv0mon.h" +#include "srv0start.h" /************************************************************************* IMPORTANT NOTE: Any operation that generates redo MUST check that there @@ -110,119 +112,134 @@ row_purge_reposition_pcur( return(node->found_clust); } +/** Status of row_purge_remove_clust() */ +enum row_purge_status { + ROW_PURGE_DONE, /*!< The row has been removed. */ + ROW_PURGE_FAIL, /*!< The purge was not successful. */ + ROW_PURGE_SUSPEND/*!< Cannot purge now, due to online rebuild. */ +}; + /***********************************************************//** Removes a delete marked clustered index record if possible. -@return TRUE if success, or if not found, or if modified after the -delete marking */ -static -ibool +@retval ROW_PURGE_DONE if the row was not found, or it was successfully removed +@retval ROW_PURGE_FAIL if the row was modified after the delete marking +@retval ROW_PURGE_SUSPEND if the row refers to an off-page column and +an online ALTER TABLE (table rebuild) is in progress. */ +static __attribute__((nonnull, warn_unused_result)) +enum row_purge_status row_purge_remove_clust_if_poss_low( /*===============================*/ - purge_node_t* node, /*!< in: row purge node */ + purge_node_t* node, /*!< in/out: row purge node */ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { - dict_index_t* index; - btr_pcur_t* pcur; - btr_cur_t* btr_cur; - ibool success; - ulint err; - mtr_t mtr; - rec_t* rec; - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; + dict_index_t* index; + enum row_purge_status status = ROW_PURGE_DONE; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint* offsets; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs_init(offsets_); - index = dict_table_get_first_index(node->table); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ - pcur = &node->pcur; - btr_cur = btr_pcur_get_btr_cur(pcur); + index = dict_table_get_first_index(node->table); log_free_check(); mtr_start(&mtr); - success = row_purge_reposition_pcur(mode, node, &mtr); - - if (!success) { - /* The record is already removed */ - - btr_pcur_commit_specify_mtr(pcur, &mtr); - - return(TRUE); + if (!row_purge_reposition_pcur(mode, node, &mtr)) { + /* The record was already removed. */ + goto func_exit; } - rec = btr_pcur_get_rec(pcur); + rec = btr_pcur_get_rec(&node->pcur); - if (node->roll_ptr != row_get_rec_roll_ptr( - rec, index, rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap))) { - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - /* Someone else has modified the record later: do not remove */ - btr_pcur_commit_specify_mtr(pcur, &mtr); + offsets = rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap); - return(TRUE); + if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) { + /* Someone else has modified the record later: do not remove */ + goto func_exit; } - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); + if (dict_index_get_online_status(index) == ONLINE_INDEX_CREATION + && rec_offs_any_extern(offsets)) { + status = ROW_PURGE_SUSPEND; + goto func_exit; } if (mode == BTR_MODIFY_LEAF) { - success = btr_cur_optimistic_delete(btr_cur, &mtr); + status = btr_cur_optimistic_delete( + btr_pcur_get_btr_cur(&node->pcur), 0, &mtr) + ? ROW_PURGE_DONE : ROW_PURGE_FAIL; } else { + dberr_t err; ut_ad(mode == BTR_MODIFY_TREE); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, - RB_NONE, &mtr); + btr_cur_pessimistic_delete( + &err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0, + RB_NONE, &mtr); - if (err == DB_SUCCESS) { - success = TRUE; - } else if (err == DB_OUT_OF_FILE_SPACE) { - success = FALSE; - } else { + switch (err) { + case DB_SUCCESS: + break; + case DB_OUT_OF_FILE_SPACE: + status = ROW_PURGE_FAIL; + break; + default: ut_error; } } - btr_pcur_commit_specify_mtr(pcur, &mtr); +func_exit: + if (heap) { + mem_heap_free(heap); + } - return(success); + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + + return(status); } /***********************************************************//** Removes a clustered index record if it has not been modified after the delete -marking. */ -static -void +marking. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended, either because of +running out of file space or because the row refers to an off-page +column and an online ALTER TABLE (table rebuild) is in progress. */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_remove_clust_if_poss( /*===========================*/ - purge_node_t* node) /*!< in: row purge node */ + purge_node_t* node) /*!< in/out: row purge node */ { - ibool success; - ulint n_tries = 0; - - /* fputs("Purge: Removing clustered record\n", stderr); */ - - success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF); - if (success) { - - return; + switch (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) { + case ROW_PURGE_DONE: + return(true); + case ROW_PURGE_SUSPEND: + return(false); + case ROW_PURGE_FAIL: + break; } -retry: - success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE); - /* The delete operation may fail if we have little - file space left: TODO: easiest to crash the database - and restart with more file space */ - if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { - n_tries++; - - os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); - - goto retry; + for (ulint n_tries = 0; + n_tries < BTR_CUR_RETRY_DELETE_N_TIMES; + n_tries++) { + switch (row_purge_remove_clust_if_poss_low( + node, BTR_MODIFY_TREE)) { + case ROW_PURGE_DONE: + return(true); + case ROW_PURGE_SUSPEND: + return(false); + case ROW_PURGE_FAIL: + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + } } - ut_a(success); + return(false); } /***********************************************************//** @@ -234,21 +251,21 @@ is newer than the purge view. NOTE: This function should only be called by the purge thread, only while holding a latch on the leaf page of the secondary index entry (or keeping the buffer pool watch on the page). It is possible that -this function first returns TRUE and then FALSE, if a user transaction +this function first returns true and then false, if a user transaction inserts a record that the secondary index entry would refer to. However, in that case, the user transaction would also re-insert the secondary index entry after purge has removed it and released the leaf page latch. -@return TRUE if the secondary index record can be purged */ +@return true if the secondary index record can be purged */ UNIV_INTERN -ibool +bool row_purge_poss_sec( /*===============*/ purge_node_t* node, /*!< in/out: row purge node */ dict_index_t* index, /*!< in: secondary index */ const dtuple_t* entry) /*!< in: secondary index entry */ { - ibool can_delete; + bool can_delete; mtr_t mtr; ut_ad(!dict_index_is_clust(index)); @@ -268,7 +285,7 @@ row_purge_poss_sec( Removes a secondary index entry if possible, by modifying the index tree. Does not try to buffer the delete. @return TRUE if success or if not found */ -static +static __attribute__((nonnull, warn_unused_result)) ibool row_purge_remove_sec_if_poss_tree( /*==============================*/ @@ -279,13 +296,35 @@ row_purge_remove_sec_if_poss_tree( btr_pcur_t pcur; btr_cur_t* btr_cur; ibool success = TRUE; - ulint err; + dberr_t err; mtr_t mtr; enum row_search_result search_result; log_free_check(); mtr_start(&mtr); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + mtr_x_lock(dict_index_get_lock(index), &mtr); + + if (dict_index_is_online_ddl(index)) { + /* Online secondary index creation will not + copy any delete-marked records. Therefore + there is nothing to be purged. We must also + skip the purge when a completed index is + dropped by rollback_inplace_alter_table(). */ + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } + search_result = row_search_index_entry(index, entry, BTR_MODIFY_TREE, &pcur, &mtr); @@ -327,7 +366,7 @@ row_purge_remove_sec_if_poss_tree( & rec_get_info_bits(btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, RB_NONE, &mtr); switch (UNIV_EXPECT(err, DB_SUCCESS)) { case DB_SUCCESS: @@ -342,6 +381,7 @@ row_purge_remove_sec_if_poss_tree( func_exit: btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(success); @@ -350,9 +390,10 @@ func_exit: /*************************************************************** Removes a secondary index entry without modifying the index tree, if possible. -@return TRUE if success or if not found */ -static -ibool +@retval true if success or if not found +@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_remove_sec_if_poss_leaf( /*==============================*/ purge_node_t* node, /*!< in: row purge node */ @@ -361,12 +402,40 @@ row_purge_remove_sec_if_poss_leaf( { mtr_t mtr; btr_pcur_t pcur; + ulint mode; enum row_search_result search_result; + bool success = true; log_free_check(); mtr_start(&mtr); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + mtr_s_lock(dict_index_get_lock(index), &mtr); + + if (dict_index_is_online_ddl(index)) { + /* Online secondary index creation will not + copy any delete-marked records. Therefore + there is nothing to be purged. We must also + skip the purge when a completed index is + dropped by rollback_inplace_alter_table(). */ + goto func_exit_no_pcur; + } + + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED | BTR_DELETE; + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + + mode = BTR_MODIFY_LEAF | BTR_DELETE; + } + /* Set the purge node for the call to row_purge_poss_sec(). */ pcur.btr_cur.purge_node = node; /* Set the query thread, so that ibuf_insert_low() will be @@ -374,10 +443,9 @@ row_purge_remove_sec_if_poss_leaf( pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node)); search_result = row_search_index_entry( - index, entry, BTR_MODIFY_LEAF | BTR_DELETE, &pcur, &mtr); + index, entry, mode, &pcur, &mtr); switch (search_result) { - ibool success; case ROW_FOUND: /* Before attempting to purge a record, check if it is safe to do so. */ @@ -390,11 +458,10 @@ row_purge_remove_sec_if_poss_leaf( btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); - if (!btr_cur_optimistic_delete(btr_cur, &mtr)) { + if (!btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { /* The index entry could not be deleted. */ - success = FALSE; - goto func_exit; + success = false; } } /* fall through (the index entry is still needed, @@ -405,9 +472,8 @@ row_purge_remove_sec_if_poss_leaf( /* The deletion was buffered. */ case ROW_NOT_FOUND: /* The index entry does not exist, nothing to do. */ - success = TRUE; - func_exit: btr_pcur_close(&pcur); + func_exit_no_pcur: mtr_commit(&mtr); return(success); } @@ -418,19 +484,26 @@ row_purge_remove_sec_if_poss_leaf( /***********************************************************//** Removes a secondary index entry if possible. */ -UNIV_INLINE +UNIV_INLINE __attribute__((nonnull(1,2))) void row_purge_remove_sec_if_poss( /*=========================*/ purge_node_t* node, /*!< in: row purge node */ dict_index_t* index, /*!< in: index */ - dtuple_t* entry) /*!< in: index entry */ + const dtuple_t* entry) /*!< in: index entry */ { ibool success; ulint n_tries = 0; /* fputs("Purge: Removing secondary record\n", stderr); */ + if (!entry) { + /* The node->row must have lacked some fields of this + index. This is possible when the undo log record was + written before this index was created. */ + return; + } + if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) { return; @@ -454,18 +527,18 @@ retry: } /***********************************************************//** -Purges a delete marking of a record. */ -static -void +Purges a delete marking of a record. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended, either because of +running out of file space or because the row refers to an off-page +column and an online ALTER TABLE (table rebuild) is in progress. */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_del_mark( /*===============*/ - purge_node_t* node) /*!< in: row purge node */ + purge_node_t* node) /*!< in/out: row purge node */ { mem_heap_t* heap; - dtuple_t* entry; - dict_index_t* index; - - ut_ad(node); heap = mem_heap_create(1024); @@ -477,13 +550,11 @@ row_purge_del_mark( break; } - index = node->index; - if (node->index->type != DICT_FTS) { - /* Build the index entry */ - entry = row_build_index_entry(node->row, NULL, index, heap); - ut_a(entry); - row_purge_remove_sec_if_poss(node, index, entry); + dtuple_t* entry = row_build_index_entry_low( + node->row, NULL, node->index, heap); + row_purge_remove_sec_if_poss(node, node->index, entry); + mem_heap_empty(heap); } node->index = dict_table_get_next_index(node->index); @@ -491,14 +562,15 @@ row_purge_del_mark( mem_heap_free(heap); - row_purge_remove_clust_if_poss(node); + return(row_purge_remove_clust_if_poss(node)); } /***********************************************************//** Purges an update of an existing record. Also purges an update of a delete -marked record if that record contained an externally stored field. */ -static -void +marked record if that record contained an externally stored field. +@return true if purged, false if skipped */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_upd_exist_or_extern_func( /*===============================*/ #ifdef UNIV_DEBUG @@ -508,16 +580,24 @@ row_purge_upd_exist_or_extern_func( trx_undo_rec_t* undo_rec) /*!< in: record to purge */ { mem_heap_t* heap; - dtuple_t* entry; - dict_index_t* index; - ibool is_insert; - ulint rseg_id; - ulint page_no; - ulint offset; - ulint i; - mtr_t mtr; - ut_ad(node); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_get_online_status(dict_table_get_first_index( + node->table)) + == ONLINE_INDEX_CREATION) { + for (ulint i = 0; i < upd_get_n_fields(node->update); i++) { + + const upd_field_t* ufield + = upd_get_nth_field(node->update, i); + + if (dfield_is_ext(&ufield->new_val)) { + return(false); + } + } + } if (node->rec_type == TRX_UNDO_UPD_DEL_REC || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { @@ -534,15 +614,13 @@ row_purge_upd_exist_or_extern_func( break; } - index = node->index; - if (row_upd_changes_ord_field_binary(node->index, node->update, thr, NULL, NULL)) { /* Build the older version of the index entry */ - entry = row_build_index_entry(node->row, NULL, - index, heap); - ut_a(entry); - row_purge_remove_sec_if_poss(node, index, entry); + dtuple_t* entry = row_build_index_entry_low( + node->row, NULL, node->index, heap); + row_purge_remove_sec_if_poss(node, node->index, entry); + mem_heap_empty(heap); } node->index = dict_table_get_next_index(node->index); @@ -552,7 +630,7 @@ row_purge_upd_exist_or_extern_func( skip_secondaries: /* Free possible externally stored fields */ - for (i = 0; i < upd_get_n_fields(node->update); i++) { + for (ulint i = 0; i < upd_get_n_fields(node->update); i++) { const upd_field_t* ufield = upd_get_nth_field(node->update, i); @@ -562,6 +640,12 @@ skip_secondaries: buf_block_t* block; ulint internal_offset; byte* data_field; + dict_index_t* index; + ibool is_insert; + ulint rseg_id; + ulint page_no; + ulint offset; + mtr_t mtr; /* We use the fact that new_val points to undo_rec and get thus the offset of @@ -590,9 +674,17 @@ skip_secondaries: index tree */ index = dict_table_get_first_index(node->table); - mtr_x_lock(dict_index_get_lock(index), &mtr); - +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + case ONLINE_INDEX_ABORTED_DROPPED: + ut_ad(0); + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_ABORTED: + break; + } +#endif /* UNIV_DEBUG */ /* NOTE: we must also acquire an X-latch to the root page of the tree. We will need it when we free pages from the tree. If the tree is of height 1, @@ -622,6 +714,8 @@ skip_secondaries: mtr_commit(&mtr); } } + + return(true); } #ifdef UNIV_DEBUG @@ -634,14 +728,14 @@ skip_secondaries: /***********************************************************//** Parses the row reference and other info in a modify undo log record. -@return TRUE if purge operation required */ +@return true if purge operation required */ static -ibool +bool row_purge_parse_undo_rec( /*=====================*/ purge_node_t* node, /*!< in: row undo node */ trx_undo_rec_t* undo_rec, /*!< in: record to purge */ - ibool* updated_extern, /*!< out: TRUE if an externally + bool* updated_extern, /*!< out: true if an externally stored field was updated */ que_thr_t* thr) /*!< in: query thread */ { @@ -665,40 +759,29 @@ row_purge_parse_undo_rec( if (type == TRX_UNDO_UPD_DEL_REC && !*updated_extern) { - return(FALSE); + return(false); } ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, &info_bits); node->table = NULL; - if (type == TRX_UNDO_UPD_EXIST_REC - && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE - && !(*updated_extern)) { - - /* Purge requires no changes to indexes: we may return */ - - return(FALSE); - } - /* Prevent DROP TABLE etc. from running when we are doing the purge for this row */ - rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + rw_lock_s_lock_inline(&dict_operation_lock, 0, __FILE__, __LINE__); - node->table = dict_table_open_on_id(table_id, FALSE); + node->table = dict_table_open_on_id(table_id, FALSE, FALSE); if (node->table == NULL) { -err_exit: /* The table has been dropped: no need to do purge */ - rw_lock_s_unlock_gen(&dict_operation_lock, 0); - return(FALSE); + goto err_exit; } if (node->table->ibd_file_missing) { /* We skip purge of missing .ibd files */ - dict_table_close(node->table, FALSE); + dict_table_close(node->table, FALSE, FALSE); node->table = NULL; @@ -708,12 +791,22 @@ err_exit: clust_index = dict_table_get_first_index(node->table); if (clust_index == NULL) { + /* The table was corrupt in the data dictionary. + dict_set_corrupted() works on an index, and + we do not have an index to call it with. */ +close_exit: + dict_table_close(node->table, FALSE, FALSE); +err_exit: + rw_lock_s_unlock(&dict_operation_lock); + return(false); + } - dict_table_close(node->table, FALSE); - - /* The table was corrupt in the data dictionary */ + if (type == TRX_UNDO_UPD_EXIST_REC + && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) + && !*updated_extern) { - goto err_exit; + /* Purge requires no changes to indexes: we may return */ + goto close_exit; } ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), @@ -734,13 +827,14 @@ err_exit: node->heap); } - return(TRUE); + return(true); } /***********************************************************//** -Purges the parsed record. */ -static -void +Purges the parsed record. +@return true if purged, false if skipped */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_record_func( /*==================*/ purge_node_t* node, /*!< in: row purge node */ @@ -748,10 +842,11 @@ row_purge_record_func( #ifdef UNIV_DEBUG const que_thr_t*thr, /*!< in: query thread */ #endif /* UNIV_DEBUG */ - ibool updated_extern) /*!< in: TRUE if external columns + bool updated_extern) /*!< in: whether external columns were updated */ { dict_index_t* clust_index; + bool purged = true; clust_index = dict_table_get_first_index(node->table); @@ -759,7 +854,10 @@ row_purge_record_func( switch (node->rec_type) { case TRX_UNDO_DEL_MARK_REC: - row_purge_del_mark(node); + purged = row_purge_del_mark(node); + if (!purged) { + break; + } MONITOR_INC(MONITOR_N_DEL_ROW_PURGE); break; default: @@ -768,20 +866,25 @@ row_purge_record_func( } /* fall through */ case TRX_UNDO_UPD_EXIST_REC: - row_purge_upd_exist_or_extern(thr, node, undo_rec); + purged = row_purge_upd_exist_or_extern(thr, node, undo_rec); + if (!purged) { + break; + } MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN); break; } if (node->found_clust) { btr_pcur_close(&node->pcur); + node->found_clust = FALSE; } if (node->table != NULL) { - dict_table_close(node->table, FALSE); + dict_table_close(node->table, FALSE, FALSE); node->table = NULL; } + return(purged); } #ifdef UNIV_DEBUG @@ -804,18 +907,24 @@ row_purge( trx_undo_rec_t* undo_rec, /*!< in: record to purge */ que_thr_t* thr) /*!< in: query thread */ { - ut_ad(node); - ut_ad(thr); - if (undo_rec != &trx_purge_dummy_rec) { - ibool updated_extern; + bool updated_extern; - if (row_purge_parse_undo_rec( - node, undo_rec, &updated_extern, thr)) { + while (row_purge_parse_undo_rec( + node, undo_rec, &updated_extern, thr)) { - row_purge_record(node, undo_rec, thr, updated_extern); + bool purged = row_purge_record( + node, undo_rec, thr, updated_extern); + + rw_lock_s_unlock(&dict_operation_lock); + + if (purged + || srv_shutdown_state != SRV_SHUTDOWN_NONE) { + return; + } - rw_lock_s_unlock_gen(&dict_operation_lock, 0); + /* Retry the purge in a second. */ + os_thread_sleep(1000000); } } } diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc new file mode 100644 index 00000000000..72e0bf43d77 --- /dev/null +++ b/storage/innobase/row/row0quiesce.cc @@ -0,0 +1,702 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0quiesce.cc +Quiesce a tablespace. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0quiesce.h" +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0quiesce.ic" +#endif + +#include "ibuf0ibuf.h" +#include "srv0start.h" +#include "trx0purge.h" + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_index_fields( +/*===========================*/ + const dict_index_t* index, /*!< in: write the meta data for + this index */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte row[sizeof(ib_uint32_t) * 2]; + + for (ulint i = 0; i < index->n_fields; ++i) { + byte* ptr = row; + const dict_field_t* field = &index->fields[i]; + + mach_write_to_4(ptr, field->prefix_len); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, field->fixed_len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_9", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index fields."); + + return(DB_IO_ERROR); + } + + /* Include the NUL byte in the length. */ + ib_uint32_t len = strlen(field->name) + 1; + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_10", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(field->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index column."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file index information. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_indexes( +/*======================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + { + byte row[sizeof(ib_uint32_t)]; + + /* Write the number of indexes in the table. */ + mach_write_to_4(row, UT_LIST_GET_LEN(table->indexes)); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_11", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index count."); + + return(DB_IO_ERROR); + } + } + + dberr_t err = DB_SUCCESS; + + /* Write the index meta data. */ + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0 && err == DB_SUCCESS; + index = UT_LIST_GET_NEXT(indexes, index)) { + + byte* ptr; + byte row[sizeof(index_id_t) + + sizeof(ib_uint32_t) * 8]; + + ptr = row; + + ut_ad(sizeof(index_id_t) == 8); + mach_write_to_8(ptr, index->id); + ptr += sizeof(index_id_t); + + mach_write_to_4(ptr, index->space); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->page); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->type); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->trx_id_offset); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_user_defined_cols); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_uniq); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_nullable); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_fields); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_12", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index meta-data."); + + return(DB_IO_ERROR); + } + + /* Write the length of the index name. + NUL byte is included in the length. */ + ib_uint32_t len = strlen(index->name) + 1; + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_1", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(index->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index name."); + + return(DB_IO_ERROR); + } + + err = row_quiesce_write_index_fields(index, file, thd); + } + + return(err); +} + +/*********************************************************************//** +Write the meta data (table columns) config file. Serialise the contents of +dict_col_t structure, along with the column name. All fields are serialized +as ib_uint32_t. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_table( +/*====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 7]; + + col = table->cols; + + for (ulint i = 0; i < table->n_cols; ++i, ++col) { + byte* ptr = row; + + mach_write_to_4(ptr, col->prtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->mtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->len); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->mbminmaxlen); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ind); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ord_part); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->max_prefix); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_2", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table column data."); + + return(DB_IO_ERROR); + } + + /* Write out the column name as [len, byte array]. The len + includes the NUL byte. */ + ib_uint32_t len; + const char* col_name; + + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + + /* Include the NUL byte in the length. */ + len = strlen(col_name) + 1; + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_3", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(col_name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing column name."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file header. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_header( +/*=====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Write the meta-data version number. */ + mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing meta-data version number."); + + return(DB_IO_ERROR); + } + + /* Write the server hostname. */ + ib_uint32_t len; + const char* hostname = server_get_hostname(); + + /* Play it safe and check for NULL. */ + if (hostname == 0) { + static const char NullHostname[] = "Hostname unknown"; + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to determine server hostname."); + + hostname = NullHostname; + } + + /* The server hostname includes the NUL byte. */ + len = strlen(hostname) + 1; + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(hostname, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing hostname."); + + return(DB_IO_ERROR); + } + + /* The table name includes the NUL byte. */ + ut_a(table->name != 0); + len = strlen(table->name) + 1; + + /* Write the table name. */ + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(table->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table name."); + + return(DB_IO_ERROR); + } + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Write the next autoinc value. */ + mach_write_to_8(row, table->autoinc); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file));); + + if (fwrite(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table autoinc value."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + /* Write the system page size. */ + mach_write_to_4(ptr, UNIV_PAGE_SIZE); + ptr += sizeof(ib_uint32_t); + + /* Write the table->flags. */ + mach_write_to_4(ptr, table->flags); + ptr += sizeof(ib_uint32_t); + + /* Write the number of columns in the table. */ + mach_write_to_4(ptr, table->n_cols); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table meta-data."); + + return(DB_IO_ERROR); + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the table meta data after quiesce. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_cfg( +/*==================*/ + dict_table_t* table, /*!< in: write the meta data for + this table */ + THD* thd) /*!< in/out: session */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + ib_logf(IB_LOG_LEVEL_INFO, "Writing table metadata to '%s'", name); + + FILE* file = fopen(name, "w+b"); + + if (file == NULL) { + ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE, + name, errno, strerror(errno)); + + err = DB_IO_ERROR; + } else { + err = row_quiesce_write_header(table, file, thd); + + if (err == DB_SUCCESS) { + err = row_quiesce_write_table(table, file, thd); + } + + if (err == DB_SUCCESS) { + err = row_quiesce_write_indexes(table, file, thd); + } + + if (fflush(file) != 0) { + + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), "%s flush() failed", + name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), msg); + } + + if (fclose(file) != 0) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), "%s flose() failed", + name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), msg); + } + } + + return(err); +} + +/*********************************************************************//** +Check whether a table has an FTS index defined on it. +@return true if an FTS index exists on the table */ +static +bool +row_quiesce_table_has_fts_index( +/*============================*/ + const dict_table_t* table) /*!< in: quiesce this table */ +{ + bool exists = false; + + dict_mutex_enter_for_mysql(); + + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + exists = true; + break; + } + } + + dict_mutex_exit_for_mysql(); + + return(exists); +} + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +UNIV_INTERN +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ut_a(trx->mysql_thd != 0); + ut_a(srv_n_purge_threads > 0); + ut_ad(!srv_read_only_mode); + + char table_name[MAX_FULL_NAME_LEN + 1]; + + ut_a(trx->mysql_thd != 0); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Sync to disk of '%s' started.", table_name); + + if (trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_stop(); + } + + ut_a(table->id > 0); + + ulint count = 0; + + while (ibuf_contract_in_background(table->id, TRUE) != 0) { + if (!(++count % 20)) { + ib_logf(IB_LOG_LEVEL_INFO, + "Merging change buffer entries for '%s'", + table_name); + } + } + + if (!trx_is_interrupted(trx)) { + buf_LRU_flush_or_remove_pages( + table->space, BUF_REMOVE_FLUSH_WRITE, trx); + + if (trx_is_interrupted(trx)) { + + ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!"); + + } else if (row_quiesce_write_cfg(table, trx->mysql_thd) + != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_WARN, + "There was an error writing to the " + "meta data file"); + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Table '%s' flushed to disk", table_name); + } + } else { + ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!"); + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Cleanup after table quiesce. */ +UNIV_INTERN +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ulint count = 0; + char table_name[MAX_FULL_NAME_LEN + 1]; + + ut_a(trx->mysql_thd != 0); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + /* We need to wait for the operation to complete if the + transaction has been killed. */ + + while (table->quiesce != QUIESCE_COMPLETE) { + + /* Print a warning after every minute. */ + if (!(count % 60)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Waiting for quiesce of '%s' to complete", + table_name); + } + + /* Sleep for a second. */ + os_thread_sleep(1000000); + + ++count; + } + + /* Remove the .cfg file now that the user has resumed + normal operations. Otherwise it will cause problems when + the user tries to drop the database (remove directory). */ + char cfg_name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name)); + + os_file_delete_if_exists(cfg_name); + + ib_logf(IB_LOG_LEVEL_INFO, + "Deleting the meta-data file '%s'", cfg_name); + + if (trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_run(); + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_NONE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(srv_n_purge_threads > 0); + + if (srv_read_only_mode) { + + ib_senderrf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + return(DB_UNSUPPORTED); + + } else if (table->space == TRX_SYS_SPACE) { + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_TABLE_IN_SYSTEM_TABLESPACE, table_name); + + return(DB_UNSUPPORTED); + } else if (row_quiesce_table_has_fts_index(table)) { + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on tables that have an FTS index. " + "FTS auxiliary tables will not be flushed."); + + } else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + /* If this flag is set then the table may not have any active + FTS indexes but it will still have the auxiliary tables. */ + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on a table that had an FTS index, " + "created on a hidden column, the " + "auxiliary tables haven't been dropped as yet. " + "FTS auxiliary tables will not be flushed."); + } + + row_mysql_lock_data_dictionary(trx); + + dict_table_x_lock_indexes(table); + + switch (state) { + case QUIESCE_START: + ut_a(table->quiesce == QUIESCE_NONE); + break; + + case QUIESCE_COMPLETE: + ut_a(table->quiesce == QUIESCE_START); + break; + + case QUIESCE_NONE: + ut_a(table->quiesce == QUIESCE_COMPLETE); + break; + } + + table->quiesce = state; + + dict_table_x_unlock_indexes(table); + + row_mysql_unlock_data_dictionary(trx); + + return(DB_SUCCESS); +} + diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc index 8c703b1e06c..be786f954fb 100644 --- a/storage/innobase/row/row0row.cc +++ b/storage/innobase/row/row0row.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -50,28 +50,26 @@ Created 4/20/1996 Heikki Tuuri /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. -@return index entry which should be inserted or purged, or NULL if the -externally stored columns in the clustered index record are -unavailable and ext != NULL */ +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ UNIV_INTERN dtuple_t* -row_build_index_entry( -/*==================*/ - const dtuple_t* row, /*!< in: row which should be - inserted or purged */ - row_ext_t* ext, /*!< in: externally stored column prefixes, - or NULL */ - dict_index_t* index, /*!< in: index on the table */ - mem_heap_t* heap) /*!< in: memory heap from which the memory for - the index entry is allocated */ +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ { dtuple_t* entry; ulint entry_len; ulint i; - ut_ad(row && index && heap); - ut_ad(dtuple_check_typed(row)); - entry_len = dict_index_get_n_fields(index); entry = dtuple_create(heap, entry_len); @@ -96,8 +94,19 @@ row_build_index_entry( = dtuple_get_nth_field(entry, i); const dfield_t* dfield2 = dtuple_get_nth_field(row, col_no); - ulint len - = dfield_get_len(dfield2); + ulint len; + +#if DATA_MISSING != 0 +# error "DATA_MISSING != 0" +#endif + if (UNIV_UNLIKELY(dfield_get_type(dfield2)->mtype + == DATA_MISSING)) { + /* The field has not been initialized in the row. + This should be from trx_undo_rec_get_partial_row(). */ + return(NULL); + } + + len = dfield_get_len(dfield2); dfield_copy(dfield, dfield2); @@ -171,8 +180,6 @@ row_build_index_entry( } } - ut_ad(dtuple_check_typed(entry)); - return(entry); } @@ -211,21 +218,23 @@ row_build( of an index, or NULL if index->table should be consulted instead */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ row_ext_t** ext, /*!< out, own: cache of externally stored column prefixes, or NULL */ mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ { + const byte* copy; dtuple_t* row; - const dict_table_t* table; - ulint n_fields; ulint n_ext_cols; ulint* ext_cols = NULL; /* remove warning */ ulint len; - ulint row_len; byte* buf; - ulint i; ulint j; mem_heap_t* tmp_heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; @@ -234,6 +243,7 @@ row_build( ut_ad(index && rec && heap); ut_ad(dict_index_is_clust(index)); ut_ad(!mutex_own(&trx_sys->mutex)); + ut_ad(!col_map || col_table); if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, @@ -260,55 +270,84 @@ row_build( buf = static_cast<byte*>( mem_heap_alloc(heap, rec_offs_size(offsets))); - rec = rec_copy(buf, rec, offsets); - /* Avoid a debug assertion in rec_offs_validate(). */ - rec_offs_make_valid(rec, index, (ulint*) offsets); + copy = rec_copy(buf, rec, offsets); + } else { + copy = rec; } - table = index->table; - row_len = dict_table_get_n_cols(table); - - row = dtuple_create(heap, row_len); - - dict_table_copy_types(row, table); - - dtuple_set_info_bits(row, rec_get_info_bits( - rec, dict_table_is_comp(table))); - - n_fields = rec_offs_n_fields(offsets); n_ext_cols = rec_offs_n_extern(offsets); if (n_ext_cols) { ext_cols = static_cast<ulint*>( mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols)); } - for (i = j = 0; i < n_fields; i++) { - dict_field_t* ind_field + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(copy, index, const_cast<ulint*>(offsets)); + + if (!col_table) { + ut_ad(!col_map); + ut_ad(!add_cols); + col_table = index->table; + } + + if (add_cols) { + ut_ad(col_map); + row = dtuple_copy(add_cols, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(col_table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else { + row = dtuple_create(heap, dict_table_get_n_cols(col_table)); + dict_table_copy_types(row, col_table); + } + + dtuple_set_info_bits(row, rec_get_info_bits( + copy, rec_offs_comp(offsets))); + + j = 0; + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + const dict_field_t* ind_field = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + const dict_col_t* col = dict_field_get_col(ind_field); ulint col_no = dict_col_get_no(col); - dfield_t* dfield - = dtuple_get_nth_field(row, col_no); - - if (ind_field->prefix_len == 0) { - const byte* field = rec_get_nth_field( - rec, offsets, i, &len); + if (col_map) { + col_no = col_map[col_no]; - dfield_set_data(dfield, field, len); + if (col_no == ULINT_UNDEFINED) { + /* dropped column */ + continue; + } } + dfield_t* dfield = dtuple_get_nth_field(row, col_no); + + const byte* field = rec_get_nth_field( + copy, offsets, i, &len); + + dfield_set_data(dfield, field, len); + if (rec_offs_nth_extern(offsets, i)) { dfield_set_ext(dfield); - if (UNIV_LIKELY_NULL(col_table)) { - ut_a(col_no - < dict_table_get_n_cols(col_table)); - col = dict_table_get_nth_col( - col_table, col_no); - } + col = dict_table_get_nth_col(col_table, col_no); if (col->ord_part) { /* We will have to fetch prefixes of @@ -319,14 +358,20 @@ row_build( } } + rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets)); + ut_ad(dtuple_check_typed(row)); if (!ext) { /* REDUNDANT and COMPACT formats store a local 768-byte prefix of each externally stored - column. No cache is needed. */ - ut_ad(dict_table_get_format(index->table) - < UNIV_FORMAT_B); + column. No cache is needed. + + During online table rebuild, + row_log_table_apply_delete_low() + may use a cache that was set up by + row_log_table_delete(). */ + } else if (j) { *ext = row_ext_create(j, ext_cols, index->table->flags, row, heap); @@ -402,28 +447,14 @@ row_rec_to_index_entry_low( /*******************************************************************//** Converts an index record to a typed data tuple. NOTE that externally stored (often big) fields are NOT copied to heap. -@return own: index entry built; see the NOTE below! */ +@return own: index entry built */ UNIV_INTERN dtuple_t* row_rec_to_index_entry( /*===================*/ - ulint type, /*!< in: ROW_COPY_DATA, or - ROW_COPY_POINTERS: the former - copies also the data fields to - heap as the latter only places - pointers to data fields on the - index page */ - const rec_t* rec, /*!< in: record in the index; - NOTE: in the case - ROW_COPY_POINTERS the data - fields in the row will point - directly into this record, - therefore, the buffer page of - this record must be at least - s-latched and the latch held - as long as the dtuple is used! */ + const rec_t* rec, /*!< in: record in the index */ const dict_index_t* index, /*!< in: index */ - ulint* offsets,/*!< in/out: rec_get_offsets(rec) */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ ulint* n_ext, /*!< out: number of externally stored columns */ mem_heap_t* heap) /*!< in: memory heap from which @@ -431,25 +462,21 @@ row_rec_to_index_entry( { dtuple_t* entry; byte* buf; + const rec_t* copy_rec; ut_ad(rec && heap && index); ut_ad(rec_offs_validate(rec, index, offsets)); - if (type == ROW_COPY_DATA) { - /* Take a copy of rec to heap */ - buf = static_cast<byte*>( - mem_heap_alloc(heap, rec_offs_size(offsets))); + /* Take a copy of rec to heap */ + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); - rec = rec_copy(buf, rec, offsets); - /* Avoid a debug assertion in rec_offs_validate(). */ - rec_offs_make_valid(rec, index, offsets); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - } else { - ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - } + copy_rec = rec_copy(buf, rec, offsets); - entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap); + rec_offs_make_valid(copy_rec, index, const_cast<ulint*>(offsets)); + entry = row_rec_to_index_entry_low( + copy_rec, index, offsets, n_ext, heap); + rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets)); dtuple_set_info_bits(entry, rec_get_info_bits(rec, rec_offs_comp(offsets))); diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 96884e89511..bfda669d97a 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -57,7 +57,6 @@ Created 12/19/1997 Heikki Tuuri #include "read0read.h" #include "buf0lru.h" #include "ha_prototypes.h" -#include "srv0mon.h" #include "my_compare.h" /* enum icp_result */ @@ -673,8 +672,8 @@ sel_enqueue_prefetched_row( /*********************************************************************//** Builds a previous version of a clustered index record for a consistent read @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_build_prev_vers( /*====================*/ read_view_t* read_view, /*!< in: read view */ @@ -691,7 +690,7 @@ row_sel_build_prev_vers( afterwards */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; + dberr_t err; if (*old_vers_heap) { mem_heap_empty(*old_vers_heap); @@ -707,10 +706,9 @@ row_sel_build_prev_vers( /*********************************************************************//** Builds the last committed version of a clustered index record for a -semi-consistent read. -@return DB_SUCCESS or error code */ -static -ulint +semi-consistent read. */ +static __attribute__((nonnull)) +void row_sel_build_committed_vers_for_mysql( /*===================================*/ dict_index_t* clust_index, /*!< in: clustered index */ @@ -726,18 +724,16 @@ row_sel_build_committed_vers_for_mysql( afterwards */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; - if (prebuilt->old_vers_heap) { mem_heap_empty(prebuilt->old_vers_heap); } else { - prebuilt->old_vers_heap = mem_heap_create(200); + prebuilt->old_vers_heap = mem_heap_create( + rec_offs_size(*offsets)); } - err = row_vers_build_for_semi_consistent_read( + row_vers_build_for_semi_consistent_read( rec, mtr, clust_index, offsets, offset_heap, prebuilt->old_vers_heap, old_vers); - return(err); } /*********************************************************************//** @@ -809,8 +805,8 @@ row_sel_test_other_conds( Retrieves the clustered index record corresponding to a record in a non-clustered index. Does the necessary locking. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_get_clust_rec( /*==================*/ sel_node_t* node, /*!< in: select_node */ @@ -828,7 +824,7 @@ row_sel_get_clust_rec( dict_index_t* index; rec_t* clust_rec; rec_t* old_vers; - ulint err; + dberr_t err; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; @@ -982,7 +978,7 @@ err_exit: Sets a lock on a record. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ UNIV_INLINE -enum db_err +dberr_t sel_set_rec_lock( /*=============*/ const buf_block_t* block, /*!< in: buffer block of rec */ @@ -995,7 +991,7 @@ sel_set_rec_lock( que_thr_t* thr) /*!< in: query thread */ { trx_t* trx; - enum db_err err; + dberr_t err; trx = thr_get_trx(thr); @@ -1084,7 +1080,7 @@ row_sel_open_pcur( (FALSE: no init) */ btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF, - &(plan->pcur), FALSE, mtr); + &(plan->pcur), false, 0, mtr); } ut_ad(plan->n_rows_prefetched == 0); @@ -1313,8 +1309,8 @@ func_exit: /*********************************************************************//** Performs a select step. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel( /*====*/ sel_node_t* node, /*!< in: select node */ @@ -1347,7 +1343,7 @@ row_sel( &mtr must be committed before we move to the next non-clustered record */ ulint found_flag; - ulint err; + dberr_t err; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; @@ -2083,11 +2079,9 @@ row_sel_step( table_node = static_cast<sym_node_t*>( que_node_get_next(table_node))) { - enum db_err err; - - err = static_cast<enum db_err>(lock_table( + dberr_t err = lock_table( 0, table_node->table, i_lock_mode, - thr)); + thr); if (err != DB_SUCCESS) { trx_t* trx; @@ -2120,7 +2114,7 @@ row_sel_step( } } - enum db_err err = static_cast<enum db_err>(row_sel(node, thr)); + dberr_t err = row_sel(node, thr); /* NOTE! if queries are parallelized, the following assignment may have problems; the assignment should be made only if thr is the @@ -2305,42 +2299,6 @@ row_printf_step( return(thr); } -/******************************************************************** -Creates a key in Innobase dtuple format.*/ - -void -row_create_key( -/*===========*/ - dtuple_t* tuple, /* in: tuple where to build; - NOTE: we assume that the type info - in the tuple is already according - to index! */ - dict_index_t* index, /* in: index of the key value */ - doc_id_t* doc_id) /* in: doc id to search. */ -{ - dtype_t type; - dict_field_t* field; - doc_id_t temp_doc_id; - dfield_t* dfield = dtuple_get_nth_field(tuple, 0); - - ut_a(dict_index_get_n_unique(index) == 1); - - /* Permit us to access any field in the tuple (ULINT_MAX): */ - dtuple_set_n_fields(tuple, ULINT_MAX); - - field = dict_index_get_nth_field(index, 0); - dict_col_copy_type(field->col, &type); - ut_a(dtype_get_mtype(&type) == DATA_INT); - - /* Convert to storage byte order */ - mach_write_to_8((byte*) &temp_doc_id, *doc_id); - *doc_id = temp_doc_id; - - ut_a(sizeof(*doc_id) == field->fixed_len); - dfield_set_data(dfield, doc_id, field->fixed_len); - - dtuple_set_n_fields(tuple, 1); -} /****************************************************************//** Converts a key value stored in MySQL format to an Innobase dtuple. The last field of the key value may be just a prefix of a fixed length field: hence @@ -2536,6 +2494,7 @@ row_sel_convert_mysql_key_to_innobase( dfield_set_len(dfield, len - (ulint) (key_ptr - key_end)); } + ut_ad(0); } n_fields++; @@ -3008,8 +2967,8 @@ row_sel_store_mysql_rec( /*********************************************************************//** Builds a previous version of a clustered index record for a consistent read @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_build_prev_vers_for_mysql( /*==============================*/ read_view_t* read_view, /*!< in: read view */ @@ -3026,7 +2985,7 @@ row_sel_build_prev_vers_for_mysql( afterwards */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; + dberr_t err; if (prebuilt->old_vers_heap) { mem_heap_empty(prebuilt->old_vers_heap); @@ -3045,8 +3004,8 @@ Retrieves the clustered index record corresponding to a record in a non-clustered index. Does the necessary locking. Used in the MySQL interface. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ -static -enum db_err +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_get_clust_rec_for_mysql( /*============================*/ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */ @@ -3073,7 +3032,7 @@ row_sel_get_clust_rec_for_mysql( dict_index_t* clust_index; const rec_t* clust_rec; rec_t* old_vers; - enum db_err err; + dberr_t err; trx_t* trx; *out_rec = NULL; @@ -3172,17 +3131,13 @@ row_sel_get_clust_rec_for_mysql( clust_rec, clust_index, *offsets, trx->read_view)) { - ulint db_err; - /* The following call returns 'offsets' associated with 'old_vers' */ - db_err = row_sel_build_prev_vers_for_mysql( + err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, prebuilt, clust_rec, offsets, offset_heap, &old_vers, mtr); - err = static_cast<enum db_err>(db_err); - if (err != DB_SUCCESS || old_vers == NULL) { goto err_exit; @@ -3226,7 +3181,10 @@ row_sel_get_clust_rec_for_mysql( func_exit: *out_rec = clust_rec; - if (prebuilt->select_lock_type != LOCK_NONE) { + /* Store the current position if select_lock_type is not + LOCK_NONE or if we are scanning using InnoDB APIs */ + if (prebuilt->select_lock_type != LOCK_NONE + || prebuilt->innodb_api) { /* We may use the cursor in update or in unlock_row(): store its position */ @@ -3633,7 +3591,7 @@ row_search_idx_cond_check( return(result); case ICP_ERROR: case ICP_ABORTED_BY_USER: - return(result); + return(result); } ut_error; @@ -3649,7 +3607,7 @@ position and fetch next or fetch prev must not be tried to the cursor! @return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ UNIV_INTERN -ulint +dberr_t row_search_for_mysql( /*=================*/ byte* buf, /*!< in/out: buffer for the fetched @@ -3678,9 +3636,9 @@ row_search_for_mysql( dict_index_t* clust_index; que_thr_t* thr; const rec_t* rec; - const rec_t* result_rec; + const rec_t* result_rec = NULL; const rec_t* clust_rec; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ibool unique_search = FALSE; ibool mtr_has_extra_clust_latch = FALSE; ibool moves_up = FALSE; @@ -3701,48 +3659,41 @@ row_search_for_mysql( ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; ibool table_lock_waited = FALSE; + byte* next_buf = 0; rec_offs_init(offsets_); ut_ad(index && pcur && search_tuple); - if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error:\n" - "InnoDB: MySQL is trying to use a table handle" - " but the .ibd file for\n" - "InnoDB: table %s does not exist.\n" - "InnoDB: Have you deleted the .ibd file" - " from the database directory under\n" - "InnoDB: the MySQL datadir, or have you used" - " DISCARD TABLESPACE?\n" - "InnoDB: Look from\n" - "InnoDB: " REFMAN "innodb-troubleshooting.html\n" - "InnoDB: how you can resolve the problem.\n", - prebuilt->table->name); + /* We don't support FTS queries from the HANDLER interfaces, because + we implemented FTS as reversed inverted index with auxiliary tables. + So anything related to traditional index query would not apply to + it. */ + if (index->type & DICT_FTS) { + return(DB_END_OF_INDEX); + } #ifdef UNIV_SYNC_DEBUG - ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); #endif /* UNIV_SYNC_DEBUG */ - return(DB_ERROR); - } - if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + if (dict_table_is_discarded(prebuilt->table)) { + + return(DB_TABLESPACE_DELETED); + + } else if (prebuilt->table->ibd_file_missing) { + + return(DB_TABLESPACE_NOT_FOUND); + + } else if (!prebuilt->index_usable) { -#ifdef UNIV_SYNC_DEBUG - ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); -#endif /* UNIV_SYNC_DEBUG */ return(DB_MISSING_HISTORY); - } - if (dict_index_is_corrupted(index)) { -#ifdef UNIV_SYNC_DEBUG - ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); -#endif /* UNIV_SYNC_DEBUG */ + } else if (dict_index_is_corrupted(index)) { + return(DB_CORRUPTION); - } - if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" "InnoDB: table handle. Magic n %lu, table name ", @@ -3846,7 +3797,6 @@ row_search_for_mysql( prebuilt->n_rows_fetched++; - srv_n_rows_read++; err = DB_SUCCESS; goto func_exit; } @@ -3925,7 +3875,8 @@ row_search_for_mysql( && dict_index_is_clust(index) && !prebuilt->templ_contains_blob && !prebuilt->used_in_HANDLER - && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { + && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8) + && !prebuilt->innodb_api) { mode = PAGE_CUR_GE; @@ -3973,8 +3924,8 @@ row_search_for_mysql( rec, offsets)) { case ICP_NO_MATCH: case ICP_OUT_OF_RANGE: - case ICP_ERROR: case ICP_ABORTED_BY_USER: + case ICP_ERROR: goto shortcut_mismatch; case ICP_MATCH: goto shortcut_match; @@ -4005,8 +3956,6 @@ row_search_for_mysql( /* ut_print_name(stderr, index->name); fputs(" shortcut\n", stderr); */ - srv_n_rows_read++; - err = DB_SUCCESS; goto release_search_latch_if_needed; @@ -4179,12 +4128,12 @@ wait_table_again: /* Try to place a gap lock on the next index record to prevent phantoms in ORDER BY ... DESC queries */ - const rec_t* next = page_rec_get_next_const(rec); + const rec_t* next_rec = page_rec_get_next_const(rec); - offsets = rec_get_offsets(next, index, offsets, + offsets = rec_get_offsets(next_rec, index, offsets, ULINT_UNDEFINED, &heap); err = sel_set_rec_lock(btr_pcur_get_block(pcur), - next, index, offsets, + next_rec, index, offsets, prebuilt->select_lock_type, LOCK_GAP, thr); @@ -4197,16 +4146,10 @@ wait_table_again: goto lock_wait_or_error; } } - } else { - if (mode == PAGE_CUR_G) { - btr_pcur_open_at_index_side( - TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE, - &mtr); - } else if (mode == PAGE_CUR_L) { - btr_pcur_open_at_index_side( - FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE, - &mtr); - } + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side( + mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF, + pcur, false, 0, &mtr); } rec_loop: @@ -4348,6 +4291,9 @@ wrong_offs: /* Calculate the 'offsets' associated with 'rec' */ + ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX); + ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); if (UNIV_UNLIKELY(srv_force_recovery > 0)) { @@ -4539,15 +4485,10 @@ no_gap_lock: /* The following call returns 'offsets' associated with 'old_vers' */ - err = row_sel_build_committed_vers_for_mysql( + row_sel_build_committed_vers_for_mysql( clust_index, prebuilt, rec, &offsets, &heap, &old_vers, &mtr); - if (err != DB_SUCCESS) { - - goto lock_wait_or_error; - } - /* Check whether it was a deadlock or not, if not a deadlock and the transaction had to wait then release the lock it is waiting on. */ @@ -4649,8 +4590,8 @@ no_gap_lock: case ICP_NO_MATCH: goto next_rec; case ICP_OUT_OF_RANGE: - case ICP_ERROR: case ICP_ABORTED_BY_USER: + case ICP_ERROR: err = DB_RECORD_NOT_FOUND; goto idx_cond_failed; case ICP_MATCH: @@ -4690,12 +4631,15 @@ locks_ok: delete marked record and the record following it. For now this is applicable only to clustered indexes while - doing a unique search. There is scope for further optimization + doing a unique search except for HANDLER queries because + HANDLER allows NEXT and PREV even in unique search on + clustered index. There is scope for further optimization applicable to unique secondary indexes. Current behaviour is to widen the scope of a lock on an already delete marked record if the same record is deleted twice by the same transaction */ if (index == clust_index && unique_search - && !prebuilt->used_in_HANDLER) { + && !prebuilt->used_in_HANDLER) { + err = DB_RECORD_NOT_FOUND; goto normal_return; @@ -4712,8 +4656,8 @@ locks_ok: } goto next_rec; case ICP_OUT_OF_RANGE: - case ICP_ERROR: case ICP_ABORTED_BY_USER: + case ICP_ERROR: err = DB_RECORD_NOT_FOUND; goto idx_cond_failed; case ICP_MATCH: @@ -4831,9 +4775,10 @@ requires_clust_rec: && !prebuilt->templ_contains_blob && !prebuilt->clust_index_was_generated && !prebuilt->used_in_HANDLER + && !prebuilt->innodb_api && prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE - && !prebuilt->result) { + && !prebuilt->in_fts_query) { /* Inside an update, for example, we do not cache rows, since we may use the cursor position to do the actual @@ -4849,29 +4794,58 @@ requires_clust_rec: /* We only convert from InnoDB row format to MySQL row format when ICP is disabled. */ - if (!prebuilt->idx_cond - && !row_sel_store_mysql_rec( - row_sel_fetch_last_buf(prebuilt), - prebuilt, result_rec, - result_rec != rec, - result_rec != rec ? clust_index : index, - offsets)) { - - /* Only fresh inserts may contain incomplete - externally stored columns. Pretend that such - records do not exist. Such records may only be - accessed at the READ UNCOMMITTED isolation - level or when rolling back a recovered - transaction. Rollback happens at a lower - level, not here. */ - goto next_rec; - } + if (!prebuilt->idx_cond) { - row_sel_enqueue_cache_row_for_mysql(buf, prebuilt); + /* We use next_buf to track the allocation of buffers + where we store and enqueue the buffers for our + pre-fetch optimisation. + + If next_buf == 0 then we store the converted record + directly into the MySQL record buffer (buf). If it is + != 0 then we allocate a pre-fetch buffer and store the + converted record there. + + If the conversion fails and the MySQL record buffer + was not written to then we reset next_buf so that + we can re-use the MySQL record buffer in the next + iteration. */ + + next_buf = next_buf + ? row_sel_fetch_last_buf(prebuilt) : buf; + + if (!row_sel_store_mysql_rec( + next_buf, prebuilt, result_rec, + result_rec != rec, + result_rec != rec ? clust_index : index, + offsets)) { + + if (next_buf == buf) { + ut_a(prebuilt->n_fetch_cached == 0); + next_buf = 0; + } + + /* Only fresh inserts may contain incomplete + externally stored columns. Pretend that such + records do not exist. Such records may only be + accessed at the READ UNCOMMITTED isolation + level or when rolling back a recovered + transaction. Rollback happens at a lower + level, not here. */ + goto next_rec; + } + + if (next_buf != buf) { + row_sel_enqueue_cache_row_for_mysql( + next_buf, prebuilt); + } + } else { + row_sel_enqueue_cache_row_for_mysql(buf, prebuilt); + } if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) { goto next_rec; } + } else { if (UNIV_UNLIKELY (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) { @@ -4892,7 +4866,7 @@ requires_clust_rec: rec_offs_size(offsets)); mach_write_to_4(buf, rec_offs_extra_size(offsets) + 4); - } else if (!prebuilt->idx_cond) { + } else if (!prebuilt->idx_cond && !prebuilt->innodb_api) { /* The record was not yet converted to MySQL format. */ if (!row_sel_store_mysql_rec( buf, prebuilt, result_rec, @@ -4935,11 +4909,16 @@ idx_cond_failed: || !dict_index_is_clust(index) || direction != 0 || prebuilt->select_lock_type != LOCK_NONE - || prebuilt->used_in_HANDLER) { + || prebuilt->used_in_HANDLER + || prebuilt->innodb_api) { /* Inside an update always store the cursor position */ btr_pcur_store_position(pcur, &mtr); + + if (prebuilt->innodb_api) { + prebuilt->innodb_api_rec = result_rec; + } } goto normal_return; @@ -5032,7 +5011,7 @@ lock_table_wait: mtr_commit(&mtr); mtr_has_extra_clust_latch = FALSE; - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; /* The following is a patch for MySQL */ @@ -5101,8 +5080,23 @@ normal_return: mtr_commit(&mtr); - if (prebuilt->n_fetch_cached > 0) { - row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + if (prebuilt->idx_cond != 0) { + + /* When ICP is active we don't write to the MySQL buffer + directly, only to buffers that are enqueued in the pre-fetch + queue. We need to dequeue the first buffer and copy the contents + to the record buffer that was passed in by MySQL. */ + + if (prebuilt->n_fetch_cached > 0) { + row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + err = DB_SUCCESS; + } + + } else if (next_buf != 0) { + + /* We may or may not have enqueued some buffers to the + pre-fetch queue, but we definitely wrote to the record + buffer passed to use by MySQL. */ err = DB_SUCCESS; } @@ -5112,9 +5106,6 @@ normal_return: dict_index_name_print(stderr, index); fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ #endif /* UNIV_SEARCH_DEBUG */ - if (err == DB_SUCCESS) { - srv_n_rows_read++; - } func_exit: trx->op_info = ""; @@ -5139,6 +5130,9 @@ func_exit: #ifdef UNIV_SYNC_DEBUG ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); #endif /* UNIV_SYNC_DEBUG */ + + DEBUG_SYNC_C("innodb_row_search_for_mysql_exit"); + return(err); } @@ -5157,7 +5151,22 @@ row_search_check_if_query_cache_permitted( dict_table_t* table; ibool ret = FALSE; - table = dict_table_open_on_name(norm_name, FALSE); + /* Disable query cache altogether for all tables if recovered XA + transactions in prepared state exist. This is because we do not + restore the table locks for those transactions and we may wrongly + set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See + "Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH + QUERY CACHE ENABLED". + Read trx_sys->n_prepared_recovered_trx without mutex protection, + not possible to end up with a torn read since n_prepared_recovered_trx + is word size. */ + if (trx_sys->n_prepared_recovered_trx > 0) { + + return(FALSE); + } + + table = dict_table_open_on_name(norm_name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); if (table == NULL) { @@ -5191,7 +5200,7 @@ row_search_check_if_query_cache_permitted( } } - dict_table_close(table, FALSE); + dict_table_close(table, FALSE, FALSE); return(ret); } @@ -5229,8 +5238,6 @@ row_search_autoinc_read_column( data = rec_get_nth_field(rec, offsets, col_no, &len); - ut_a(len != UNIV_SQL_NULL); - switch (mtype) { case DATA_INT: ut_a(len <= sizeof value); @@ -5289,7 +5296,7 @@ Read the max AUTOINC value from an index. @return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if column name can't be found in index */ UNIV_INTERN -ulint +dberr_t row_search_max_autoinc( /*===================*/ dict_index_t* index, /*!< in: index to search */ @@ -5299,7 +5306,7 @@ row_search_max_autoinc( ulint i; ulint n_cols; dict_field_t* dfield = NULL; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; n_cols = dict_index_get_n_ordering_defined_by_user(index); @@ -5321,10 +5328,9 @@ row_search_max_autoinc( mtr_start(&mtr); - /* Open at the high/right end (FALSE), and INIT - cursor (TRUE) */ + /* Open at the high/right end (false), and init cursor */ btr_pcur_open_at_index_side( - FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) { const rec_t* rec; diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc index 78fd4ad5199..25b2b6b62ce 100644 --- a/storage/innobase/row/row0uins.cc +++ b/storage/innobase/row/row0uins.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,6 +38,7 @@ Created 2/25/1997 Heikki Tuuri #include "mach0data.h" #include "row0undo.h" #include "row0vers.h" +#include "row0log.h" #include "trx0trx.h" #include "trx0rec.h" #include "row0row.h" @@ -60,25 +61,64 @@ introduced where a call to log_free_check() is bypassed. */ Removes a clustered index record. The pcur in node was positioned on the record, now it is detached. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_clust_rec( /*==========================*/ undo_node_t* node) /*!< in: undo node */ { btr_cur_t* btr_cur; ibool success; - ulint err; - ulint n_tries = 0; + dberr_t err; + ulint n_tries = 0; mtr_t mtr; + dict_index_t* index = node->pcur.btr_cur.index; + bool online; + + ut_ad(dict_index_is_clust(index)); mtr_start(&mtr); - success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), - &mtr); + /* This is similar to row_undo_mod_clust(). Even though we + call row_log_table_rollback() elsewhere, the DDL thread may + already have copied this row to the sort buffers or to the new + table. We must log the removal, so that the row will be + correctly purged. However, we can log the removal out of sync + with the B-tree modification. */ + + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->trx->dict_operation_lock_mode + != RW_X_LATCH); + ut_ad(node->table->id != DICT_INDEXES_ID); + mtr_s_lock(dict_index_get_lock(index), &mtr); + } + + success = btr_pcur_restore_position( + online + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF, &node->pcur, &mtr); ut_a(success); + btr_cur = btr_pcur_get_btr_cur(&node->pcur); + + ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index) + == node->trx->id); + + if (online && dict_index_is_online_ddl(index)) { + const rec_t* rec = btr_cur_get_rec(btr_cur); + mem_heap_t* heap = NULL; + const ulint* offsets = rec_get_offsets( + rec, index, NULL, ULINT_UNDEFINED, &heap); + row_log_table_delete( + rec, index, offsets, + trx_read_trx_id(row_get_trx_id_offset(index, offsets) + + rec)); + mem_heap_free(heap); + } + if (node->table->id == DICT_INDEXES_ID) { + ut_ad(!online); ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); /* Drop the index tree associated with the row in @@ -90,14 +130,12 @@ row_undo_ins_remove_clust_rec( mtr_start(&mtr); - success = btr_pcur_restore_position(BTR_MODIFY_LEAF, - &(node->pcur), &mtr); + success = btr_pcur_restore_position( + BTR_MODIFY_LEAF, &node->pcur, &mtr); ut_a(success); } - btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); - - if (btr_cur_optimistic_delete(btr_cur, &mtr)) { + if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { err = DB_SUCCESS; goto func_exit; } @@ -111,7 +149,7 @@ retry: &(node->pcur), &mtr); ut_a(success); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, trx_is_recv(node->trx) ? RB_RECOVERY : RB_NORMAL, &mtr); @@ -142,8 +180,8 @@ func_exit: /***************************************************************//** Removes a secondary index entry if found. @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_sec_low( /*========================*/ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, @@ -154,22 +192,31 @@ row_undo_ins_remove_sec_low( { btr_pcur_t pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err = DB_SUCCESS; mtr_t mtr; enum row_search_result search_result; + log_free_check(); + mtr_start(&mtr); - btr_cur = btr_pcur_get_btr_cur(&pcur); + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } - ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF); + if (row_log_online_op_try(index, entry, 0)) { + goto func_exit_no_pcur; + } search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); switch (search_result) { case ROW_NOT_FOUND: - err = DB_SUCCESS; goto func_exit; case ROW_FOUND: break; @@ -181,23 +228,24 @@ row_undo_ins_remove_sec_low( ut_error; } - if (mode == BTR_MODIFY_LEAF) { - err = btr_cur_optimistic_delete(btr_cur, &mtr) + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (mode != BTR_MODIFY_TREE) { + err = btr_cur_optimistic_delete(btr_cur, 0, &mtr) ? DB_SUCCESS : DB_FAIL; } else { - ut_ad(mode == BTR_MODIFY_TREE); - /* No need to distinguish RB_RECOVERY here, because we are deleting a secondary index record: the distinction between RB_NORMAL and RB_RECOVERY only matters when deleting a record that contains externally stored columns. */ ut_ad(!dict_index_is_clust(index)); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, RB_NORMAL, &mtr); } func_exit: btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(err); @@ -207,14 +255,14 @@ func_exit: Removes a secondary index entry from the index if found. Tries first optimistic, then pessimistic descent down the tree. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_sec( /*====================*/ dict_index_t* index, /*!< in: index */ dtuple_t* entry) /*!< in: index entry to insert */ { - ulint err; + dberr_t err; ulint n_tries = 0; /* Try first optimistic descent to the B-tree */ @@ -261,7 +309,7 @@ row_undo_ins_parse_undo_rec( table_id_t table_id; ulint type; ulint dummy; - ibool dummy_extern; + bool dummy_extern; ut_ad(node); @@ -271,12 +319,13 @@ row_undo_ins_parse_undo_rec( node->rec_type = type; node->update = NULL; - node->table = dict_table_open_on_id(table_id, dict_locked); + node->table = dict_table_open_on_id(table_id, dict_locked, FALSE); /* Skip the UNDO if we can't find the table or the .ibd file. */ if (UNIV_UNLIKELY(node->table == NULL)) { } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) { - dict_table_close(node->table, dict_locked); +close_table: + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; } else { clust_index = dict_table_get_first_index(node->table); @@ -286,10 +335,7 @@ row_undo_ins_parse_undo_rec( ptr, clust_index, &node->ref, node->heap); if (!row_undo_search_clust_to_pcur(node)) { - - dict_table_close(node->table, dict_locked); - - node->table = NULL; + goto close_table; } } else { @@ -299,10 +345,7 @@ row_undo_ins_parse_undo_rec( node->table->name); fprintf(stderr, " has no indexes, " "ignoring the table\n"); - - dict_table_close(node->table, dict_locked); - - node->table = NULL; + goto close_table; } } } @@ -310,27 +353,32 @@ row_undo_ins_parse_undo_rec( /***************************************************************//** Removes secondary index records. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_sec_rec( /*========================*/ undo_node_t* node) /*!< in/out: row undo node */ { - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; + dict_index_t* index = node->index; mem_heap_t* heap; heap = mem_heap_create(1024); - while (node->index != NULL) { + while (index != NULL) { dtuple_t* entry; - if (node->index->type & DICT_FTS) { - dict_table_next_uncorrupted_index(node->index); + if (index->type & DICT_FTS) { + dict_table_next_uncorrupted_index(index); continue; } - entry = row_build_index_entry(node->row, node->ext, - node->index, heap); + /* An insert undo record TRX_UNDO_INSERT_REC will + always contain all fields of the index. It does not + matter if any indexes were created afterwards; all + index entries can be reconstructed from the row. */ + entry = row_build_index_entry( + node->row, node->ext, index, heap); if (UNIV_UNLIKELY(!entry)) { /* The database must have crashed after inserting a clustered index record but before @@ -343,9 +391,7 @@ row_undo_ins_remove_sec_rec( transactions. */ ut_a(trx_is_recv(node->trx)); } else { - log_free_check(); - - err = row_undo_ins_remove_sec(node->index, entry); + err = row_undo_ins_remove_sec(index, entry); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { goto func_exit; @@ -353,10 +399,11 @@ row_undo_ins_remove_sec_rec( } mem_heap_empty(heap); - dict_table_next_uncorrupted_index(node->index); + dict_table_next_uncorrupted_index(index); } func_exit: + node->index = index; mem_heap_free(heap); return(err); } @@ -369,15 +416,14 @@ if it figures out that an index record will be removed in the purge anyway, it will remove it in the rollback. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ UNIV_INTERN -ulint +dberr_t row_undo_ins( /*=========*/ undo_node_t* node) /*!< in: row undo node */ { - ulint err; - ibool dict_locked; + dberr_t err; + ibool dict_locked; - ut_ad(node); ut_ad(node->state == UNDO_NODE_INSERT); dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH; @@ -392,24 +438,46 @@ row_undo_ins( /* Iterate over all the indexes and undo the insert.*/ + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + + if (dict_index_is_online_ddl(node->index)) { + /* Note that we are rolling back this transaction, so + that all inserts and updates with this DB_TRX_ID can + be skipped. */ + row_log_table_rollback(node->index, node->trx->id); + } + /* Skip the clustered index (the first index) */ - node->index = dict_table_get_next_index( - dict_table_get_first_index(node->table)); + node->index = dict_table_get_next_index(node->index); dict_table_skip_corrupt_index(node->index); err = row_undo_ins_remove_sec_rec(node); - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - goto func_exit; - } + if (err == DB_SUCCESS) { - log_free_check(); + log_free_check(); - err = row_undo_ins_remove_clust_rec(node); + if (node->table->id == DICT_INDEXES_ID) { -func_exit: - dict_table_close(node->table, dict_locked); + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + } + + // FIXME: We need to update the dict_index_t::space and + // page number fields too. + err = row_undo_ins_remove_clust_rec(node); + + if (node->table->id == DICT_INDEXES_ID + && !dict_locked) { + + mutex_exit(&dict_sys->mutex); + } + } + + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index 4869909f5a6..c1a4ba76052 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -37,6 +37,7 @@ Created 2/27/1997 Heikki Tuuri #include "mach0data.h" #include "row0undo.h" #include "row0vers.h" +#include "row0log.h" #include "trx0trx.h" #include "trx0rec.h" #include "row0row.h" @@ -71,11 +72,20 @@ introduced where a call to log_free_check() is bypassed. */ /***********************************************************//** Undoes a modify in a clustered index record. @return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_clust_low( /*===================*/ undo_node_t* node, /*!< in: row undo node */ + ulint** offsets,/*!< out: rec_get_offsets() on the record */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const dtuple_t**rebuilt_old_pk, + /*!< out: row_log_table_get_pk() + before the update, or NULL if + the table is not being rebuilt online or + the PRIMARY KEY definition does not change */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr, /*!< in: mtr; must be committed before latching any further pages */ @@ -83,12 +93,12 @@ row_undo_mod_clust_low( { btr_pcur_t* pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err; #ifdef UNIV_DEBUG ibool success; #endif /* UNIV_DEBUG */ - pcur = &(node->pcur); + pcur = &node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); #ifdef UNIV_DEBUG @@ -97,31 +107,40 @@ row_undo_mod_clust_low( btr_pcur_restore_position(mode, pcur, mtr); ut_ad(success); + ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), + btr_cur_get_index(btr_cur)) + == thr_get_trx(thr)->id); + + if (mode != BTR_MODIFY_LEAF + && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) { + *rebuilt_old_pk = row_log_table_get_pk( + btr_cur_get_rec(btr_cur), + btr_cur_get_index(btr_cur), NULL, &heap); + } else { + *rebuilt_old_pk = NULL; + } - if (mode == BTR_MODIFY_LEAF) { + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); - err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG - | BTR_NO_UNDO_LOG_FLAG - | BTR_KEEP_SYS_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); + err = btr_cur_optimistic_update( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, offsets, offsets_heap, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); } else { - mem_heap_t* heap = NULL; big_rec_t* dummy_big_rec; - ut_ad(mode == BTR_MODIFY_TREE); - err = btr_cur_pessimistic_update( BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG, - btr_cur, &heap, &dummy_big_rec, node->update, - node->cmpl_info, thr, mtr); + btr_cur, offsets, offsets_heap, heap, + &dummy_big_rec, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); ut_a(!dummy_big_rec); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } } return(err); @@ -134,8 +153,8 @@ delete-marked record and there no longer exist transactions that would see the delete-marked record. In other words, we roll back the insert by purging the record. @return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_remove_clust_low( /*==========================*/ undo_node_t* node, /*!< in: row undo node */ @@ -144,7 +163,7 @@ row_undo_mod_remove_clust_low( ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { btr_cur_t* btr_cur; - ulint err; + dberr_t err; ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); @@ -159,8 +178,14 @@ row_undo_mod_remove_clust_low( btr_cur = btr_pcur_get_btr_cur(&node->pcur); + /* We are about to remove an old, delete-marked version of the + record that may have been delete-marked by a different transaction + than the rolling-back one. */ + ut_ad(rec_get_deleted_flag(btr_cur_get_rec(btr_cur), + dict_table_is_comp(node->table))); + if (mode == BTR_MODIFY_LEAF) { - err = btr_cur_optimistic_delete(btr_cur, mtr) + err = btr_cur_optimistic_delete(btr_cur, 0, mtr) ? DB_SUCCESS : DB_FAIL; } else { @@ -169,7 +194,7 @@ row_undo_mod_remove_clust_low( /* This operation is analogous to purge, we can free also inherited externally stored fields */ - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, thr_is_recv(thr) ? RB_RECOVERY_PURGE_REC : RB_NONE, mtr); @@ -186,8 +211,8 @@ row_undo_mod_remove_clust_low( Undoes a modify in a clustered index record. Sets also the node state for the next round of undo. @return DB_SUCCESS or error code: we may run out of file space */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_clust( /*===============*/ undo_node_t* node, /*!< in: row undo node */ @@ -195,21 +220,42 @@ row_undo_mod_clust( { btr_pcur_t* pcur; mtr_t mtr; - ulint err; + dberr_t err; + dict_index_t* index; + bool online; - ut_ad(node && thr); + ut_ad(thr_get_trx(thr) == node->trx); + ut_ad(node->trx->dict_operation_lock_mode); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED) + || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ log_free_check(); + pcur = &node->pcur; + index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur)); + mtr_start(&mtr); - pcur = &(node->pcur); + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH); + mtr_s_lock(dict_index_get_lock(index), &mtr); + } - mtr_start(&mtr); + mem_heap_t* heap = mem_heap_create(1024); + mem_heap_t* offsets_heap = NULL; + ulint* offsets = NULL; + const dtuple_t* rebuilt_old_pk; /* Try optimistic processing of the record, keeping changes within the index page */ - err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF); + err = row_undo_mod_clust_low(node, &offsets, &offsets_heap, + heap, &rebuilt_old_pk, + thr, &mtr, online + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF); if (err != DB_SUCCESS) { btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -219,7 +265,40 @@ row_undo_mod_clust( mtr_start(&mtr); - err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + err = row_undo_mod_clust_low( + node, &offsets, &offsets_heap, heap, &rebuilt_old_pk, + thr, &mtr, BTR_MODIFY_TREE); + ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE); + } + + /* Online rebuild cannot be initiated while we are holding + dict_operation_lock and index->lock. (It can be aborted.) */ + ut_ad(online || !dict_index_is_online_ddl(index)); + + if (err == DB_SUCCESS && online) { +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + switch (node->rec_type) { + case TRX_UNDO_DEL_MARK_REC: + row_log_table_insert( + btr_pcur_get_rec(pcur), index, offsets); + break; + case TRX_UNDO_UPD_EXIST_REC: + row_log_table_update( + btr_pcur_get_rec(pcur), index, offsets, + rebuilt_old_pk); + break; + case TRX_UNDO_UPD_DEL_REC: + row_log_table_delete( + btr_pcur_get_rec(pcur), index, offsets, + node->trx->id); + break; + default: + ut_ad(0); + break; + } } btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -228,8 +307,11 @@ row_undo_mod_clust( mtr_start(&mtr); - err = row_undo_mod_remove_clust_low(node, thr, &mtr, - BTR_MODIFY_LEAF); + /* It is not necessary to call row_log_table, + because the record is delete-marked and would thus + be omitted from the rebuilt copy of the table. */ + err = row_undo_mod_remove_clust_low( + node, thr, &mtr, BTR_MODIFY_LEAF); if (err != DB_SUCCESS) { btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -240,6 +322,9 @@ row_undo_mod_clust( err = row_undo_mod_remove_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + + ut_ad(err == DB_SUCCESS + || err == DB_OUT_OF_FILE_SPACE); } btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -249,14 +334,18 @@ row_undo_mod_clust( trx_undo_rec_release(node->trx, node->undo_no); + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + mem_heap_free(heap); return(err); } /***********************************************************//** Delete marks or removes a secondary index entry if found. @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_mark_or_remove_sec_low( /*====================================*/ undo_node_t* node, /*!< in: row undo node */ @@ -270,7 +359,7 @@ row_undo_mod_del_mark_or_remove_sec_low( btr_cur_t* btr_cur; ibool success; ibool old_has; - ulint err; + dberr_t err = DB_SUCCESS; mtr_t mtr; mtr_t mtr_vers; enum row_search_result search_result; @@ -278,9 +367,30 @@ row_undo_mod_del_mark_or_remove_sec_low( log_free_check(); mtr_start(&mtr); - btr_cur = btr_pcur_get_btr_cur(&pcur); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try(index, entry, 0)) { + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } - ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF); + btr_cur = btr_pcur_get_btr_cur(&pcur); search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); @@ -296,8 +406,6 @@ row_undo_mod_del_mark_or_remove_sec_low( In normal processing, if an update ends in a deadlock before it has inserted all updated secondary index records, then the undo will not find those records. */ - - err = DB_SUCCESS; goto func_exit; case ROW_FOUND: break; @@ -329,16 +437,14 @@ row_undo_mod_del_mark_or_remove_sec_low( } else { /* Remove the index record */ - if (mode == BTR_MODIFY_LEAF) { - success = btr_cur_optimistic_delete(btr_cur, &mtr); + if (mode != BTR_MODIFY_TREE) { + success = btr_cur_optimistic_delete(btr_cur, 0, &mtr); if (success) { err = DB_SUCCESS; } else { err = DB_FAIL; } } else { - ut_ad(mode == BTR_MODIFY_TREE); - /* No need to distinguish RB_RECOVERY_PURGE here, because we are deleting a secondary index record: the distinction between RB_NORMAL and @@ -346,7 +452,7 @@ row_undo_mod_del_mark_or_remove_sec_low( record that contains externally stored columns. */ ut_ad(!dict_index_is_clust(index)); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, RB_NORMAL, &mtr); /* The delete operation may fail if we have little @@ -359,6 +465,7 @@ row_undo_mod_del_mark_or_remove_sec_low( func_exit: btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(err); @@ -373,8 +480,8 @@ not cause problems because in row0sel.cc, in queries we always retrieve the clustered index record or an earlier version of it, if the secondary index record through which we do the search is delete-marked. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_mark_or_remove_sec( /*================================*/ undo_node_t* node, /*!< in: row undo node */ @@ -382,7 +489,7 @@ row_undo_mod_del_mark_or_remove_sec( dict_index_t* index, /*!< in: index */ dtuple_t* entry) /*!< in: index entry */ { - ulint err; + dberr_t err; err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, entry, BTR_MODIFY_LEAF); @@ -401,42 +508,67 @@ Delete unmarks a secondary index entry which must be found. It might not be delete-marked at the moment, but it does not harm to unmark it anyway. We also need to update the fields of the secondary index record if we updated its fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. -@return DB_FAIL or DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +@retval DB_SUCCESS on success +@retval DB_FAIL if BTR_MODIFY_TREE should be tried +@retval DB_OUT_OF_FILE_SPACE when running out of tablespace +@retval DB_DUPLICATE_KEY if the value was missing + and an insert would lead to a duplicate exists */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_unmark_sec_and_undo_update( /*========================================*/ ulint mode, /*!< in: search mode: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ que_thr_t* thr, /*!< in: query thread */ dict_index_t* index, /*!< in: index */ - const dtuple_t* entry) /*!< in: index entry */ + dtuple_t* entry) /*!< in: index entry */ { - mem_heap_t* heap; btr_pcur_t pcur; - btr_cur_t* btr_cur; + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); upd_t* update; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; big_rec_t* dummy_big_rec; mtr_t mtr; trx_t* trx = thr_get_trx(thr); + const ulint flags + = BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG; enum row_search_result search_result; - /* Ignore indexes that are being created. */ - if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) { - - return(DB_SUCCESS); - } + ut_ad(trx->id); log_free_check(); mtr_start(&mtr); - ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try(index, entry, trx->id)) { + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); switch (search_result) { + mem_heap_t* heap; + mem_heap_t* offsets_heap; + ulint* offsets; case ROW_BUFFERED: case ROW_NOT_DELETED_REF: /* These are invalid outcomes, because the mode passed @@ -444,80 +576,183 @@ row_undo_mod_del_unmark_sec_and_undo_update( flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ ut_error; case ROW_NOT_FOUND: - fputs("InnoDB: error in sec index entry del undo in\n" - "InnoDB: ", stderr); - dict_index_name_print(stderr, trx, index); - fputs("\n" - "InnoDB: tuple ", stderr); - dtuple_print(stderr, entry); - fputs("\n" - "InnoDB: record ", stderr); - rec_print(stderr, btr_pcur_get_rec(&pcur), index); - putc('\n', stderr); - trx_print(stderr, trx, 0); - fputs("\n" - "InnoDB: Submit a detailed bug report" - " to http://bugs.mysql.com\n", stderr); - ut_ad(0); + if (*index->name != TEMP_INDEX_PREFIX) { + /* During online secondary index creation, it + is possible that MySQL is waiting for a + meta-data lock upgrade before invoking + ha_innobase::commit_inplace_alter_table() + while this ROLLBACK is executing. InnoDB has + finished building the index, but it does not + yet exist in MySQL. In this case, we suppress + the printout to the error log. */ + fputs("InnoDB: error in sec index entry del undo in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); + putc('\n', stderr); + trx_print(stderr, trx, 0); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + + ib_logf(IB_LOG_LEVEL_WARN, + "record in index %s was not found" + " on rollback, trying to insert", + index->name); + } + + if (btr_cur->up_match >= dict_index_get_n_unique(index) + || btr_cur->low_match >= dict_index_get_n_unique(index)) { + if (*index->name != TEMP_INDEX_PREFIX) { + ib_logf(IB_LOG_LEVEL_WARN, + "record in index %s was not found on" + " rollback, and a duplicate exists", + index->name); + } + err = DB_DUPLICATE_KEY; + break; + } + + /* Insert the missing record that we were trying to + delete-unmark. */ + big_rec_t* big_rec; + rec_t* insert_rec; + offsets = NULL; + offsets_heap = NULL; + + err = btr_cur_optimistic_insert( + flags, btr_cur, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + ut_ad(!big_rec); + + if (err == DB_FAIL && mode == BTR_MODIFY_TREE) { + err = btr_cur_pessimistic_insert( + flags, btr_cur, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + /* There are no off-page columns in + secondary indexes. */ + ut_ad(!big_rec); + } + + if (err == DB_SUCCESS) { + page_update_max_trx_id( + btr_cur_get_block(btr_cur), + btr_cur_get_page_zip(btr_cur), + trx->id, &mtr); + } + + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + break; case ROW_FOUND: - btr_cur = btr_pcur_get_btr_cur(&pcur); - err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, - btr_cur, FALSE, thr, &mtr); + err = btr_cur_del_mark_set_sec_rec( + BTR_NO_LOCKING_FLAG, + btr_cur, FALSE, thr, &mtr); ut_a(err == DB_SUCCESS); - heap = mem_heap_create(100); - + heap = mem_heap_create( + sizeof(upd_t) + + dtuple_get_n_fields(entry) * sizeof(upd_field_t)); + offsets_heap = NULL; + offsets = rec_get_offsets( + btr_cur_get_rec(btr_cur), + index, NULL, ULINT_UNDEFINED, &offsets_heap); update = row_upd_build_sec_rec_difference_binary( - index, entry, btr_cur_get_rec(btr_cur), trx, heap); + btr_cur_get_rec(btr_cur), index, offsets, entry, heap); if (upd_get_n_fields(update) == 0) { /* Do nothing */ - } else if (mode == BTR_MODIFY_LEAF) { + } else if (mode != BTR_MODIFY_TREE) { /* Try an optimistic updating of the record, keeping changes within the page */ + /* TODO: pass offsets, not &offsets */ err = btr_cur_optimistic_update( - BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, - btr_cur, update, 0, thr, &mtr); + flags, btr_cur, &offsets, &offsets_heap, + update, 0, thr, thr_get_trx(thr)->id, &mtr); switch (err) { case DB_OVERFLOW: case DB_UNDERFLOW: case DB_ZIP_OVERFLOW: err = DB_FAIL; + default: + break; } } else { - ut_a(mode == BTR_MODIFY_TREE); err = btr_cur_pessimistic_update( - BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, - btr_cur, &heap, &dummy_big_rec, - update, 0, thr, &mtr); + flags, btr_cur, &offsets, &offsets_heap, + heap, &dummy_big_rec, + update, 0, thr, thr_get_trx(thr)->id, &mtr); ut_a(!dummy_big_rec); } mem_heap_free(heap); + mem_heap_free(offsets_heap); } btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(err); } /***********************************************************//** +Flags a secondary index corrupted. */ +static __attribute__((nonnull)) +void +row_undo_mod_sec_flag_corrupted( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_index_t* index) /*!< in: secondary index */ +{ + ut_ad(!dict_index_is_clust(index)); + + switch (trx->dict_operation_lock_mode) { + case RW_S_LATCH: + /* Because row_undo() is holding an S-latch + on the data dictionary during normal rollback, + we can only mark the index corrupted in the + data dictionary cache. TODO: fix this somehow.*/ + mutex_enter(&dict_sys->mutex); + dict_set_corrupted_index_cache_only(index, index->table); + mutex_exit(&dict_sys->mutex); + break; + default: + ut_ad(0); + /* fall through */ + case RW_X_LATCH: + /* This should be the rollback of a data dictionary + transaction. */ + dict_set_corrupted(index, trx, "rollback"); + } +} + +/***********************************************************//** Undoes a modify in secondary indexes when undo record type is UPD_DEL. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_upd_del_sec( /*=====================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + ut_ad(!node->undo_row); heap = mem_heap_create(1024); @@ -530,6 +765,13 @@ row_undo_mod_upd_del_sec( continue; } + /* During online index creation, + HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should + guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ entry = row_build_index_entry( node->row, node->ext, index, heap); @@ -566,15 +808,17 @@ row_undo_mod_upd_del_sec( /***********************************************************//** Undoes a modify in secondary indexes when undo record type is DEL_MARK. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_mark_sec( /*======================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; + + ut_ad(!node->undo_row); heap = mem_heap_create(1024); @@ -587,6 +831,13 @@ row_undo_mod_del_mark_sec( continue; } + /* During online index creation, + HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should + guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ entry = row_build_index_entry( node->row, node->ext, index, heap); @@ -599,8 +850,17 @@ row_undo_mod_del_mark_sec( BTR_MODIFY_TREE, thr, index, entry); } - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - + if (err == DB_DUPLICATE_KEY) { + row_undo_mod_sec_flag_corrupted( + thr_get_trx(thr), index); + err = DB_SUCCESS; + /* Do not return any error to the caller. The + duplicate will be reported by ALTER TABLE or + CREATE UNIQUE INDEX. Unfortunately we cannot + report the duplicate key value to the DDL + thread, because the altered_table object is + private to its call stack. */ + } else if (err != DB_SUCCESS) { break; } @@ -616,18 +876,18 @@ row_undo_mod_del_mark_sec( /***********************************************************//** Undoes a modify in secondary indexes when undo record type is UPD_EXIST. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_upd_exist_sec( /*=======================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; if (node->index == NULL - || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + || ((node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) { /* No change in secondary indexes */ return(err); @@ -713,7 +973,11 @@ row_undo_mod_upd_exist_sec( BTR_MODIFY_TREE, thr, index, entry); } - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + if (err == DB_DUPLICATE_KEY) { + row_undo_mod_sec_flag_corrupted( + thr_get_trx(thr), index); + err = DB_SUCCESS; + } else if (err != DB_SUCCESS) { break; } @@ -728,12 +992,11 @@ row_undo_mod_upd_exist_sec( /***********************************************************//** Parses the row reference and other info in a modify undo log record. */ -static +static __attribute__((nonnull)) void row_undo_mod_parse_undo_rec( /*========================*/ undo_node_t* node, /*!< in: row undo node */ - que_thr_t* thr, /*!< in: query thread */ ibool dict_locked) /*!< in: TRUE if own dict_sys->mutex */ { dict_index_t* clust_index; @@ -745,16 +1008,13 @@ row_undo_mod_parse_undo_rec( ulint info_bits; ulint type; ulint cmpl_info; - ibool dummy_extern; - trx_t* trx; + bool dummy_extern; - ut_ad(node && thr); - trx = thr_get_trx(thr); ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, &dummy_extern, &undo_no, &table_id); node->rec_type = type; - node->table = dict_table_open_on_id(table_id, dict_locked); + node->table = dict_table_open_on_id(table_id, dict_locked, FALSE); /* TODO: other fixes associated with DROP TABLE + rollback in the same table by another user */ @@ -765,7 +1025,7 @@ row_undo_mod_parse_undo_rec( } if (node->table->ibd_file_missing) { - dict_table_close(node->table, dict_locked); + dict_table_close(node->table, dict_locked, FALSE); /* We skip undo operations to missing .ibd files */ node->table = NULL; @@ -782,14 +1042,14 @@ row_undo_mod_parse_undo_rec( node->heap); trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, - roll_ptr, info_bits, trx, + roll_ptr, info_bits, node->trx, node->heap, &(node->update)); node->new_trx_id = trx_id; node->cmpl_info = cmpl_info; if (!row_undo_search_clust_to_pcur(node)) { - dict_table_close(node->table, dict_locked); + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; } @@ -799,21 +1059,23 @@ row_undo_mod_parse_undo_rec( Undoes a modify operation on a row of a table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_undo_mod( /*=========*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; - ibool dict_locked; + dberr_t err; + ibool dict_locked; ut_ad(node && thr); ut_ad(node->state == UNDO_NODE_MODIFY); dict_locked = thr_get_trx(thr)->dict_operation_lock_mode == RW_X_LATCH; - row_undo_mod_parse_undo_rec(node, thr, dict_locked); + ut_ad(thr_get_trx(thr) == node->trx); + + row_undo_mod_parse_undo_rec(node, dict_locked); if (node->table == NULL) { /* It is already undone, or will be undone by another query @@ -825,8 +1087,18 @@ row_undo_mod( return(DB_SUCCESS); } - node->index = dict_table_get_next_index( - dict_table_get_first_index(node->table)); + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + + if (dict_index_is_online_ddl(node->index)) { + /* Note that we are rolling back this transaction, so + that all inserts and updates with this DB_TRX_ID can + be skipped. */ + row_log_table_rollback(node->index, node->trx->id); + } + + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); /* Skip all corrupted secondary index */ dict_table_skip_corrupt_index(node->index); @@ -851,7 +1123,7 @@ row_undo_mod( err = row_undo_mod_clust(node, thr); } - dict_table_close(node->table, dict_locked); + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc index 757d3544ba4..9977a1e8f04 100644 --- a/storage/innobase/row/row0undo.cc +++ b/storage/innobase/row/row0undo.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -216,8 +216,9 @@ row_undo_search_clust_to_pcur( } node->row = row_build(ROW_COPY_DATA, clust_index, rec, - offsets, NULL, ext, node->heap); - if (node->update) { + offsets, NULL, + NULL, NULL, ext, node->heap); + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { node->undo_row = dtuple_copy(node->row, node->heap); row_upd_replace(node->undo_row, &node->undo_ext, clust_index, node->update, node->heap); @@ -244,14 +245,14 @@ Fetches an undo log record and does the undo for the recorded operation. If none left, or a partial rollback completed, returns control to the parent node, which is always a query thread node. @return DB_SUCCESS if operation successfully completed, else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo( /*=====*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; trx_t* trx; roll_ptr_t roll_ptr; ibool locked_data_dict; @@ -332,7 +333,7 @@ row_undo_step( /*==========*/ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; undo_node_t* node; trx_t* trx; @@ -348,17 +349,17 @@ row_undo_step( err = row_undo(node, thr); - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { /* SQL error detected */ - fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", - (ulong) err); + fprintf(stderr, "InnoDB: Fatal error (%s) in rollback.\n", + ut_strerr(err)); if (err == DB_OUT_OF_FILE_SPACE) { fprintf(stderr, - "InnoDB: Error 13 means out of tablespace.\n" + "InnoDB: Out of tablespace.\n" "InnoDB: Consider increasing" " your tablespace.\n"); diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 28faa59add8..f97c0c3c82b 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -23,14 +23,13 @@ Update of a row Created 12/27/1996 Heikki Tuuri *******************************************************/ -#include "m_string.h" /* for my_sys.h */ -#include "my_sys.h" /* DEBUG_SYNC_C */ #include "row0upd.h" #ifdef UNIV_NONINL #include "row0upd.ic" #endif +#include "ha_prototypes.h" #include "dict0dict.h" #include "trx0undo.h" #include "rem0rec.h" @@ -43,8 +42,9 @@ Created 12/27/1996 Heikki Tuuri #include "que0que.h" #include "row0ext.h" #include "row0ins.h" -#include "row0sel.h" +#include "row0log.h" #include "row0row.h" +#include "row0sel.h" #include "rem0cmp.h" #include "lock0lock.h" #include "log0log.h" @@ -178,8 +178,8 @@ NOTE that this function will temporarily commit mtr and lose the pcur position! @return DB_SUCCESS or an error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_check_references_constraints( /*=================================*/ upd_node_t* node, /*!< in: row update node */ @@ -197,7 +197,7 @@ row_upd_check_references_constraints( trx_t* trx; const rec_t* rec; ulint n_ext; - ulint err; + dberr_t err; ibool got_s_lock = FALSE; if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) { @@ -212,11 +212,12 @@ row_upd_check_references_constraints( heap = mem_heap_create(500); - entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, - &n_ext, heap); + entry = row_rec_to_index_entry(rec, index, offsets, &n_ext, heap); mtr_commit(mtr); + DEBUG_SYNC_C("foreign_constraint_check_for_update"); + mtr_start(mtr); if (trx->dict_operation_lock_mode == 0) { @@ -225,6 +226,7 @@ row_upd_check_references_constraints( row_mysql_freeze_data_dictionary(trx); } +run_again: foreign = UT_LIST_GET_FIRST(table->referenced_list); while (foreign) { @@ -238,18 +240,20 @@ row_upd_check_references_constraints( || row_upd_changes_first_fields_binary( entry, index, node->update, foreign->n_fields))) { + dict_table_t* foreign_table = foreign->foreign_table; dict_table_t* ref_table = NULL; - if (foreign->foreign_table == NULL) { + if (foreign_table == NULL) { ref_table = dict_table_open_on_name( - foreign->foreign_table_name_lookup, FALSE); + foreign->foreign_table_name_lookup, + FALSE, FALSE, DICT_ERR_IGNORE_NONE); } - if (foreign->foreign_table) { + if (foreign_table) { os_inc_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } @@ -261,18 +265,20 @@ row_upd_check_references_constraints( err = row_ins_check_foreign_constraint( FALSE, foreign, table, entry, thr); - if (foreign->foreign_table) { + if (foreign_table) { os_dec_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } if (ref_table != NULL) { - dict_table_close(ref_table, FALSE); + dict_table_close(ref_table, FALSE, FALSE); } - if (err != DB_SUCCESS) { - + /* Some table foreign key dropped, try again */ + if (err == DB_DICT_CHANGED) { + goto run_again; + } else if (err != DB_SUCCESS) { goto func_exit; } } @@ -289,6 +295,8 @@ func_exit: mem_heap_free(heap); + DEBUG_SYNC_C("foreign_constraint_check_for_update_done"); + return(err); } @@ -465,6 +473,47 @@ row_upd_changes_field_size_or_external( return(FALSE); } + +/***********************************************************//** +Returns true if row update contains disowned external fields. +@return true if the update contains disowned external fields. */ +UNIV_INTERN +bool +row_upd_changes_disowned_external( +/*==============================*/ + const upd_t* update) /*!< in: update vector */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint new_len; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const byte* field_ref; + + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + new_len = dfield_get_len(new_val); + + if (!dfield_is_ext(new_val)) { + continue; + } + + ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE); + + field_ref = static_cast<const byte*>(dfield_get_data(new_val)) + + new_len - BTR_EXTERN_FIELD_REF_SIZE; + + if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) { + return(true); + } + } + + return(false); +} #endif /* !UNIV_HOTBACKUP */ /***********************************************************//** @@ -560,7 +609,7 @@ byte* row_upd_write_sys_vals_to_log( /*==========================*/ dict_index_t* index, /*!< in: clustered index */ - trx_t* trx, /*!< in: transaction */ + trx_id_t trx_id, /*!< in: transaction id */ roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ byte* log_ptr,/*!< pointer to a buffer of size > 20 opened in mlog */ @@ -576,7 +625,7 @@ row_upd_write_sys_vals_to_log( trx_write_roll_ptr(log_ptr, roll_ptr); log_ptr += DATA_ROLL_PTR_LEN; - log_ptr += mach_ull_write_compressed(log_ptr, trx->id); + log_ptr += mach_ull_write_compressed(log_ptr, trx_id); return(log_ptr); } @@ -779,10 +828,10 @@ UNIV_INTERN upd_t* row_upd_build_sec_rec_difference_binary( /*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ const dtuple_t* entry, /*!< in: entry to insert */ - const rec_t* rec, /*!< in: secondary index record */ - trx_t* trx, /*!< in: transaction */ mem_heap_t* heap) /*!< in: memory heap from which allocated */ { upd_field_t* upd_field; @@ -792,18 +841,16 @@ row_upd_build_sec_rec_difference_binary( upd_t* update; ulint n_diff; ulint i; - ulint offsets_[REC_OFFS_SMALL_SIZE]; - const ulint* offsets; - rec_offs_init(offsets_); /* This function is used only for a secondary index */ ut_a(!dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry)); + ut_ad(!rec_offs_any_extern(offsets)); update = upd_create(dtuple_get_n_fields(entry), heap); n_diff = 0; - offsets = rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap); for (i = 0; i < dtuple_get_n_fields(entry); i++) { @@ -828,7 +875,7 @@ row_upd_build_sec_rec_difference_binary( dfield_copy(&(upd_field->new_val), dfield); - upd_field_set_field_no(upd_field, i, index, trx); + upd_field_set_field_no(upd_field, i, index, NULL); n_diff++; } @@ -846,12 +893,15 @@ the equal ordering fields. NOTE: we compare the fields as binary strings! @return own: update vector of differing fields, excluding roll ptr and trx id */ UNIV_INTERN -upd_t* +const upd_t* row_upd_build_difference_binary( /*============================*/ dict_index_t* index, /*!< in: clustered index */ const dtuple_t* entry, /*!< in: entry to insert */ const rec_t* rec, /*!< in: clustered index record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), or NULL */ + bool no_sys, /*!< in: skip the system columns + DB_TRX_ID and DB_ROLL_PTR */ trx_t* trx, /*!< in: transaction */ mem_heap_t* heap) /*!< in: memory heap from which allocated */ { @@ -861,11 +911,9 @@ row_upd_build_difference_binary( ulint len; upd_t* update; ulint n_diff; - ulint roll_ptr_pos; ulint trx_id_pos; ulint i; ulint offsets_[REC_OFFS_NORMAL_SIZE]; - const ulint* offsets; rec_offs_init(offsets_); /* This function is used only for a clustered index */ @@ -875,11 +923,16 @@ row_upd_build_difference_binary( n_diff = 0; - roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + ut_ad(dict_index_get_sys_col_pos(index, DATA_ROLL_PTR) + == trx_id_pos + 1); - offsets = rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap); + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } for (i = 0; i < dtuple_get_n_fields(entry); i++) { @@ -890,9 +943,9 @@ row_upd_build_difference_binary( /* NOTE: we compare the fields as binary strings! (No collation) */ - if (i == trx_id_pos || i == roll_ptr_pos) { + if (no_sys && (i == trx_id_pos || i == trx_id_pos + 1)) { - goto skip_compare; + continue; } if (!dfield_is_ext(dfield) @@ -907,8 +960,6 @@ row_upd_build_difference_binary( n_diff++; } -skip_compare: - ; } update->n_fields = n_diff; @@ -1386,9 +1437,9 @@ row_upd_changes_some_index_ord_field_binary( /***********************************************************//** Checks if an FTS Doc ID column is affected by an UPDATE. -@return TRUE if the Doc ID column is changed */ +@return whether the Doc ID column is changed */ UNIV_INTERN -ulint +bool row_upd_changes_doc_id( /*===================*/ dict_table_t* table, /*!< in: table */ @@ -1431,61 +1482,6 @@ row_upd_changes_fts_column( } /***********************************************************//** -Checks if an update vector changes the table's FTS-indexed columns. -NOTE: must not be called for tables which do not have an FTS-index. -Also, the vector returned must be explicitly freed as it's allocated -using the ut_malloc() allocator. -@return vector of FTS indexes that were affected by the update */ -UNIV_INTERN -ib_vector_t* -row_upd_changes_fts_columns( -/*========================*/ - dict_table_t* table, /*!< in: table */ - upd_t* update) /*!< in: update vector for the row */ -{ - ulint i; - ulint offset; - fts_t* fts = table->fts; - ib_vector_t* updated_fts_indexes = NULL; - - for (i = 0; i < upd_get_n_fields(update); ++i) { - upd_field_t* upd_field = upd_get_nth_field(update, i); - - offset = row_upd_changes_fts_column(table, upd_field); - - if (offset != ULINT_UNDEFINED) { - - dict_index_t* index; - - /* TODO: Investigate if we can check whether the - existing set of affected indexes matches the new - affected set. If matched then we don't need to - do the extra malloc()/free(). */ - - /* This vector is created from the ut_malloc() - allocator because we only want to keep one instance - around not matter how many times this row is - updated. The old entry should be deleted when - we update the FTS row info with this new vector. */ - if (updated_fts_indexes == NULL) { - ib_alloc_t* ut_alloc; - - ut_alloc = ib_ut_allocator_create(); - - updated_fts_indexes = ib_vector_create( - ut_alloc, sizeof(dict_index_t*), 2); - } - - index = static_cast<dict_index_t*>( - ib_vector_getp(fts->indexes, offset)); - ib_vector_push(updated_fts_indexes, &index); - } - } - - return(updated_fts_indexes); -} - -/***********************************************************//** Checks if an update vector changes some of the first ordering fields of an index record. This is only used in foreign key checks and we can assume that index does not contain column prefixes. @@ -1633,7 +1629,7 @@ row_upd_store_row( } node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, - NULL, ext, node->heap); + NULL, NULL, NULL, ext, node->heap); if (node->is_delete) { node->upd_row = NULL; node->upd_ext = NULL; @@ -1652,8 +1648,8 @@ row_upd_store_row( Updates a secondary index entry of a row. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_sec_index_entry( /*====================*/ upd_node_t* node, /*!< in: row update node */ @@ -1667,11 +1663,13 @@ row_upd_sec_index_entry( dict_index_t* index; btr_cur_t* btr_cur; ibool referenced; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; trx_t* trx = thr_get_trx(thr); - ulint mode = BTR_MODIFY_LEAF; + ulint mode; enum row_search_result search_result; + ut_ad(trx->id); + index = node->index; referenced = row_upd_index_is_referenced(index, trx); @@ -1682,19 +1680,74 @@ row_upd_sec_index_entry( entry = row_build_index_entry(node->row, node->ext, index, heap); ut_a(entry); + log_free_check(); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!trx->ddl) { + DEBUG_SYNC_C_IF_THD(trx->mysql_thd, + "before_row_upd_sec_index_entry"); + } +#endif /* UNIV_DEBUG */ + mtr_start(&mtr); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + + mtr_s_lock(dict_index_get_lock(index), &mtr); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + /* This is a normal index. Do not log anything. + Perform the update on the index tree directly. */ + break; + case ONLINE_INDEX_CREATION: + /* Log a DELETE and optionally INSERT. */ + row_log_online_op(index, entry, 0); + + if (!node->is_delete) { + mem_heap_empty(heap); + entry = row_build_index_entry( + node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + row_log_online_op(index, entry, trx->id); + } + /* fall through */ + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + mtr_commit(&mtr); + goto func_exit; + } + + /* We can only buffer delete-mark operations if there + are no foreign key constraints referring to the index. */ + mode = referenced + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + | BTR_DELETE_MARK; + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + + /* We can only buffer delete-mark operations if there + are no foreign key constraints referring to the index. */ + mode = referenced + ? BTR_MODIFY_LEAF + : BTR_MODIFY_LEAF | BTR_DELETE_MARK; + } + /* Set the query thread, so that ibuf_insert_low() will be able to invoke thd_get_trx(). */ btr_pcur_get_btr_cur(&pcur)->thr = thr; - /* We can only try to use the insert/delete buffer to buffer - delete-mark operations if the index we're modifying has no foreign - key constraints referring to it. */ - if (!referenced) { - mode |= BTR_DELETE_MARK; - } - search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); @@ -1711,6 +1764,20 @@ row_upd_sec_index_entry( break; case ROW_NOT_FOUND: + if (*index->name == TEMP_INDEX_PREFIX) { + /* When online CREATE INDEX copied the update + that we already made to the clustered index, + and completed the secondary index creation + before we got here, the old secondary index + record would not exist. The CREATE INDEX + should be waiting for a MySQL meta-data lock + upgrade at least until this UPDATE + returns. After that point, the + TEMP_INDEX_PREFIX would be dropped from the + index name in commit_inplace_alter_table(). */ + break; + } + fputs("InnoDB: error in sec index entry update in\n" "InnoDB: ", stderr); dict_index_name_print(stderr, trx, index); @@ -1730,11 +1797,9 @@ row_upd_sec_index_entry( case ROW_FOUND: /* Delete mark the old index record; it can already be delete marked if we return after a lock wait in - row_ins_index_entry below */ - + row_ins_sec_index_entry() below */ if (!rec_get_deleted_flag( - rec, dict_table_is_comp(index->table))) { - + rec, dict_table_is_comp(index->table))) { err = btr_cur_del_mark_set_sec_rec( 0, btr_cur, TRUE, thr, &mtr); @@ -1764,13 +1829,15 @@ row_upd_sec_index_entry( goto func_exit; } + mem_heap_empty(heap); + /* Build a new index entry */ entry = row_build_index_entry(node->upd_row, node->upd_ext, index, heap); ut_a(entry); /* Insert new index entry */ - err = row_ins_index_entry(index, entry, 0, TRUE, thr); + err = row_ins_sec_index_entry(index, entry, thr); func_exit: mem_heap_free(heap); @@ -1783,8 +1850,8 @@ Updates the secondary index record if it is changed in the row update or deletes it if this is a delete. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_sec_step( /*=============*/ upd_node_t* node, /*!< in: row update node */ @@ -1897,8 +1964,8 @@ fields of the clustered index record change. This should be quite rare in database applications. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_clust_rec_by_insert( /*========================*/ upd_node_t* node, /*!< in/out: row update node */ @@ -1914,7 +1981,7 @@ row_upd_clust_rec_by_insert( trx_t* trx; dict_table_t* table; dtuple_t* entry; - ulint err; + dberr_t err; ibool change_ownership = FALSE; rec_t* rec; ulint* offsets = NULL; @@ -1939,7 +2006,7 @@ row_upd_clust_rec_by_insert( default: ut_error; case UPD_NODE_INSERT_BLOB: - /* A lock wait occurred in row_ins_index_entry() in + /* A lock wait occurred in row_ins_clust_index_entry() in the previous invocation of this function. Mark the off-page columns in the entry inherited. */ @@ -1948,7 +2015,7 @@ row_upd_clust_rec_by_insert( ut_a(change_ownership); /* fall through */ case UPD_NODE_INSERT_CLUSTERED: - /* A lock wait occurred in row_ins_index_entry() in + /* A lock wait occurred in row_ins_clust_index_entry() in the previous invocation of this function. */ break; case UPD_NODE_UPDATE_CLUSTERED: @@ -1961,8 +2028,8 @@ row_upd_clust_rec_by_insert( ut_ad(page_rec_is_user_rec(rec)); err = btr_cur_del_mark_set_clust_rec( - BTR_NO_LOCKING_FLAG, btr_cur_get_block(btr_cur), - rec, index, offsets, TRUE, thr, mtr); + btr_cur_get_block(btr_cur), rec, index, offsets, + thr, mtr); if (err != DB_SUCCESS) { err_exit: mtr_commit(mtr); @@ -1999,9 +2066,9 @@ err_exit: mtr_commit(mtr); - err = row_ins_index_entry(index, entry, - node->upd_ext ? node->upd_ext->n_ext : 0, - TRUE, thr); + err = row_ins_clust_index_entry( + index, entry, thr, + node->upd_ext ? node->upd_ext->n_ext : 0); node->state = change_ownership ? UPD_NODE_INSERT_BLOB : UPD_NODE_INSERT_CLUSTERED; @@ -2027,11 +2094,17 @@ err_exit: offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets))); btr_cur_disown_inherited_fields( btr_cur_get_page_zip(btr_cur), rec, index, offsets, node->update, mtr); + /* It is not necessary to call row_log_table for + this, because during online table rebuild, purge will + not free any BLOBs in the table, whether or not they + are owned by the clustered index record. */ + mtr_commit(mtr); } @@ -2045,20 +2118,24 @@ Updates a clustered index record of a row when the ordering fields do not change. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_clust_rec( /*==============*/ upd_node_t* node, /*!< in: row update node */ dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in: rec_get_offsets() on node->pcur */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap, can be emptied */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr; gets committed here */ { - mem_heap_t* heap = NULL; - big_rec_t* big_rec = NULL; + mem_heap_t* heap = NULL; + big_rec_t* big_rec = NULL; btr_pcur_t* pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err; + const dtuple_t* rebuilt_old_pk = NULL; ut_ad(node); ut_ad(dict_index_is_clust(index)); @@ -2066,33 +2143,48 @@ row_upd_clust_rec( pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); - ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + ut_ad(btr_cur_get_index(btr_cur) == index); + ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); + ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets)); + + if (dict_index_is_online_ddl(index)) { + rebuilt_old_pk = row_log_table_get_pk( + btr_cur_get_rec(btr_cur), index, offsets, &heap); + } /* Try optimistic updating of the record, keeping changes within the page; we do not check locks because we assume the x-lock on the record to update */ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { - err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); + err = btr_cur_update_in_place( + BTR_NO_LOCKING_FLAG, btr_cur, + offsets, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); } else { - err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); + err = btr_cur_optimistic_update( + BTR_NO_LOCKING_FLAG, btr_cur, + &offsets, offsets_heap, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + } + + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_update(btr_cur_get_rec(btr_cur), + index, offsets, rebuilt_old_pk); } mtr_commit(mtr); if (UNIV_LIKELY(err == DB_SUCCESS)) { - return(DB_SUCCESS); + goto func_exit; } if (buf_LRU_buf_pool_running_out()) { - return(DB_LOCK_TABLE_FULL); + err = DB_LOCK_TABLE_FULL; + goto func_exit; } /* We may have to modify the tree structure: do a pessimistic descent down the index tree */ @@ -2110,14 +2202,16 @@ row_upd_clust_rec( ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), dict_table_is_comp(index->table))); + if (!heap) { + heap = mem_heap_create(1024); + } + err = btr_cur_pessimistic_update( BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, - &heap, &big_rec, node->update, node->cmpl_info, thr, mtr); + &offsets, offsets_heap, heap, &big_rec, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); if (big_rec) { - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - rec_t* rec; - rec_offs_init(offsets_); - ut_a(err == DB_SUCCESS); /* Write out the externally stored columns while still x-latching @@ -2140,12 +2234,10 @@ row_upd_clust_rec( portion of the file, in case the file was somehow truncated in the crash. */ - rec = btr_cur_get_rec(btr_cur); DEBUG_SYNC_C("before_row_upd_extern"); err = btr_store_big_rec_extern_fields( - index, btr_cur_get_block(btr_cur), rec, - rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap), + index, btr_cur_get_block(btr_cur), + btr_cur_get_rec(btr_cur), offsets, big_rec, mtr, BTR_STORE_UPDATE); DEBUG_SYNC_C("after_row_upd_extern"); /* If writing big_rec fails (for example, because of @@ -2164,9 +2256,14 @@ row_upd_clust_rec( ut_a(err == DB_SUCCESS); } - mtr_commit(mtr); + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_update(btr_cur_get_rec(btr_cur), + index, offsets, rebuilt_old_pk); + } - if (UNIV_LIKELY_NULL(heap)) { + mtr_commit(mtr); +func_exit: + if (heap) { mem_heap_free(heap); } @@ -2180,8 +2277,8 @@ row_upd_clust_rec( /***********************************************************//** Delete marks a clustered index record. @return DB_SUCCESS if operation successfully completed, else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_del_mark_clust_rec( /*=======================*/ upd_node_t* node, /*!< in: row update node */ @@ -2196,7 +2293,7 @@ row_upd_del_mark_clust_rec( { btr_pcur_t* pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err; ut_ad(node); ut_ad(dict_index_is_clust(index)); @@ -2214,8 +2311,8 @@ row_upd_del_mark_clust_rec( locks, because we assume that we have an x-lock on the record */ err = btr_cur_del_mark_set_clust_rec( - BTR_NO_LOCKING_FLAG, btr_cur_get_block(btr_cur), - btr_cur_get_rec(btr_cur), index, offsets, TRUE, thr, mtr); + btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur), + index, offsets, thr, mtr); if (err == DB_SUCCESS && referenced) { /* NOTE that the following call loses the position of pcur ! */ @@ -2232,8 +2329,8 @@ row_upd_del_mark_clust_rec( Updates the clustered index record. @return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT in case of a lock wait, else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_clust_step( /*===============*/ upd_node_t* node, /*!< in: row update node */ @@ -2242,11 +2339,10 @@ row_upd_clust_step( dict_index_t* index; btr_pcur_t* pcur; ibool success; - ulint err; - mtr_t* mtr; - mtr_t mtr_buf; + dberr_t err; + mtr_t mtr; rec_t* rec; - mem_heap_t* heap = NULL; + mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets; ibool referenced; @@ -2259,9 +2355,8 @@ row_upd_clust_step( pcur = node->pcur; /* We have to restore the cursor to its position */ - mtr = &mtr_buf; - mtr_start(mtr); + mtr_start(&mtr); /* If the restoration does not succeed, then the same transaction has deleted the record on which the cursor was, @@ -2273,12 +2368,32 @@ row_upd_clust_step( ut_a(pcur->rel_pos == BTR_PCUR_ON); - success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + ulint mode; + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "innodb_row_upd_clust_step_enter"); + } +#endif /* UNIV_DEBUG */ + + if (dict_index_is_online_ddl(index)) { + ut_ad(node->table->id != DICT_INDEXES_ID); + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + mode = BTR_MODIFY_LEAF; + } + + success = btr_pcur_restore_position(mode, pcur, &mtr); if (!success) { err = DB_RECORD_NOT_FOUND; - mtr_commit(mtr); + mtr_commit(&mtr); return(err); } @@ -2289,18 +2404,20 @@ row_upd_clust_step( if (node->is_delete && node->table->id == DICT_INDEXES_ID) { - dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); + ut_ad(!dict_index_is_online_ddl(index)); - mtr_commit(mtr); + dict_drop_index_tree(btr_pcur_get_rec(pcur), &mtr); - mtr_start(mtr); + mtr_commit(&mtr); + + mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, - mtr); + &mtr); if (!success) { err = DB_ERROR; - mtr_commit(mtr); + mtr_commit(&mtr); return(err); } @@ -2315,7 +2432,7 @@ row_upd_clust_step( 0, btr_pcur_get_block(pcur), rec, index, offsets, thr); if (err != DB_SUCCESS) { - mtr_commit(mtr); + mtr_commit(&mtr); goto exit_func; } } @@ -2324,17 +2441,14 @@ row_upd_clust_step( if (node->is_delete) { err = row_upd_del_mark_clust_rec( - node, index, offsets, thr, referenced, mtr); + node, index, offsets, thr, referenced, &mtr); if (err == DB_SUCCESS) { node->state = UPD_NODE_UPDATE_ALL_SEC; node->index = dict_table_get_next_index(index); } -exit_func: - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(err); + + goto exit_func; } /* If the update is made for MySQL, we already have the update vector @@ -2348,13 +2462,11 @@ exit_func: row_upd_eval_new_vals(node->update); } - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { - return(row_upd_clust_rec(node, index, thr, mtr)); + err = row_upd_clust_rec( + node, index, offsets, &heap, thr, &mtr); + goto exit_func; } row_upd_store_row(node); @@ -2374,20 +2486,21 @@ exit_func: externally! */ err = row_upd_clust_rec_by_insert( - node, index, thr, referenced, mtr); + node, index, thr, referenced, &mtr); if (err != DB_SUCCESS) { - return(err); + goto exit_func; } node->state = UPD_NODE_UPDATE_ALL_SEC; } else { - err = row_upd_clust_rec(node, index, thr, mtr); + err = row_upd_clust_rec( + node, index, offsets, &heap, thr, &mtr); if (err != DB_SUCCESS) { - return(err); + goto exit_func; } node->state = UPD_NODE_UPDATE_SOME_SEC; @@ -2395,6 +2508,10 @@ exit_func: node->index = dict_table_get_next_index(index); +exit_func: + if (heap) { + mem_heap_free(heap); + } return(err); } @@ -2404,14 +2521,14 @@ to this node, we assume that we have a persistent cursor which was on a record, and the position of the cursor is stored in the cursor. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd( /*====*/ upd_node_t* node, /*!< in: row update node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(node && thr); @@ -2449,6 +2566,17 @@ row_upd( return(DB_SUCCESS); } +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_upd_clust"); + } +#endif /* UNIV_DEBUG */ + + DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;); + do { /* Skip corrupted index */ dict_table_skip_corrupt_index(node->index); @@ -2458,7 +2586,6 @@ row_upd( } if (node->index->type != DICT_FTS) { - log_free_check(); err = row_upd_sec_step(node, thr); if (err != DB_SUCCESS) { @@ -2500,7 +2627,7 @@ row_upd_step( upd_node_t* node; sel_node_t* sel_node; que_node_t* parent; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; trx_t* trx; ut_ad(thr); @@ -2579,7 +2706,7 @@ row_upd_step( err = row_upd(node, thr); error_handling: - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { return(NULL); diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc index 0aad8675ff8..2c3191928fd 100644 --- a/storage/innobase/row/row0vers.cc +++ b/storage/innobase/row/row0vers.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -114,7 +114,6 @@ row_vers_impl_x_locked_low( on rec. */ for (version = clust_rec;; version = prev_version) { - ulint err; row_ext_t* ext; const dtuple_t* row; dtuple_t* entry; @@ -128,24 +127,22 @@ row_vers_impl_x_locked_low( heap = mem_heap_create(1024); - err = trx_undo_prev_version_build( + trx_undo_prev_version_build( clust_rec, mtr, version, clust_index, clust_offsets, heap, &prev_version); - /* Free version and clust_offsets. */ + /* Free version and clust_offsets. */ mem_heap_free(old_heap); if (prev_version == NULL) { - /* clust_rec must be a fresh insert, because + /* clust_rec should be a fresh insert, because no previous version was found or the transaction has committed. The caller has to recheck as the synopsis of this function states, whether trx_id is active or not. */ - ut_a(err == DB_SUCCESS || err == DB_MISSING_HISTORY); - break; } @@ -155,15 +152,16 @@ row_vers_impl_x_locked_low( vers_del = rec_get_deleted_flag(prev_version, comp); - prev_trx_id = row_get_rec_trx_id( - prev_version, clust_index, clust_offsets); + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); /* The stack of versions is locked by mtr. Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, - clust_offsets, NULL, &ext, heap); + clust_offsets, + NULL, NULL, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); @@ -183,8 +181,6 @@ row_vers_impl_x_locked_low( There is no guarantee that the transaction is still active. */ - ut_ad(err == DB_SUCCESS); - /* We check if entry and rec are identified in the alphabetical ordering */ @@ -355,7 +351,6 @@ row_vers_old_has_index_entry( mem_heap_t* heap2; const dtuple_t* row; const dtuple_t* entry; - ulint err; ulint comp; ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) @@ -383,7 +378,8 @@ row_vers_old_has_index_entry( Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, - rec, clust_offsets, NULL, &ext, heap); + rec, clust_offsets, + NULL, NULL, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* If entry == NULL, the record contains unset BLOB @@ -420,12 +416,12 @@ row_vers_old_has_index_entry( for (;;) { heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(rec, mtr, version, - clust_index, clust_offsets, - heap, &prev_version); + trx_undo_prev_version_build(rec, mtr, version, + clust_index, clust_offsets, + heap, &prev_version); mem_heap_free(heap2); /* free version and clust_offsets */ - if (err != DB_SUCCESS || !prev_version) { + if (!prev_version) { /* Versions end here */ mem_heap_free(heap); @@ -444,7 +440,7 @@ row_vers_old_has_index_entry( externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, clust_offsets, - NULL, &ext, heap); + NULL, NULL, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* If entry == NULL, the record contains unset @@ -477,7 +473,7 @@ read should see. We assume that the trx id stored in rec is such that the consistent read should not see rec in its present version. @return DB_SUCCESS or DB_MISSING_HISTORY */ UNIV_INTERN -ulint +dberr_t row_vers_build_for_consistent_read( /*===============================*/ const rec_t* rec, /*!< in: record in a clustered index; the @@ -495,8 +491,9 @@ row_vers_build_for_consistent_read( *old_vers is allocated; memory for possible intermediate versions is allocated and freed locally within the function */ - rec_t** old_vers)/*!< out, own: old version, or NULL if the - record does not exist in the view, that is, + rec_t** old_vers)/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, it was freshly inserted afterwards */ { const rec_t* version; @@ -504,7 +501,7 @@ row_vers_build_for_consistent_read( trx_id_t trx_id; mem_heap_t* heap = NULL; byte* buf; - ulint err; + dberr_t err; ut_ad(dict_index_is_clust(index)); ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) @@ -558,27 +555,21 @@ row_vers_build_for_consistent_read( rec_offs_make_valid(*old_vers, index, *offsets); err = DB_SUCCESS; - break; } } err = trx_undo_prev_version_build(rec, mtr, version, index, *offsets, heap, - &prev_version); + &prev_version) + ? DB_SUCCESS : DB_MISSING_HISTORY; if (heap2) { mem_heap_free(heap2); /* free version */ } - if (err != DB_SUCCESS) { - break; - } - if (prev_version == NULL) { /* It was a freshly inserted version */ *old_vers = NULL; - err = DB_SUCCESS; - break; } @@ -602,8 +593,6 @@ row_vers_build_for_consistent_read( *old_vers = rec_copy(buf, prev_version, *offsets); rec_offs_make_valid(*old_vers, index, *offsets); - err = DB_SUCCESS; - break; } @@ -617,10 +606,9 @@ row_vers_build_for_consistent_read( /*****************************************************************//** Constructs the last committed version of a clustered index record, -which should be seen by a semi-consistent read. -@return DB_SUCCESS or DB_MISSING_HISTORY */ +which should be seen by a semi-consistent read. */ UNIV_INTERN -ulint +void row_vers_build_for_semi_consistent_read( /*====================================*/ const rec_t* rec, /*!< in: record in a clustered index; the @@ -644,7 +632,6 @@ row_vers_build_for_semi_consistent_read( const rec_t* version; mem_heap_t* heap = NULL; byte* buf; - ulint err; trx_id_t rec_trx_id = 0; ut_ad(dict_index_is_clust(index)); @@ -683,7 +670,7 @@ row_vers_build_for_semi_consistent_read( mutex_exit(&trx_sys->mutex); if (!version_trx) { - +committed_version_trx: /* We found a version that belongs to a committed transaction: return it. */ @@ -693,7 +680,6 @@ row_vers_build_for_semi_consistent_read( if (rec == version) { *old_vers = rec; - err = DB_SUCCESS; break; } @@ -721,30 +707,30 @@ row_vers_build_for_semi_consistent_read( *old_vers = rec_copy(buf, version, *offsets); rec_offs_make_valid(*old_vers, index, *offsets); - err = DB_SUCCESS; - break; } + DEBUG_SYNC_C("after_row_vers_check_trx_active"); + heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(rec, mtr, version, index, - *offsets, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* free version */ + if (!trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version)) { + mem_heap_free(heap); + heap = heap2; + heap2 = NULL; + goto committed_version_trx; } - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - break; + if (heap2) { + mem_heap_free(heap2); /* free version */ } if (prev_version == NULL) { /* It was a freshly inserted version */ *old_vers = NULL; - err = DB_SUCCESS; - break; } @@ -759,6 +745,4 @@ row_vers_build_for_semi_consistent_read( if (heap) { mem_heap_free(heap); } - - return(err); } diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc index d5c949f3a06..820700a95a8 100644 --- a/storage/innobase/srv/srv0conc.cc +++ b/storage/innobase/srv/srv0conc.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -40,7 +40,6 @@ Created 2011/04/18 Sunny Bains #include "srv0srv.h" #include "sync0sync.h" #include "trx0trx.h" -#include "ha_prototypes.h" #include "mysql/plugin.h" @@ -73,13 +72,11 @@ UNIV_INTERN ulong srv_thread_concurrency = 0; /** This mutex protects srv_conc data structures */ static os_fast_mutex_t srv_conc_mutex; -/** Slot for a thread waiting in the concurrency control queue. */ -typedef struct srv_conc_slot_struct srv_conc_slot_t; - /** Concurrency list node */ -typedef UT_LIST_NODE_T(srv_conc_slot_t) srv_conc_node_t; +typedef UT_LIST_NODE_T(struct srv_conc_slot_t) srv_conc_node_t; -struct srv_conc_slot_struct{ +/** Slot for a thread waiting in the concurrency control queue. */ +struct srv_conc_slot_t{ os_event_t event; /*!< event to wait */ ibool reserved; /*!< TRUE if slot reserved */ @@ -106,10 +103,8 @@ UNIV_INTERN mysql_pfs_key_t srv_conc_mutex_key; #endif /* !HAVE_ATOMIC_BUILTINS */ -typedef struct srv_conc_struct srv_conc_t; - /** Variables tracking the active and waiting threads. */ -struct srv_conc_struct { +struct srv_conc_t { char pad[64 - (sizeof(ulint) + sizeof(lint))]; /** Number of transactions that have declared_to_be_inside_innodb set. @@ -148,7 +143,7 @@ srv_conc_init(void) for (i = 0; i < OS_THREAD_MAX_N; i++) { srv_conc_slot_t* conc_slot = &srv_conc_slots[i]; - conc_slot->event = os_event_create(NULL); + conc_slot->event = os_event_create(); ut_a(conc_slot->event); } #endif /* !HAVE_ATOMIC_BUILTINS */ @@ -224,9 +219,7 @@ srv_conc_enter_innodb_with_atomics( (void) os_atomic_decrement_lint( &srv_conc.n_waiting, 1); - thd_wait_end( - static_cast<THD*>( - trx->mysql_thd)); + thd_wait_end(trx->mysql_thd); } if (srv_adaptive_max_sleep_delay > 0) { @@ -262,9 +255,7 @@ srv_conc_enter_innodb_with_atomics( trx_search_latch_release_if_reserved(trx); } - thd_wait_begin( - static_cast<THD*>(trx->mysql_thd), - THD_WAIT_USER_LOCK); + thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK); notified_mysql = TRUE; } @@ -477,10 +468,10 @@ retry: #endif /* UNIV_SYNC_DEBUG */ trx->op_info = "waiting in InnoDB queue"; - thd_wait_begin(static_cast<THD*>(trx->mysql_thd), THD_WAIT_USER_LOCK); + thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK); os_event_wait(slot->event); - thd_wait_end(static_cast<THD*>(trx->mysql_thd)); + thd_wait_end(trx->mysql_thd); trx->op_info = ""; diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 9c6e56bcb9d..3b3da2f070f 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,7 +34,6 @@ Created 12/9/2009 Jimmy Yang #include "trx0rseg.h" #include "lock0lock.h" #include "ibuf0ibuf.h" -#include "btr0cur.h" #ifdef UNIV_NONINL #include "srv0mon.ic" #endif @@ -215,11 +215,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST}, - {"buffer_pool_pages_in_flush", "buffer", - "Number of pages in flush list", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_PAGE_INFLUSH}, - {"buffer_pool_wait_free", "buffer", "Number of times waited for free buffer" " (innodb_buffer_pool_wait_free)", @@ -259,12 +254,24 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DATA}, + {"buffer_pool_bytes_data", "buffer", + "Buffer bytes containing data (innodb_buffer_pool_bytes_data)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DATA}, + {"buffer_pool_pages_dirty", "buffer", "Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)", static_cast<monitor_type_t>( MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY}, + {"buffer_pool_bytes_dirty", "buffer", + "Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DIRTY}, + {"buffer_pool_pages_free", "buffer", "Buffer pages currently free (innodb_buffer_pool_pages_free)", static_cast<monitor_type_t>( @@ -350,25 +357,40 @@ static monitor_info_t innodb_counter_info[] = MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, MONITOR_FLUSH_NEIGHBOR_PAGES}, - /* Cumulative counter for flush batches because of max_dirty */ - {"buffer_flush_max_dirty_total_pages", "buffer", - "Total pages flushed as part of max_dirty batches", - MONITOR_SET_OWNER, MONITOR_FLUSH_MAX_DIRTY_COUNT, - MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE}, + {"buffer_flush_n_to_flush_requested", "buffer", + "Number of pages requested for flushing.", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_REQUESTED}, + + {"buffer_flush_avg_page_rate", "buffer", + "Average number of pages at which flushing is happening", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PAGE_RATE}, + + {"buffer_flush_lsn_avg_rate", "buffer", + "Average redo generation rate", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_LSN_AVG_RATE}, + + {"buffer_flush_pct_for_dirty", "buffer", + "Percent of IO capacity used to avoid max dirty page limit", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_DIRTY}, - {"buffer_flush_max_dirty", "buffer", - "Number of max_dirty batches", - MONITOR_SET_MEMBER, MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE, - MONITOR_FLUSH_MAX_DIRTY_COUNT}, + {"buffer_flush_pct_for_lsn", "buffer", + "Percent of IO capacity used to avoid reusable redo space limit", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_LSN}, + + {"buffer_flush_sync_waits", "buffer", + "Number of times a wait happens due to sync flushing", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_SYNC_WAITS}, - {"buffer_flush_max_dirty_pages", "buffer", - "Pages queued as a max_dirty batch", - MONITOR_SET_MEMBER, MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE, - MONITOR_FLUSH_MAX_DIRTY_PAGES}, - /* Cumulative counter for flush batches because of adaptive */ + /* Cumulative counter for flush batches for adaptive flushing */ {"buffer_flush_adaptive_total_pages", "buffer", - "Total pages flushed as part of adaptive batches", + "Total pages flushed as part of adaptive flushing", MONITOR_SET_OWNER, MONITOR_FLUSH_ADAPTIVE_COUNT, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE}, @@ -382,22 +404,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, MONITOR_FLUSH_ADAPTIVE_PAGES}, - /* Cumulative counter for flush batches because of async */ - {"buffer_flush_async_total_pages", "buffer", - "Total pages flushed as part of async batches", - MONITOR_SET_OWNER, MONITOR_FLUSH_ASYNC_COUNT, - MONITOR_FLUSH_ASYNC_TOTAL_PAGE}, - - {"buffer_flush_async", "buffer", - "Number of async batches", - MONITOR_SET_MEMBER, MONITOR_FLUSH_ASYNC_TOTAL_PAGE, - MONITOR_FLUSH_ASYNC_COUNT}, - - {"buffer_flush_async_pages", "buffer", - "Pages queued as an async batch", - MONITOR_SET_MEMBER, MONITOR_FLUSH_ASYNC_TOTAL_PAGE, - MONITOR_FLUSH_ASYNC_PAGES}, - /* Cumulative counter for flush batches because of sync */ {"buffer_flush_sync_total_pages", "buffer", "Total pages flushed as part of sync batches", @@ -859,6 +865,16 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_PAGE_DECOMPRESS}, + {"compression_pad_increments", "compression", + "Number of times padding is incremented to avoid compression failures", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PAD_INCREMENTS}, + + {"compression_pad_decrements", "compression", + "Number of times padding is decremented due to good compressibility", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1130,11 +1146,26 @@ static monitor_info_t innodb_counter_info[] = MONITOR_MODULE, MONITOR_DEFAULT_START, MONITOR_MODULE_DDL_STATS}, + {"ddl_background_drop_indexes", "ddl", + "Number of indexes waiting to be dropped after failed index creation", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_INDEX}, + {"ddl_background_drop_tables", "ddl", "Number of tables in background drop table list", MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_TABLE}, + {"ddl_online_create_index", "ddl", + "Number of indexes being created online", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ONLINE_CREATE_INDEX}, + + {"ddl_pending_alter_table", "ddl", + "Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PENDING_ALTER_TABLE}, + /* ===== Counters for ICP (Index Condition Pushdown) Module ===== */ {"module_icp", "icp", "Index Condition Pushdown", MONITOR_MODULE, @@ -1171,6 +1202,34 @@ has been turned on/off. */ UNIV_INTERN ulint monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT]; +#ifndef HAVE_ATOMIC_BUILTINS_64 +/** Mutex protecting atomic operations on platforms that lack +built-in operations for atomic memory access */ +ib_mutex_t monitor_mutex; + +/** Key to register monitor_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t monitor_mutex_key; + +/****************************************************************//** +Initialize the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_create(void) +/*================*/ +{ + mutex_create(monitor_mutex_key, &monitor_mutex, SYNC_ANY_LATCH); +} +/****************************************************************//** +Close the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_free(void) +/*==============*/ +{ + mutex_free(&monitor_mutex); +} +#endif /* !HAVE_ATOMIC_BUILTINS_64 */ + /****************************************************************//** Get a monitor's "monitor_info" by its monitor id (index into the innodb_counter_info array. @@ -1359,13 +1418,14 @@ srv_mon_process_existing_counter( mon_option_t set_option) /*!< in: Turn on/off reset the counter */ { - mon_type_t value; - monitor_info_t* monitor_info; - ibool update_min = FALSE; - buf_pool_stat_t stat; - ulint LRU_len; - ulint free_len; - ulint flush_list_len; + mon_type_t value; + monitor_info_t* monitor_info; + ibool update_min = FALSE; + buf_pool_stat_t stat; + buf_pools_list_size_t buf_pools_list_size; + ulint LRU_len; + ulint free_len; + ulint flush_list_len; monitor_info = srv_mon_get_info(monitor_id); @@ -1381,7 +1441,7 @@ srv_mon_process_existing_counter( /* export_vars.innodb_buffer_pool_reads. Num Reads from disk (page not in buffer) */ case MONITOR_OVLD_BUF_POOL_READS: - value = srv_buf_pool_reads; + value = srv_stats.buf_pool_reads; break; /* innodb_buffer_pool_read_requests, the number of logical @@ -1394,12 +1454,12 @@ srv_mon_process_existing_counter( /* innodb_buffer_pool_write_requests, the number of write request */ case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST: - value = srv_buf_pool_write_requests; + value = srv_stats.buf_pool_write_requests; break; /* innodb_buffer_pool_wait_free */ case MONITOR_OVLD_BUF_POOL_WAIT_FREE: - value = srv_buf_pool_wait_free; + value = srv_stats.buf_pool_wait_free; break; /* innodb_buffer_pool_read_ahead */ @@ -1431,12 +1491,25 @@ srv_mon_process_existing_counter( value = LRU_len; break; + /* innodb_buffer_pool_bytes_data */ + case MONITOR_OVLD_BUF_POOL_BYTES_DATA: + buf_get_total_list_size_in_bytes(&buf_pools_list_size); + value = buf_pools_list_size.LRU_bytes + + buf_pools_list_size.unzip_LRU_bytes; + break; + /* innodb_buffer_pool_pages_dirty */ case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY: buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); value = flush_list_len; break; + /* innodb_buffer_pool_bytes_dirty */ + case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY: + buf_get_total_list_size_in_bytes(&buf_pools_list_size); + value = buf_pools_list_size.flush_list_bytes; + break; + /* innodb_buffer_pool_pages_free */ case MONITOR_OVLD_BUF_POOL_PAGES_FREE: buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); @@ -1463,12 +1536,12 @@ srv_mon_process_existing_counter( /* innodb_data_reads, the total number of data reads */ case MONITOR_OVLD_BYTE_READ: - value = srv_data_read; + value = srv_stats.data_read; break; /* innodb_data_writes, the total number of data writes. */ case MONITOR_OVLD_BYTE_WRITTEN: - value = srv_data_written; + value = srv_stats.data_written; break; /* innodb_data_reads, the total number of data reads. */ @@ -1488,7 +1561,7 @@ srv_mon_process_existing_counter( /* innodb_os_log_written */ case MONITOR_OVLD_OS_LOG_WRITTEN: - value = (mon_type_t) srv_os_log_written; + value = (mon_type_t) srv_stats.os_log_written; break; /* innodb_os_log_fsyncs */ @@ -1504,33 +1577,33 @@ srv_mon_process_existing_counter( /* innodb_os_log_pending_writes */ case MONITOR_OVLD_OS_LOG_PENDING_WRITES: - value = srv_os_log_pending_writes; + value = srv_stats.os_log_pending_writes; update_min = TRUE; break; /* innodb_log_waits */ case MONITOR_OVLD_LOG_WAITS: - value = srv_log_waits; + value = srv_stats.log_waits; break; /* innodb_log_write_requests */ case MONITOR_OVLD_LOG_WRITE_REQUEST: - value = srv_log_write_requests; + value = srv_stats.log_write_requests; break; /* innodb_log_writes */ case MONITOR_OVLD_LOG_WRITES: - value = srv_log_writes; + value = srv_stats.log_writes; break; /* innodb_dblwr_writes */ case MONITOR_OVLD_SRV_DBLWR_WRITES: - value = srv_dblwr_writes; + value = srv_stats.dblwr_writes; break; /* innodb_dblwr_pages_written */ case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN: - value = srv_dblwr_pages_written; + value = srv_stats.dblwr_pages_written; break; /* innodb_page_size */ @@ -1539,27 +1612,27 @@ srv_mon_process_existing_counter( break; case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS: - value = rw_s_spin_wait_count; + value = rw_lock_stats.rw_s_spin_wait_count; break; case MONITOR_OVLD_RWLOCK_X_SPIN_WAITS: - value = rw_x_os_wait_count; + value = rw_lock_stats.rw_x_os_wait_count; break; case MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS: - value = rw_s_spin_round_count; + value = rw_lock_stats.rw_s_spin_round_count; break; case MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS: - value = rw_x_spin_round_count; + value = rw_lock_stats.rw_x_spin_round_count; break; case MONITOR_OVLD_RWLOCK_S_OS_WAITS: - value = rw_s_os_wait_count; + value = rw_lock_stats.rw_s_os_wait_count; break; case MONITOR_OVLD_RWLOCK_X_OS_WAITS: - value = rw_x_os_wait_count; + value = rw_lock_stats.rw_x_os_wait_count; break; case MONITOR_OVLD_BUFFER_POOL_SIZE: @@ -1568,44 +1641,44 @@ srv_mon_process_existing_counter( /* innodb_rows_read */ case MONITOR_OLVD_ROW_READ: - value = srv_n_rows_read; + value = srv_stats.n_rows_read; break; /* innodb_rows_inserted */ case MONITOR_OLVD_ROW_INSERTED: - value = srv_n_rows_inserted; + value = srv_stats.n_rows_inserted; break; /* innodb_rows_deleted */ case MONITOR_OLVD_ROW_DELETED: - value = srv_n_rows_deleted; + value = srv_stats.n_rows_deleted; break; /* innodb_rows_updated */ case MONITOR_OLVD_ROW_UPDTATED: - value = srv_n_rows_updated; + value = srv_stats.n_rows_updated; break; /* innodb_row_lock_current_waits */ case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT: - value = srv_n_lock_wait_current_count; + value = srv_stats.n_lock_wait_current_count; break; /* innodb_row_lock_time */ case MONITOR_OVLD_LOCK_WAIT_TIME: - value = srv_n_lock_wait_time / 1000; + value = srv_stats.n_lock_wait_time / 1000; break; /* innodb_row_lock_time_max */ case MONITOR_OVLD_LOCK_MAX_WAIT_TIME: - value = srv_n_lock_max_wait_time / 1000; + value = lock_sys->n_lock_max_wait_time / 1000; break; /* innodb_row_lock_time_avg */ case MONITOR_OVLD_LOCK_AVG_WAIT_TIME: - if (srv_n_lock_wait_count > 0) { - value = srv_n_lock_wait_time / 1000 - / srv_n_lock_wait_count; + if (srv_stats.n_lock_wait_count > 0) { + value = srv_stats.n_lock_wait_time / 1000 + / srv_stats.n_lock_wait_count; } else { value = 0; } @@ -1613,7 +1686,7 @@ srv_mon_process_existing_counter( /* innodb_row_lock_waits */ case MONITOR_OVLD_ROW_LOCK_WAIT: - value = srv_n_lock_wait_count; + value = srv_stats.n_lock_wait_count; break; case MONITOR_RSEG_HISTORY_LEN: diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index e64cc006f02..5c0ca903417 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -59,6 +59,7 @@ Created 10/8/1995 Heikki Tuuri #include "btr0sea.h" #include "dict0load.h" #include "dict0boot.h" +#include "dict0stats_bg.h" /* dict_stats_event */ #include "srv0start.h" #include "row0mysql.h" #include "ha_prototypes.h" @@ -70,10 +71,6 @@ Created 10/8/1995 Heikki Tuuri #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" -/* The following counter is incremented whenever there is some user activity -in the server */ -UNIV_INTERN ulint srv_activity_count = 0; - /* The following is the maximum allowed duration of a lock wait. */ UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600; @@ -86,6 +83,8 @@ UNIV_INTERN ibool srv_error_monitor_active = FALSE; UNIV_INTERN ibool srv_buf_dump_thread_active = FALSE; +UNIV_INTERN ibool srv_dict_stats_thread_active = FALSE; + UNIV_INTERN const char* srv_main_thread_op_info = ""; /** Prefix used by MySQL to indicate pre-5.1 table name encoding */ @@ -104,6 +103,9 @@ UNIV_INTERN char* srv_undo_dir = NULL; /** The number of tablespaces to use for rollback segments. */ UNIV_INTERN ulong srv_undo_tablespaces = 8; +/** The number of UNDO tablespaces that are open and ready to use. */ +UNIV_INTERN ulint srv_undo_tablespaces_open = 8; + /* The number of rollback segments to use */ UNIV_INTERN ulong srv_undo_logs = 1; @@ -111,6 +113,10 @@ UNIV_INTERN ulong srv_undo_logs = 1; UNIV_INTERN char* srv_arch_dir = NULL; #endif /* UNIV_LOG_ARCHIVE */ +/** Set if InnoDB must operate in read-only mode. We don't do any +recovery and open all tables in RO mode instead of RW mode. We don't +sync the max trx id to disk either. */ +UNIV_INTERN my_bool srv_read_only_mode; /** store to its own file each table created by an user; data dictionary tables are in the system tablespace 0 */ UNIV_INTERN my_bool srv_file_per_table; @@ -128,6 +134,10 @@ UNIV_INTERN ulint srv_max_file_format_at_startup = UNIV_FORMAT_MAX; /** Place locks to records only i.e. do not use next-key locking except on duplicate key checking and foreign key checking */ UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE; +/** Sort buffer size in index creation */ +UNIV_INTERN ulong srv_sort_buf_size = 1048576; +/** Maximum modification log file size for online index creation */ +UNIV_INTERN unsigned long long srv_online_max_size; /* If this flag is TRUE, then we will use the native aio of the OS (provided we compiled Innobase with it in), otherwise we will @@ -170,15 +180,16 @@ the user from forgetting the 'newraw' keyword to my.cnf */ UNIV_INTERN ibool srv_created_new_raw = FALSE; -UNIV_INTERN char** srv_log_group_home_dirs = NULL; +UNIV_INTERN char* srv_log_group_home_dir = NULL; -UNIV_INTERN ulint srv_n_log_groups = ULINT_MAX; -UNIV_INTERN ulint srv_n_log_files = ULINT_MAX; +UNIV_INTERN ulong srv_n_log_files = SRV_N_LOG_FILES_MAX; /* size in database pages */ UNIV_INTERN ib_uint64_t srv_log_file_size = IB_UINT64_MAX; +UNIV_INTERN ib_uint64_t srv_log_file_size_requested; /* size in database pages */ UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX; UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1; +UNIV_INTERN uint srv_flush_log_at_timeout = 1; UNIV_INTERN ulong srv_page_size = UNIV_PAGE_SIZE_DEF; UNIV_INTERN ulong srv_page_size_shift = UNIV_PAGE_SIZE_SHIFT_DEF; @@ -211,7 +222,7 @@ UNIV_INTERN ulong srv_n_page_hash_locks = 16; /** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ UNIV_INTERN ulong srv_LRU_scan_depth = 1024; /** whether or not to flush neighbors of a block */ -UNIV_INTERN my_bool srv_flush_neighbors = TRUE; +UNIV_INTERN ulong srv_flush_neighbors = 1; /* previously requested size */ UNIV_INTERN ulint srv_buf_pool_old_size; /* current size in kilobytes */ @@ -256,7 +267,8 @@ UNIV_INTERN ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; UNIV_INTERN ulint srv_max_n_open_files = 300; /* Number of IO operations per second the server can do */ -UNIV_INTERN ulong srv_io_capacity = 400; +UNIV_INTERN ulong srv_io_capacity = 200; +UNIV_INTERN ulong srv_max_io_capacity = 400; /* The InnoDB main thread tries to keep the ratio of modified pages in the buffer pool to all database pages in the buffer pool smaller than @@ -264,76 +276,49 @@ the following number. But it is not guaranteed that the value stays below that during a time of heavy update/insert activity. */ UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75; +UNIV_INTERN ulong srv_max_dirty_pages_pct_lwm = 50; + +/* This is the percentage of log capacity at which adaptive flushing, +if enabled, will kick in. */ +UNIV_INTERN ulong srv_adaptive_flushing_lwm = 10; + +/* Number of iterations over which adaptive flushing is averaged. */ +UNIV_INTERN ulong srv_flushing_avg_loops = 30; /* The number of purge threads to use.*/ -UNIV_INTERN ulong srv_n_purge_threads = 1; +UNIV_INTERN ulong srv_n_purge_threads = 1; /* the number of pages to purge in one batch */ -UNIV_INTERN ulong srv_purge_batch_size = 20; - -/* variable counts amount of data read in total (in bytes) */ -UNIV_INTERN ulint srv_data_read = 0; +UNIV_INTERN ulong srv_purge_batch_size = 20; /* Internal setting for "innodb_stats_method". Decides how InnoDB treats NULL value when collecting statistics. By default, it is set to SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */ -ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL; - -/* here we count the amount of data written in total (in bytes) */ -UNIV_INTERN ulint srv_data_written = 0; - -/* the number of the log write requests done */ -UNIV_INTERN ulint srv_log_write_requests = 0; - -/* the number of physical writes to the log performed */ -UNIV_INTERN ulint srv_log_writes = 0; - -/* amount of data written to the log files in bytes */ -UNIV_INTERN lsn_t srv_os_log_written = 0; +UNIV_INTERN ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL; -/* amount of writes being done to the log files */ -UNIV_INTERN ulint srv_os_log_pending_writes = 0; - -/* we increase this counter, when there we don't have enough space in the -log buffer and have to flush it */ -UNIV_INTERN ulint srv_log_waits = 0; - -/* this variable counts the amount of times, when the doublewrite buffer -was flushed */ -UNIV_INTERN ulint srv_dblwr_writes = 0; - -/* here we store the number of pages that have been flushed to the -doublewrite buffer */ -UNIV_INTERN ulint srv_dblwr_pages_written = 0; - -/* in this variable we store the number of write requests issued */ -UNIV_INTERN ulint srv_buf_pool_write_requests = 0; - -/* here we store the number of times when we had to wait for a free page -in the buffer pool. It happens when the buffer pool is full and we need -to make a flush, in order to be able to read or create a page. */ -UNIV_INTERN ulint srv_buf_pool_wait_free = 0; - -/* variable to count the number of pages that were written from buffer -pool to the disk */ -UNIV_INTERN ulint srv_buf_pool_flushed = 0; - -/** Number of buffer pool reads that led to the -reading of a disk page */ -UNIV_INTERN ulint srv_buf_pool_reads = 0; +UNIV_INTERN srv_stats_t srv_stats; /* structure to pass status variables to MySQL */ -UNIV_INTERN export_struc export_vars; - -/* If the following is != 0 we do not allow inserts etc. This protects -the user from forgetting the innodb_force_recovery keyword to my.cnf */ - -UNIV_INTERN ulint srv_force_recovery = 0; +UNIV_INTERN export_var_t export_vars; + +/** Normally 0. When nonzero, skip some phases of crash recovery, +starting from SRV_FORCE_IGNORE_CORRUPT, so that data can be recovered +by SELECT or mysqldump. When this is nonzero, we do not allow any user +modifications to the data. */ +UNIV_INTERN ulong srv_force_recovery; +#ifndef DBUG_OFF +/** Inject a crash at different steps of the recovery process. +This is for testing and debugging only. */ +UNIV_INTERN ulong srv_force_recovery_crash; +#endif /* !DBUG_OFF */ /** Print all user-level transactions deadlocks to mysqld stderr */ UNIV_INTERN my_bool srv_print_all_deadlocks = FALSE; +/** Enable INFORMATION_SCHEMA.innodb_cmp_per_index */ +UNIV_INTERN my_bool srv_cmp_per_index_enabled = FALSE; + /* If the following is set to 1 then we do not run purge and insert buffer merge to completion before shutdown. If it is set to 2, do not even flush the buffer pool to data files at the shutdown: we effectively 'crash' @@ -350,7 +335,9 @@ this many index pages, there are 2 ways to calculate statistics: * quick transient stats, that are used if persistent stats for the given table/index are not found in the innodb database */ UNIV_INTERN unsigned long long srv_stats_transient_sample_pages = 8; +UNIV_INTERN my_bool srv_stats_persistent = TRUE; UNIV_INTERN unsigned long long srv_stats_persistent_sample_pages = 20; +UNIV_INTERN my_bool srv_stats_auto_recalc = TRUE; UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE; @@ -375,11 +362,6 @@ UNIV_INTERN ibool srv_print_log_io = FALSE; UNIV_INTERN ibool srv_print_latch_waits = FALSE; #endif /* UNIV_DEBUG */ -UNIV_INTERN ulint srv_n_rows_inserted = 0; -UNIV_INTERN ulint srv_n_rows_updated = 0; -UNIV_INTERN ulint srv_n_rows_deleted = 0; -UNIV_INTERN ulint srv_n_rows_read = 0; - static ulint srv_n_rows_inserted_old = 0; static ulint srv_n_rows_updated_old = 0; static ulint srv_n_rows_deleted_old = 0; @@ -404,58 +386,58 @@ UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; UNIV_INTERN time_t srv_last_monitor_time; -UNIV_INTERN mutex_t srv_innodb_monitor_mutex; +UNIV_INTERN ib_mutex_t srv_innodb_monitor_mutex; -/* Mutex for locking srv_monitor_file */ -UNIV_INTERN mutex_t srv_monitor_file_mutex; +/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */ +UNIV_INTERN ib_mutex_t srv_monitor_file_mutex; #ifdef UNIV_PFS_MUTEX # ifndef HAVE_ATOMIC_BUILTINS /* Key to register server_mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t server_mutex_key; # endif /* !HAVE_ATOMIC_BUILTINS */ -/* Key to register srv_innodb_monitor_mutex with performance schema */ +/** Key to register srv_innodb_monitor_mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t srv_innodb_monitor_mutex_key; -/* Key to register srv_monitor_file_mutex with performance schema */ +/** Key to register srv_monitor_file_mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t srv_monitor_file_mutex_key; -/* Key to register srv_dict_tmpfile_mutex with performance schema */ +/** Key to register srv_dict_tmpfile_mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t srv_dict_tmpfile_mutex_key; -/* Key to register the mutex with performance schema */ +/** Key to register the mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t srv_misc_tmpfile_mutex_key; -/* Key to register srv_sys_t::mutex with performance schema */ +/** Key to register srv_sys_t::mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t srv_sys_mutex_key; -/* Key to register srv_sys_t::tasks_mutex with performance schema */ +/** Key to register srv_sys_t::tasks_mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t srv_sys_tasks_mutex_key; #endif /* UNIV_PFS_MUTEX */ -/* Temporary file for innodb monitor output */ +/** Temporary file for innodb monitor output */ UNIV_INTERN FILE* srv_monitor_file; -/* Mutex for locking srv_dict_tmpfile. +/** Mutex for locking srv_dict_tmpfile. Not created if srv_read_only_mode. This mutex has a very high rank; threads reserving it should not be holding any InnoDB latches. */ -UNIV_INTERN mutex_t srv_dict_tmpfile_mutex; -/* Temporary file for output from the data dictionary */ +UNIV_INTERN ib_mutex_t srv_dict_tmpfile_mutex; +/** Temporary file for output from the data dictionary */ UNIV_INTERN FILE* srv_dict_tmpfile; -/* Mutex for locking srv_misc_tmpfile. +/** Mutex for locking srv_misc_tmpfile. Not created if srv_read_only_mode. This mutex has a very low rank; threads reserving it should not acquire any further latches or sleep before releasing this one. */ -UNIV_INTERN mutex_t srv_misc_tmpfile_mutex; -/* Temporary file for miscellanous diagnostic output */ +UNIV_INTERN ib_mutex_t srv_misc_tmpfile_mutex; +/** Temporary file for miscellanous diagnostic output */ UNIV_INTERN FILE* srv_misc_tmpfile; UNIV_INTERN ulint srv_main_thread_process_no = 0; UNIV_INTERN ulint srv_main_thread_id = 0; -/* The following count work done by srv_master_thread. */ +/* The following counts are used by the srv_master_thread. */ -/* Iterations of the loop bounded by 'srv_active' label. */ -static ulint srv_main_active_loops = 0; -/* Iterations of the loop bounded by the 'srv_idle' label. */ -static ulint srv_main_idle_loops = 0; -/* Iterations of the loop bounded by the 'srv_shutdown' label. */ -static ulint srv_main_shutdown_loops = 0; -/* Log writes involving flush. */ -static ulint srv_log_writes_and_flush = 0; +/** Iterations of the loop bounded by 'srv_active' label. */ +static ulint srv_main_active_loops = 0; +/** Iterations of the loop bounded by the 'srv_idle' label. */ +static ulint srv_main_idle_loops = 0; +/** Iterations of the loop bounded by the 'srv_shutdown' label. */ +static ulint srv_main_shutdown_loops = 0; +/** Log writes involving flush. */ +static ulint srv_log_writes_and_flush = 0; /* This is only ever touched by the master thread. It records the time when the last flush of log file has happened. The master @@ -484,7 +466,8 @@ current_time % 5 != 0. */ } while (0) /** Test if the system mutex is owned. */ -#define srv_sys_mutex_own() mutex_own(&srv_sys->mutex) +#define srv_sys_mutex_own() (mutex_own(&srv_sys->mutex) \ + && !srv_read_only_mode) /** Release the system mutex. */ #define srv_sys_mutex_exit() do { \ @@ -492,7 +475,7 @@ current_time % 5 != 0. */ } while (0) #define fetch_lock_wait_timeout(trx) \ - ((trx)->lock.allowed_to_wait \ + ((trx)->lock.allowed_to_wait \ ? thd_lock_wait_timeout((trx)->mysql_thd) \ : 0) @@ -568,35 +551,32 @@ suspending the master thread and utility threads when they have nothing to do. The thread table can be seen as an analogue to the process table in a traditional Unix implementation. */ -/** The server system */ -typedef struct srv_sys_struct srv_sys_t; - /** The server system struct */ -struct srv_sys_struct{ - mutex_t tasks_mutex; /*!< variable protecting the +struct srv_sys_t{ + ib_mutex_t tasks_mutex; /*!< variable protecting the tasks queue */ UT_LIST_BASE_NODE_T(que_thr_t) tasks; /*!< task queue */ - mutex_t mutex; /*!< variable protecting the - + ib_mutex_t mutex; /*!< variable protecting the fields below. */ ulint n_sys_threads; /*!< size of the sys_threads array */ - srv_table_t* sys_threads; /*!< server thread table */ + srv_slot_t* sys_threads; /*!< server thread table */ ulint n_threads_active[SRV_MASTER + 1]; /*!< number of threads active in a thread class */ - ulint activity_count; /*!< For tracking server + srv_stats_t::ulint_ctr_1_t + activity_count; /*!< For tracking server activity */ }; #ifndef HAVE_ATOMIC_BUILTINS /** Mutex protecting some server global variables. */ -UNIV_INTERN mutex_t server_mutex; +UNIV_INTERN ib_mutex_t server_mutex; #endif /* !HAVE_ATOMIC_BUILTINS */ static srv_sys_t* srv_sys = NULL; @@ -656,6 +636,18 @@ srv_set_io_thread_op_info( srv_io_thread_op_info[i] = str; } +/*********************************************************************//** +Resets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_reset_io_thread_op_info() +/*=========================*/ +{ + for (ulint i = 0; i < UT_ARR_SIZE(srv_io_thread_op_info); ++i) { + srv_io_thread_op_info[i] = "not started yet"; + } +} + #ifdef UNIV_DEBUG /*********************************************************************//** Validates the type of a thread table slot. @@ -756,6 +748,8 @@ srv_suspend_thread_low( /*===================*/ srv_slot_t* slot) /*!< in/out: thread slot */ { + + ut_ad(!srv_read_only_mode); ut_ad(srv_sys_mutex_own()); ut_ad(slot->in_use); @@ -915,9 +909,8 @@ void srv_init(void) /*==========*/ { - ulint i; - ulint srv_sys_sz; - ulint n_sys_threads; + ulint n_sys_threads = 0; + ulint srv_sys_sz = sizeof(*srv_sys); #ifndef HAVE_ATOMIC_BUILTINS mutex_create(server_mutex_key, &server_mutex, SYNC_ANY_LATCH); @@ -926,38 +919,55 @@ srv_init(void) mutex_create(srv_innodb_monitor_mutex_key, &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK); - /* Number of purge threads + master thread */ - n_sys_threads = srv_n_purge_threads + 1; + if (!srv_read_only_mode) { - srv_sys_sz = sizeof(*srv_sys) + (n_sys_threads * sizeof(srv_slot_t)); + /* Number of purge threads + master thread */ + n_sys_threads = srv_n_purge_threads + 1; + + srv_sys_sz += n_sys_threads * sizeof(*srv_sys->sys_threads); + } srv_sys = static_cast<srv_sys_t*>(mem_zalloc(srv_sys_sz)); - mutex_create(srv_sys_mutex_key, &srv_sys->mutex, SYNC_THREADS); + srv_sys->n_sys_threads = n_sys_threads; - mutex_create(srv_sys_tasks_mutex_key, - &srv_sys->tasks_mutex, SYNC_ANY_LATCH); + if (!srv_read_only_mode) { - srv_sys->n_sys_threads = n_sys_threads; - srv_sys->sys_threads = (srv_slot_t*) &srv_sys[1]; + mutex_create(srv_sys_mutex_key, &srv_sys->mutex, SYNC_THREADS); - for (i = 0; i < srv_sys->n_sys_threads; i++) { - srv_slot_t* slot; + mutex_create(srv_sys_tasks_mutex_key, + &srv_sys->tasks_mutex, SYNC_ANY_LATCH); - slot = srv_sys->sys_threads + i; + srv_sys->sys_threads = (srv_slot_t*) &srv_sys[1]; - slot->event = os_event_create(NULL); + for (ulint i = 0; i < srv_sys->n_sys_threads; ++i) { + srv_slot_t* slot = &srv_sys->sys_threads[i]; - ut_a(slot->event); - } + slot->event = os_event_create(); + + ut_a(slot->event); + } + + srv_error_event = os_event_create(); - srv_error_event = os_event_create(NULL); + srv_monitor_event = os_event_create(); - srv_monitor_event = os_event_create(NULL); + srv_buf_dump_event = os_event_create(); - srv_buf_dump_event = os_event_create("buf_dump_event"); + UT_LIST_INIT(srv_sys->tasks); + } + + /* page_zip_stat_per_index_mutex is acquired from: + 1. page_zip_compress() (after SYNC_FSP) + 2. page_zip_decompress() + 3. i_s_cmp_per_index_fill_low() (where SYNC_DICT is acquired) + 4. innodb_cmp_per_index_update(), no other latches + since we do not acquire any other latches while holding this mutex, + it can have very low level. We pick SYNC_ANY_LATCH for it. */ - UT_LIST_INIT(srv_sys->tasks); + mutex_create( + page_zip_stat_per_index_mutex_key, + &page_zip_stat_per_index_mutex, SYNC_ANY_LATCH); /* Create dummy indexes for infimum and supremum records */ @@ -987,8 +997,10 @@ srv_free(void) trx_i_s_cache_free(trx_i_s_cache); - os_event_free(srv_buf_dump_event); - srv_buf_dump_event = NULL; + if (!srv_read_only_mode) { + os_event_free(srv_buf_dump_event); + srv_buf_dump_event = NULL; + } } /*********************************************************************//** @@ -1010,10 +1022,9 @@ srv_general_init(void) } /*********************************************************************//** -Normalizes init parameter values to use units we use inside InnoDB. -@return DB_SUCCESS or error code */ +Normalizes init parameter values to use units we use inside InnoDB. */ static -ulint +void srv_normalize_init_values(void) /*===========================*/ { @@ -1035,28 +1046,19 @@ srv_normalize_init_values(void) srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE; srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE); - - return(DB_SUCCESS); } /*********************************************************************//** -Boots the InnoDB server. -@return DB_SUCCESS or error code */ +Boots the InnoDB server. */ UNIV_INTERN -ulint +void srv_boot(void) /*==========*/ { - ulint err; - /* Transform the init parameter values given by MySQL to use units we use inside InnoDB: */ - err = srv_normalize_init_values(); - - if (err != DB_SUCCESS) { - return(err); - } + srv_normalize_init_values(); /* Initialize synchronization primitives, memory management, and thread local storage */ @@ -1066,8 +1068,7 @@ srv_boot(void) /* Initialize this module */ srv_init(); - - return(DB_SUCCESS); + srv_mon_create(); } /******************************************************************//** @@ -1090,10 +1091,10 @@ srv_refresh_innodb_monitor_stats(void) buf_refresh_io_stats_all(); - srv_n_rows_inserted_old = srv_n_rows_inserted; - srv_n_rows_updated_old = srv_n_rows_updated; - srv_n_rows_deleted_old = srv_n_rows_deleted; - srv_n_rows_read_old = srv_n_rows_read; + srv_n_rows_inserted_old = srv_stats.n_rows_inserted; + srv_n_rows_updated_old = srv_stats.n_rows_updated; + srv_n_rows_deleted_old = srv_stats.n_rows_deleted; + srv_n_rows_read_old = srv_stats.n_rows_read; mutex_exit(&srv_innodb_monitor_mutex); } @@ -1158,7 +1159,7 @@ srv_printf_innodb_monitor( mutex_enter(&dict_foreign_err_mutex); - if (ftell(dict_foreign_err_file) != 0L) { + if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) { fputs("------------------------\n" "LATEST FOREIGN KEY ERROR\n" "------------------------\n", file); @@ -1271,26 +1272,26 @@ srv_printf_innodb_monitor( "Number of rows inserted " ULINTPF ", updated " ULINTPF ", deleted " ULINTPF ", read " ULINTPF "\n", - srv_n_rows_inserted, - srv_n_rows_updated, - srv_n_rows_deleted, - srv_n_rows_read); + (ulint) srv_stats.n_rows_inserted, + (ulint) srv_stats.n_rows_updated, + (ulint) srv_stats.n_rows_deleted, + (ulint) srv_stats.n_rows_read); fprintf(file, "%.2f inserts/s, %.2f updates/s," " %.2f deletes/s, %.2f reads/s\n", - (srv_n_rows_inserted - srv_n_rows_inserted_old) + ((ulint) srv_stats.n_rows_inserted - srv_n_rows_inserted_old) / time_elapsed, - (srv_n_rows_updated - srv_n_rows_updated_old) + ((ulint) srv_stats.n_rows_updated - srv_n_rows_updated_old) / time_elapsed, - (srv_n_rows_deleted - srv_n_rows_deleted_old) + ((ulint) srv_stats.n_rows_deleted - srv_n_rows_deleted_old) / time_elapsed, - (srv_n_rows_read - srv_n_rows_read_old) + ((ulint) srv_stats.n_rows_read - srv_n_rows_read_old) / time_elapsed); - srv_n_rows_inserted_old = srv_n_rows_inserted; - srv_n_rows_updated_old = srv_n_rows_updated; - srv_n_rows_deleted_old = srv_n_rows_deleted; - srv_n_rows_read_old = srv_n_rows_read; + srv_n_rows_inserted_old = srv_stats.n_rows_inserted; + srv_n_rows_updated_old = srv_stats.n_rows_updated; + srv_n_rows_deleted_old = srv_stats.n_rows_deleted; + srv_n_rows_read_old = srv_stats.n_rows_read; fputs("----------------------------\n" "END OF INNODB MONITOR OUTPUT\n" @@ -1308,89 +1309,168 @@ void srv_export_innodb_status(void) /*==========================*/ { - buf_pool_stat_t stat; - ulint LRU_len; - ulint free_len; - ulint flush_list_len; + buf_pool_stat_t stat; + buf_pools_list_size_t buf_pools_list_size; + ulint LRU_len; + ulint free_len; + ulint flush_list_len; buf_get_total_stat(&stat); buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); + buf_get_total_list_size_in_bytes(&buf_pools_list_size); mutex_enter(&srv_innodb_monitor_mutex); - export_vars.innodb_data_pending_reads - = os_n_pending_reads; - export_vars.innodb_data_pending_writes - = os_n_pending_writes; - export_vars.innodb_data_pending_fsyncs - = fil_n_pending_log_flushes + export_vars.innodb_data_pending_reads = + os_n_pending_reads; + + export_vars.innodb_data_pending_writes = + os_n_pending_writes; + + export_vars.innodb_data_pending_fsyncs = + fil_n_pending_log_flushes + fil_n_pending_tablespace_flushes; + export_vars.innodb_data_fsyncs = os_n_fsyncs; - export_vars.innodb_data_read = srv_data_read; + + export_vars.innodb_data_read = srv_stats.data_read; + export_vars.innodb_data_reads = os_n_file_reads; + export_vars.innodb_data_writes = os_n_file_writes; - export_vars.innodb_data_written = srv_data_written; + + export_vars.innodb_data_written = srv_stats.data_written; + export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets; - export_vars.innodb_buffer_pool_write_requests - = srv_buf_pool_write_requests; - export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free; - export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed; - export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads; - export_vars.innodb_buffer_pool_read_ahead_rnd - = stat.n_ra_pages_read_rnd; - export_vars.innodb_buffer_pool_read_ahead - = stat.n_ra_pages_read; - export_vars.innodb_buffer_pool_read_ahead_evicted - = stat.n_ra_pages_evicted; + + export_vars.innodb_buffer_pool_write_requests = + srv_stats.buf_pool_write_requests; + + export_vars.innodb_buffer_pool_wait_free = + srv_stats.buf_pool_wait_free; + + export_vars.innodb_buffer_pool_pages_flushed = + srv_stats.buf_pool_flushed; + + export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads; + + export_vars.innodb_buffer_pool_read_ahead_rnd = + stat.n_ra_pages_read_rnd; + + export_vars.innodb_buffer_pool_read_ahead = + stat.n_ra_pages_read; + + export_vars.innodb_buffer_pool_read_ahead_evicted = + stat.n_ra_pages_evicted; + export_vars.innodb_buffer_pool_pages_data = LRU_len; + + export_vars.innodb_buffer_pool_bytes_data = + buf_pools_list_size.LRU_bytes + + buf_pools_list_size.unzip_LRU_bytes; + export_vars.innodb_buffer_pool_pages_dirty = flush_list_len; + + export_vars.innodb_buffer_pool_bytes_dirty = + buf_pools_list_size.flush_list_bytes; + export_vars.innodb_buffer_pool_pages_free = free_len; + #ifdef UNIV_DEBUG - export_vars.innodb_buffer_pool_pages_latched - = buf_get_latched_pages_number(); + export_vars.innodb_buffer_pool_pages_latched = + buf_get_latched_pages_number(); #endif /* UNIV_DEBUG */ export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages(); - export_vars.innodb_buffer_pool_pages_misc - = buf_pool_get_n_pages() - LRU_len - free_len; + export_vars.innodb_buffer_pool_pages_misc = + buf_pool_get_n_pages() - LRU_len - free_len; + #ifdef HAVE_ATOMIC_BUILTINS export_vars.innodb_have_atomic_builtins = 1; #else export_vars.innodb_have_atomic_builtins = 0; #endif export_vars.innodb_page_size = UNIV_PAGE_SIZE; - export_vars.innodb_log_waits = srv_log_waits; - export_vars.innodb_os_log_written = srv_os_log_written; + + export_vars.innodb_log_waits = srv_stats.log_waits; + + export_vars.innodb_os_log_written = srv_stats.os_log_written; + export_vars.innodb_os_log_fsyncs = fil_n_log_flushes; + export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes; - export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes; - export_vars.innodb_log_write_requests = srv_log_write_requests; - export_vars.innodb_log_writes = srv_log_writes; - export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written; - export_vars.innodb_dblwr_writes = srv_dblwr_writes; + + export_vars.innodb_os_log_pending_writes = + srv_stats.os_log_pending_writes; + + export_vars.innodb_log_write_requests = srv_stats.log_write_requests; + + export_vars.innodb_log_writes = srv_stats.log_writes; + + export_vars.innodb_dblwr_pages_written = + srv_stats.dblwr_pages_written; + + export_vars.innodb_dblwr_writes = srv_stats.dblwr_writes; + export_vars.innodb_pages_created = stat.n_pages_created; + export_vars.innodb_pages_read = stat.n_pages_read; + export_vars.innodb_pages_written = stat.n_pages_written; - export_vars.innodb_row_lock_waits = srv_n_lock_wait_count; - export_vars.innodb_row_lock_current_waits - = srv_n_lock_wait_current_count; - export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000; - if (srv_n_lock_wait_count > 0) { + + export_vars.innodb_row_lock_waits = srv_stats.n_lock_wait_count; + + export_vars.innodb_row_lock_current_waits = + srv_stats.n_lock_wait_current_count; + + export_vars.innodb_row_lock_time = srv_stats.n_lock_wait_time / 1000; + + if (srv_stats.n_lock_wait_count > 0) { + export_vars.innodb_row_lock_time_avg = (ulint) - (srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count); + (srv_stats.n_lock_wait_time + / 1000 / srv_stats.n_lock_wait_count); + } else { export_vars.innodb_row_lock_time_avg = 0; } - export_vars.innodb_row_lock_time_max - = srv_n_lock_max_wait_time / 1000; - export_vars.innodb_rows_read = srv_n_rows_read; - export_vars.innodb_rows_inserted = srv_n_rows_inserted; - export_vars.innodb_rows_updated = srv_n_rows_updated; - export_vars.innodb_rows_deleted = srv_n_rows_deleted; + + export_vars.innodb_row_lock_time_max = + lock_sys->n_lock_max_wait_time / 1000; + + export_vars.innodb_rows_read = srv_stats.n_rows_read; + + export_vars.innodb_rows_inserted = srv_stats.n_rows_inserted; + + export_vars.innodb_rows_updated = srv_stats.n_rows_updated; + + export_vars.innodb_rows_deleted = srv_stats.n_rows_deleted; + export_vars.innodb_num_open_files = fil_n_file_opened; - export_vars.innodb_truncated_status_writes = srv_truncated_status_writes; + + export_vars.innodb_truncated_status_writes = + srv_truncated_status_writes; + export_vars.innodb_available_undo_logs = srv_available_undo_logs; +#ifdef UNIV_DEBUG + if (purge_sys->done.trx_no == 0 + || trx_sys->rw_max_trx_id < purge_sys->done.trx_no - 1) { + export_vars.innodb_purge_trx_id_age = 0; + } else { + export_vars.innodb_purge_trx_id_age = + trx_sys->rw_max_trx_id - purge_sys->done.trx_no + 1; + } + + if (!purge_sys->view + || trx_sys->rw_max_trx_id < purge_sys->view->up_limit_id) { + export_vars.innodb_purge_view_trx_id_age = 0; + } else { + export_vars.innodb_purge_view_trx_id_age = + trx_sys->rw_max_trx_id - purge_sys->view->up_limit_id; + } +#endif /* UNIV_DEBUG */ + mutex_exit(&srv_innodb_monitor_mutex); } @@ -1414,14 +1494,16 @@ DECLARE_THREAD(srv_monitor_thread)( ulint mutex_skipped; ibool last_srv_print_monitor; + ut_ad(!srv_read_only_mode); + #ifdef UNIV_DEBUG_THREAD_CREATION fprintf(stderr, "Lock timeout thread starts, id %lu\n", os_thread_pf(os_thread_get_curr_id())); -#endif +#endif /* UNIV_DEBUG_THREAD_CREATION */ #ifdef UNIV_PFS_THREAD pfs_register_thread(srv_monitor_thread_key); -#endif +#endif /* UNIV_PFS_THREAD */ srv_monitor_active = TRUE; UT_NOT_USED(arg); @@ -1470,7 +1552,10 @@ loop: } - if (srv_innodb_status) { + /* We don't create the temp files or associated + mutexes in read-only-mode */ + + if (!srv_read_only_mode && srv_innodb_status) { mutex_enter(&srv_monitor_file_mutex); rewind(srv_monitor_file); if (!srv_printf_innodb_monitor(srv_monitor_file, @@ -1587,16 +1672,18 @@ DECLARE_THREAD(srv_error_monitor_thread)( const void* sema = NULL; const void* old_sema = NULL; + ut_ad(!srv_read_only_mode); + old_lsn = srv_start_lsn; #ifdef UNIV_DEBUG_THREAD_CREATION fprintf(stderr, "Error monitor thread starts, id %lu\n", os_thread_pf(os_thread_get_curr_id())); -#endif +#endif /* UNIV_DEBUG_THREAD_CREATION */ #ifdef UNIV_PFS_THREAD pfs_register_thread(srv_error_monitor_thread_key); -#endif +#endif /* UNIV_PFS_THREAD */ srv_error_monitor_active = TRUE; loop: @@ -1630,9 +1717,6 @@ loop: eviction policy. */ buf_LRU_stat_update(); - /* Update the statistics collected for flush rate policy. */ - buf_flush_stat_update(); - /* In case mutex_exit is not a memory barrier, it is theoretically possible some threads are left waiting though the semaphore is already released. Wake up those threads: */ @@ -1690,7 +1774,7 @@ void srv_inc_activity_count(void) /*========================*/ { - ++srv_sys->activity_count; + srv_sys->activity_count.inc(); } /**********************************************************************//** @@ -1703,12 +1787,15 @@ srv_thread_type srv_get_active_thread_type(void) /*============================*/ { - ulint i; srv_thread_type ret = SRV_NONE; + if (srv_read_only_mode) { + return(SRV_NONE); + } + srv_sys_mutex_enter(); - for (i = SRV_WORKER; i <= SRV_MASTER; ++i) { + for (ulint i = SRV_WORKER; i <= SRV_MASTER; ++i) { if (srv_sys->n_threads_active[i] != 0) { ret = static_cast<srv_thread_type>(i); break; @@ -1720,6 +1807,7 @@ srv_get_active_thread_type(void) /* Check only on shutdown. */ if (ret == SRV_NONE && srv_shutdown_state != SRV_SHUTDOWN_NONE + && trx_purge_state() != PURGE_STATE_DISABLED && trx_purge_state() != PURGE_STATE_EXIT) { ret = SRV_PURGE; @@ -1739,20 +1827,25 @@ srv_any_background_threads_are_active(void) { const char* thread_active = NULL; - if (srv_error_monitor_active) { + if (srv_read_only_mode) { + return(NULL); + } else if (srv_error_monitor_active) { thread_active = "srv_error_monitor_thread"; - } else if (srv_lock_timeout_active) { + } else if (lock_sys->timeout_thread_active) { thread_active = "srv_lock_timeout thread"; } else if (srv_monitor_active) { thread_active = "srv_monitor_thread"; } else if (srv_buf_dump_thread_active) { thread_active = "buf_dump_thread"; + } else if (srv_dict_stats_thread_active) { + thread_active = "dict_stats_thread"; } os_event_set(srv_error_event); os_event_set(srv_monitor_event); - os_event_set(srv_timeout_event); os_event_set(srv_buf_dump_event); + os_event_set(lock_sys->timeout_event); + os_event_set(dict_stats_event); return(thread_active); } @@ -1768,6 +1861,10 @@ void srv_active_wake_master_thread(void) /*===============================*/ { + if (srv_read_only_mode) { + return; + } + ut_ad(!srv_sys_mutex_own()); srv_inc_activity_count(); @@ -1869,7 +1966,8 @@ srv_sync_log_buffer_in_background(void) time_t current_time = time(NULL); srv_main_thread_op_info = "flushing log"; - if (difftime(current_time, srv_last_log_flush_time) >= 1) { + if (difftime(current_time, srv_last_log_flush_time) + >= srv_flush_log_at_timeout) { log_buffer_sync_in_background(TRUE); srv_last_log_flush_time = current_time; srv_log_writes_and_flush++; @@ -1986,7 +2084,7 @@ srv_master_do_active_tasks(void) /* Do an ibuf merge */ srv_main_thread_op_info = "doing insert buffer merge"; counter_time = ut_time_us(NULL); - ibuf_contract_in_background(FALSE); + ibuf_contract_in_background(0, FALSE); MONITOR_INC_TIME_IN_MICRO_SECS( MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time); @@ -2078,7 +2176,7 @@ srv_master_do_idle_tasks(void) /* Do an ibuf merge */ counter_time = ut_time_us(NULL); srv_main_thread_op_info = "doing insert buffer merge"; - ibuf_contract_in_background(TRUE); + ibuf_contract_in_background(0, TRUE); MONITOR_INC_TIME_IN_MICRO_SECS( MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time); @@ -2125,6 +2223,8 @@ srv_master_do_shutdown_tasks( ulint n_bytes_merged = 0; ulint n_tables_to_drop = 0; + ut_ad(!srv_read_only_mode); + ++srv_main_shutdown_loops; ut_a(srv_shutdown_state > 0); @@ -2152,7 +2252,7 @@ srv_master_do_shutdown_tasks( /* Do an ibuf merge */ srv_main_thread_op_info = "doing insert buffer merge"; - n_bytes_merged = ibuf_contract_in_background(TRUE); + n_bytes_merged = ibuf_contract_in_background(0, TRUE); /* Flush logs if needed */ srv_sync_log_buffer_in_background(); @@ -2200,14 +2300,16 @@ DECLARE_THREAD(srv_master_thread)( ulint old_activity_count = srv_get_activity_count(); ib_time_t last_print_time; + ut_ad(!srv_read_only_mode); + #ifdef UNIV_DEBUG_THREAD_CREATION fprintf(stderr, "Master thread starts, id %lu\n", os_thread_pf(os_thread_get_curr_id())); -#endif +#endif /* UNIV_DEBUG_THREAD_CREATION */ #ifdef UNIV_PFS_THREAD pfs_register_thread(srv_master_thread_key); -#endif +#endif /* UNIV_PFS_THREAD */ srv_main_thread_process_no = os_proc_get_number(); srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); @@ -2300,6 +2402,7 @@ srv_task_execute(void) { que_thr_t* thr = NULL; + ut_ad(!srv_read_only_mode); ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); mutex_enter(&srv_sys->tasks_mutex); @@ -2338,6 +2441,7 @@ DECLARE_THREAD(srv_worker_thread)( { srv_slot_t* slot; + ut_ad(!srv_read_only_mode); ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); #ifdef UNIV_DEBUG_THREAD_CREATION @@ -2418,6 +2522,7 @@ srv_do_purge( ulint old_activity_count = srv_get_activity_count(); ut_a(n_threads > 0); + ut_ad(!srv_read_only_mode); /* Purge until there are no more records to purge and there is no change in configuration or server state. If the user has @@ -2464,7 +2569,7 @@ srv_do_purge( n_pages_purged = trx_purge( n_use_threads, srv_purge_batch_size, false); - if (!(count++ % TRX_SYS_N_RSEGS) || n_pages_purged == 0) { + if (!(count++ % TRX_SYS_N_RSEGS)) { /* Force a truncate of the history list. */ trx_purge(1, srv_purge_batch_size, true); } @@ -2487,14 +2592,9 @@ srv_purge_coordinator_suspend( ulint rseg_history_len) /*!< in: history list length before last purge */ { + ut_ad(!srv_read_only_mode); ut_a(slot->type == SRV_PURGE); - rw_lock_x_lock(&purge_sys->latch); - - purge_sys->running = false; - - rw_lock_x_unlock(&purge_sys->latch); - bool stop = false; /** Maximum wait time on the purge event, in micro-seconds. */ @@ -2504,6 +2604,12 @@ srv_purge_coordinator_suspend( ulint ret; ib_int64_t sig_count = srv_suspend_thread(slot); + rw_lock_x_lock(&purge_sys->latch); + + purge_sys->running = false; + + rw_lock_x_unlock(&purge_sys->latch); + /* We don't wait right away on the the non-timed wait because we want to signal the thread that wants to suspend purge. */ @@ -2514,8 +2620,8 @@ srv_purge_coordinator_suspend( ret = os_event_wait_time_low( slot->event, SRV_PURGE_MAX_TIMEOUT, sig_count); } else { - /* We don't want to waste time waiting if the - history list has increased by the time we get here + /* We don't want to waste time waiting, if the + history list increased by the time we got here, unless purge has been stopped. */ ret = 0; } @@ -2582,6 +2688,7 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( srv_slot_t* slot; ulint n_total_purged = ULINT_UNDEFINED; + ut_ad(!srv_read_only_mode); ut_a(srv_n_purge_threads >= 1); ut_a(trx_purge_state() == PURGE_STATE_INIT); ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); @@ -2689,6 +2796,7 @@ srv_que_task_enqueue_low( /*=====================*/ que_thr_t* thr) /*!< in: query thread */ { + ut_ad(!srv_read_only_mode); mutex_enter(&srv_sys->tasks_mutex); UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr); @@ -2708,6 +2816,8 @@ srv_get_task_queue_length(void) { ulint n_tasks; + ut_ad(!srv_read_only_mode); + mutex_enter(&srv_sys->tasks_mutex); n_tasks = UT_LIST_GET_LEN(srv_sys->tasks); @@ -2724,6 +2834,8 @@ void srv_purge_wakeup(void) /*==================*/ { + ut_ad(!srv_read_only_mode); + if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { srv_release_threads(SRV_PURGE, 1); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 9d1600cff23..efe9f094c0d 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -71,6 +71,7 @@ Created 2/16/1996 Heikki Tuuri # include "buf0rea.h" # include "dict0boot.h" # include "dict0load.h" +# include "dict0stats_bg.h" # include "que0que.h" # include "usr0sess.h" # include "lock0lock.h" @@ -87,9 +88,9 @@ Created 2/16/1996 Heikki Tuuri # include "row0row.h" # include "row0mysql.h" # include "btr0pcur.h" -# include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */ -# include "zlib.h" /* for ZLIB_VERSION */ -# include "buf0dblwr.h" +# include "os0sync.h" +# include "zlib.h" +# include "ut0crc32.h" /** Log sequence number immediately after startup */ UNIV_INTERN lsn_t srv_start_lsn; @@ -188,6 +189,63 @@ srv_parse_megabytes( } /*********************************************************************//** +Check if a file can be opened in read-write mode. +@return true if it doesn't exist or can be opened in rw mode. */ +static +bool +srv_file_check_mode( +/*================*/ + const char* name) /*!< in: filename to check */ +{ + os_file_stat_t stat; + + memset(&stat, 0x0, sizeof(stat)); + + dberr_t err = os_file_get_status(name, &stat, true); + + if (err == DB_FAIL) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "os_file_get_status() failed on '%s'. Can't determine " + "file permissions", name); + + return(false); + + } else if (err == DB_SUCCESS) { + + /* Note: stat.rw_perm is only valid of files */ + + if (stat.type == OS_FILE_TYPE_FILE) { + if (!stat.rw_perm) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "%s can't be opened in %s mode", + srv_read_only_mode + ? "read-write" : "read", + name); + + return(false); + } + } else { + /* Not a regular file, bail out. */ + + ib_logf(IB_LOG_LEVEL_ERROR, + "'%s' not a regular file.", name); + + return(false); + } + } else { + + /* This is OK. If the file create fails on RO media, there + is nothing we can do. */ + + ut_a(err == DB_NOT_FOUND); + } + + return(true); +} + +/*********************************************************************//** Reads the data files and their sizes from a character string given in the .cnf file. @return TRUE if ok, FALSE on parse error */ @@ -376,79 +434,6 @@ srv_parse_data_file_paths_and_sizes( } /*********************************************************************//** -Reads log group home directories from a character string given in -the .cnf file. -@return TRUE if ok, FALSE on parse error */ -UNIV_INTERN -ibool -srv_parse_log_group_home_dirs( -/*==========================*/ - char* str) /*!< in/out: character string */ -{ - char* input_str; - char* path; - ulint i = 0; - - srv_log_group_home_dirs = NULL; - - input_str = str; - - /* First calculate the number of directories and check syntax: - path;path;... */ - - while (*str != '\0') { - path = str; - - while (*str != ';' && *str != '\0') { - str++; - } - - i++; - - if (*str == ';') { - str++; - } else if (*str != '\0') { - - return(FALSE); - } - } - - if (i != 1) { - /* If innodb_log_group_home_dir was defined it must - contain exactly one path definition under current MySQL */ - - return(FALSE); - } - - srv_log_group_home_dirs = static_cast<char**>( - malloc(i * sizeof *srv_log_group_home_dirs)); - - /* Then store the actual values to our array */ - - str = input_str; - i = 0; - - while (*str != '\0') { - path = str; - - while (*str != ';' && *str != '\0') { - str++; - } - - if (*str == ';') { - *str = '\0'; - str++; - } - - srv_log_group_home_dirs[i] = path; - - i++; - } - - return(TRUE); -} - -/*********************************************************************//** Frees the memory allocated by srv_parse_data_file_paths_and_sizes() and srv_parse_log_group_home_dirs(). */ UNIV_INTERN @@ -462,8 +447,6 @@ srv_free_paths_and_sizes(void) srv_data_file_sizes = NULL; free(srv_data_file_is_raw_partition); srv_data_file_is_raw_partition = NULL; - free(srv_log_group_home_dirs); - srv_log_group_home_dirs = NULL; } #ifndef UNIV_HOTBACKUP @@ -526,175 +509,230 @@ srv_normalize_path_for_win( #ifndef UNIV_HOTBACKUP /*********************************************************************//** -Creates or opens the log files and closes them. +Creates a log file. @return DB_SUCCESS or error code */ -static -ulint -open_or_create_log_file( -/*====================*/ - ibool create_new_db, /*!< in: TRUE if we should create a - new database */ - ibool* log_file_created, /*!< out: TRUE if new log file - created */ - ibool log_file_has_been_opened,/*!< in: TRUE if a log file has been - opened before: then it is an error - to try to create another log file */ - ulint k, /*!< in: log group number */ - ulint i) /*!< in: log file number in group */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +create_log_file( +/*============*/ + os_file_t* file, /*!< out: file handle */ + const char* name) /*!< in: log file name */ { ibool ret; - os_offset_t size; - char name[10000]; - ulint dirnamelen; - UT_NOT_USED(create_new_db); + *file = os_file_create( + innodb_file_log_key, name, + OS_FILE_CREATE, OS_FILE_NORMAL, OS_LOG_FILE, &ret); - *log_file_created = FALSE; + ib_logf(IB_LOG_LEVEL_INFO, + "Setting log file %s size to %lu MB", + name, (ulong) srv_log_file_size + >> (20 - UNIV_PAGE_SIZE_SHIFT)); - srv_normalize_path_for_win(srv_log_group_home_dirs[k]); + ret = os_file_set_size(name, *file, + (os_offset_t) srv_log_file_size + << UNIV_PAGE_SIZE_SHIFT); + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, "Error in creating %s", name); + return(DB_ERROR); + } - dirnamelen = strlen(srv_log_group_home_dirs[k]); - ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile"); - memcpy(name, srv_log_group_home_dirs[k], dirnamelen); + ret = os_file_close(*file); + ut_a(ret); - /* Add a path separator if needed. */ - if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { - name[dirnamelen++] = SRV_PATH_SEPARATOR; + return(DB_SUCCESS); +} + +/** Initial number of the first redo log file */ +#define INIT_LOG_FILE0 (SRV_N_LOG_FILES_MAX + 1) + +#ifdef DBUG_OFF +# define RECOVERY_CRASH(x) do {} while(0) +#else +# define RECOVERY_CRASH(x) do { \ + if (srv_force_recovery_crash == x) { \ + fprintf(stderr, "innodb_force_recovery_crash=%lu\n", \ + srv_force_recovery_crash); \ + fflush(stderr); \ + exit(3); \ + } \ +} while (0) +#endif + +/*********************************************************************//** +Creates all log files. +@return DB_SUCCESS or error code */ +static +dberr_t +create_log_files( +/*=============*/ + char* logfilename, /*!< in/out: buffer for log file name */ + size_t dirnamelen, /*!< in: length of the directory path */ + lsn_t lsn, /*!< in: FIL_PAGE_FILE_FLUSH_LSN value */ + char*& logfile0) /*!< out: name of the first log file */ +{ + if (srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create log files in read-only mode"); + return(DB_READ_ONLY); } - sprintf(name + dirnamelen, "%s%lu", "ib_logfile", (ulong) i); + /* Remove any old log files. */ + for (unsigned i = 0; i <= INIT_LOG_FILE0; i++) { + sprintf(logfilename + dirnamelen, "ib_logfile%u", i); - files[i] = os_file_create(innodb_file_log_key, name, - OS_FILE_CREATE, OS_FILE_NORMAL, - OS_LOG_FILE, &ret); - if (ret == FALSE) { - if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS -#ifdef UNIV_AIX - /* AIX 5.1 after security patch ML7 may have errno set - to 0 here, which causes our function to return 100; - work around that AIX problem */ - && os_file_get_last_error(FALSE) != 100 + /* Ignore errors about non-existent files or files + that cannot be removed. The create_log_file() will + return an error when the file exists. */ +#ifdef __WIN__ + DeleteFile((LPCTSTR) logfilename); +#else + unlink(logfilename); #endif - ) { - fprintf(stderr, - "InnoDB: Error in creating" - " or opening %s\n", name); + /* Crashing after deleting the first + file should be recoverable. The buffer + pool was clean, and we can simply create + all log files from the scratch. */ + RECOVERY_CRASH(6); + } - return(DB_ERROR); - } + ut_ad(!buf_pool_check_no_pending_io()); - files[i] = os_file_create(innodb_file_log_key, name, - OS_FILE_OPEN, OS_FILE_AIO, - OS_LOG_FILE, &ret); - if (!ret) { - fprintf(stderr, - "InnoDB: Error in opening %s\n", name); + RECOVERY_CRASH(7); - return(DB_ERROR); + for (unsigned i = 0; i < srv_n_log_files; i++) { + sprintf(logfilename + dirnamelen, + "ib_logfile%u", i ? i : INIT_LOG_FILE0); + + dberr_t err = create_log_file(&files[i], logfilename); + + if (err != DB_SUCCESS) { + return(err); } + } - size = os_file_get_size(files[i]); - ut_a(size != (os_offset_t) -1); + RECOVERY_CRASH(8); - if (UNIV_UNLIKELY(size != (os_offset_t) srv_log_file_size - << UNIV_PAGE_SIZE_SHIFT)) { + /* We did not create the first log file initially as + ib_logfile0, so that crash recovery cannot find it until it + has been completed and renamed. */ + sprintf(logfilename + dirnamelen, "ib_logfile%u", INIT_LOG_FILE0); - fprintf(stderr, - "InnoDB: Error: log file %s is" - " of different size "UINT64PF" bytes\n" - "InnoDB: than specified in the .cnf" - " file "UINT64PF" bytes!\n", - name, size, - (os_offset_t) srv_log_file_size - << UNIV_PAGE_SIZE_SHIFT); + fil_space_create( + logfilename, SRV_LOG_SPACE_FIRST_ID, + fsp_flags_set_page_size(0, UNIV_PAGE_SIZE), + FIL_LOG); + ut_a(fil_validate()); - return(DB_ERROR); + logfile0 = fil_node_create( + logfilename, (ulint) srv_log_file_size, + SRV_LOG_SPACE_FIRST_ID, FALSE); + ut_a(logfile0); + + for (unsigned i = 1; i < srv_n_log_files; i++) { + sprintf(logfilename + dirnamelen, "ib_logfile%u", i); + + if (!fil_node_create( + logfilename, + (ulint) srv_log_file_size, + SRV_LOG_SPACE_FIRST_ID, FALSE)) { + ut_error; } - } else { - *log_file_created = TRUE; + } - ut_print_timestamp(stderr); + log_group_init(0, srv_n_log_files, + srv_log_file_size * UNIV_PAGE_SIZE, + SRV_LOG_SPACE_FIRST_ID, + SRV_LOG_SPACE_FIRST_ID + 1); - fprintf(stderr, - " InnoDB: Log file %s did not exist:" - " new to be created\n", - name); - if (log_file_has_been_opened) { + fil_open_log_and_system_tablespace_files(); - return(DB_ERROR); - } + /* Create a log checkpoint. */ + mutex_enter(&log_sys->mutex); + ut_d(recv_no_log_write = FALSE); + recv_reset_logs(lsn); + mutex_exit(&log_sys->mutex); - fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n", - name, (ulong) srv_log_file_size - >> (20 - UNIV_PAGE_SIZE_SHIFT)); + return(DB_SUCCESS); +} - fprintf(stderr, - "InnoDB: Database physically writes the file" - " full: wait...\n"); +/*********************************************************************//** +Renames the first log file. */ +static +void +create_log_files_rename( +/*====================*/ + char* logfilename, /*!< in/out: buffer for log file name */ + size_t dirnamelen, /*!< in: length of the directory path */ + lsn_t lsn, /*!< in: FIL_PAGE_FILE_FLUSH_LSN value */ + char* logfile0) /*!< in/out: name of the first log file */ +{ + /* If innodb_flush_method=O_DSYNC, + we need to explicitly flush the log buffers. */ + fil_flush(SRV_LOG_SPACE_FIRST_ID); + /* Close the log files, so that we can rename + the first one. */ + fil_close_log_files(false); - ret = os_file_set_size(name, files[i], - (os_offset_t) srv_log_file_size - << UNIV_PAGE_SIZE_SHIFT); - if (!ret) { - fprintf(stderr, - "InnoDB: Error in creating %s:" - " probably out of disk space\n", - name); + /* Rename the first log file, now that a log + checkpoint has been created. */ + sprintf(logfilename + dirnamelen, "ib_logfile%u", 0); - return(DB_ERROR); - } - } + RECOVERY_CRASH(9); - ret = os_file_close(files[i]); - ut_a(ret); + ib_logf(IB_LOG_LEVEL_INFO, + "Renaming log file %s to %s", logfile0, logfilename); - if (i == 0) { - /* Create in memory the file space object - which is for this log group */ + mutex_enter(&log_sys->mutex); + ut_ad(strlen(logfile0) == 2 + strlen(logfilename)); + ibool success = os_file_rename( + innodb_file_log_key, logfile0, logfilename); + ut_a(success); - fil_space_create(name, - 2 * k + SRV_LOG_SPACE_FIRST_ID, - fsp_flags_set_page_size(0, UNIV_PAGE_SIZE), - FIL_LOG); - } + RECOVERY_CRASH(10); - ut_a(fil_validate()); + /* Replace the first file with ib_logfile0. */ + strcpy(logfile0, logfilename); + mutex_exit(&log_sys->mutex); - /* srv_log_file_size is measured in pages; if page size is 16KB, - then we have a limit of 64TB on 32 bit systems */ - ut_a(srv_log_file_size <= ULINT_MAX); + fil_open_log_and_system_tablespace_files(); - fil_node_create(name, (ulint) srv_log_file_size, - 2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE); -#ifdef UNIV_LOG_ARCHIVE - /* If this is the first log group, create the file space object - for archived logs. - Under MySQL, no archiving ever done. */ + ib_logf(IB_LOG_LEVEL_WARN, "New log files created, LSN=" LSN_PF, lsn); +} - if (k == 0 && i == 0) { - arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID; +/*********************************************************************//** +Opens a log file. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +open_log_file( +/*==========*/ + os_file_t* file, /*!< out: file handle */ + const char* name, /*!< in: log file name */ + os_offset_t* size) /*!< out: file size */ +{ + ibool ret; - fil_space_create("arch_log_space", arch_space_id, 0, FIL_LOG); - } else { - arch_space_id = ULINT_UNDEFINED; - } -#endif /* UNIV_LOG_ARCHIVE */ - if (i == 0) { - log_group_init(k, srv_n_log_files, - srv_log_file_size * UNIV_PAGE_SIZE, - 2 * k + SRV_LOG_SPACE_FIRST_ID, - SRV_LOG_SPACE_FIRST_ID + 1); /* dummy arch - space id */ + *file = os_file_create(innodb_file_log_key, name, + OS_FILE_OPEN, OS_FILE_AIO, + OS_LOG_FILE, &ret); + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); + return(DB_ERROR); } + *size = os_file_get_size(*file); + + ret = os_file_close(*file); + ut_a(ret); return(DB_SUCCESS); } /*********************************************************************//** Creates or opens database data files and closes them. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t open_or_create_data_files( /*======================*/ ibool* create_new_db, /*!< out: TRUE if new database should be @@ -718,13 +756,16 @@ open_or_create_data_files( ibool one_created = FALSE; os_offset_t size; ulint flags; + ulint space; ulint rounded_size_pages; char name[10000]; if (srv_n_data_files >= 1000) { - fprintf(stderr, "InnoDB: can only have < 1000 data files\n" - "InnoDB: you have defined %lu\n", - (ulong) srv_n_data_files); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Can only have < 1000 data files, you have " + "defined %lu", (ulong) srv_n_data_files); + return(DB_ERROR); } @@ -742,7 +783,9 @@ open_or_create_data_files( ut_a(dirnamelen + strlen(srv_data_file_names[i]) < (sizeof name) - 1); + memcpy(name, srv_data_home, dirnamelen); + /* Add a path separator if needed. */ if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { name[dirnamelen++] = SRV_PATH_SEPARATOR; @@ -750,46 +793,67 @@ open_or_create_data_files( strcpy(name + dirnamelen, srv_data_file_names[i]); - if (srv_data_file_is_raw_partition[i] == 0) { + /* Note: It will return true if the file doesn' exist. */ + + if (!srv_file_check_mode(name)) { + + return(DB_FAIL); + + } else if (srv_data_file_is_raw_partition[i] == 0) { /* First we try to create the file: if it already exists, ret will get value FALSE */ - files[i] = os_file_create(innodb_file_data_key, - name, OS_FILE_CREATE, - OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + files[i] = os_file_create( + innodb_file_data_key, name, OS_FILE_CREATE, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + + if (srv_read_only_mode) { + + if (ret) { + goto size_check; + } - if (ret == FALSE && os_file_get_last_error(FALSE) - != OS_FILE_ALREADY_EXISTS + ib_logf(IB_LOG_LEVEL_ERROR, + "Opening %s failed!", name); + + return(DB_ERROR); + + } else if (!ret + && os_file_get_last_error(false) + != OS_FILE_ALREADY_EXISTS #ifdef UNIV_AIX - /* AIX 5.1 after security patch ML7 may have - errno set to 0 here, which causes our function - to return 100; work around that AIX problem */ - && os_file_get_last_error(FALSE) != 100 -#endif + /* AIX 5.1 after security patch ML7 may have + errno set to 0 here, which causes our + function to return 100; work around that + AIX problem */ + && os_file_get_last_error(false) != 100 +#endif /* UNIV_AIX */ ) { - fprintf(stderr, - "InnoDB: Error in creating" - " or opening %s\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Creating or opening %s failed!", name); return(DB_ERROR); } + } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { + + ut_a(!srv_read_only_mode); + /* The partition is opened, not created; then it is written over */ srv_start_raw_disk_in_use = TRUE; srv_created_new_raw = TRUE; - files[i] = os_file_create(innodb_file_data_key, - name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + files[i] = os_file_create( + innodb_file_data_key, name, OS_FILE_OPEN_RAW, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + if (!ret) { - fprintf(stderr, - "InnoDB: Error in opening %s\n", name); + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in opening %s", name); return(DB_ERROR); } @@ -805,17 +869,15 @@ open_or_create_data_files( /* We open the data file */ if (one_created) { - fprintf(stderr, - "InnoDB: Error: data files can only" - " be added at the end\n"); - fprintf(stderr, - "InnoDB: of a tablespace, but" - " data file %s existed beforehand.\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Data files can only be added at " + "the end of a tablespace, but " + "data file %s existed beforehand.", name); return(DB_ERROR); } - if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + ut_a(!srv_read_only_mode); files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, @@ -833,9 +895,11 @@ open_or_create_data_files( } if (!ret) { - fprintf(stderr, - "InnoDB: Error in opening %s\n", name); - os_file_get_last_error(TRUE); + + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Can't open '%s'", name); return(DB_ERROR); } @@ -845,6 +909,7 @@ open_or_create_data_files( goto skip_size_check; } +size_check: size = os_file_get_size(files[i]); ut_a(size != (os_offset_t) -1); /* Round size downward to megabytes */ @@ -860,16 +925,16 @@ open_or_create_data_files( && srv_last_file_size_max < rounded_size_pages)) { - fprintf(stderr, - "InnoDB: Error: auto-extending" - " data file %s is" - " of a different size\n" - "InnoDB: %lu pages (rounded" - " down to MB) than specified" - " in the .cnf file:\n" - "InnoDB: initial %lu pages," - " max %lu (relevant if" - " non-zero) pages!\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "auto-extending " + "data file %s is " + "of a different size " + "%lu pages (rounded " + "down to MB) than specified " + "in the .cnf file: " + "initial %lu pages, " + "max %lu (relevant if " + "non-zero) pages!", name, (ulong) rounded_size_pages, (ulong) srv_data_file_sizes[i], @@ -884,13 +949,11 @@ open_or_create_data_files( if (rounded_size_pages != srv_data_file_sizes[i]) { - fprintf(stderr, - "InnoDB: Error: data file %s" - " is of a different size\n" - "InnoDB: %lu pages" - " (rounded down to MB)\n" - "InnoDB: than specified" - " in the .cnf file %lu pages!\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Data file %s is of a different " + "size %lu pages (rounded down to MB) " + "than specified in the .cnf file " + "%lu pages!", name, (ulong) rounded_size_pages, (ulong) srv_data_file_sizes[i]); @@ -899,63 +962,65 @@ open_or_create_data_files( } skip_size_check: fil_read_first_page( - files[i], one_opened, &flags, + files[i], one_opened, &flags, &space, #ifdef UNIV_LOG_ARCHIVE min_arch_log_no, max_arch_log_no, #endif /* UNIV_LOG_ARCHIVE */ min_flushed_lsn, max_flushed_lsn); + /* The first file of the system tablespace must + have space ID = TRX_SYS_SPACE. The FSP_SPACE_ID + field in files greater than ibdata1 are unreliable. */ + ut_a(one_opened || space == TRX_SYS_SPACE); + + /* Check the flags for the first system tablespace + file only. */ if (!one_opened && UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: data file %s" - " uses page size %lu,\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Data file \"%s\" uses page size %lu," + "but the start-up parameter " + "is --innodb-page-size=%lu", name, - fsp_flags_get_page_size(flags)); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: but the start-up parameter" - " is innodb-page-size=%lu\n", + fsp_flags_get_page_size(flags), UNIV_PAGE_SIZE); return(DB_ERROR); } one_opened = TRUE; - } else { + } else if (!srv_read_only_mode) { /* We created the data file and now write it full of zeros */ one_created = TRUE; if (i > 0) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Data file %s did not" - " exist: new to be created\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Data file %s did not" + " exist: new to be created", name); } else { - fprintf(stderr, - "InnoDB: The first specified" - " data file %s did not exist:\n" - "InnoDB: a new database" - " to be created!\n", name); + ib_logf(IB_LOG_LEVEL_INFO, + "The first specified " + "data file %s did not exist: " + "a new database to be created!", + name); + *create_new_db = TRUE; } - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Setting file %s size to %lu MB\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Setting file %s size to %lu MB", name, (ulong) (srv_data_file_sizes[i] >> (20 - UNIV_PAGE_SIZE_SHIFT))); - fprintf(stderr, - "InnoDB: Database physically writes the" - " file full: wait...\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Database physically writes the" + " file full: wait..."); ret = os_file_set_size( name, files[i], @@ -963,9 +1028,10 @@ skip_size_check: << UNIV_PAGE_SIZE_SHIFT); if (!ret) { - fprintf(stderr, - "InnoDB: Error in creating %s:" - " probably out of disk space\n", name); + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in creating %s: " + "probably out of disk space", + name); return(DB_ERROR); } @@ -983,8 +1049,10 @@ skip_size_check: ut_a(fil_validate()); - fil_node_create(name, srv_data_file_sizes[i], 0, - srv_data_file_is_raw_partition[i] != 0); + if (!fil_node_create(name, srv_data_file_sizes[i], 0, + srv_data_file_is_raw_partition[i] != 0)) { + return(DB_ERROR); + } } return(DB_SUCCESS); @@ -994,7 +1062,7 @@ skip_size_check: Create undo tablespace. @return DB_SUCCESS or error code */ static -enum db_err +dberr_t srv_undo_tablespace_create( /*=======================*/ const char* name, /*!< in: tablespace name */ @@ -1002,48 +1070,55 @@ srv_undo_tablespace_create( { os_file_t fh; ibool ret; - enum db_err err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; os_file_create_subdirs_if_needed(name); fh = os_file_create( - innodb_file_data_key, name, OS_FILE_CREATE, + innodb_file_data_key, + name, + srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, OS_FILE_NORMAL, OS_DATA_FILE, &ret); - if (ret == FALSE - && os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS + if (srv_read_only_mode && ret) { + ib_logf(IB_LOG_LEVEL_INFO, + "%s opened in read-only mode", name); + } else if (ret == FALSE + && os_file_get_last_error(false) != OS_FILE_ALREADY_EXISTS #ifdef UNIV_AIX - /* AIX 5.1 after security patch ML7 may have - errno set to 0 here, which causes our function - to return 100; work around that AIX problem */ - && os_file_get_last_error(FALSE) != 100 -#endif + /* AIX 5.1 after security patch ML7 may have + errno set to 0 here, which causes our function + to return 100; work around that AIX problem */ + && os_file_get_last_error(false) != 100 +#endif /* UNIV_AIX */ ) { - fprintf(stderr, "InnoDB: Error in creating %s\n", name); + ib_logf(IB_LOG_LEVEL_ERROR, + "Can't create UNDO tablespace %s", name); err = DB_ERROR; } else { + ut_a(!srv_read_only_mode); + /* We created the data file and now write it full of zeros */ - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Data file %s did not" - " exist: new to be created\n", name); + ib_logf(IB_LOG_LEVEL_INFO, + "Data file %s did not exist: new to be created", + name); - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Setting file %s size to %lu MB\n", - name, size >> (20 - UNIV_PAGE_SIZE_SHIFT)); + ib_logf(IB_LOG_LEVEL_INFO, + "Setting file %s size to %lu MB", + name, size >> (20 - UNIV_PAGE_SIZE_SHIFT)); - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Database physically writes the" - " file full: wait...\n"); + ib_logf(IB_LOG_LEVEL_INFO, + "Database physically writes the file full: wait..."); ret = os_file_set_size(name, fh, size << UNIV_PAGE_SIZE_SHIFT); if (!ret) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error in creating %s:" - " probably out of disk space\n", name); + ib_logf(IB_LOG_LEVEL_INFO, + "Error in creating %s: probably out of " + "disk space", name); err = DB_ERROR; } @@ -1058,17 +1133,25 @@ srv_undo_tablespace_create( Open an undo tablespace. @return DB_SUCCESS or error code */ static -enum db_err +dberr_t srv_undo_tablespace_open( /*=====================*/ const char* name, /*!< in: tablespace name */ ulint space) /*!< in: tablespace id */ { os_file_t fh; - enum db_err err; + dberr_t err = DB_ERROR; ibool ret; ulint flags; + if (!srv_file_check_mode(name)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "UNDO tablespaces must be %s!", + srv_read_only_mode ? "writable" : "readable"); + + return(DB_ERROR); + } + fh = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RETRY @@ -1082,7 +1165,6 @@ srv_undo_tablespace_open( if (ret) { os_offset_t size; - os_offset_t n_pages; size = os_file_get_size(fh); ut_a(size != (os_offset_t) -1); @@ -1105,17 +1187,15 @@ srv_undo_tablespace_open( ut_a(fil_validate()); - n_pages = size / UNIV_PAGE_SIZE; + os_offset_t n_pages = size / UNIV_PAGE_SIZE; /* On 64 bit Windows ulint can be 32 bit and os_offset_t is 64 bit. It is OK to cast the n_pages to ulint because the unit has been scaled to pages and they are always 32 bit. */ - fil_node_create(name, (ulint) n_pages, space, FALSE); - - err = DB_SUCCESS; - } else { - err = DB_ERROR; + if (fil_node_create(name, (ulint) n_pages, space, FALSE)) { + err = DB_SUCCESS; + } } return(err); @@ -1125,20 +1205,25 @@ srv_undo_tablespace_open( Opens the configured number of undo tablespaces. @return DB_SUCCESS or error code */ static -enum db_err +dberr_t srv_undo_tablespaces_init( /*======================*/ ibool create_new_db, /*!< in: TRUE if new db being created */ - const ulint n_conf_tablespaces) /*!< in: configured undo + const ulint n_conf_tablespaces, /*!< in: configured undo tablespaces */ + ulint* n_opened) /*!< out: number of UNDO + tablespaces successfully + discovered and opened */ { ulint i; - enum db_err err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ulint prev_space_id = 0; ulint n_undo_tablespaces; ulint undo_tablespace_ids[TRX_SYS_N_RSEGS + 1]; + *n_opened = 0; + ut_a(n_conf_tablespaces <= TRX_SYS_N_RSEGS); memset(undo_tablespace_ids, 0x0, sizeof(undo_tablespace_ids)); @@ -1164,10 +1249,10 @@ srv_undo_tablespaces_init( name, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES); if (err != DB_SUCCESS) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Could not create " - "undo tablespace '%s'.\n", name); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not create undo tablespace '%s'.", + name); return(err); } @@ -1217,15 +1302,16 @@ srv_undo_tablespaces_init( err = srv_undo_tablespace_open(name, undo_tablespace_ids[i]); if (err != DB_SUCCESS) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error opening undo " - "tablespace %s.\n", name); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to open undo tablespace '%s'.", name); return(err); } prev_space_id = undo_tablespace_ids[i]; + + ++*n_opened; } /* Open any extra unused undo tablespaces. These must be contiguous. @@ -1248,6 +1334,8 @@ srv_undo_tablespaces_init( } ++n_undo_tablespaces; + + ++*n_opened; } /* If the user says that there are fewer than what we find we @@ -1275,13 +1363,17 @@ srv_undo_tablespaces_init( "value is %lu\n", n_undo_tablespaces); return(err != DB_SUCCESS ? err : DB_ERROR); - } - if (n_undo_tablespaces > 0) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Opened %lu undo tablespaces\n", - n_conf_tablespaces); + } else if (n_undo_tablespaces > 0) { + + ib_logf(IB_LOG_LEVEL_INFO, "Opened %lu undo tablespaces", + n_undo_tablespaces); + + if (n_conf_tablespaces == 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Using the system tablespace for all UNDO " + "logging because innodb_undo_tablespaces=0"); + } } if (create_new_db) { @@ -1303,18 +1395,51 @@ srv_undo_tablespaces_init( } /******************************************************************** +Wait for the purge thread(s) to start up. */ +static +void +srv_start_wait_for_purge_to_start() +/*===============================*/ +{ + /* Wait for the purge coordinator and master thread to startup. */ + + purge_state_t state = trx_purge_state(); + + ut_a(state != PURGE_STATE_DISABLED); + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND + && state == PURGE_STATE_INIT) { + + switch (state = trx_purge_state()) { + case PURGE_STATE_RUN: + case PURGE_STATE_STOP: + break; + + case PURGE_STATE_INIT: + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for purge to start"); + + os_thread_sleep(50000); + break; + + case PURGE_STATE_EXIT: + case PURGE_STATE_DISABLED: + ut_error; + } + } +} + +/******************************************************************** Starts InnoDB and creates a new database if database files are not found and the user wants. @return DB_SUCCESS or error code */ UNIV_INTERN -int +dberr_t innobase_start_or_create_for_mysql(void) /*====================================*/ { ibool create_new_db; - ibool log_file_created; - ibool log_created = FALSE; - ibool log_opened = FALSE; lsn_t min_flushed_lsn; lsn_t max_flushed_lsn; #ifdef UNIV_LOG_ARCHIVE @@ -1324,11 +1449,19 @@ innobase_start_or_create_for_mysql(void) ulint sum_of_new_sizes; ulint sum_of_data_file_sizes; ulint tablespace_size_in_header; - ulint err; - ulint i; + dberr_t err; + unsigned i; + ulint srv_n_log_files_found = srv_n_log_files; ulint io_limit; mtr_t mtr; ib_bh_t* ib_bh; + char logfilename[10000]; + char* logfile0 = NULL; + size_t dirnamelen; + + if (srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_INFO, "Started in read only mode"); + } #ifdef HAVE_DARWIN_THREADS # ifdef F_FULLFSYNC @@ -1422,31 +1555,34 @@ innobase_start_or_create_for_mysql(void) " InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n"); #endif - if (UNIV_LIKELY(srv_use_sys_malloc)) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: The InnoDB memory heap is disabled\n"); + if (srv_use_sys_malloc) { + ib_logf(IB_LOG_LEVEL_INFO, + "The InnoDB memory heap is disabled"); } #if defined(COMPILER_HINTS_ENABLED) - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Compiler hints enabled.\n"); + ib_logf(IB_LOG_LEVEL_INFO, + " InnoDB: Compiler hints enabled."); #endif /* defined(COMPILER_HINTS_ENABLED) */ - ut_print_timestamp(stderr); - fputs(" InnoDB: " IB_ATOMICS_STARTUP_MSG "\n", stderr); + ib_logf(IB_LOG_LEVEL_INFO, + "" IB_ATOMICS_STARTUP_MSG ""); - ut_print_timestamp(stderr); - fputs(" InnoDB: Compressed tables use zlib " ZLIB_VERSION + ib_logf(IB_LOG_LEVEL_INFO, + "Compressed tables use zlib " ZLIB_VERSION #ifdef UNIV_ZIP_DEBUG " with validation" #endif /* UNIV_ZIP_DEBUG */ - "\n" , stderr); + ); #ifdef UNIV_ZIP_COPY - ut_print_timestamp(stderr); - fputs(" InnoDB: and extra copying\n", stderr); + ib_logf(IB_LOG_LEVEL_INFO, "and extra copying"); #endif /* UNIV_ZIP_COPY */ + + ib_logf(IB_LOG_LEVEL_INFO, + "CPU %s crc32 instructions", + ut_crc32_sse2_enabled ? "supports" : "does not support"); + /* Since InnoDB does not currently clean up all its internal data structures in MySQL Embedded Server Library server_end(), we print an error message if someone tries to start up InnoDB a @@ -1505,17 +1641,14 @@ innobase_start_or_create_for_mysql(void) #elif defined(LINUX_NATIVE_AIO) if (srv_use_native_aio) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Using Linux native AIO\n"); + ib_logf(IB_LOG_LEVEL_INFO, "Using Linux native AIO"); } #else /* Currently native AIO is supported only on windows and linux and that also when the support is compiled in. In all other cases, we ignore the setting of innodb_use_native_aio. */ srv_use_native_aio = FALSE; - -#endif +#endif /* __WIN__ */ if (srv_file_flush_method_str == NULL) { /* These are the default options */ @@ -1533,6 +1666,9 @@ innobase_start_or_create_for_mysql(void) } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) { srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT_NO_FSYNC")) { + srv_unix_file_flush_method = SRV_UNIX_O_DIRECT_NO_FSYNC; + } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; @@ -1550,12 +1686,10 @@ innobase_start_or_create_for_mysql(void) } else if (0 == ut_strcmp(srv_file_flush_method_str, "async_unbuffered")) { srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; -#endif +#endif /* __WIN__ */ } else { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Unrecognized value %s for" - " innodb_flush_method\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "Unrecognized value %s for innodb_flush_method", srv_file_flush_method_str); return(DB_ERROR); } @@ -1580,74 +1714,93 @@ innobase_start_or_create_for_mysql(void) srv_max_n_threads = 10000; } else { srv_buf_pool_instances = 1; - srv_max_n_threads = 1000; /* saves several MB of memory, - especially in 64-bit - computers */ + + /* Saves several MB of memory, especially in + 64-bit computers */ + + srv_max_n_threads = 1000; } - err = srv_boot(); + srv_boot(); - if (err != DB_SUCCESS) { + if (!srv_read_only_mode) { - return((int) err); - } + mutex_create(srv_monitor_file_mutex_key, + &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK); - mutex_create(srv_monitor_file_mutex_key, - &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK); + if (srv_innodb_status) { - if (srv_innodb_status) { + srv_monitor_file_name = static_cast<char*>( + mem_alloc( + strlen(fil_path_to_mysql_datadir) + + 20 + sizeof "/innodb_status.")); - srv_monitor_file_name = static_cast<char*>( - mem_alloc( - strlen(fil_path_to_mysql_datadir) - + 20 + sizeof "/innodb_status.")); + sprintf(srv_monitor_file_name, "%s/innodb_status.%lu", + fil_path_to_mysql_datadir, + os_proc_get_number()); - sprintf(srv_monitor_file_name, "%s/innodb_status.%lu", - fil_path_to_mysql_datadir, os_proc_get_number()); - srv_monitor_file = fopen(srv_monitor_file_name, "w+"); - if (!srv_monitor_file) { - fprintf(stderr, "InnoDB: unable to create %s: %s\n", - srv_monitor_file_name, strerror(errno)); - return(DB_ERROR); + srv_monitor_file = fopen(srv_monitor_file_name, "w+"); + + if (!srv_monitor_file) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create %s: %s", + srv_monitor_file_name, + strerror(errno)); + + return(DB_ERROR); + } + } else { + srv_monitor_file_name = NULL; + srv_monitor_file = os_file_create_tmpfile(); + + if (!srv_monitor_file) { + return(DB_ERROR); + } } - } else { - srv_monitor_file_name = NULL; - srv_monitor_file = os_file_create_tmpfile(); - if (!srv_monitor_file) { + + mutex_create(srv_dict_tmpfile_mutex_key, + &srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION); + + srv_dict_tmpfile = os_file_create_tmpfile(); + + if (!srv_dict_tmpfile) { return(DB_ERROR); } - } - mutex_create(srv_dict_tmpfile_mutex_key, - &srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION); + mutex_create(srv_misc_tmpfile_mutex_key, + &srv_misc_tmpfile_mutex, SYNC_ANY_LATCH); - srv_dict_tmpfile = os_file_create_tmpfile(); - if (!srv_dict_tmpfile) { - return(DB_ERROR); - } + srv_misc_tmpfile = os_file_create_tmpfile(); - mutex_create(srv_misc_tmpfile_mutex_key, - &srv_misc_tmpfile_mutex, SYNC_ANY_LATCH); - - srv_misc_tmpfile = os_file_create_tmpfile(); - if (!srv_misc_tmpfile) { - return(DB_ERROR); + if (!srv_misc_tmpfile) { + return(DB_ERROR); + } } /* If user has set the value of innodb_file_io_threads then we'll emit a message telling the user that this parameter is now deprecated. */ if (srv_n_file_io_threads != 4) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Warning:" - " innodb_file_io_threads is deprecated." - " Please use innodb_read_io_threads and" - " innodb_write_io_threads instead\n"); + ib_logf(IB_LOG_LEVEL_WARN, + "innodb_file_io_threads is deprecated. Please use " + "innodb_read_io_threads and innodb_write_io_threads " + "instead"); } /* Now overwrite the value on srv_n_file_io_threads */ - srv_n_file_io_threads = 2 + srv_n_read_io_threads - + srv_n_write_io_threads; + srv_n_file_io_threads = srv_n_read_io_threads; + + if (!srv_read_only_mode) { + /* Add the log and ibuf IO threads. */ + srv_n_file_io_threads += 2; + srv_n_file_io_threads += srv_n_write_io_threads; + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Disabling background IO write threads."); + + srv_n_write_io_threads = 0; + } ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS); @@ -1662,56 +1815,59 @@ innobase_start_or_create_for_mysql(void) } # endif /* __WIN__ */ - os_aio_init(io_limit, - srv_n_read_io_threads, - srv_n_write_io_threads, - SRV_MAX_N_PENDING_SYNC_IOS); + if (!os_aio_init(io_limit, + srv_n_read_io_threads, + srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Fatal : Cannot initialize AIO sub-system"); + + return(DB_ERROR); + } fil_init(srv_file_per_table ? 50000 : 5000, srv_max_n_open_files); - /* Print time to initialize the buffer pool */ - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Initializing buffer pool, size ="); + double size; + char unit; if (srv_buf_pool_size >= 1024 * 1024 * 1024) { - fprintf(stderr, - " %.1fG\n", - ((double) srv_buf_pool_size) / (1024 * 1024 * 1024)); + size = ((double) srv_buf_pool_size) / (1024 * 1024 * 1024); + unit = 'G'; } else { - fprintf(stderr, - " %.1fM\n", - ((double) srv_buf_pool_size) / (1024 * 1024)); + size = ((double) srv_buf_pool_size) / (1024 * 1024); + unit = 'M'; } - err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances); + /* Print time to initialize the buffer pool */ + ib_logf(IB_LOG_LEVEL_INFO, + "Initializing buffer pool, size = %.1f%c", size, unit); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Completed initialization of buffer pool\n"); + err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances); if (err != DB_SUCCESS) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Fatal error: cannot allocate memory" - " for the buffer pool\n"); + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot allocate memory for the buffer pool"); return(DB_ERROR); } + ib_logf(IB_LOG_LEVEL_INFO, + "Completed initialization of buffer pool"); + #ifdef UNIV_DEBUG /* We have observed deadlocks with a 5MB buffer pool but the actual lower limit could very well be a little higher. */ if (srv_buf_pool_size <= 5 * 1024 * 1024) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Warning: Small buffer pool size " - "(%luM), the flst_validate() debug function " - "can cause a deadlock if the buffer pool fills up.\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Small buffer pool size (%luM), the flst_validate() " + "debug function can cause a deadlock if the " + "buffer pool fills up.", srv_buf_pool_size / 1024 / 1024); } -#endif +#endif /* UNIV_DEBUG */ fsp_init(); log_init(); @@ -1720,14 +1876,15 @@ innobase_start_or_create_for_mysql(void) /* Create i/o-handler threads: */ - for (i = 0; i < srv_n_file_io_threads; i++) { + for (ulint i = 0; i < srv_n_file_io_threads; ++i) { + n[i] = i; os_thread_create(io_handler_thread, n + i, thread_ids + i); } #ifdef UNIV_LOG_ARCHIVE - if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) { + if (0 != ut_strcmp(srv_log_group_home_dir, srv_arch_dir)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: you must set the log group home dir in my.cnf\n"); ut_print_timestamp(stderr); @@ -1738,16 +1895,14 @@ innobase_start_or_create_for_mysql(void) #endif /* UNIV_LOG_ARCHIVE */ if (srv_n_log_files * srv_log_file_size * UNIV_PAGE_SIZE - >= 549755813888ULL /* 512G */) { + >= 512ULL * 1024ULL * 1024ULL * 1024ULL) { /* log_block_convert_lsn_to_no() limits the returned block number to 1G and given that OS_FILE_LOG_BLOCK_SIZE is 512 bytes, then we have a limit of 512 GB. If that limit is to be raised, then log_block_convert_lsn_to_no() must be modified. */ - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: combined size of log files" - " must be < 512 GB\n"); + ib_logf(IB_LOG_LEVEL_ERROR, + "Combined size of log files must be < 512 GB"); return(DB_ERROR); } @@ -1759,7 +1914,6 @@ innobase_start_or_create_for_mysql(void) So next_offset must be < ULINT_MAX * UNIV_PAGE_SIZE. This means that we are limited to ULINT_MAX * UNIV_PAGE_SIZE which is 64 TB on 32 bit systems. */ - ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: combined size of log files" " must be < %lu GB\n", @@ -1791,10 +1945,8 @@ innobase_start_or_create_for_mysql(void) } if (sum_of_new_sizes < 10485760 / UNIV_PAGE_SIZE) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: tablespace size must be" - " at least 10 MB\n"); + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespace size must be at least 10 MB"); return(DB_ERROR); } @@ -1805,36 +1957,27 @@ innobase_start_or_create_for_mysql(void) #endif /* UNIV_LOG_ARCHIVE */ &min_flushed_lsn, &max_flushed_lsn, &sum_of_new_sizes); - if (err != DB_SUCCESS) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Could not open or create data files.\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: If you tried to add new data files," - " and it failed here,\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: you should now edit innodb_data_file_path" - " in my.cnf back\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: to what it was, and remove the" - " new ibdata files InnoDB created\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: in this failed attempt. InnoDB only wrote" - " those files full of\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: zeros, but did not yet use them in any way." - " But be careful: do not\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: remove old data files" - " which contain your precious data!\n"); + if (err == DB_FAIL) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "The system tablespace must be writable!"); + + return(DB_ERROR); - return((int) err); + } else if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not open or create the system tablespace. If " + "you tried to add new data files to the system " + "tablespace, and it failed here, you should now " + "edit innodb_data_file_path in my.cnf back to what " + "it was, and remove the new ibdata files InnoDB " + "created in this failed attempt. InnoDB only wrote " + "those files full of zeros, but did not yet use " + "them in any way. But be careful: do not remove " + "old data files which contain your precious data!"); + + return(err); } #ifdef UNIV_LOG_ARCHIVE @@ -1842,125 +1985,199 @@ innobase_start_or_create_for_mysql(void) srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir); #endif /* UNIV_LOG_ARCHIVE */ - for (i = 0; i < srv_n_log_files; i++) { - err = open_or_create_log_file(create_new_db, &log_file_created, - log_opened, 0, i); - if (err != DB_SUCCESS) { + dirnamelen = strlen(srv_log_group_home_dir); + ut_a(dirnamelen < (sizeof logfilename) - 10 - sizeof "ib_logfile"); + memcpy(logfilename, srv_log_group_home_dir, dirnamelen); - return((int) err); - } + /* Add a path separator if needed. */ + if (dirnamelen && logfilename[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + logfilename[dirnamelen++] = SRV_PATH_SEPARATOR; + } - if (log_file_created) { - log_created = TRUE; - } else { - log_opened = TRUE; + srv_log_file_size_requested = srv_log_file_size; + + if (create_new_db) { + bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL); + ut_a(success); + + min_flushed_lsn = max_flushed_lsn = log_get_lsn(); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + err = create_log_files(logfilename, dirnamelen, + max_flushed_lsn, logfile0); + + if (err != DB_SUCCESS) { + return(err); } - if ((log_opened && create_new_db) - || (log_opened && log_created)) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: all log files must be" - " created at the same time.\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: All log files must be" - " created also in database creation.\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: If you want bigger or smaller" - " log files, shut down the\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: database and make sure there" - " were no errors in shutdown.\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Then delete the existing log files." - " Edit the .cnf file\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: and start the database again.\n"); + } else { + for (i = 0; i < SRV_N_LOG_FILES_MAX; i++) { + os_offset_t size; + os_file_stat_t stat_info; + + sprintf(logfilename + dirnamelen, + "ib_logfile%u", i); + + err = os_file_get_status( + logfilename, &stat_info, false); + + if (err == DB_NOT_FOUND) { + if (i == 0) { + if (max_flushed_lsn + != min_flushed_lsn) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create" + " log files because" + " data files are" + " corrupt or" + " not in sync" + " with each other"); + return(DB_ERROR); + } + + if (max_flushed_lsn < (lsn_t) 1000) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create" + " log files because" + " data files are" + " corrupt or the" + " database was not" + " shut down cleanly" + " after creating" + " the data files."); + return(DB_ERROR); + } + + err = create_log_files( + logfilename, dirnamelen, + max_flushed_lsn, logfile0); + + if (err != DB_SUCCESS) { + return(err); + } + + create_log_files_rename( + logfilename, dirnamelen, + max_flushed_lsn, logfile0); + + /* Suppress the message about + crash recovery. */ + max_flushed_lsn = min_flushed_lsn + = log_get_lsn(); + goto files_checked; + } else if (i < 2) { + /* must have at least 2 log files */ + ib_logf(IB_LOG_LEVEL_ERROR, + "Only one log file found."); + return(err); + } - return(DB_ERROR); + /* opened all files */ + break; + } + + if (!srv_file_check_mode(logfilename)) { + return(DB_ERROR); + } + + err = open_log_file(&files[i], logfilename, &size); + + if (err != DB_SUCCESS) { + return(err); + } + + ut_a(size != (os_offset_t) -1); + + if (size & ((1 << UNIV_PAGE_SIZE_SHIFT) - 1)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Log file %s size " + UINT64PF " is not a multiple of" + " innodb_page_size", + logfilename, size); + return(DB_ERROR); + } + + size >>= UNIV_PAGE_SIZE_SHIFT; + + if (i == 0) { + srv_log_file_size = size; + } else if (size != srv_log_file_size) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Log file %s is" + " of different size "UINT64PF" bytes" + " than other log" + " files "UINT64PF" bytes!", + logfilename, + size << UNIV_PAGE_SIZE_SHIFT, + (os_offset_t) srv_log_file_size + << UNIV_PAGE_SIZE_SHIFT); + return(DB_ERROR); + } } - } - /* Open all log files and data files in the system tablespace: we - keep them open until database shutdown */ + srv_n_log_files_found = i; - fil_open_log_and_system_tablespace_files(); + /* Create the in-memory file space objects. */ - err = srv_undo_tablespaces_init(create_new_db, srv_undo_tablespaces); + sprintf(logfilename + dirnamelen, "ib_logfile%u", 0); - /* If the force recovery is set very high then we carry on regardless - of all errors. Basically this is fingers crossed mode. */ + fil_space_create(logfilename, + SRV_LOG_SPACE_FIRST_ID, + fsp_flags_set_page_size(0, UNIV_PAGE_SIZE), + FIL_LOG); - if (err != DB_SUCCESS - && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + ut_a(fil_validate()); - return((int) err); - } + /* srv_log_file_size is measured in pages; if page size is 16KB, + then we have a limit of 64TB on 32 bit systems */ + ut_a(srv_log_file_size <= ULINT_MAX); + + for (unsigned j = 0; j < i; j++) { + sprintf(logfilename + dirnamelen, "ib_logfile%u", j); + + if (!fil_node_create(logfilename, + (ulint) srv_log_file_size, + SRV_LOG_SPACE_FIRST_ID, FALSE)) { + return(DB_ERROR); + } + } - if (log_created && !create_new_db -#ifdef UNIV_LOG_ARCHIVE - && !srv_archive_recovery -#endif /* UNIV_LOG_ARCHIVE */ - ) { - if (max_flushed_lsn != min_flushed_lsn #ifdef UNIV_LOG_ARCHIVE - || max_arch_log_no != min_arch_log_no + /* Create the file space object for archived logs. Under + MySQL, no archiving ever done. */ + fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1, + 0, FIL_LOG); #endif /* UNIV_LOG_ARCHIVE */ - ) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Cannot initialize created" - " log files because\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: data files were not in sync" - " with each other\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: or the data files are corrupt.\n"); + log_group_init(0, i, srv_log_file_size * UNIV_PAGE_SIZE, + SRV_LOG_SPACE_FIRST_ID, + SRV_LOG_SPACE_FIRST_ID + 1); + } - return(DB_ERROR); - } +files_checked: + /* Open all log files and data files in the system + tablespace: we keep them open until database + shutdown */ - if (max_flushed_lsn < (lsn_t) 1000) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Cannot initialize created" - " log files because\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: data files are corrupt," - " or new data files were\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: created when the database" - " was started previous\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: time but the database" - " was not shut down\n"); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: normally after that.\n"); + fil_open_log_and_system_tablespace_files(); - return(DB_ERROR); - } + err = srv_undo_tablespaces_init( + create_new_db, + srv_undo_tablespaces, + &srv_undo_tablespaces_open); - mutex_enter(&(log_sys->mutex)); + /* If the force recovery is set very high then we carry on regardless + of all errors. Basically this is fingers crossed mode. */ -#ifdef UNIV_LOG_ARCHIVE - /* Do not + 1 arch_log_no because we do not use log - archiving */ - recv_reset_logs(max_flushed_lsn, max_arch_log_no, TRUE); -#else - recv_reset_logs(max_flushed_lsn, TRUE); -#endif /* UNIV_LOG_ARCHIVE */ + if (err != DB_SUCCESS + && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { - mutex_exit(&(log_sys->mutex)); + return(err); + } + + /* Initialize objects used by dict stats gathering thread, which + can also be used by recovery if it tries to drop some table */ + if (!srv_read_only_mode) { + dict_stats_thread_init(); } trx_sys_file_format_init(); @@ -1968,6 +2185,9 @@ innobase_start_or_create_for_mysql(void) trx_sys_create(); if (create_new_db) { + + ut_a(!srv_read_only_mode); + mtr_start(&mtr); fsp_header_init(0, sum_of_new_sizes, &mtr); @@ -1987,16 +2207,34 @@ innobase_start_or_create_for_mysql(void) trx_purge_sys_create(srv_n_purge_threads, ib_bh); - dict_create(); + err = dict_create(); + + if (err != DB_SUCCESS) { + return(err); + } srv_startup_is_before_trx_rollback_phase = FALSE; + bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL); + ut_a(success); + + min_flushed_lsn = max_flushed_lsn = log_get_lsn(); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + /* Stamp the LSN to the data files. */ + fil_write_flushed_lsn_to_data_files(max_flushed_lsn, 0); + + fil_flush_file_spaces(FIL_TABLESPACE); + + create_log_files_rename(logfilename, dirnamelen, + max_flushed_lsn, logfile0); #ifdef UNIV_LOG_ARCHIVE } else if (srv_archive_recovery) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Starting archive" - " recovery from a backup...\n"); + + ib_logf(IB_LOG_LEVEL_INFO, + " Starting archive recovery from a backup..."); + err = recv_recovery_from_archive_start( min_flushed_lsn, srv_archive_recovery_limit_lsn, min_arch_log_no); @@ -2007,7 +2245,11 @@ innobase_start_or_create_for_mysql(void) /* Since ibuf init is in dict_boot, and ibuf is needed in any disk i/o, first call dict_boot */ - dict_boot(); + err = dict_boot(); + + if (err != DB_SUCCESS) { + return(err); + } ib_bh = trx_sys_init_at_db_start(); @@ -2051,10 +2293,10 @@ innobase_start_or_create_for_mysql(void) /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ - err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT, - IB_ULONGLONG_MAX, - min_flushed_lsn, - max_flushed_lsn); + err = recv_recovery_from_checkpoint_start( + LOG_CHECKPOINT, IB_ULONGLONG_MAX, + min_flushed_lsn, max_flushed_lsn); + if (err != DB_SUCCESS) { return(DB_ERROR); @@ -2066,7 +2308,11 @@ innobase_start_or_create_for_mysql(void) to access space 0, and the insert buffer at this stage already works for space 0. */ - dict_boot(); + err = dict_boot(); + + if (err != DB_SUCCESS) { + return(err); + } ib_bh = trx_sys_init_at_db_start(); @@ -2079,6 +2325,7 @@ innobase_start_or_create_for_mysql(void) are initialized in trx_sys_init_at_db_start(). */ recv_recovery_from_checkpoint_finish(); + if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) { /* The following call is necessary for the insert buffer to work with multiple tablespaces. We must @@ -2100,6 +2347,90 @@ innobase_start_or_create_for_mysql(void) recv_needed_recovery); } + if (!srv_force_recovery + && !recv_sys->found_corrupt_log + && (srv_log_file_size_requested != srv_log_file_size + || srv_n_log_files_found != srv_n_log_files)) { + /* Prepare to replace the redo log files. */ + + if (srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot resize log files " + "in read-only mode."); + return(DB_READ_ONLY); + } + + /* Clean the buffer pool. */ + bool success = buf_flush_list( + ULINT_MAX, LSN_MAX, NULL); + ut_a(success); + + RECOVERY_CRASH(1); + + min_flushed_lsn = max_flushed_lsn = log_get_lsn(); + + ib_logf(IB_LOG_LEVEL_WARN, + "Resizing redo log from %u*%u to %u*%u pages" + ", LSN=" LSN_PF, + (unsigned) i, + (unsigned) srv_log_file_size, + (unsigned) srv_n_log_files, + (unsigned) srv_log_file_size_requested, + max_flushed_lsn); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + RECOVERY_CRASH(2); + + /* Flush the old log files. */ + log_buffer_flush_to_disk(); + /* If innodb_flush_method=O_DSYNC, + we need to explicitly flush the log buffers. */ + fil_flush(SRV_LOG_SPACE_FIRST_ID); + + ut_ad(max_flushed_lsn == log_get_lsn()); + + /* Prohibit redo log writes from any other + threads until creating a log checkpoint at the + end of create_log_files(). */ + ut_d(recv_no_log_write = TRUE); + ut_ad(!buf_pool_check_no_pending_io()); + + RECOVERY_CRASH(3); + + /* Stamp the LSN to the data files. */ + fil_write_flushed_lsn_to_data_files( + max_flushed_lsn, 0); + + fil_flush_file_spaces(FIL_TABLESPACE); + + RECOVERY_CRASH(4); + + /* Close and free the redo log files, so that + we can replace them. */ + fil_close_log_files(true); + + RECOVERY_CRASH(5); + + /* Free the old log file space. */ + log_group_close_all(); + + ib_logf(IB_LOG_LEVEL_WARN, + "Starting to delete and rewrite log files."); + + srv_log_file_size = srv_log_file_size_requested; + + err = create_log_files(logfilename, dirnamelen, + max_flushed_lsn, logfile0); + + if (err != DB_SUCCESS) { + return(err); + } + + create_log_files_rename(logfilename, dirnamelen, + max_flushed_lsn, logfile0); + } + srv_startup_is_before_trx_rollback_phase = FALSE; recv_recovery_rollback_active(); @@ -2181,31 +2512,39 @@ innobase_start_or_create_for_mysql(void) if (srv_available_undo_logs == ULINT_UNDEFINED) { /* Can only happen if force recovery is set. */ - ut_a(srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + ut_a(srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + || srv_read_only_mode); srv_undo_logs = ULONG_UNDEFINED; } - /* Create the thread which watches the timeouts for lock waits */ - os_thread_create( - lock_wait_timeout_thread, - NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS); - - /* Create the thread which warns of long semaphore waits */ - os_thread_create( - srv_error_monitor_thread, - NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS); + if (!srv_read_only_mode) { + /* Create the thread which watches the timeouts + for lock waits */ + os_thread_create( + lock_wait_timeout_thread, + NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS); - /* Create the thread which prints InnoDB monitor info */ - os_thread_create( - srv_monitor_thread, - NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS); + /* Create the thread which warns of long semaphore waits */ + os_thread_create( + srv_error_monitor_thread, + NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS); - srv_is_being_started = FALSE; + /* Create the thread which prints InnoDB monitor info */ + os_thread_create( + srv_monitor_thread, + NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS); + } /* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */ err = dict_create_or_check_foreign_constraint_tables(); if (err != DB_SUCCESS) { - return((int)DB_ERROR); + return(err); + } + + /* Create the SYS_TABLESPACES system table */ + err = dict_create_or_check_sys_tablespace(); + if (err != DB_SUCCESS) { + return(err); } srv_is_being_started = FALSE; @@ -2215,11 +2554,15 @@ innobase_start_or_create_for_mysql(void) /* Create the master thread which does purge and other utility operations */ - os_thread_create( - srv_master_thread, - NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS)); + if (!srv_read_only_mode) { + + os_thread_create( + srv_master_thread, + NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS)); + } - if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { + if (!srv_read_only_mode + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { os_thread_create( srv_purge_coordinator_thread, @@ -2234,35 +2577,15 @@ innobase_start_or_create_for_mysql(void) srv_worker_thread, NULL, thread_ids + 5 + i + SRV_MAX_N_IO_THREADS); } - } - - os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); - - /* Wait for the purge coordinator and master thread to startup. */ - - purge_state_t state = trx_purge_state(); - while (srv_shutdown_state == SRV_SHUTDOWN_NONE - && srv_force_recovery < SRV_FORCE_NO_BACKGROUND - && state == PURGE_STATE_INIT) { - - switch (state = trx_purge_state()) { - case PURGE_STATE_RUN: - case PURGE_STATE_STOP: - break; - - case PURGE_STATE_INIT: - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: " - "Waiting for the background threads to " - "start\n"); + srv_start_wait_for_purge_to_start(); - os_thread_sleep(50000); - break; + } else { + purge_sys->state = PURGE_STATE_DISABLED; + } - case PURGE_STATE_EXIT: - ut_error; - } + if (!srv_read_only_mode) { + os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); } #ifdef UNIV_DEBUG @@ -2276,7 +2599,8 @@ innobase_start_or_create_for_mysql(void) tablespace_size_in_header = fsp_header_get_tablespace_size(); - if (!srv_auto_extend_last_data_file + if (!srv_read_only_mode + && !srv_auto_extend_last_data_file && sum_of_data_file_sizes != tablespace_size_in_header) { ut_print_timestamp(stderr); @@ -2319,7 +2643,8 @@ innobase_start_or_create_for_mysql(void) } } - if (srv_auto_extend_last_data_file + if (!srv_read_only_mode + && srv_auto_extend_last_data_file && sum_of_data_file_sizes < tablespace_size_in_header) { ut_print_timestamp(stderr); @@ -2383,23 +2708,17 @@ innobase_start_or_create_for_mysql(void) os_fast_mutex_free(&srv_os_test_mutex); if (srv_print_verbose_log) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: %s started; " - "log sequence number " LSN_PF "\n", + ib_logf(IB_LOG_LEVEL_INFO, + "%s started; log sequence number " LSN_PF "", INNODB_VERSION_STR, srv_start_lsn); } if (srv_force_recovery > 0) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: !!! innodb_force_recovery" - " is set to %lu !!!\n", + ib_logf(IB_LOG_LEVEL_INFO, + "!!! innodb_force_recovery is set to %lu !!!", (ulong) srv_force_recovery); } - fflush(stderr); - if (srv_force_recovery == 0) { /* In the insert buffer we may have even bigger tablespace id's, because we may have dropped those tablespaces, but @@ -2409,16 +2728,20 @@ innobase_start_or_create_for_mysql(void) ibuf_update_max_tablespace_id(); } - /* Create the buffer pool dump/load thread */ - os_thread_create(buf_dump_thread, NULL, NULL); + if (!srv_read_only_mode) { + /* Create the buffer pool dump/load thread */ + os_thread_create(buf_dump_thread, NULL, NULL); - srv_was_started = TRUE; + /* Create the dict stats gathering thread */ + os_thread_create(dict_stats_thread, NULL, NULL); - /* Create the thread that will optimize the FTS sub-system - in a separate background thread. */ - fts_optimize_init(); + /* Create the thread that will optimize the FTS sub-system. */ + fts_optimize_init(); + } - return((int) DB_SUCCESS); + srv_was_started = TRUE; + + return(DB_SUCCESS); } #if 0 @@ -2455,27 +2778,28 @@ srv_fts_close(void) Shuts down the InnoDB database. @return DB_SUCCESS or error code */ UNIV_INTERN -int +dberr_t innobase_shutdown_for_mysql(void) /*=============================*/ { ulint i; + if (!srv_was_started) { if (srv_is_being_started) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Warning: shutting down" - " a not properly started\n" - "InnoDB: or created database!\n"); + ib_logf(IB_LOG_LEVEL_WARN, + "Shutting down an improperly started, " + "or created database!"); } return(DB_SUCCESS); } - /* Shutdown the FTS optimize sub system. */ - fts_optimize_start_shutdown(); + if (!srv_read_only_mode) { + /* Shutdown the FTS optimize sub system. */ + fts_optimize_start_shutdown(); - fts_optimize_end(); + fts_optimize_end(); + } /* 1. Flush the buffer pool to disk, write the current lsn to the tablespace header(s), and copy all log data to archive. @@ -2485,18 +2809,12 @@ innobase_shutdown_for_mysql(void) logs_empty_and_mark_files_at_shutdown(); if (srv_conc_get_active_threads() != 0) { - fprintf(stderr, - "InnoDB: Warning: query counter shows %ld queries" - " still\n" - "InnoDB: inside InnoDB at shutdown\n", + ib_logf(IB_LOG_LEVEL_WARN, + "Query counter shows %ld queries still " + "inside InnoDB at shutdown", srv_conc_get_active_threads()); } - /* This functionality will be used by WL#5522. */ - ut_a(trx_purge_state() == PURGE_STATE_RUN - || trx_purge_state() == PURGE_STATE_EXIT - || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND); - /* 2. Make all threads created by InnoDB to exit */ srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS; @@ -2509,22 +2827,28 @@ innobase_shutdown_for_mysql(void) /* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM HERE OR EARLIER */ - /* a. Let the lock timeout thread exit */ - os_event_set(srv_timeout_event); + if (!srv_read_only_mode) { + /* a. Let the lock timeout thread exit */ + os_event_set(lock_sys->timeout_event); - /* b. srv error monitor thread exits automatically, no need - to do anything here */ + /* b. srv error monitor thread exits automatically, + no need to do anything here */ - /* c. We wake the master thread so that it exits */ - srv_wake_master_thread(); + /* c. We wake the master thread so that it exits */ + srv_wake_master_thread(); - /* d. Wakeup purge threads. */ - srv_purge_wakeup(); + /* d. Wakeup purge threads. */ + srv_purge_wakeup(); + } /* e. Exit the i/o threads */ os_aio_wake_all_threads_at_shutdown(); + /* f. dict_stats_thread is signaled from + logs_empty_and_mark_files_at_shutdown() and should have + already quit or is quitting right now. */ + os_mutex_enter(os_sync_mutex); if (os_thread_count == 0) { @@ -2549,9 +2873,9 @@ innobase_shutdown_for_mysql(void) } if (i == 1000) { - fprintf(stderr, - "InnoDB: Warning: %lu threads created by InnoDB" - " had not exited at shutdown!\n", + ib_logf(IB_LOG_LEVEL_WARN, + "%lu threads created by InnoDB" + " had not exited at shutdown!", (ulong) os_thread_count); } @@ -2563,6 +2887,7 @@ innobase_shutdown_for_mysql(void) mem_free(srv_monitor_file_name); } } + if (srv_dict_tmpfile) { fclose(srv_dict_tmpfile); srv_dict_tmpfile = 0; @@ -2573,6 +2898,10 @@ innobase_shutdown_for_mysql(void) srv_misc_tmpfile = 0; } + if (!srv_read_only_mode) { + dict_stats_thread_deinit(); + } + /* This must be disabled before closing the buffer pool and closing the data dictionary. */ btr_search_disable(); @@ -2583,9 +2912,14 @@ innobase_shutdown_for_mysql(void) trx_sys_file_format_close(); trx_sys_close(); - mutex_free(&srv_monitor_file_mutex); - mutex_free(&srv_dict_tmpfile_mutex); - mutex_free(&srv_misc_tmpfile_mutex); + /* We don't create these mutexes in RO mode because we don't create + the temp files that the cover. */ + if (!srv_read_only_mode) { + mutex_free(&srv_monitor_file_mutex); + mutex_free(&srv_dict_tmpfile_mutex); + mutex_free(&srv_misc_tmpfile_mutex); + } + dict_close(); btr_search_sys_free(); @@ -2594,6 +2928,7 @@ innobase_shutdown_for_mysql(void) os_aio_free(); que_close(); row_mysql_close(); + srv_mon_free(); sync_close(); srv_free(); fil_close(); @@ -2618,11 +2953,10 @@ innobase_shutdown_for_mysql(void) || os_event_count != 0 || os_mutex_count != 0 || os_fast_mutex_count != 0) { - fprintf(stderr, - "InnoDB: Warning: some resources were not" - " cleaned up in shutdown:\n" - "InnoDB: threads %lu, events %lu," - " os_mutexes %lu, os_fast_mutexes %lu\n", + ib_logf(IB_LOG_LEVEL_WARN, + "Some resources were not cleaned up in shutdown: " + "threads %lu, events %lu, os_mutexes %lu, " + "os_fast_mutexes %lu", (ulong) os_thread_count, (ulong) os_event_count, (ulong) os_mutex_count, (ulong) os_fast_mutex_count); } @@ -2632,17 +2966,15 @@ innobase_shutdown_for_mysql(void) } if (srv_print_verbose_log) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Shutdown completed;" - " log sequence number " LSN_PF "\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Shutdown completed; log sequence number " LSN_PF "", srv_shutdown_lsn); } srv_was_started = FALSE; srv_start_has_been_called = FALSE; - return((int) DB_SUCCESS); + return(DB_SUCCESS); } #endif /* !UNIV_HOTBACKUP */ @@ -2650,7 +2982,7 @@ innobase_shutdown_for_mysql(void) /******************************************************************** Signal all per-table background threads to shutdown, and wait for them to do so. */ - +UNIV_INTERN void srv_shutdown_table_bg_threads(void) /*===============================*/ @@ -2723,3 +3055,48 @@ srv_shutdown_table_bg_threads(void) table = next; } } + +/*****************************************************************//** +Get the meta-data filename from the table name. */ +UNIV_INTERN +void +srv_get_meta_data_filename( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + char* filename, /*!< out: filename */ + ulint max_len) /*!< in: filename max length */ +{ + ulint len; + char* path; + char* suffix; + static const ulint suffix_len = strlen(".cfg"); + + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, false); + ut_a(table->data_dir_path); + + path = os_file_make_remote_pathname( + table->data_dir_path, table->name, "cfg"); + } else { + path = fil_make_ibd_name(table->name, false); + } + + ut_a(path); + len = ut_strlen(path); + ut_a(max_len >= len); + + suffix = path + (len - suffix_len); + if (strncmp(suffix, ".cfg", suffix_len) == 0) { + strcpy(filename, path); + } else { + ut_ad(strncmp(suffix, ".ibd", suffix_len) == 0); + + strncpy(filename, path, len - suffix_len); + suffix = filename + (len - suffix_len); + strcpy(suffix, ".cfg"); + } + + mem_free(path); + + srv_normalize_path_for_win(filename); +} diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc index b90a5f29589..749258021f7 100644 --- a/storage/innobase/sync/sync0arr.cc +++ b/storage/innobase/sync/sync0arr.cc @@ -39,6 +39,7 @@ Created 9/5/1995 Heikki Tuuri #include "sync0rw.h" #include "os0sync.h" #include "os0file.h" +#include "lock0lock.h" #include "srv0srv.h" #include "ha_prototypes.h" @@ -78,11 +79,11 @@ any waiting threads who have missed the signal. */ /** A cell where an individual thread may wait suspended until a resource is released. The suspending is implemented using an operating system event semaphore. */ -struct sync_cell_struct { +struct sync_cell_t { void* wait_object; /*!< pointer to the object the thread is waiting for; if NULL the cell is free for use */ - mutex_t* old_wait_mutex; /*!< the latest wait mutex in cell */ + ib_mutex_t* old_wait_mutex; /*!< the latest wait mutex in cell */ rw_lock_t* old_wait_rw_lock; /*!< the latest wait rw-lock in cell */ @@ -116,15 +117,15 @@ all changes (set or reset) to the state of the event must be made while owning the mutex. */ /** Synchronization array */ -struct sync_array_struct { +struct sync_array_t { ulint n_reserved; /*!< number of currently reserved cells in the wait array */ ulint n_cells; /*!< number of cells in the wait array */ sync_cell_t* array; /*!< pointer to wait array */ - mutex_t mutex; /*!< possible database mutex + ib_mutex_t mutex; /*!< possible database mutex protecting this data structure */ - os_mutex_t os_mutex; /*!< Possible operating system mutex + os_ib_mutex_t os_mutex; /*!< Possible operating system mutex protecting the data structure. As this data structure is used in constructing the database mutex, @@ -293,7 +294,7 @@ sync_cell_get_event( ulint type = cell->request_type; if (type == SYNC_MUTEX) { - return(((mutex_t*) cell->wait_object)->event); + return(((ib_mutex_t*) cell->wait_object)->event); } else if (type == RW_LOCK_WAIT_EX) { return(((rw_lock_t*) cell->wait_object)->wait_ex_event); } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */ @@ -434,7 +435,7 @@ sync_array_cell_print( FILE* file, /*!< in: file where to print */ sync_cell_t* cell) /*!< in: sync cell */ { - mutex_t* mutex; + ib_mutex_t* mutex; rw_lock_t* rwlock; ulint type; ulint writer; @@ -600,7 +601,7 @@ sync_array_detect_deadlock( sync_cell_t* cell, /*!< in: cell to search */ ulint depth) /*!< in: recursion depth */ { - mutex_t* mutex; + ib_mutex_t* mutex; rw_lock_t* lock; os_thread_id_t thread; ibool ret; @@ -622,7 +623,7 @@ sync_array_detect_deadlock( if (cell->request_type == SYNC_MUTEX) { - mutex = static_cast<mutex_t*>(cell->wait_object); + mutex = static_cast<ib_mutex_t*>(cell->wait_object); if (mutex_get_lock_word(mutex) != 0) { @@ -736,7 +737,7 @@ sync_arr_cell_can_wake_up( /*======================*/ sync_cell_t* cell) /*!< in: cell to search */ { - mutex_t* mutex; + ib_mutex_t* mutex; rw_lock_t* lock; if (cell->request_type == SYNC_MUTEX) { @@ -902,6 +903,11 @@ sync_array_print_long_waits_low( ibool fatal = FALSE; double longest_diff = 0; + /* For huge tables, skip the check during CHECK TABLE etc... */ + if (fatal_timeout > SRV_SEMAPHORE_WAIT_EXTENSION) { + return(FALSE); + } + #ifdef UNIV_DEBUG_VALGRIND /* Increase the timeouts if running under valgrind because it executes extremely slowly. UNIV_DEBUG_VALGRIND does not necessary mean that @@ -1000,7 +1006,7 @@ sync_array_print_long_waits( (ulong) os_file_n_pending_pwrites); srv_print_innodb_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); os_thread_sleep(30000000); diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc index dc6c510a3ed..823efecaf6b 100644 --- a/storage/innobase/sync/sync0rw.cc +++ b/storage/innobase/sync/sync0rw.cc @@ -57,11 +57,11 @@ lock_word == 0: Write locked (-lock_word) is the number of readers that hold the lock. lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been - decremented by X_LOCK_DECR once for each lock, - so the number of locks is: - ((-lock_word) / X_LOCK_DECR) + 1 -When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0: -other values of lock_word are invalid. + decremented by X_LOCK_DECR for the first lock + and the first recursive lock, then by 1 for + each recursive lock thereafter. + So the number of locks is: + (lock_copy == 0) ? 1 : 2 - (lock_copy + X_LOCK_DECR) The lock_word is always read and updated atomically and consistently, so that it always represents the state of the lock, and the state of the lock changes @@ -124,50 +124,21 @@ wait_ex_event: A thread may only wait on the wait_ex_event after it has performed the following actions in order: (1) Decrement lock_word by X_LOCK_DECR. (2) Record counter value of wait_ex_event (os_event_reset, - called from sync_array_reserve_cell). + called from sync_array_reserve_cell). (3) Verify that lock_word < 0. (1) must come first to ensures no other threads become reader - or next writer, and notifies unlocker that signal must be sent. - (2) must come before (3) to ensure the signal is not missed. + or next writer, and notifies unlocker that signal must be sent. + (2) must come before (3) to ensure the signal is not missed. These restrictions force the above ordering. Immediately before sending the wake-up signal, we should: Verify lock_word == 0 (waiting thread holds x_lock) */ - -/** number of spin waits on rw-latches, -resulted during shared (read) locks */ -UNIV_INTERN ib_int64_t rw_s_spin_wait_count = 0; -/** number of spin loop rounds on rw-latches, -resulted during shared (read) locks */ -UNIV_INTERN ib_int64_t rw_s_spin_round_count = 0; - -/** number of OS waits on rw-latches, -resulted during shared (read) locks */ -UNIV_INTERN ib_int64_t rw_s_os_wait_count = 0; - -/** number of unlocks (that unlock shared locks), -set only when UNIV_SYNC_PERF_STAT is defined */ -UNIV_INTERN ib_int64_t rw_s_exit_count = 0; - -/** number of spin waits on rw-latches, -resulted during exclusive (write) locks */ -UNIV_INTERN ib_int64_t rw_x_spin_wait_count = 0; -/** number of spin loop rounds on rw-latches, -resulted during exclusive (write) locks */ -UNIV_INTERN ib_int64_t rw_x_spin_round_count = 0; - -/** number of OS waits on rw-latches, -resulted during exclusive (write) locks */ -UNIV_INTERN ib_int64_t rw_x_os_wait_count = 0; - -/** number of unlocks (that unlock exclusive locks), -set only when UNIV_SYNC_PERF_STAT is defined */ -UNIV_INTERN ib_int64_t rw_x_exit_count = 0; +UNIV_INTERN rw_lock_stats_t rw_lock_stats; /* The global list of rw-locks */ UNIV_INTERN rw_lock_list_t rw_lock_list; -UNIV_INTERN mutex_t rw_lock_list_mutex; +UNIV_INTERN ib_mutex_t rw_lock_list_mutex; #ifdef UNIV_PFS_MUTEX UNIV_INTERN mysql_pfs_key_t rw_lock_list_mutex_key; @@ -179,7 +150,7 @@ UNIV_INTERN mysql_pfs_key_t rw_lock_mutex_key; To modify the debug info list of an rw-lock, this mutex has to be acquired in addition to the mutex protecting the lock. */ -UNIV_INTERN mutex_t rw_lock_debug_mutex; +UNIV_INTERN ib_mutex_t rw_lock_debug_mutex; # ifdef UNIV_PFS_MUTEX UNIV_INTERN mysql_pfs_key_t rw_lock_debug_mutex_key; @@ -258,7 +229,7 @@ rw_lock_create_func( lock->mutex.cline = cline; ut_d(lock->mutex.cmutex_name = cmutex_name); - ut_d(lock->mutex.mutex_type = 1); + ut_d(lock->mutex.ib_mutex_type = 1); #else /* INNODB_RW_LOCKS_USE_ATOMICS */ # ifdef UNIV_DEBUG UT_NOT_USED(cmutex_name); @@ -292,8 +263,8 @@ rw_lock_create_func( lock->last_x_file_name = "not yet reserved"; lock->last_s_line = 0; lock->last_x_line = 0; - lock->event = os_event_create(NULL); - lock->wait_ex_event = os_event_create(NULL); + lock->event = os_event_create(); + lock->wait_ex_event = os_event_create(); mutex_enter(&rw_lock_list_mutex); @@ -316,7 +287,7 @@ rw_lock_free_func( rw_lock_t* lock) /*!< in: rw-lock */ { #ifndef INNODB_RW_LOCKS_USE_ATOMICS - mutex_t* mutex; + ib_mutex_t* mutex; #endif /* !INNODB_RW_LOCKS_USE_ATOMICS */ ut_ad(rw_lock_validate(lock)); @@ -364,14 +335,15 @@ rw_lock_validate( ulint waiters; lint lock_word; - ut_a(lock); + ut_ad(lock); waiters = rw_lock_get_waiters(lock); lock_word = lock->lock_word; ut_ad(lock->magic_n == RW_LOCK_MAGIC_N); - ut_a(waiters == 0 || waiters == 1); - ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0); + ut_ad(waiters == 0 || waiters == 1); + ut_ad(lock_word > -(2 * X_LOCK_DECR)); + ut_ad(lock_word <= X_LOCK_DECR); return(TRUE); } @@ -395,10 +367,16 @@ rw_lock_s_lock_spin( ulint index; /* index of the reserved wait cell */ ulint i = 0; /* spin round count */ sync_array_t* sync_arr; + size_t counter_index; + + /* We reuse the thread id to index into the counter, cache + it here for efficiency. */ + + counter_index = (size_t) os_thread_get_curr_id(); ut_ad(rw_lock_validate(lock)); - rw_s_spin_wait_count++; /*!< Count calls to this function */ + rw_lock_stats.rw_s_spin_wait_count.add(counter_index, 1); lock_loop: /* Spin waiting for the writer field to become free */ @@ -414,19 +392,9 @@ lock_loop: os_thread_yield(); } - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu spin wait rw-s-lock at %p" - " cfile %s cline %lu rnds %lu\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), - (void*) lock, - innobase_basename(lock->cfile_name), - (ulong) lock->cline, (ulong) i); - } - /* We try once again to obtain the lock */ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { - rw_s_spin_round_count += i; + rw_lock_stats.rw_s_spin_round_count.add(counter_index, i); return; /* Success */ } else { @@ -435,7 +403,7 @@ lock_loop: goto lock_loop; } - rw_s_spin_round_count += i; + rw_lock_stats.rw_s_spin_round_count.add(counter_index, i); sync_arr = sync_array_get(); @@ -444,7 +412,7 @@ lock_loop: file_name, line, &index); /* Set waiters before checking lock_word to ensure wake-up - signal is sent. This may lead to some unnecessary signals. */ + signal is sent. This may lead to some unnecessary signals. */ rw_lock_set_waiter_flag(lock); if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { @@ -452,19 +420,9 @@ lock_loop: return; /* Success */ } - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu OS wait rw-s-lock at %p" - " cfile %s cline %lu\n", - os_thread_pf(os_thread_get_curr_id()), - (void*) lock, - innobase_basename(lock->cfile_name), - (ulong) lock->cline); - } - /* these stats may not be accurate */ lock->count_os_wait++; - rw_s_os_wait_count++; + rw_lock_stats.rw_s_os_wait_count.add(counter_index, 1); sync_array_wait_event(sync_arr, index); @@ -511,6 +469,12 @@ rw_lock_x_lock_wait( ulint index; ulint i = 0; sync_array_t* sync_arr; + size_t counter_index; + + /* We reuse the thread id to index into the counter, cache + it here for efficiency. */ + + counter_index = (size_t) os_thread_get_curr_id(); ut_ad(lock->lock_word <= 0); @@ -524,7 +488,7 @@ rw_lock_x_lock_wait( } /* If there is still a reader, then go to sleep.*/ - rw_x_spin_round_count += i; + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); sync_arr = sync_array_get(); @@ -539,11 +503,11 @@ rw_lock_x_lock_wait( /* these stats may not be accurate */ lock->count_os_wait++; - rw_x_os_wait_count++; + rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1); - /* Add debug info as it is needed to detect possible - deadlock. We must add info for WAIT_EX thread for - deadlock detection to work properly. */ + /* Add debug info as it is needed to detect possible + deadlock. We must add info for WAIT_EX thread for + deadlock detection to work properly. */ #ifdef UNIV_SYNC_DEBUG rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, file_name, line); @@ -551,16 +515,16 @@ rw_lock_x_lock_wait( sync_array_wait_event(sync_arr, index); #ifdef UNIV_SYNC_DEBUG - rw_lock_remove_debug_info(lock, pass, - RW_LOCK_WAIT_EX); + rw_lock_remove_debug_info( + lock, pass, RW_LOCK_WAIT_EX); #endif - /* It is possible to wake when lock_word < 0. - We must pass the while-loop check to proceed.*/ + /* It is possible to wake when lock_word < 0. + We must pass the while-loop check to proceed.*/ } else { sync_array_free_cell(sync_arr, index); } } - rw_x_spin_round_count += i; + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); } /******************************************************************//** @@ -576,8 +540,6 @@ rw_lock_x_lock_low( const char* file_name,/*!< in: file name where lock requested */ ulint line) /*!< in: line where requested */ { - os_thread_id_t curr_thread = os_thread_get_curr_id(); - if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) { /* lock->recursive also tells us if the writer_thread @@ -587,8 +549,8 @@ rw_lock_x_lock_low( ut_a(!lock->recursive); /* Decrement occurred: we are writer or next-writer. */ - rw_lock_set_writer_id_and_recursion_flag(lock, - pass ? FALSE : TRUE); + rw_lock_set_writer_id_and_recursion_flag( + lock, pass ? FALSE : TRUE); rw_lock_x_lock_wait(lock, #ifdef UNIV_SYNC_DEBUG @@ -597,19 +559,25 @@ rw_lock_x_lock_low( file_name, line); } else { + os_thread_id_t thread_id = os_thread_get_curr_id(); + /* Decrement failed: relock or failed lock */ if (!pass && lock->recursive - && os_thread_eq(lock->writer_thread, curr_thread)) { + && os_thread_eq(lock->writer_thread, thread_id)) { /* Relock */ - lock->lock_word -= X_LOCK_DECR; + if (lock->lock_word == 0) { + lock->lock_word -= X_LOCK_DECR; + } else { + --lock->lock_word; + } + } else { /* Another thread locked before us */ return(FALSE); } } #ifdef UNIV_SYNC_DEBUG - rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, - file_name, line); + rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, line); #endif lock->last_x_file_name = file_name; lock->last_x_line = (unsigned int) line; @@ -640,6 +608,12 @@ rw_lock_x_lock_func( ulint index; /*!< index of the reserved wait cell */ sync_array_t* sync_arr; ibool spinning = FALSE; + size_t counter_index; + + /* We reuse the thread id to index into the counter, cache + it here for efficiency. */ + + counter_index = (size_t) os_thread_get_curr_id(); ut_ad(rw_lock_validate(lock)); #ifdef UNIV_SYNC_DEBUG @@ -651,15 +625,17 @@ rw_lock_x_lock_func( lock_loop: if (rw_lock_x_lock_low(lock, pass, file_name, line)) { - rw_x_spin_round_count += i; + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); return; /* Locking succeeded */ } else { - if (!spinning) { - spinning = TRUE; - rw_x_spin_wait_count++; + if (!spinning) { + spinning = TRUE; + + rw_lock_stats.rw_x_spin_wait_count.add( + counter_index, 1); } /* Spin waiting for the lock_word to become free */ @@ -679,16 +655,7 @@ lock_loop: } } - rw_x_spin_round_count += i; - - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu spin wait rw-x-lock at %p" - " cfile %s cline %lu rnds %lu\n", - os_thread_pf(os_thread_get_curr_id()), (void*) lock, - innobase_basename(lock->cfile_name), - (ulong) lock->cline, (ulong) i); - } + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); sync_arr = sync_array_get(); @@ -704,18 +671,9 @@ lock_loop: return; /* Locking succeeded */ } - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu OS wait for rw-x-lock at %p" - " cfile %s cline %lu\n", - os_thread_pf(os_thread_get_curr_id()), (void*) lock, - innobase_basename(lock->cfile_name), - (ulong) lock->cline); - } - /* these stats may not be accurate */ lock->count_os_wait++; - rw_x_os_wait_count++; + rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1); sync_array_wait_event(sync_arr, index); diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index af64d011db2..d6f7325e2a3 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -171,25 +171,25 @@ Q.E.D. */ /** The number of iterations in the mutex_spin_wait() spin loop. Intended for performance monitoring. */ -static ib_int64_t mutex_spin_round_count = 0; +static ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_spin_round_count; /** The number of mutex_spin_wait() calls. Intended for performance monitoring. */ -static ib_int64_t mutex_spin_wait_count = 0; +static ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_spin_wait_count; /** The number of OS waits in mutex_spin_wait(). Intended for performance monitoring. */ -static ib_int64_t mutex_os_wait_count = 0; +static ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_os_wait_count; /** The number of mutex_exit() calls. Intended for performance monitoring. */ -UNIV_INTERN ib_int64_t mutex_exit_count = 0; +UNIV_INTERN ib_int64_t mutex_exit_count; /** This variable is set to TRUE when sync_init is called */ UNIV_INTERN ibool sync_initialized = FALSE; #ifdef UNIV_SYNC_DEBUG /** An acquired mutex or rw-lock and its level in the latching order */ -typedef struct sync_level_struct sync_level_t; +struct sync_level_t; /** Mutexes or rw-locks held by a thread */ -typedef struct sync_thread_struct sync_thread_t; +struct sync_thread_t; /** The latch levels currently owned by threads are stored in this data structure; the size of this array is OS_THREAD_MAX_N */ @@ -197,7 +197,7 @@ structure; the size of this array is OS_THREAD_MAX_N */ UNIV_INTERN sync_thread_t* sync_thread_level_arrays; /** Mutex protecting sync_thread_level_arrays */ -UNIV_INTERN mutex_t sync_thread_mutex; +UNIV_INTERN ib_mutex_t sync_thread_mutex; # ifdef UNIV_PFS_MUTEX UNIV_INTERN mysql_pfs_key_t sync_thread_mutex_key; @@ -208,7 +208,7 @@ UNIV_INTERN mysql_pfs_key_t sync_thread_mutex_key; UNIV_INTERN ut_list_base_node_t mutex_list; /** Mutex protecting the mutex_list variable */ -UNIV_INTERN mutex_t mutex_list_mutex; +UNIV_INTERN ib_mutex_t mutex_list_mutex; #ifdef UNIV_PFS_MUTEX UNIV_INTERN mysql_pfs_key_t mutex_list_mutex_key; @@ -221,10 +221,8 @@ UNIV_INTERN ibool sync_order_checks_on = FALSE; /** Number of slots reserved for each OS thread in the sync level array */ static const ulint SYNC_THREAD_N_LEVELS = 10000; -typedef struct sync_arr_struct sync_arr_t; - /** Array for tracking sync levels per thread. */ -struct sync_arr_struct { +struct sync_arr_t { ulint in_use; /*!< Number of active cells */ ulint n_elems; /*!< Number of elements in the array */ ulint max_elems; /*!< Maximum elements */ @@ -234,14 +232,14 @@ struct sync_arr_struct { }; /** Mutexes or rw-locks held by a thread */ -struct sync_thread_struct{ +struct sync_thread_t{ os_thread_id_t id; /*!< OS thread id */ sync_arr_t* levels; /*!< level array for this thread; if this is NULL this slot is unused */ }; /** An acquired mutex or rw-lock and its level in the latching order */ -struct sync_level_struct{ +struct sync_level_t{ void* latch; /*!< pointer to a mutex or an rw-lock; NULL means that the slot is empty */ @@ -264,7 +262,7 @@ UNIV_INTERN void mutex_create_func( /*==============*/ - mutex_t* mutex, /*!< in: pointer to memory */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ #ifdef UNIV_DEBUG const char* cmutex_name, /*!< in: mutex name */ # ifdef UNIV_SYNC_DEBUG @@ -280,7 +278,7 @@ mutex_create_func( os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mutex->os_fast_mutex); mutex->lock_word = 0; #endif - mutex->event = os_event_create(NULL); + mutex->event = os_event_create(); mutex_set_waiters(mutex, 0); #ifdef UNIV_DEBUG mutex->magic_n = MUTEX_MAGIC_N; @@ -293,16 +291,6 @@ mutex_create_func( mutex->cfile_name = cfile_name; mutex->cline = cline; mutex->count_os_wait = 0; -#ifdef UNIV_DEBUG - mutex->cmutex_name= cmutex_name; - mutex->count_using= 0; - mutex->mutex_type= 0; - mutex->lspent_time= 0; - mutex->lmax_spent_time= 0; - mutex->count_spin_loop= 0; - mutex->count_spin_rounds= 0; - mutex->count_os_yield= 0; -#endif /* UNIV_DEBUG */ /* Check that lock_word is aligned; this is important on Intel */ ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0); @@ -337,7 +325,7 @@ UNIV_INTERN void mutex_free_func( /*============*/ - mutex_t* mutex) /*!< in: mutex */ + ib_mutex_t* mutex) /*!< in: mutex */ { ut_ad(mutex_validate(mutex)); ut_a(mutex_get_lock_word(mutex) == 0); @@ -397,7 +385,7 @@ UNIV_INTERN ulint mutex_enter_nowait_func( /*====================*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name __attribute__((unused)), /*!< in: file name where mutex requested */ @@ -406,7 +394,7 @@ mutex_enter_nowait_func( { ut_ad(mutex_validate(mutex)); - if (!mutex_test_and_set(mutex)) { + if (!ib_mutex_test_and_set(mutex)) { ut_d(mutex->thread_id = os_thread_get_curr_id()); #ifdef UNIV_SYNC_DEBUG @@ -427,7 +415,7 @@ UNIV_INTERN ibool mutex_validate( /*===========*/ - const mutex_t* mutex) /*!< in: mutex */ + const ib_mutex_t* mutex) /*!< in: mutex */ { ut_a(mutex); ut_a(mutex->magic_n == MUTEX_MAGIC_N); @@ -443,7 +431,7 @@ UNIV_INTERN ibool mutex_own( /*======*/ - const mutex_t* mutex) /*!< in: mutex */ + const ib_mutex_t* mutex) /*!< in: mutex */ { ut_ad(mutex_validate(mutex)); @@ -458,7 +446,7 @@ UNIV_INTERN void mutex_set_waiters( /*==============*/ - mutex_t* mutex, /*!< in: mutex */ + ib_mutex_t* mutex, /*!< in: mutex */ ulint n) /*!< in: value to set */ { volatile ulint* ptr; /* declared volatile to ensure that @@ -479,7 +467,7 @@ UNIV_INTERN void mutex_spin_wait( /*============*/ - mutex_t* mutex, /*!< in: pointer to mutex */ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ const char* file_name, /*!< in: file name where mutex requested */ ulint line) /*!< in: line where requested */ @@ -487,6 +475,9 @@ mutex_spin_wait( ulint i; /* spin round count */ ulint index; /* index of the reserved wait cell */ sync_array_t* sync_arr; + size_t counter_index; + + counter_index = (size_t) os_thread_get_curr_id(); ut_ad(mutex); @@ -494,7 +485,7 @@ mutex_spin_wait( isn't exact. Moved out of ifdef that follows because we are willing to sacrifice the cost of counting this as the data is valuable. Count the number of calls to mutex_spin_wait. */ - mutex_spin_wait_count++; + mutex_spin_wait_count.add(counter_index, 1); mutex_loop: @@ -507,7 +498,6 @@ mutex_loop: a memory word. */ spin_loop: - ut_d(mutex->count_spin_loop++); while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) { if (srv_spin_wait_delay) { @@ -518,26 +508,12 @@ spin_loop: } if (i == SYNC_SPIN_ROUNDS) { -#ifdef UNIV_DEBUG - mutex->count_os_yield++; -#endif /* UNIV_DEBUG */ os_thread_yield(); } -#ifdef UNIV_SRV_PRINT_LATCH_WAITS - fprintf(stderr, - "Thread %lu spin wait mutex at %p" - " cfile %s cline %lu rnds %lu\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex, - innobase_basename(mutex->cfile_name), - (ulong) mutex->cline, (ulong) i); -#endif - - mutex_spin_round_count += i; - - ut_d(mutex->count_spin_rounds += i); + mutex_spin_round_count.add(counter_index, i); - if (mutex_test_and_set(mutex) == 0) { + if (ib_mutex_test_and_set(mutex) == 0) { /* Succeeded! */ ut_d(mutex->thread_id = os_thread_get_curr_id()); @@ -550,7 +526,7 @@ spin_loop: /* We may end up with a situation where lock_word is 0 but the OS fast mutex is still reserved. On FreeBSD the OS does not seem to schedule a thread which is constantly calling pthread_mutex_trylock - (in mutex_test_and_set implementation). Then we could end up + (in ib_mutex_test_and_set implementation). Then we could end up spinning here indefinitely. The following 'i++' stops this infinite spin. */ @@ -575,7 +551,7 @@ spin_loop: /* Try to reserve still a few times */ for (i = 0; i < 4; i++) { - if (mutex_test_and_set(mutex) == 0) { + if (ib_mutex_test_and_set(mutex) == 0) { /* Succeeded! Free the reserved wait cell */ sync_array_free_cell(sync_arr, index); @@ -585,13 +561,6 @@ spin_loop: mutex_set_debug_info(mutex, file_name, line); #endif -#ifdef UNIV_SRV_PRINT_LATCH_WAITS - fprintf(stderr, "Thread %lu spin wait succeeds at 2:" - " mutex at %p\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), - (void*) mutex); -#endif - return; /* Note that in this case we leave the waiters field @@ -604,19 +573,12 @@ spin_loop: after the change in the wait array and the waiters field was made. Now there is no risk of infinite wait on the event. */ -#ifdef UNIV_SRV_PRINT_LATCH_WAITS - fprintf(stderr, - "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex, - innobase_basename(mutex->cfile_name), - (ulong) mutex->cline, (ulong) i); -#endif - - mutex_os_wait_count++; + mutex_os_wait_count.add(counter_index, 1); mutex->count_os_wait++; sync_array_wait_event(sync_arr, index); + goto mutex_loop; } @@ -626,7 +588,7 @@ UNIV_INTERN void mutex_signal_object( /*================*/ - mutex_t* mutex) /*!< in: mutex */ + ib_mutex_t* mutex) /*!< in: mutex */ { mutex_set_waiters(mutex, 0); @@ -643,7 +605,7 @@ UNIV_INTERN void mutex_set_debug_info( /*=================*/ - mutex_t* mutex, /*!< in: mutex */ + ib_mutex_t* mutex, /*!< in: mutex */ const char* file_name, /*!< in: file where requested */ ulint line) /*!< in: line where requested */ { @@ -662,7 +624,7 @@ UNIV_INTERN void mutex_get_debug_info( /*=================*/ - mutex_t* mutex, /*!< in: mutex */ + ib_mutex_t* mutex, /*!< in: mutex */ const char** file_name, /*!< out: file where requested */ ulint* line, /*!< out: line where requested */ os_thread_id_t* thread_id) /*!< out: id of the thread which owns @@ -683,7 +645,7 @@ mutex_list_print_info( /*==================*/ FILE* file) /*!< in: file where to print */ { - mutex_t* mutex; + ib_mutex_t* mutex; const char* file_name; ulint line; os_thread_id_t thread_id; @@ -726,7 +688,7 @@ ulint mutex_n_reserved(void) /*==================*/ { - mutex_t* mutex; + ib_mutex_t* mutex; ulint count = 0; mutex_enter(&mutex_list_mutex); @@ -825,9 +787,9 @@ sync_print_warning( const sync_level_t* slot) /*!< in: slot for which to print warning */ { - mutex_t* mutex; + ib_mutex_t* mutex; - mutex = static_cast<mutex_t*>(slot->latch); + mutex = static_cast<ib_mutex_t*>(slot->latch); if (mutex->magic_n == MUTEX_MAGIC_N) { fprintf(stderr, @@ -1200,6 +1162,8 @@ sync_thread_add_level( case SYNC_TRX_I_S_RWLOCK: case SYNC_TRX_I_S_LAST_READ: case SYNC_IBUF_MUTEX: + case SYNC_INDEX_ONLINE_LOG: + case SYNC_STATS_AUTO_RECALC: if (!sync_thread_levels_g(array, level, TRUE)) { fprintf(stderr, "InnoDB: sync_thread_levels_g(array, %lu)" @@ -1448,7 +1412,7 @@ sync_thread_reset_level( return(TRUE); } - if (((mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) { + if (((ib_mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) { rw_lock_t* rw_lock; rw_lock = (rw_lock_t*) latch; @@ -1511,7 +1475,7 @@ sync_init(void) mutex_create(rw_lock_debug_mutex_key, &rw_lock_debug_mutex, SYNC_NO_ORDER_CHECK); - rw_lock_debug_event = os_event_create(NULL); + rw_lock_debug_event = os_event_create(); rw_lock_debug_waiters = FALSE; #endif /* UNIV_SYNC_DEBUG */ } @@ -1552,7 +1516,7 @@ void sync_close(void) /*===========*/ { - mutex_t* mutex; + ib_mutex_t* mutex; sync_array_close(); @@ -1569,7 +1533,7 @@ sync_close(void) mutex_free(mutex); - mutex = UT_LIST_GET_FIRST(mutex_list); + mutex = UT_LIST_GET_FIRST(mutex_list); } mutex_free(&mutex_list_mutex); @@ -1593,13 +1557,6 @@ sync_print_wait_info( /*=================*/ FILE* file) /*!< in: file where to print */ { -#ifdef UNIV_SYNC_DEBUG - fprintf(file, - "Mutex exits "UINT64PF", " - "rws exits "UINT64PF", rwx exits "UINT64PF"\n", - mutex_exit_count, rw_s_exit_count, rw_x_exit_count); -#endif - fprintf(file, "Mutex spin waits "UINT64PF", rounds "UINT64PF", " "OS waits "UINT64PF"\n" @@ -1607,25 +1564,27 @@ sync_print_wait_info( "OS waits "UINT64PF"\n" "RW-excl spins "UINT64PF", rounds "UINT64PF", " "OS waits "UINT64PF"\n", - mutex_spin_wait_count, - mutex_spin_round_count, - mutex_os_wait_count, - rw_s_spin_wait_count, - rw_s_spin_round_count, - rw_s_os_wait_count, - rw_x_spin_wait_count, - rw_x_spin_round_count, - rw_x_os_wait_count); + (ib_uint64_t) mutex_spin_wait_count, + (ib_uint64_t) mutex_spin_round_count, + (ib_uint64_t) mutex_os_wait_count, + (ib_uint64_t) rw_lock_stats.rw_s_spin_wait_count, + (ib_uint64_t) rw_lock_stats.rw_s_spin_round_count, + (ib_uint64_t) rw_lock_stats.rw_s_os_wait_count, + (ib_uint64_t) rw_lock_stats.rw_x_spin_wait_count, + (ib_uint64_t) rw_lock_stats.rw_x_spin_round_count, + (ib_uint64_t) rw_lock_stats.rw_x_os_wait_count); fprintf(file, "Spin rounds per wait: %.2f mutex, %.2f RW-shared, " "%.2f RW-excl\n", (double) mutex_spin_round_count / (mutex_spin_wait_count ? mutex_spin_wait_count : 1), - (double) rw_s_spin_round_count / - (rw_s_spin_wait_count ? rw_s_spin_wait_count : 1), - (double) rw_x_spin_round_count / - (rw_x_spin_wait_count ? rw_x_spin_wait_count : 1)); + (double) rw_lock_stats.rw_s_spin_round_count / + (rw_lock_stats.rw_s_spin_wait_count + ? rw_lock_stats.rw_s_spin_wait_count : 1), + (double) rw_lock_stats.rw_x_spin_round_count / + (rw_lock_stats.rw_x_spin_wait_count + ? rw_lock_stats.rw_x_spin_wait_count : 1)); } /*******************************************************************//** diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc index cbf90afae0d..f6360562ae7 100644 --- a/storage/innobase/trx/trx0i_s.cc +++ b/storage/innobase/trx/trx0i_s.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -131,25 +131,25 @@ noop because it will be empty. */ /** Memory for each table in the intermediate buffer is allocated in separate chunks. These chunks are considered to be concatenated to represent one flat array of rows. */ -typedef struct i_s_mem_chunk_struct { +struct i_s_mem_chunk_t { ulint offset; /*!< offset, in number of rows */ ulint rows_allocd; /*!< the size of this chunk, in number of rows */ void* base; /*!< start of the chunk */ -} i_s_mem_chunk_t; +}; /** This represents one table's cache. */ -typedef struct i_s_table_cache_struct { +struct i_s_table_cache_t { ulint rows_used; /*!< number of used rows */ ulint rows_allocd; /*!< number of allocated rows */ ulint row_size; /*!< size of a single row */ i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of memory chunks that stores the rows */ -} i_s_table_cache_t; +}; /** This structure describes the intermediate buffer */ -struct trx_i_s_cache_struct { +struct trx_i_s_cache_t { rw_lock_t rw_lock; /*!< read-write lock protecting the rest of this structure */ ullint last_read; /*!< last time the cache was read; @@ -501,8 +501,7 @@ fill_trx_row( goto thd_done; } - row->trx_mysql_thread_id = thd_get_thread_id( - static_cast<const THD*>(trx->mysql_thd)); + row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd); stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len); @@ -1290,7 +1289,10 @@ fetch_data_into_cache_low( for (trx = UT_LIST_GET_FIRST(*trx_list); trx != NULL; - trx = UT_LIST_GET_NEXT(trx_list, trx)) { + trx = + (trx_list == &trx_sys->mysql_trx_list + ? UT_LIST_GET_NEXT(mysql_trx_list, trx) + : UT_LIST_GET_NEXT(trx_list, trx))) { i_s_trx_row_t* trx_row; i_s_locks_row_t* requested_lock_row; diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 62c140879aa..f6d8dfc6b40 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -69,19 +69,9 @@ UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key; UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key; #endif /* UNIV_PFS_MUTEX */ -/********************************************************************//** -Fetches the next undo log record from the history list to purge. It must be -released with the corresponding release function. -@return copy of an undo log record or pointer to trx_purge_dummy_rec, -if the whole undo log can skipped in purge; NULL if none left */ -static -trx_undo_rec_t* -trx_purge_fetch_next_rec( -/*=====================*/ - roll_ptr_t* roll_ptr, /*!< out: roll pointer to undo record */ - ulint* n_pages_handled,/*!< in/out: number of UNDO log pages - handled */ - mem_heap_t* heap); /*!< in: memory heap where copied */ +#ifdef UNIV_DEBUG +UNIV_INTERN my_bool srv_purge_view_update_only_debug; +#endif /* UNIV_DEBUG */ /****************************************************************//** Builds a purge 'query' graph. The actual purge is performed by executing @@ -129,7 +119,7 @@ trx_purge_sys_create( purge_sys = static_cast<trx_purge_t*>(mem_zalloc(sizeof(*purge_sys))); purge_sys->state = PURGE_STATE_INIT; - purge_sys->event = os_event_create("purge"); + purge_sys->event = os_event_create(); /* Take ownership of ib_bh, we are responsible for freeing it. */ purge_sys->ib_bh = ib_bh; @@ -539,7 +529,6 @@ trx_purge_truncate_history( } } - /***********************************************************************//** Updates the last not yet purged history log info in rseg when we have purged a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */ @@ -703,7 +692,7 @@ trx_purge_get_rseg_with_min_trx_id( /* We assume in purge of externally stored fields that space id is in the range of UNDO tablespace space ids */ - ut_a(purge_sys->rseg->space <= srv_undo_tablespaces); + ut_a(purge_sys->rseg->space <= srv_undo_tablespaces_open); zip_size = purge_sys->rseg->zip_size; @@ -924,7 +913,7 @@ Fetches the next undo log record from the history list to purge. It must be released with the corresponding release function. @return copy of an undo log record or pointer to trx_purge_dummy_rec, if the whole undo log can skipped in purge; NULL if none left */ -static +static __attribute__((warn_unused_result, nonnull)) trx_undo_rec_t* trx_purge_fetch_next_rec( /*=====================*/ @@ -1215,6 +1204,12 @@ trx_purge( rw_lock_x_unlock(&purge_sys->latch); +#ifdef UNIV_DEBUG + if (srv_purge_view_update_only_debug) { + return(0); + } +#endif + /* Fetch the UNDO recs that need to be purged. */ n_pages_handled = trx_purge_attach_undo_recs( n_purge_threads, purge_sys, &purge_sys->limit, batch_size); @@ -1260,6 +1255,14 @@ run_synchronously: ut_a(purge_sys->n_submitted == purge_sys->n_completed); +#ifdef UNIV_DEBUG + if (purge_sys->limit.trx_no == 0) { + purge_sys->done = purge_sys->iter; + } else { + purge_sys->done = purge_sys->limit; + } +#endif /* UNIV_DEBUG */ + if (truncate) { trx_purge_truncate(); } @@ -1305,14 +1308,14 @@ trx_purge_stop(void) ut_a(purge_sys->state != PURGE_STATE_INIT); ut_a(purge_sys->state != PURGE_STATE_EXIT); + ut_a(purge_sys->state != PURGE_STATE_DISABLED); ++purge_sys->n_stop; state = purge_sys->state; if (state == PURGE_STATE_RUN) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Stopping purge.\n"); + ib_logf(IB_LOG_LEVEL_INFO, "Stopping purge"); /* We need to wakeup the purge thread in case it is suspended, so that it can acknowledge the state change. */ @@ -1329,6 +1332,28 @@ trx_purge_stop(void) /* Wait for purge coordinator to signal that it is suspended. */ os_event_wait_low(purge_sys->event, sig_count); + } else { + bool once = true; + + rw_lock_x_lock(&purge_sys->latch); + + /* Wait for purge to signal that it has actually stopped. */ + while (purge_sys->running) { + + if (once) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for purge to stop"); + once = false; + } + + rw_lock_x_unlock(&purge_sys->latch); + + os_thread_sleep(10000); + + rw_lock_x_lock(&purge_sys->latch); + } + + rw_lock_x_unlock(&purge_sys->latch); } MONITOR_INC_VALUE(MONITOR_PURGE_STOP_COUNT, 1); @@ -1343,8 +1368,16 @@ trx_purge_run(void) { rw_lock_x_lock(&purge_sys->latch); - ut_a(purge_sys->state != PURGE_STATE_INIT); - ut_a(purge_sys->state != PURGE_STATE_EXIT); + switch(purge_sys->state) { + case PURGE_STATE_INIT: + case PURGE_STATE_EXIT: + case PURGE_STATE_DISABLED: + ut_error; + + case PURGE_STATE_RUN: + case PURGE_STATE_STOP: + break; + } if (purge_sys->n_stop > 0) { @@ -1354,8 +1387,7 @@ trx_purge_run(void) if (purge_sys->n_stop == 0) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Resuming purge.\n"); + ib_logf(IB_LOG_LEVEL_INFO, "Resuming purge"); purge_sys->state = PURGE_STATE_RUN; } diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index b87eac9362e..a698b37c2a6 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -287,7 +287,7 @@ trx_undo_rec_get_pars( TRX_UNDO_INSERT_REC, ... */ ulint* cmpl_info, /*!< out: compiler info, relevant only for update type records */ - ibool* updated_extern, /*!< out: TRUE if we updated an + bool* updated_extern, /*!< out: true if we updated an externally stored fild */ undo_no_t* undo_no, /*!< out: undo log record number */ table_id_t* table_id) /*!< out: table id */ @@ -300,12 +300,8 @@ trx_undo_rec_get_pars( type_cmpl = mach_read_from_1(ptr); ptr++; - if (type_cmpl & TRX_UNDO_UPD_EXTERN) { - *updated_extern = TRUE; - type_cmpl -= TRX_UNDO_UPD_EXTERN; - } else { - *updated_extern = FALSE; - } + *updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN); + type_cmpl &= ~TRX_UNDO_UPD_EXTERN; *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; @@ -588,6 +584,7 @@ trx_undo_page_report_modify( /* Store first some general parameters to the undo log */ if (!update) { + ut_ad(!rec_get_deleted_flag(rec, dict_table_is_comp(table))); type_cmpl = TRX_UNDO_DEL_MARK_REC; } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { type_cmpl = TRX_UNDO_UPD_DEL_REC; @@ -1040,8 +1037,9 @@ trx_undo_update_rec_get_update( } /*******************************************************************//** -Builds a partial row from an update undo log record. It contains the -columns which occur as ordering in any index of the table. +Builds a partial row from an update undo log record, for purge. +It contains the columns which occur as ordering in any index of the table. +Any missing columns are indicated by col->mtype == DATA_MISSING. @return pointer to remaining part of undo record */ UNIV_INTERN byte* @@ -1075,7 +1073,12 @@ trx_undo_rec_get_partial_row( *row = dtuple_create(heap, row_len); - dict_table_copy_types(*row, index->table); + /* Mark all columns in the row uninitialized, so that + we can distinguish missing fields from fields that are SQL NULL. */ + for (ulint i = 0; i < row_len; i++) { + dfield_get_type(dtuple_get_nth_field(*row, i)) + ->mtype = DATA_MISSING; + } end_ptr = ptr + mach_read_from_2(ptr); ptr += 2; @@ -1097,7 +1100,9 @@ trx_undo_rec_get_partial_row( ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); dfield = dtuple_get_nth_field(*row, col_no); - + dict_col_copy_type( + dict_table_get_nth_col(index->table, col_no), + dfield_get_type(dfield)); dfield_set_data(dfield, field, len); if (len != UNIV_SQL_NULL @@ -1177,7 +1182,7 @@ transaction and in consistent reads that must look to the history of this transaction. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t trx_undo_report_row_operation( /*==========================*/ ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is @@ -1196,6 +1201,7 @@ trx_undo_report_row_operation( const rec_t* rec, /*!< in: in case of an update or delete marking, the record in the clustered index, otherwise NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the inserted undo log record, 0 if BTR_NO_UNDO_LOG @@ -1207,16 +1213,14 @@ trx_undo_report_row_operation( buf_block_t* undo_block; trx_rseg_t* rseg; mtr_t mtr; - ulint err = DB_SUCCESS; - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets = offsets_; + dberr_t err = DB_SUCCESS; #ifdef UNIV_DEBUG int loop_count = 0; #endif /* UNIV_DEBUG */ - rec_offs_init(offsets_); + ut_ad(!srv_read_only_mode); ut_a(dict_index_is_clust(index)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); if (flags & BTR_NO_UNDO_LOG_FLAG) { @@ -1230,6 +1234,17 @@ trx_undo_report_row_operation( || (clust_entry && !update && !rec)); trx = thr_get_trx(thr); + + /* This table is visible only to the session that created it. */ + if (trx->read_only) { + ut_ad(!srv_read_only_mode); + /* MySQL should block writes to non-temporary tables. */ + ut_a(DICT_TF2_FLAG_IS_SET(index->table, DICT_TF2_TEMPORARY)); + if (trx->rseg == 0) { + trx_assign_rseg(trx); + } + } + rseg = trx->rseg; mtr_start(&mtr); @@ -1272,8 +1287,6 @@ trx_undo_report_row_operation( } ut_ad(err == DB_SUCCESS); - offsets = rec_get_offsets(rec, index, offsets, - ULINT_UNDEFINED, &heap); } page_no = undo->last_page_no; @@ -1352,8 +1365,7 @@ trx_undo_report_row_operation( *roll_ptr = trx_undo_build_roll_ptr( op_type == TRX_UNDO_INSERT_OP, rseg->id, page_no, offset); - err = DB_SUCCESS; - goto func_exit; + return(DB_SUCCESS); } ut_ad(page_no == undo->last_page_no); @@ -1380,10 +1392,6 @@ trx_undo_report_row_operation( err_exit: mutex_exit(&trx->undo_mutex); mtr_commit(&mtr); -func_exit: - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } return(err); } @@ -1428,39 +1436,34 @@ trx_undo_get_undo_rec_low( /******************************************************************//** Copies an undo record to heap. -NOTE: the caller must have latches on the clustered index page and -purge_view. +NOTE: the caller must have latches on the clustered index page. -@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been -truncated and we cannot fetch the old version */ -static -ulint +@retval true if the undo log has been +truncated and we cannot fetch the old version +@retval false if the undo log record is available */ +static __attribute__((nonnull, warn_unused_result)) +bool trx_undo_get_undo_rec( /*==================*/ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ trx_id_t trx_id, /*!< in: id of the trx that generated the roll pointer: it points to an undo log of this transaction */ - trx_undo_rec_t** undo_rec, /*!< out, own: copy of the record */ + trx_undo_rec_t**undo_rec, /*!< out, own: copy of the record */ mem_heap_t* heap) /*!< in: memory heap where copied */ { - ibool missing_history; + bool missing_history; rw_lock_s_lock(&purge_sys->latch); missing_history = read_view_sees_trx_id(purge_sys->view, trx_id); - rw_lock_s_unlock(&purge_sys->latch); - - if (UNIV_UNLIKELY(missing_history)) { - /* It may be that the necessary undo log has already been - deleted */ - - return(DB_MISSING_HISTORY); + if (!missing_history) { + *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); } - *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + rw_lock_s_unlock(&purge_sys->latch); - return(DB_SUCCESS); + return(missing_history); } #ifdef UNIV_DEBUG @@ -1471,13 +1474,13 @@ trx_undo_get_undo_rec( /*******************************************************************//** Build a previous version of a clustered index record. The caller must -hold a latch on the index page of the clustered index record, to -guarantee that the stack of versions is locked all the way down to the -purge_sys->view. -@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is -earlier than purge_view, which means that it may have been removed */ +hold a latch on the index page of the clustered index record. +@retval true if previous version was built, or if it was an insert +or the table has been rebuilt +@retval false if the previous version is earlier than purge_view, +which means that it may have been removed */ UNIV_INTERN -ulint +bool trx_undo_prev_version_build( /*========================*/ const rec_t* index_rec ATTRIB_USED_ONLY_IN_DEBUG, @@ -1488,7 +1491,7 @@ trx_undo_prev_version_build( index_rec page and purge_view */ const rec_t* rec, /*!< in: version of a clustered index record */ dict_index_t* index, /*!< in: clustered index */ - ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ mem_heap_t* heap, /*!< in: memory heap from which the memory needed is allocated */ rec_t** old_vers)/*!< out, own: previous version, or NULL if @@ -1509,9 +1512,8 @@ trx_undo_prev_version_build( byte* ptr; ulint info_bits; ulint cmpl_info; - ibool dummy_extern; + bool dummy_extern; byte* buf; - ulint err; #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&purge_sys->latch, RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ @@ -1526,28 +1528,28 @@ trx_undo_prev_version_build( *old_vers = NULL; if (trx_undo_roll_ptr_is_insert(roll_ptr)) { - /* The record rec is the first inserted version */ - - return(DB_SUCCESS); + return(true); } rec_trx_id = row_get_rec_trx_id(rec, index, offsets); - err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); - - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - /* The undo record may already have been purged. - This should never happen for user transactions, but - it can happen in purge. */ - ut_ad(err == DB_MISSING_HISTORY); - - return(err); + if (trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap)) { + /* The undo record may already have been purged, + during purge or semi-consistent read. */ + return(false); } ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, &dummy_extern, &undo_no, &table_id); + if (table_id != index->table->id) { + /* The table should have been rebuilt, but purge has + not yet removed the undo log records for the + now-dropped old table (table_id). */ + return(true); + } + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, &info_bits); @@ -1578,7 +1580,6 @@ trx_undo_prev_version_build( ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, roll_ptr, info_bits, NULL, heap, &update); - ut_a(table_id == index->table->id); ut_a(ptr); # if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG @@ -1588,14 +1589,46 @@ trx_undo_prev_version_build( if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint n_ext; + /* We should confirm the existence of disowned external data, + if the previous version record is delete marked. If the trx_id + of the previous record is seen by purge view, we should treat + it as missing history, because the disowned external data + might be purged already. + + The inherited external data (BLOBs) can be freed (purged) + after trx_id was committed, provided that no view was started + before trx_id. If the purge view can see the committed + delete-marked record by trx_id, no transactions need to access + the BLOB. */ + + /* the row_upd_changes_disowned_external(update) call could be + omitted, but the synchronization on purge_sys->latch is likely + more expensive. */ + + if ((update->info_bits & REC_INFO_DELETED_FLAG) + && row_upd_changes_disowned_external(update)) { + bool missing_extern; + + rw_lock_s_lock(&purge_sys->latch); + missing_extern = read_view_sees_trx_id(purge_sys->view, + trx_id); + rw_lock_s_unlock(&purge_sys->latch); + + if (missing_extern) { + /* treat as a fresh insert, not to + cause assertion error at the caller. */ + return(true); + } + } + /* We have to set the appropriate extern storage bits in the old version of the record: the extern bits in rec for those fields that update does NOT update, as well as the bits for those fields that update updates to become externally stored fields. Store the info: */ - entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, - offsets, &n_ext, heap); + entry = row_rec_to_index_entry( + rec, index, offsets, &n_ext, heap); n_ext += btr_push_update_extern_fields(entry, update, heap); /* The page containing the clustered index record corresponding to entry is latched in mtr. Thus the @@ -1618,6 +1651,6 @@ trx_undo_prev_version_build( row_upd_rec_in_place(*old_vers, index, offsets, update, NULL); } - return(DB_SUCCESS); + return(true); } #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc index 042b5b87da7..d07e40c506d 100644 --- a/storage/innobase/trx/trx0roll.cc +++ b/storage/innobase/trx/trx0roll.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -133,7 +133,7 @@ trx_rollback_to_savepoint_low( Rollback a transaction to a given savepoint or do a complete rollback. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t trx_rollback_to_savepoint( /*======================*/ trx_t* trx, /*!< in: transaction handle */ @@ -157,14 +157,14 @@ trx_rollback_to_savepoint( srv_active_wake_master_thread(); - return((int) trx->error_state); + return(trx->error_state); } /*******************************************************************//** Rollback a transaction used in MySQL. @return error code or DB_SUCCESS */ static -enum db_err +dberr_t trx_rollback_for_mysql_low( /*=======================*/ trx_t* trx) /*!< in/out: transaction */ @@ -193,7 +193,7 @@ trx_rollback_for_mysql_low( Rollback a transaction used in MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t trx_rollback_for_mysql( /*===================*/ trx_t* trx) /*!< in/out: transaction */ @@ -214,7 +214,7 @@ trx_rollback_for_mysql( return(trx_rollback_for_mysql_low(trx)); case TRX_STATE_PREPARED: - assert_trx_in_rw_list(trx); + ut_ad(!trx_is_autocommit_non_locking(trx)); return(trx_rollback_for_mysql_low(trx)); case TRX_STATE_COMMITTED_IN_MEMORY: @@ -223,19 +223,19 @@ trx_rollback_for_mysql( } ut_error; - return((int) DB_CORRUPTION); + return(DB_CORRUPTION); } /*******************************************************************//** Rollback the latest SQL statement for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t trx_rollback_last_sql_stat_for_mysql( /*=================================*/ trx_t* trx) /*!< in/out: transaction */ { - int err; + dberr_t err; /* We are reading trx->state without holding trx_sys->mutex here, because the statement rollback should be invoked for a @@ -344,8 +344,8 @@ the row, these locks are naturally released in the rollback. Savepoints which were set after this savepoint are deleted. @return if no savepoint of the name found then DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t trx_rollback_to_savepoint_for_mysql_low( /*====================================*/ trx_t* trx, /*!< in/out: transaction */ @@ -358,7 +358,7 @@ trx_rollback_to_savepoint_for_mysql_low( binlog entries of the queries executed after the savepoint */ { - ulint err; + dberr_t err; ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); ut_ad(trx->in_mysql_trx_list); @@ -395,7 +395,7 @@ were set after this savepoint are deleted. @return if no savepoint of the name found then DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_rollback_to_savepoint_for_mysql( /*================================*/ trx_t* trx, /*!< in: transaction handle */ @@ -449,7 +449,7 @@ savepoint and replaces it with a new. Savepoints are deleted in a transaction commit or rollback. @return always DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_savepoint_for_mysql( /*====================*/ trx_t* trx, /*!< in: transaction handle */ @@ -495,7 +495,7 @@ savepoint are left as is. @return if no savepoint of the name found then DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t trx_release_savepoint_for_mysql( /*============================*/ trx_t* trx, /*!< in: transaction handle */ @@ -623,18 +623,16 @@ trx_rollback_active( if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE && trx->table_id != 0) { - /* If the transaction was for a dictionary operation, we - drop the relevant table, if it still exists */ + /* If the transaction was for a dictionary operation, + we drop the relevant table only if it is not flagged + as DISCARDED. If it still exists. */ - fprintf(stderr, - "InnoDB: Dropping table with id "UINT64PF - " in recovery if it exists\n", - (ib_uint64_t) trx->table_id); + table = dict_table_open_on_id( + trx->table_id, dictionary_locked, FALSE); - table = dict_table_open_on_id(trx->table_id, dictionary_locked); + if (table && !dict_table_is_discarded(table)) { - if (table) { - ulint err; + dberr_t err; /* Ensure that the table doesn't get evicted from the cache, keeps things simple for drop. */ @@ -643,16 +641,17 @@ trx_rollback_active( dict_table_move_from_lru_to_non_lru(table); } - dict_table_close(table, dictionary_locked); + dict_table_close(table, dictionary_locked, FALSE); - fputs("InnoDB: Table found: dropping table ", stderr); - ut_print_name(stderr, trx, TRUE, table->name); - fputs(" in recovery\n", stderr); + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping table '%s', with id " UINT64PF " " + "in recovery", + table->name, trx->table_id); err = row_drop_table_for_mysql(table->name, trx, TRUE); trx_commit_for_mysql(trx); - ut_a(err == (int) DB_SUCCESS); + ut_a(err == DB_SUCCESS); } } @@ -660,9 +659,8 @@ trx_rollback_active( row_mysql_unlock_data_dictionary(trx); } - fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT - " completed\n", - trx->id); + ib_logf(IB_LOG_LEVEL_INFO, + "Rollback of trx with id " TRX_ID_FMT " completed", trx->id); mem_heap_free(heap); @@ -808,6 +806,8 @@ DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( /*!< in: a dummy parameter required by os_thread_create */ { + ut_ad(!srv_read_only_mode); + #ifdef UNIV_PFS_THREAD pfs_register_thread(trx_rollback_clean_thread_key); #endif /* UNIV_PFS_THREAD */ diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 97fd1f36943..7c2bbc90ad9 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -43,20 +43,16 @@ Created 3/26/1996 Heikki Tuuri #include "log0recv.h" #include "os0file.h" #include "read0read.h" -#include "buf0dblwr.h" /** The file format tag structure with id and name. */ -struct file_format_struct { +struct file_format_t { ulint id; /*!< id of the file format */ const char* name; /*!< text representation of the file format */ - mutex_t mutex; /*!< covers changes to the above + ib_mutex_t mutex; /*!< covers changes to the above fields */ }; -/** The file format tag */ -typedef struct file_format_struct file_format_t; - /** The transaction system */ UNIV_INTERN trx_sys_t* trx_sys = NULL; @@ -122,12 +118,12 @@ UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key; UNIV_INTERN mysql_pfs_key_t trx_sys_mutex_key; #endif /* UNIV_PFS_RWLOCK */ +#ifndef UNIV_HOTBACKUP #ifdef UNIV_DEBUG /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ uint trx_rseg_n_slots_debug = 0; #endif -#ifndef UNIV_HOTBACKUP /** This is used to track the maximum file format id known to InnoDB. It's updated via SET GLOBAL innodb_file_format_max = 'x' or when we open or create a table. */ @@ -180,13 +176,17 @@ trx_sys_flush_max_trx_id(void) ut_ad(mutex_own(&trx_sys->mutex)); - mtr_start(&mtr); + if (!srv_read_only_mode) { + mtr_start(&mtr); - sys_header = trx_sysf_get(&mtr); + sys_header = trx_sysf_get(&mtr); - mlog_write_ull(sys_header + TRX_SYS_TRX_ID_STORE, - trx_sys->max_trx_id, &mtr); - mtr_commit(&mtr); + mlog_write_ull( + sys_header + TRX_SYS_TRX_ID_STORE, + trx_sys->max_trx_id, &mtr); + + mtr_commit(&mtr); + } } /*****************************************************************//** @@ -524,6 +524,8 @@ trx_sys_init_at_db_start(void) + TRX_SYS_TRX_ID_STORE), TRX_SYS_TRX_ID_WRITE_MARGIN); + ut_d(trx_sys->rw_max_trx_id = trx_sys->max_trx_id); + UT_LIST_INIT(trx_sys->mysql_trx_list); trx_dummy_sess = sess_open(); @@ -701,7 +703,7 @@ Check for the max file format tag stored on disk. Note: If max_format_id is == UNIV_FORMAT_MAX + 1 then we only print a warning. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t trx_sys_file_format_max_check( /*==========================*/ ulint max_format_id) /*!< in: max format id to check */ @@ -718,21 +720,18 @@ trx_sys_file_format_max_check( format_id = UNIV_FORMAT_MIN; } - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: highest supported file format is %s.\n", + ib_logf(IB_LOG_LEVEL_INFO, + "Highest supported file format is %s.", trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX)); if (format_id > UNIV_FORMAT_MAX) { ut_a(format_id < FILE_FORMAT_NAME_N); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: %s: the system tablespace is in a file " - "format that this version doesn't support - %s\n", - ((max_format_id <= UNIV_FORMAT_MAX) - ? "Error" : "Warning"), + ib_logf(max_format_id <= UNIV_FORMAT_MAX + ? IB_LOG_LEVEL_ERROR : IB_LOG_LEVEL_WARN, + "The system tablespace is in a file " + "format that this version doesn't support - %s.", trx_sys_file_format_id_to_name(format_id)); if (max_format_id <= UNIV_FORMAT_MAX) { @@ -883,7 +882,7 @@ trx_sys_create_rsegs( ut_a(n_spaces < TRX_SYS_N_RSEGS); ut_a(n_rsegs <= TRX_SYS_N_RSEGS); - if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) { + if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO || srv_read_only_mode) { return(ULINT_UNDEFINED); } @@ -926,9 +925,8 @@ trx_sys_create_rsegs( } } - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: %lu rollback segment(s) are active.\n", - n_used); + ib_logf(IB_LOG_LEVEL_INFO, + "%lu rollback segment(s) are active.", n_used); return(n_used); } @@ -1000,7 +998,7 @@ trx_sys_read_file_format_id( ); if (!success) { /* The following call prints an error message */ - os_file_get_last_error(TRUE); + os_file_get_last_error(true); ut_print_timestamp(stderr); @@ -1019,7 +1017,7 @@ trx_sys_read_file_format_id( if (!success) { /* The following call prints an error message */ - os_file_get_last_error(TRUE); + os_file_get_last_error(true); ut_print_timestamp(stderr); @@ -1080,7 +1078,7 @@ trx_sys_read_pertable_file_format_id( ); if (!success) { /* The following call prints an error message */ - os_file_get_last_error(TRUE); + os_file_get_last_error(true); ut_print_timestamp(stderr); @@ -1099,7 +1097,7 @@ trx_sys_read_pertable_file_format_id( if (!success) { /* The following call prints an error message */ - os_file_get_last_error(TRUE); + os_file_get_last_error(true); ut_print_timestamp(stderr); @@ -1120,11 +1118,11 @@ trx_sys_read_pertable_file_format_id( if (flags == 0) { /* file format is Antelope */ *format_id = 0; - return (TRUE); + return(TRUE); } else if (flags & 1) { /* tablespace flags are ok */ *format_id = (flags / 32) % 128; - return (TRUE); + return(TRUE); } else { /* bad tablespace flags */ return(FALSE); @@ -1143,7 +1141,7 @@ trx_sys_file_format_id_to_name( { if (!(id < FILE_FORMAT_NAME_N)) { /* unknown id */ - return ("Unknown"); + return("Unknown"); } return(file_format_name_map[id]); @@ -1252,7 +1250,7 @@ trx_sys_any_active_transactions(void) mutex_enter(&trx_sys->mutex); total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list) - + trx_sys->n_mysql_trx; + + UT_LIST_GET_LEN(trx_sys->mysql_trx_list); ut_a(total_trx >= trx_sys->n_prepared_trx); total_trx -= trx_sys->n_prepared_trx; diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 80ebe0df2b3..449b970842a 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -105,6 +105,7 @@ trx_create(void) trx->state = TRX_STATE_NOT_STARTED; + trx->active_commit_ordered = 0; trx->isolation_level = TRX_ISO_REPEATABLE_READ; trx->no = IB_ULONGLONG_MAX; @@ -146,10 +147,6 @@ trx_create(void) trx->lock.table_locks = ib_vector_create( heap_alloc, sizeof(void**), 32); - /* For non-locking selects we avoid calling ut_time() too frequently. - Set the time here for new transactions. */ - trx->start_time = ut_time(); - return(trx); } @@ -184,8 +181,6 @@ trx_allocate_for_mysql(void) mutex_enter(&trx_sys->mutex); - trx_sys->n_mysql_trx++; - ut_d(trx->in_mysql_trx_list = TRUE); UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); @@ -205,6 +200,7 @@ trx_free( ut_a(trx->magic_n == TRX_MAGIC_N); ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_mysql_trx_list); mutex_free(&trx->undo_mutex); @@ -233,8 +229,10 @@ trx_free( /* We allocated a dedicated heap for the vector. */ ib_vector_free(trx->autoinc_locks); - /* We allocated a dedicated heap for the vector. */ - ib_vector_free(trx->lock.table_locks); + if (trx->lock.table_locks != NULL) { + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(trx->lock.table_locks); + } mutex_free(&trx->mutex); @@ -249,11 +247,12 @@ trx_free_for_background( /*====================*/ trx_t* trx) /*!< in, own: trx object */ { - if (UNIV_UNLIKELY(trx->declared_to_be_inside_innodb)) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: Freeing a trx which is declared" - " to be processing\n" - "InnoDB: inside InnoDB.\n", stderr); + if (trx->declared_to_be_inside_innodb) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Freeing a trx (%p, " TRX_ID_FMT ") which is declared " + "to be processing inside InnoDB", trx, trx->id); + trx_print(stderr, trx, 600); putc('\n', stderr); @@ -262,16 +261,16 @@ trx_free_for_background( srv_conc_force_exit_innodb(trx); } - if (UNIV_UNLIKELY(trx->n_mysql_tables_in_use != 0 - || trx->mysql_n_tables_locked != 0)) { + if (trx->n_mysql_tables_in_use != 0 + || trx->mysql_n_tables_locked != 0) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Error: MySQL is freeing a thd\n" - "InnoDB: though trx->n_mysql_tables_in_use is %lu\n" - "InnoDB: and trx->mysql_n_tables_locked is %lu.\n", + ib_logf(IB_LOG_LEVEL_ERROR, + "MySQL is freeing a thd though " + "trx->n_mysql_tables_in_use is %lu and " + "trx->mysql_n_tables_locked is %lu.", (ulong) trx->n_mysql_tables_in_use, (ulong) trx->mysql_n_tables_locked); + trx_print(stderr, trx, 600); ut_print_buf(stderr, trx, sizeof(trx_t)); putc('\n', stderr); @@ -326,8 +325,6 @@ trx_free_for_mysql( ut_ad(trx_sys_validate_trx_list()); - trx_sys->n_mysql_trx--; - mutex_exit(&trx_sys->mutex); trx_free_for_background(trx); @@ -348,6 +345,9 @@ trx_list_rw_insert_ordered( ut_ad(!trx->read_only); + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); + ut_a(srv_is_being_started); ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); @@ -372,6 +372,7 @@ trx_list_rw_insert_ordered( if (trx2 == NULL) { UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx_sys->rw_max_trx_id = trx->id); } else { UT_LIST_INSERT_AFTER( trx_list, trx_sys->rw_trx_list, trx2, trx); @@ -423,6 +424,7 @@ trx_resurrect_insert( trx->state = TRX_STATE_PREPARED; trx_sys->n_prepared_trx++; + trx_sys->n_prepared_recovered_trx++; } else { fprintf(stderr, "InnoDB: Since innodb_force_recovery" @@ -483,6 +485,7 @@ trx_resurrect_update_in_prepared_state( if (srv_force_recovery == 0) { if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) { trx_sys->n_prepared_trx++; + trx_sys->n_prepared_recovered_trx++; } else { ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); } @@ -620,10 +623,10 @@ trx_lists_init_at_db_start(void) /******************************************************************//** Assigns a rollback segment to a transaction in a round-robin fashion. @return assigned rollback segment instance */ -UNIV_INLINE +static trx_rseg_t* -trx_assign_rseg( -/*============*/ +trx_assign_rseg_low( +/*================*/ ulong max_undo_logs, /*!< in: maximum number of UNDO logs to use */ ulint n_tablespaces) /*!< in: number of rollback tablespaces */ { @@ -631,7 +634,7 @@ trx_assign_rseg( trx_rseg_t* rseg; static ulint latest_rseg = 0; - if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) { + if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO || srv_read_only_mode) { ut_a(max_undo_logs == ULONG_UNDEFINED); return(NULL); } @@ -668,6 +671,24 @@ trx_assign_rseg( } /****************************************************************//** +Assign a read-only transaction a rollback-segment, if it is attempting +to write to a TEMPORARY table. */ +UNIV_INTERN +void +trx_assign_rseg( +/*============*/ + trx_t* trx) /*!< A read-only transaction that + needs to be assigned a RBS. */ +{ + ut_a(trx->rseg == 0); + ut_a(trx->read_only); + ut_a(!srv_read_only_mode); + ut_a(!trx_is_autocommit_non_locking(trx)); + + trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces); +} + +/****************************************************************//** Starts a transaction. */ static void @@ -675,10 +696,10 @@ trx_start_low( /*==========*/ trx_t* trx) /*!< in: transaction */ { - static ulint n_start_times; - ut_ad(trx->rseg == NULL); + ut_ad(trx->start_file != 0); + ut_ad(trx->start_line != 0); ut_ad(!trx->is_recovered); ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); @@ -686,7 +707,9 @@ trx_start_low( /* Check whether it is an AUTOCOMMIT SELECT */ trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd); - trx->read_only = thd_trx_is_read_only(trx->mysql_thd); + trx->read_only = + (!trx->ddl && thd_trx_is_read_only(trx->mysql_thd)) + || srv_read_only_mode; if (!trx->auto_commit) { ++trx->will_lock; @@ -695,16 +718,10 @@ trx_start_low( } if (!trx->read_only) { - trx->rseg = trx_assign_rseg( + trx->rseg = trx_assign_rseg_low( srv_undo_logs, srv_undo_tablespaces); } - /* Avoid making an unnecessary system call, for non-locking - auto-commit selects we reuse the start_time for every 32 starts. */ - if (!trx_is_autocommit_non_locking(trx) || !(n_start_times++ % 32)) { - trx->start_time = ut_time(); - } - /* The initial value for trx->no: IB_ULONGLONG_MAX is used in read_view_open_now: */ @@ -745,12 +762,15 @@ trx_start_low( ut_ad(!trx_is_autocommit_non_locking(trx)); UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); ut_d(trx->in_rw_trx_list = TRUE); + ut_d(trx_sys->rw_max_trx_id = trx->id); } ut_ad(trx_sys_validate_trx_list()); mutex_exit(&trx_sys->mutex); + trx->start_time = ut_time(); + MONITOR_INC(MONITOR_TRX_ACTIVE); } @@ -971,6 +991,52 @@ trx_finalize_for_fts( trx->fts_trx = NULL; } +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static +void +trx_flush_log_if_needed_low( +/*========================*/ + lsn_t lsn) /*!< in: lsn up to which logs are to be + flushed. */ +{ + switch (srv_flush_log_at_trx_commit) { + case 0: + /* Do nothing */ + break; + case 1: + case 3: + /* Write the log and optionally flush it to disk */ + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + srv_unix_file_flush_method != SRV_UNIX_NOSYNC); + break; + case 2: + /* Write the log but do not flush it to disk */ + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + + break; + default: + ut_error; + } +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static __attribute__((nonnull)) +void +trx_flush_log_if_needed( +/*====================*/ + lsn_t lsn, /*!< in: lsn up to which logs are to be + flushed. */ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->op_info = "flushing log"; + trx_flush_log_if_needed_low(lsn); + trx->op_info = ""; +} + /****************************************************************//** Commits a transaction. */ UNIV_INTERN @@ -987,7 +1053,7 @@ trx_commit( ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); /* undo_no is non-zero if we're doing the final commit. */ - if (trx->fts_trx && (trx->undo_no != 0)) { + if (trx->fts_trx && trx->undo_no != 0) { ulint error; ut_a(!trx_is_autocommit_non_locking(trx)); @@ -1043,6 +1109,8 @@ trx_commit( trx->state = TRX_STATE_NOT_STARTED; + read_view_remove(trx->global_read_view, false); + MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); } else { lock_trx_release_locks(trx); @@ -1057,7 +1125,6 @@ trx_commit( assert_trx_in_list(trx); if (trx->read_only) { - ut_ad(trx->rseg == NULL); UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx); ut_d(trx->in_ro_trx_list = FALSE); MONITOR_INC(MONITOR_TRX_RO_COMMIT); @@ -1075,13 +1142,16 @@ trx_commit( trx->state = TRX_STATE_NOT_STARTED; + /* We already own the trx_sys_t::mutex, by doing it here we + avoid a potential context switch later. */ + read_view_remove(trx->global_read_view, true); + ut_ad(trx_sys_validate_trx_list()); mutex_exit(&trx_sys->mutex); } if (trx->global_read_view != NULL) { - read_view_remove(trx->global_read_view); mem_heap_empty(trx->global_read_view_heap); @@ -1129,26 +1199,8 @@ trx_commit( trx->must_flush_log_later = TRUE; } else if (srv_flush_log_at_trx_commit == 0) { /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1 || - srv_flush_log_at_trx_commit == 3) { - if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, - FALSE); - } else { - /* Write the log to the log files AND flush - them to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } - } else if (srv_flush_log_at_trx_commit == 2) { - - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); } else { - ut_error; + trx_flush_log_if_needed(lsn, trx); } trx->commit_lsn = lsn; @@ -1162,6 +1214,14 @@ trx_commit( trx->undo_no = 0; trx->last_sql_stat_start.least_undo_no = 0; + trx->ddl = false; +#ifdef UNIV_DEBUG + ut_ad(trx->start_file != 0); + ut_ad(trx->start_line != 0); + trx->start_file = 0; + trx->start_line = 0; +#endif /* UNIV_DEBUG */ + trx->will_lock = 0; trx->read_only = FALSE; trx->auto_commit = FALSE; @@ -1175,6 +1235,8 @@ trx_commit( ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); + trx->dict_operation = TRX_DICT_OP_NONE; + trx->error_state = DB_SUCCESS; /* trx->in_mysql_trx_list would hold between @@ -1365,7 +1427,7 @@ trx_commit_step( Does the transaction commit for MySQL. @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t trx_commit_for_mysql( /*=================*/ trx_t* trx) /*!< in/out: transaction */ @@ -1389,6 +1451,9 @@ trx_commit_for_mysql( records, generated by the same transaction do not. */ trx->support_xa = thd_supports_xa(trx->mysql_thd); + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); + trx_start_low(trx); /* fall through */ case TRX_STATE_ACTIVE: @@ -1407,53 +1472,23 @@ trx_commit_for_mysql( /**********************************************************************//** If required, flushes the log to disk if we called trx_commit_for_mysql() -with trx->flush_log_later == TRUE. -@return 0 or error number */ +with trx->flush_log_later == TRUE. */ UNIV_INTERN -ulint +void trx_commit_complete_for_mysql( /*==========================*/ - trx_t* trx) /*!< in: trx handle */ + trx_t* trx) /*!< in/out: transaction */ { - lsn_t lsn = trx->commit_lsn; - ut_a(trx); - trx->op_info = "flushing log"; - - if (!trx->must_flush_log_later) { - /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 0) { - /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered) { - /* Do nothing - we already flushed the prepare and binlog write - to disk, so transaction is durable (will be recovered from - binlog if necessary) */ - } else if (srv_flush_log_at_trx_commit == 1 || srv_flush_log_at_trx_commit == 3) { - if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } else { - /* Write the log to the log files AND flush them to - disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } - } else if (srv_flush_log_at_trx_commit == 2) { - - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } else { - ut_error; + if (!trx->must_flush_log_later + || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) { + return; } - trx->must_flush_log_later = FALSE; - - trx->op_info = ""; + trx_flush_log_if_needed(trx->commit_lsn, trx); - return(0); + trx->must_flush_log_later = FALSE; } /**********************************************************************//** @@ -1500,9 +1535,9 @@ trx_print_low( ulint max_query_len, /*!< in: max query length to print, or 0 to use the default max length */ - ulint n_lock_rec, + ulint n_rec_locks, /*!< in: lock_number_of_rows_locked(&trx->lock) */ - ulint n_lock_struct, + ulint n_trx_locks, /*!< in: length of trx->lock.trx_locks */ ulint heap_size) /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ @@ -1581,14 +1616,14 @@ state_ok: fprintf(f, "que state %lu ", (ulong) trx->lock.que_state); } - if (n_lock_struct > 0 || heap_size > 400) { + if (n_trx_locks > 0 || heap_size > 400) { newline = TRUE; fprintf(f, "%lu lock struct(s), heap size %lu," " %lu row lock(s)", - (ulong) n_lock_struct, + (ulong) n_trx_locks, (ulong) heap_size, - (ulong) n_lock_rec); + (ulong) n_rec_locks); } if (trx->has_search_latch) { @@ -1644,19 +1679,19 @@ trx_print( ulint max_query_len) /*!< in: max query length to print, or 0 to use the default max length */ { - ulint n_lock_rec; - ulint n_lock_struct; + ulint n_rec_locks; + ulint n_trx_locks; ulint heap_size; lock_mutex_enter(); - n_lock_rec = lock_number_of_rows_locked(&trx->lock); - n_lock_struct = UT_LIST_GET_LEN(trx->lock.trx_locks); + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); heap_size = mem_heap_get_size(trx->lock.lock_heap); lock_mutex_exit(); mutex_enter(&trx_sys->mutex); trx_print_low(f, trx, max_query_len, - n_lock_rec, n_lock_struct, heap_size); + n_rec_locks, n_trx_locks, heap_size); mutex_exit(&trx_sys->mutex); } @@ -1684,7 +1719,6 @@ trx_assert_started( switch (trx->state) { case TRX_STATE_PREPARED: - assert_trx_in_rw_list(trx); return(TRUE); case TRX_STATE_ACTIVE: @@ -1826,28 +1860,7 @@ trx_prepare( TODO: find out if MySQL holds some mutex when calling this. That would spoil our group prepare algorithm. */ - if (srv_flush_log_at_trx_commit == 0) { - /* Do nothing */ - } else if (srv_flush_log_at_trx_commit == 1 || srv_flush_log_at_trx_commit == 3) { - if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, - FALSE); - } else { - /* Write the log to the log files AND flush - them to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } - } else if (srv_flush_log_at_trx_commit == 2) { - - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } else { - ut_error; - } + trx_flush_log_if_needed(lsn, trx); } } @@ -1859,7 +1872,7 @@ trx_prepare_for_mysql( /*==================*/ trx_t* trx) /*!< in/out: trx handle */ { - trx_start_if_not_started_xa(trx); + trx_start_if_not_started_xa_low(trx); trx->op_info = "preparing"; @@ -1935,12 +1948,12 @@ trx_recover_for_mysql( if (count > 0){ ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: %lu transactions in prepared state" + " InnoDB: %d transactions in prepared state" " after recovery\n", - (ulong) count); + int (count)); } - return ((int) count); + return(int (count)); } /*******************************************************************//** @@ -2023,8 +2036,8 @@ trx_get_trx_by_xid( Starts the transaction if it is not yet started. */ UNIV_INTERN void -trx_start_if_not_started_xa( -/*========================*/ +trx_start_if_not_started_xa_low( +/*============================*/ trx_t* trx) /*!< in: transaction */ { switch (trx->state) { @@ -2057,8 +2070,8 @@ trx_start_if_not_started_xa( Starts the transaction if it is not yet started. */ UNIV_INTERN void -trx_start_if_not_started( -/*=====================*/ +trx_start_if_not_started_low( +/*=========================*/ trx_t* trx) /*!< in: transaction */ { switch (trx->state) { @@ -2074,3 +2087,45 @@ trx_start_if_not_started( ut_error; } + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +UNIV_INTERN +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op) /*!< in: dictionary operation type */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + /* Flag this transaction as a dictionary operation, so that + the data dictionary will be locked in crash recovery. */ + + trx_set_dict_operation(trx, op); + + /* Ensure it is not flagged as an auto-commit-non-locking + transation. */ + trx->will_lock = 1; + + trx->ddl = true; + + trx_start_low(trx); + return; + + case TRX_STATE_ACTIVE: + /* We have this start if not started idiom, therefore we + can't add stronger checks here. */ + trx->ddl = true; + + ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); + ut_ad(trx->will_lock > 0); + return; + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 13ad2bb3755..c4480b11366 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -413,8 +413,8 @@ trx_undo_page_init( Creates a new undo log segment in file. @return DB_SUCCESS if page creation OK possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t trx_undo_seg_create( /*================*/ trx_rseg_t* rseg __attribute__((unused)),/*!< in: rollback segment */ @@ -435,7 +435,7 @@ trx_undo_seg_create( trx_usegf_t* seg_hdr; ulint n_reserved; ibool success; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(mtr && id && rseg_hdr); ut_ad(mutex_own(&(rseg->mutex))); @@ -1468,7 +1468,7 @@ trx_undo_mem_create( if (undo == NULL) { - return NULL; + return(NULL); } undo->id = id; @@ -1551,8 +1551,8 @@ Creates a new undo log. @return DB_SUCCESS if successful in creating the new undo lob object, possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t trx_undo_create( /*============*/ trx_t* trx, /*!< in: transaction */ @@ -1571,7 +1571,7 @@ trx_undo_create( ulint offset; ulint id; page_t* undo_page; - ulint err; + dberr_t err; ut_ad(mutex_own(&(rseg->mutex))); @@ -1746,7 +1746,7 @@ undo log reused. are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY DB_OUT_OF_MEMORY */ UNIV_INTERN -ulint +dberr_t trx_undo_assign_undo( /*=================*/ trx_t* trx, /*!< in: transaction */ @@ -1755,7 +1755,7 @@ trx_undo_assign_undo( trx_rseg_t* rseg; trx_undo_t* undo; mtr_t mtr; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(trx); @@ -1771,11 +1771,17 @@ trx_undo_assign_undo( mutex_enter(&rseg->mutex); + DBUG_EXECUTE_IF( + "ib_create_table_fail_too_many_trx", + err = DB_TOO_MANY_CONCURRENT_TRXS; + goto func_exit; + ); + undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid, &mtr); if (undo == NULL) { err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid, - &undo, &mtr); + &undo, &mtr); if (err != DB_SUCCESS) { goto func_exit; @@ -1800,7 +1806,7 @@ func_exit: mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); - return err; + return(err); } /******************************************************************//** diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index 538879dd9e2..695035d6ae8 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -79,11 +79,11 @@ mysys/my_perf.c, contributed by Facebook under the following license. * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. */ -#include <string.h> /* memcmp() */ - #include "univ.i" #include "ut0crc32.h" +#include <string.h> + ib_ut_crc32_t ut_crc32; /* Precalculated table used to generate the CRC32 if the CPU does not @@ -92,7 +92,7 @@ static ib_uint32_t ut_crc32_slice8_table[8][256]; static ibool ut_crc32_slice8_table_initialized = FALSE; /* Flag that tells whether the CPU supports CRC32 or not */ -static ibool ut_crc32_sse2_enabled = FALSE; +UNIV_INTERN bool ut_crc32_sse2_enabled = false; /********************************************************************//** Initializes the table that is used to generate the CRC32 if the CPU does @@ -315,8 +315,4 @@ ut_crc32_init() ut_crc32_slice8_table_init(); ut_crc32 = ut_crc32_slice8; } - - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: CPU %s crc32 instructions\n", - ut_crc32_sse2_enabled ? "supports" : "does not support"); } diff --git a/storage/innobase/ut/ut0mem.cc b/storage/innobase/ut/ut0mem.cc index 42ad180d373..2bb5d9ce332 100644 --- a/storage/innobase/ut/ut0mem.cc +++ b/storage/innobase/ut/ut0mem.cc @@ -35,9 +35,6 @@ Created 5/11/1994 Heikki Tuuri #include <stdlib.h> -/** This struct is placed first in every allocated memory block */ -typedef struct ut_mem_block_struct ut_mem_block_t; - /** The total amount of memory currently allocated from the operating system with os_mem_alloc_large() or malloc(). Does not count malloc() if srv_use_sys_malloc is set. Protected by ut_list_mutex. */ @@ -52,14 +49,14 @@ UNIV_INTERN mysql_pfs_key_t ut_list_mutex_key; #endif /** Dynamically allocated memory block */ -struct ut_mem_block_struct{ +struct ut_mem_block_t{ UT_LIST_NODE_T(ut_mem_block_t) mem_block_list; /*!< mem block list node */ ulint size; /*!< size of allocated memory */ ulint magic_n;/*!< magic number (UT_MEM_MAGIC_N) */ }; -/** The value of ut_mem_block_struct::magic_n. Used in detecting +/** The value of ut_mem_block_t::magic_n. Used in detecting memory corruption. */ #define UT_MEM_MAGIC_N 1601650166 diff --git a/storage/innobase/ut/ut0rbt.cc b/storage/innobase/ut/ut0rbt.cc index b21543a679d..e93844af600 100644 --- a/storage/innobase/ut/ut0rbt.cc +++ b/storage/innobase/ut/ut0rbt.cc @@ -773,7 +773,7 @@ rbt_create_arg_cmp( size_t sizeof_value, /*!< in: sizeof data item */ ib_rbt_arg_compare compare, /*!< in: fn to compare items */ - const void* cmp_arg) /*!< in: compare fn arg */ + void* cmp_arg) /*!< in: compare fn arg */ { ib_rbt_t* tree; diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc index 2268cfd2493..3c94d96c3ac 100644 --- a/storage/innobase/ut/ut0ut.cc +++ b/storage/innobase/ut/ut0ut.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,6 +28,7 @@ Created 5/11/1994 Heikki Tuuri #ifndef UNIV_INNOCHECKSUM #include "ut0sort.h" +#include "os0thread.h" /* thread-ID */ #ifdef UNIV_NONINL #include "ut0ut.ic" @@ -218,18 +219,25 @@ ut_print_timestamp( /*===============*/ FILE* file) /*!< in: file where to print */ { + ulint thread_id = 0; + +#ifndef UNIV_INNOCHECKSUM + thread_id = os_thread_pf(os_thread_get_curr_id()); +#endif + #ifdef __WIN__ SYSTEMTIME cal_tm; GetLocalTime(&cal_tm); - fprintf(file,"%02d%02d%02d %2d:%02d:%02d", - (int) cal_tm.wYear % 100, + fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx", + (int) cal_tm.wYear, (int) cal_tm.wMonth, (int) cal_tm.wDay, (int) cal_tm.wHour, (int) cal_tm.wMinute, - (int) cal_tm.wSecond); + (int) cal_tm.wSecond, + thread_id); #else struct tm* cal_tm_ptr; time_t tm; @@ -243,13 +251,14 @@ ut_print_timestamp( time(&tm); cal_tm_ptr = localtime(&tm); #endif - fprintf(file,"%02d%02d%02d %2d:%02d:%02d", - cal_tm_ptr->tm_year % 100, + fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx", + cal_tm_ptr->tm_year + 1900, cal_tm_ptr->tm_mon + 1, cal_tm_ptr->tm_mday, cal_tm_ptr->tm_hour, cal_tm_ptr->tm_min, - cal_tm_ptr->tm_sec); + cal_tm_ptr->tm_sec, + thread_id); #endif } @@ -515,7 +524,7 @@ void ut_print_name( /*==========*/ FILE* f, /*!< in: output stream */ - trx_t* trx, /*!< in: transaction */ + const trx_t* trx, /*!< in: transaction */ ibool table_id,/*!< in: TRUE=print a table name, FALSE=print other identifier */ const char* name) /*!< in: name to print */ @@ -533,7 +542,7 @@ void ut_print_namel( /*===========*/ FILE* f, /*!< in: output stream */ - trx_t* trx, /*!< in: transaction (NULL=no quotes) */ + const trx_t* trx, /*!< in: transaction (NULL=no quotes) */ ibool table_id,/*!< in: TRUE=print a table name, FALSE=print other identifier */ const char* name, /*!< in: name to print */ @@ -553,6 +562,50 @@ ut_print_namel( } /**********************************************************************//** +Formats a table or index name, quoted as an SQL identifier. If the name +contains a slash '/', the result will contain two identifiers separated by +a period (.), as in SQL database_name.identifier. +@return pointer to 'formatted' */ +UNIV_INTERN +char* +ut_format_name( +/*===========*/ + const char* name, /*!< in: table or index name, must be + '\0'-terminated */ + ibool is_table, /*!< in: if TRUE then 'name' is a table + name */ + char* formatted, /*!< out: formatted result, will be + '\0'-terminated */ + ulint formatted_size) /*!< out: no more than this number of + bytes will be written to 'formatted' */ +{ + switch (formatted_size) { + case 1: + formatted[0] = '\0'; + /* FALL-THROUGH */ + case 0: + return(formatted); + } + + char* end; + + end = innobase_convert_name(formatted, formatted_size, + name, strlen(name), NULL, is_table); + + /* If the space in 'formatted' was completely used, then sacrifice + the last character in order to write '\0' at the end. */ + if ((ulint) (end - formatted) == formatted_size) { + end--; + } + + ut_a((ulint) (end - formatted) < formatted_size); + + *end = '\0'; + + return(formatted); +} + +/**********************************************************************//** Catenate files. */ UNIV_INTERN void @@ -648,7 +701,7 @@ UNIV_INTERN const char* ut_strerr( /*======*/ - enum db_err num) /*!< in: error number */ + dberr_t num) /*!< in: error number */ { switch (num) { case DB_SUCCESS: @@ -703,10 +756,12 @@ ut_strerr( return("Cannot drop constraint"); case DB_NO_SAVEPOINT: return("No such savepoint"); - case DB_TABLESPACE_ALREADY_EXISTS: + case DB_TABLESPACE_EXISTS: return("Tablespace already exists"); case DB_TABLESPACE_DELETED: - return("No such tablespace"); + return("Tablespace deleted or being deleted"); + case DB_TABLESPACE_NOT_FOUND: + return("Tablespace not found"); case DB_LOCK_TABLE_FULL: return("Lock structs have exhausted the buffer pool"); case DB_FOREIGN_DUPLICATE_KEY: @@ -717,8 +772,8 @@ ut_strerr( return("Too many concurrent transactions"); case DB_UNSUPPORTED: return("Unsupported"); - case DB_PRIMARY_KEY_IS_NULL: - return("Primary key is NULL"); + case DB_INVALID_NULL: + return("NULL value encountered in NOT NULL column"); case DB_STATS_DO_NOT_EXIST: return("Persistent statistics do not exist"); case DB_FAIL: @@ -745,6 +800,21 @@ ut_strerr( return("Undo record too big"); case DB_END_OF_INDEX: return("End of index"); + case DB_IO_ERROR: + return("I/O error"); + case DB_TABLE_IN_FK_CHECK: + return("Table is being used in foreign key check"); + case DB_DATA_MISMATCH: + return("data mismatch"); + case DB_SCHEMA_NOT_LOCKED: + return("schema not locked"); + case DB_NOT_FOUND: + return("not found"); + case DB_ONLINE_LOG_TOO_BIG: + return("Log size exceeded during online index creation"); + case DB_DICT_CHANGED: + return("Table dictionary has changed"); + /* do not add default: in order to produce a warning if new code is added to the enum but not added here */ } diff --git a/storage/innobase/ut/ut0vec.cc b/storage/innobase/ut/ut0vec.cc index 8ac5d9dc5d3..5842d9f1c0e 100644 --- a/storage/innobase/ut/ut0vec.cc +++ b/storage/innobase/ut/ut0vec.cc @@ -44,12 +44,14 @@ ib_vector_create( ut_a(size > 0); - vec = static_cast<ib_vector_t*>(allocator->mem_malloc(allocator, sizeof(*vec))); + vec = static_cast<ib_vector_t*>( + allocator->mem_malloc(allocator, sizeof(*vec))); vec->used = 0; vec->total = size; vec->allocator = allocator; vec->sizeof_value = sizeof_value; + vec->data = static_cast<void*>( allocator->mem_malloc(allocator, vec->sizeof_value * size)); diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc index 6d410524fe7..d1ba36b3b00 100644 --- a/storage/innobase/ut/ut0wqueue.cc +++ b/storage/innobase/ut/ut0wqueue.cc @@ -40,7 +40,7 @@ ib_wqueue_create(void) mutex_create(PFS_NOT_INSTRUMENTED, &wq->mutex, SYNC_WORK_QUEUE); wq->items = ib_list_create(); - wq->event = os_event_create(NULL); + wq->event = os_event_create(); return(wq); } diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc index 95f37ddb12f..092e1a8a79e 100644 --- a/storage/maria/ha_maria.cc +++ b/storage/maria/ha_maria.cc @@ -512,7 +512,7 @@ static int table2maria(TABLE *table_arg, data_file_type row_type, keydef[i].block_length= pos->block_size; keydef[i].seg= keyseg; keydef[i].keysegs= pos->key_parts; - for (j= 0; j < pos->key_parts; j++) + for (j= 0; j < pos->user_defined_key_parts; j++) { Field *field= pos->key_part[j].field; type= field->key_type(); @@ -574,7 +574,7 @@ static int table2maria(TABLE *table_arg, data_file_type row_type, (uchar*) table_arg->record[0]); } } - keyseg+= pos->key_parts; + keyseg+= pos->user_defined_key_parts; } if (table_arg->found_next_number_field) keydef[share->next_number_index].flag|= HA_AUTO_KEY; @@ -1042,7 +1042,7 @@ ulong ha_maria::index_flags(uint inx, uint part, bool all_parts) const double ha_maria::scan_time() { if (file->s->data_file_type == BLOCK_RECORD) - return ulonglong2double(stats.data_file_length - file->s->block_size) / max(file->s->block_size / 2, IO_SIZE) + 2; + return ulonglong2double(stats.data_file_length - file->s->block_size) / MY_MAX(file->s->block_size / 2, IO_SIZE) + 2; return handler::scan_time(); } @@ -2464,18 +2464,18 @@ int ha_maria::info(uint flag) ref_length= maria_info.reflength; share->db_options_in_use= maria_info.options; stats.block_size= maria_block_size; - stats.mrr_length_per_rec= maria_info.reflength + 8; // 8 = max(sizeof(void *)) + stats.mrr_length_per_rec= maria_info.reflength + 8; // 8 = MY_MAX(sizeof(void *)) /* Update share */ share->keys_in_use.set_prefix(share->keys); share->keys_in_use.intersect_extended(maria_info.key_map); share->keys_for_keyread.intersect(share->keys_in_use); share->db_record_offset= maria_info.record_offset; - if (share->key_parts) + if (share->user_defined_key_parts) { ulong *to= table->key_info[0].rec_per_key, *end; double *from= maria_info.rec_per_key; - for (end= to+ share->key_parts ; to < end ; to++, from++) + for (end= to+ share->user_defined_key_parts ; to < end ; to++, from++) *to= (ulong) (*from + 0.5); } diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c index e3668d3c8d3..a351447cce3 100644 --- a/storage/maria/ma_bitmap.c +++ b/storage/maria/ma_bitmap.c @@ -319,7 +319,11 @@ my_bool _ma_bitmap_init(MARIA_SHARE *share, File file, my_bool _ma_bitmap_end(MARIA_SHARE *share) { my_bool res; - mysql_mutex_assert_owner(&share->close_lock); + +#ifndef DBUG_OFF + if (! share->internal_table) + mysql_mutex_assert_owner(&share->close_lock); +#endif DBUG_ASSERT(share->bitmap.non_flushable == 0); DBUG_ASSERT(share->bitmap.flush_all_requested == 0); DBUG_ASSERT(share->bitmap.waiting_for_non_flushable == 0 && @@ -1393,7 +1397,7 @@ found: IMPLEMENTATION We will return the smallest area >= size. If there is no such block, we will return the biggest area that satisfies - area_size >= min(BLOB_SEGMENT_MIN_SIZE*full_page_size, size) + area_size >= MY_MIN(BLOB_SEGMENT_MIN_SIZE*full_page_size, size) To speed up searches, we will only consider areas that has at least 16 free pages starting on an even boundary. When finding such an area, we will @@ -1501,7 +1505,7 @@ static ulong allocate_full_pages(MARIA_FILE_BITMAP *bitmap, DBUG_RETURN(0); /* No room on page */ /* - Now allocate min(pages_needed, area_size), starting from + Now allocate MY_MIN(pages_needed, area_size), starting from best_start + best_prefix_area_size */ if (best_area_size > pages_needed) diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c index 55b9a137050..2fc30b880b4 100644 --- a/storage/maria/ma_blockrec.c +++ b/storage/maria/ma_blockrec.c @@ -1230,7 +1230,7 @@ static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size, } check_directory(buff, block_size, - info ? min(info->s->base.min_block_length, length) : 0, + info ? MY_MIN(info->s->base.min_block_length, length) : 0, *empty_space); DBUG_RETURN(0); } @@ -2126,7 +2126,7 @@ static my_bool write_full_pages(MARIA_HA *info, } lsn_store(buff, lsn); buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE; - copy_length= min(data_size, length); + copy_length= MY_MIN(data_size, length); memcpy(buff + LSN_SIZE + PAGE_TYPE_SIZE, data, copy_length); length-= copy_length; @@ -3504,7 +3504,7 @@ static my_bool allocate_and_write_block_record(MARIA_HA *info, /* page will be pinned & locked by get_head_or_tail_page */ if (get_head_or_tail_page(info, blocks->block, info->buff, - max(row->space_on_head_page, + MY_MAX(row->space_on_head_page, info->s->base.min_block_length), HEAD_PAGE, PAGECACHE_LOCK_WRITE, &row_pos)) @@ -3952,7 +3952,7 @@ static my_bool _ma_update_at_original_place(MARIA_HA *info, */ DBUG_ASSERT(blocks->count > 1 || - max(new_row->total_length, share->base.min_block_length) <= + MY_MAX(new_row->total_length, share->base.min_block_length) <= length_on_head_page); /* Store same amount of data on head page as on original page */ diff --git a/storage/maria/ma_cache.c b/storage/maria/ma_cache.c index 829189baeed..35926d37e03 100644 --- a/storage/maria/ma_cache.c +++ b/storage/maria/ma_cache.c @@ -61,7 +61,7 @@ my_bool _ma_read_cache(MARIA_HA *handler, IO_CACHE *info, uchar *buff, (my_off_t) (info->read_end - info->request_pos)) { in_buff_pos=info->request_pos+(uint) offset; - in_buff_length= min(length,(size_t) (info->read_end-in_buff_pos)); + in_buff_length= MY_MIN(length,(size_t) (info->read_end-in_buff_pos)); memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length); if (!(length-=in_buff_length)) DBUG_RETURN(0); diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c index ab9080c40fb..e6907aabe27 100644 --- a/storage/maria/ma_check.c +++ b/storage/maria/ma_check.c @@ -2396,7 +2396,7 @@ static int initialize_variables_for_repair(HA_CHECK *param, else { ulong rec_length; - rec_length= max(share->base.min_pack_length, + rec_length= MY_MAX(share->base.min_pack_length, share->base.min_block_length); sort_info->max_records= (ha_rows) (sort_info->filelength / rec_length); } @@ -3600,7 +3600,7 @@ int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start, ulong buff_length; DBUG_ENTER("maria_filecopy"); - buff_length=(ulong) min(param->write_buffer_length,length); + buff_length=(ulong) MY_MIN(param->write_buffer_length,length); if (!(buff=my_malloc(buff_length,MYF(0)))) { buff=tmp_buff; buff_length=IO_SIZE; @@ -5658,7 +5658,7 @@ word_init_ft_buf: ft_buf->buf=ft_buf->lastkey+a_len; /* 32 is just a safety margin here - (at least max(val_len, sizeof(nod_flag)) should be there). + (at least MY_MAX(val_len, sizeof(nod_flag)) should be there). May be better performance could be achieved if we'd put (sort_info->keyinfo->block_length-32)/XXX instead. @@ -6071,7 +6071,7 @@ int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename) maria_close(*org_info); bzero((char*) &create_info,sizeof(create_info)); - create_info.max_rows=max(max_records,share.base.records); + create_info.max_rows=MY_MAX(max_records,share.base.records); create_info.reloc_rows=share.base.reloc; create_info.old_options=(share.options | (unpack ? HA_OPTION_TEMP_COMPRESS_RECORD : 0)); @@ -6494,7 +6494,8 @@ static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file) DBUG_ENTER("create_new_data_handle"); if (!(sort_info->new_info= maria_open(info->s->open_file_name.str, O_RDWR, - HA_OPEN_COPY | HA_OPEN_FOR_REPAIR))) + HA_OPEN_COPY | HA_OPEN_FOR_REPAIR | + HA_OPEN_INTERNAL_TABLE))) DBUG_RETURN(1); new_info= sort_info->new_info; @@ -6915,7 +6916,7 @@ static TrID max_trid_in_system(void) { TrID id= trnman_get_max_trid(); /* 0 if transac manager not initialized */ /* 'id' may be far bigger, if last shutdown is old */ - return max(id, max_trid_in_control_file); + return MY_MAX(id, max_trid_in_control_file); } diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c index 304216a76d9..51494300172 100644 --- a/storage/maria/ma_checkpoint.c +++ b/storage/maria/ma_checkpoint.c @@ -563,7 +563,7 @@ pthread_handler_t ma_checkpoint_background(void *arg) DBUG_ASSERT(interval > 0); #ifdef HAVE_PSI_THREAD_INTERFACE - PSI_CALL(set_thread_user_host)(0,0,0,0); + PSI_THREAD_CALL(set_thread_user_host)(0,0,0,0); #endif /* @@ -861,11 +861,11 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME)); dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles, /* avoid size of 0 for my_realloc */ - max(1, nb) * sizeof(PAGECACHE_FILE), + MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), MYF(MY_WME | MY_ALLOW_ZERO_PTR)); kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles, /* avoid size of 0 for my_realloc */ - max(1, nb) * sizeof(PAGECACHE_FILE), + MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), MYF(MY_WME | MY_ALLOW_ZERO_PTR)); if (unlikely((state_copies == NULL) || (dfiles == NULL) || (kfiles == NULL))) @@ -898,7 +898,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) Collect and cache a bunch of states. We do this for many states at a time, to not lock/unlock the log's lock too often. */ - uint j, bound= min(nb, i + STATE_COPIES); + uint j, bound= MY_MIN(nb, i + STATE_COPIES); state_copy= state_copies; /* part of the state is protected by log's lock */ translog_lock(); diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c index c355f1f1def..dd3a034425a 100644 --- a/storage/maria/ma_close.c +++ b/storage/maria/ma_close.c @@ -27,6 +27,7 @@ int maria_close(register MARIA_HA *info) int error=0,flag; my_bool share_can_be_freed= FALSE; MARIA_SHARE *share= info->s; + my_bool internal_table= share->internal_table; DBUG_ENTER("maria_close"); DBUG_PRINT("enter",("name: '%s' base: 0x%lx reopen: %u locks: %u", share->open_file_name.str, @@ -49,9 +50,9 @@ int maria_close(register MARIA_HA *info) error= my_errno; } - /* Ensure no one can open this file while we are closing it */ - mysql_mutex_lock(&THR_LOCK_maria); + if (!internal_table) + mysql_mutex_lock(&THR_LOCK_maria); if (info->lock_type == F_EXTRA_LCK) info->lock_type=F_UNLCK; /* HA_EXTRA_NO_USER_CHANGE */ @@ -60,8 +61,11 @@ int maria_close(register MARIA_HA *info) if (maria_lock_database(info,F_UNLCK)) error=my_errno; } - mysql_mutex_lock(&share->close_lock); - mysql_mutex_lock(&share->intern_lock); + if (!internal_table) + { + mysql_mutex_lock(&share->close_lock); + mysql_mutex_lock(&share->intern_lock); + } if (share->options & HA_OPTION_READ_ONLY_DATA) { @@ -75,7 +79,8 @@ int maria_close(register MARIA_HA *info) info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); } flag= !--share->reopen; - maria_open_list=list_delete(maria_open_list,&info->open_list); + if (!internal_table) + maria_open_list=list_delete(maria_open_list,&info->open_list); my_free(info->rec_buff); (*share->end)(info); @@ -159,7 +164,8 @@ int maria_close(register MARIA_HA *info) error= my_errno; } thr_lock_delete(&share->lock); - (void) mysql_mutex_destroy(&share->key_del_lock); + mysql_mutex_destroy(&share->key_del_lock); + { int i,keys; keys = share->state.header.keys; @@ -181,9 +187,11 @@ int maria_close(register MARIA_HA *info) We have to unlock share->intern_lock then lock it after LOCK_trn_list (trnman_lock()) to avoid dead locks. */ - mysql_mutex_unlock(&share->intern_lock); + if (!internal_table) + mysql_mutex_unlock(&share->intern_lock); _ma_remove_not_visible_states_with_lock(share, TRUE); - mysql_mutex_lock(&share->intern_lock); + if (!internal_table) + mysql_mutex_lock(&share->intern_lock); if (share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME) { @@ -220,9 +228,12 @@ int maria_close(register MARIA_HA *info) share->state_history= 0; } } - mysql_mutex_unlock(&THR_LOCK_maria); - mysql_mutex_unlock(&share->intern_lock); - mysql_mutex_unlock(&share->close_lock); + if (!internal_table) + { + mysql_mutex_unlock(&THR_LOCK_maria); + mysql_mutex_unlock(&share->intern_lock); + mysql_mutex_unlock(&share->close_lock); + } if (share_can_be_freed) { (void) mysql_mutex_destroy(&share->intern_lock); diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c index 70bc668a220..46db3ca4ae5 100644 --- a/storage/maria/ma_commit.c +++ b/storage/maria/ma_commit.c @@ -39,11 +39,11 @@ int ma_commit(TRN *trn) /* - if COMMIT record is written before trnman_commit_trn(): if Checkpoint comes in the middle it will see trn is not committed, - then if crash, Recovery might roll back trn (if min(rec_lsn) is after + then if crash, Recovery might roll back trn (if MY_MIN(rec_lsn) is after COMMIT record) and this is not an issue as * transaction's updates were not made visible to other transactions * "commit ok" was not sent to client - Alternatively, Recovery might commit trn (if min(rec_lsn) is before COMMIT + Alternatively, Recovery might commit trn (if MY_MIN(rec_lsn) is before COMMIT record), which is ok too. All in all it means that "trn committed" is not 100% equal to "COMMIT record written". - if COMMIT record is written after trnman_commit_trn(): diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c index 28c3491730f..152302a5426 100644 --- a/storage/maria/ma_create.c +++ b/storage/maria/ma_create.c @@ -51,6 +51,7 @@ int maria_create(const char *name, enum data_file_type datafile_type, base_pos,long_varchar_count,varchar_length, unique_key_parts,fulltext_keys,offset, not_block_record_extra_length; uint max_field_lengths, extra_header_size, column_nr; + uint internal_table= flags & HA_CREATE_INTERNAL_TABLE; ulong reclength, real_reclength,min_pack_length; char filename[FN_REFLEN], linkname[FN_REFLEN], *linkname_ptr; ulong pack_reclength; @@ -713,7 +714,7 @@ int maria_create(const char *name, enum data_file_type datafile_type, got from MAI file header (see also mariapack.c:save_state) */ share.base.key_reflength= - maria_get_pointer_length(max(ci->key_file_length,tmp),3); + maria_get_pointer_length(MY_MAX(ci->key_file_length,tmp),3); share.base.keys= share.state.header.keys= keys; share.state.header.uniques= uniques; share.state.header.fulltext_keys= fulltext_keys; @@ -780,7 +781,7 @@ int maria_create(const char *name, enum data_file_type datafile_type, share.base.min_block_length= (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH && ! share.base.blobs) ? - max(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) : + MY_MAX(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) : MARIA_EXTEND_BLOCK_LENGTH; } else if (datafile_type == STATIC_RECORD) @@ -789,7 +790,8 @@ int maria_create(const char *name, enum data_file_type datafile_type, if (! (flags & HA_DONT_TOUCH_DATA)) share.state.create_time= time((time_t*) 0); - mysql_mutex_lock(&THR_LOCK_maria); + if (!internal_table) + mysql_mutex_lock(&THR_LOCK_maria); /* NOTE: For test_if_reopen() we need a real path name. Hence we need @@ -854,7 +856,7 @@ int maria_create(const char *name, enum data_file_type datafile_type, NOTE: The filename is compared against unique_file_name of every open table. Hence we need a real path here. */ - if (_ma_test_if_reopen(filename)) + if (!internal_table && _ma_test_if_reopen(filename)) { my_printf_error(HA_ERR_TABLE_EXIST, "Aria table '%s' is in use " "(most likely by a MERGE table). Try FLUSH TABLES.", @@ -1171,7 +1173,8 @@ int maria_create(const char *name, enum data_file_type datafile_type, if (mysql_file_close(dfile,MYF(0))) goto err; } - mysql_mutex_unlock(&THR_LOCK_maria); + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); res= 0; my_free((char*) rec_per_key_part); errpos=0; @@ -1180,7 +1183,8 @@ int maria_create(const char *name, enum data_file_type datafile_type, DBUG_RETURN(res); err: - mysql_mutex_unlock(&THR_LOCK_maria); + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); err_no_lock: save_errno=my_errno; diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c index 5b8d0e01677..50edb216a1c 100644 --- a/storage/maria/ma_delete.c +++ b/storage/maria/ma_delete.c @@ -987,7 +987,7 @@ static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo, */ if (_ma_log_add(anc_page, anc_length, keypos, anc_key_inserted.move_length + - max(anc_key_inserted.changed_length - + MY_MAX(anc_key_inserted.changed_length - anc_key_inserted.move_length, key_deleted.changed_length), anc_key_inserted.move_length - @@ -1229,7 +1229,7 @@ static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo, */ if (_ma_log_add(anc_page, anc_length, keypos, anc_key_inserted.move_length + - max(anc_key_inserted.changed_length - + MY_MAX(anc_key_inserted.changed_length - anc_key_inserted.move_length, key_deleted.changed_length), anc_key_inserted.move_length - @@ -1570,7 +1570,7 @@ my_bool _ma_log_delete(MARIA_PAGE *ma_page, const uchar *key_pos, current_size != share->max_index_block_size) { /* Append data that didn't fit on the page before */ - uint length= (min(ma_page->size, share->max_index_block_size) - + uint length= (MY_MIN(ma_page->size, share->max_index_block_size) - current_size); uchar *data= ma_page->buff + current_size; diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c index c1c0a8e9729..4bb51d0dcf3 100644 --- a/storage/maria/ma_dynrec.c +++ b/storage/maria/ma_dynrec.c @@ -851,7 +851,7 @@ static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, uint tmp=MY_ALIGN(reclength - length + 3 + test(reclength >= 65520L),MARIA_DYN_ALIGN_SIZE); /* Don't create a block bigger than MARIA_MAX_BLOCK_LENGTH */ - tmp= min(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length; + tmp= MY_MIN(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length; /* Check if we can extend this block */ if (block_info.filepos + block_info.block_len == info->state->data_file_length && diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c index 0847f3c729c..66e7b4033c7 100644 --- a/storage/maria/ma_extra.c +++ b/storage/maria/ma_extra.c @@ -105,7 +105,7 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function, cache_size= (extra_arg ? *(ulong*) extra_arg : my_default_record_cache_size); if (!(init_io_cache(&info->rec_cache, info->dfile.file, - (uint) min(share->state.state.data_file_length+1, + (uint) MY_MIN(share->state.state.data_file_length+1, cache_size), READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK), MYF(share->write_flag & MY_WAIT_IF_FULL)))) diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c index c98c4b599fc..eb5813f84f1 100644 --- a/storage/maria/ma_ft_boolean_search.c +++ b/storage/maria/ma_ft_boolean_search.c @@ -46,9 +46,9 @@ three subexpressions (including the top-level one), every one has its own max_docid, updated by its plus word. but for the search word6 uses - max(word1.max_docid, word3.max_docid, word5.max_docid), + MY_MAX(word1.max_docid, word3.max_docid, word5.max_docid), while word4 uses, accordingly, - max(word1.max_docid, word3.max_docid). + MY_MAX(word1.max_docid, word3.max_docid). */ #define FT_CORE diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c index 341ea147785..912ed0984a3 100644 --- a/storage/maria/ma_info.c +++ b/storage/maria/ma_info.c @@ -31,7 +31,7 @@ MARIA_RECORD_POS maria_position(MARIA_HA *info) uint maria_max_key_length() { uint tmp= (_ma_max_key_length() - 8 - HA_MAX_KEY_SEG*3); - return min(HA_MAX_KEY_LENGTH, tmp); + return MY_MIN(HA_MAX_KEY_LENGTH, tmp); } /* Get information about the table */ diff --git a/storage/maria/ma_key_recover.c b/storage/maria/ma_key_recover.c index 502ac2b8809..ae9427981ea 100644 --- a/storage/maria/ma_key_recover.c +++ b/storage/maria/ma_key_recover.c @@ -506,7 +506,7 @@ my_bool _ma_log_add(MARIA_PAGE *ma_page, move_length)); DBUG_ASSERT(info->s->now_transactional); DBUG_ASSERT(move_length <= (int) changed_length); - DBUG_ASSERT(ma_page->org_size == min(org_page_length, max_page_size)); + DBUG_ASSERT(ma_page->org_size == MY_MIN(org_page_length, max_page_size)); DBUG_ASSERT(ma_page->size == org_page_length + move_length); DBUG_ASSERT(offset <= ma_page->org_size); @@ -618,7 +618,7 @@ my_bool _ma_log_add(MARIA_PAGE *ma_page, DBUG_ASSERT(current_size <= max_page_size && current_size <= ma_page->size); if (current_size != ma_page->size && current_size != max_page_size) { - uint length= min(ma_page->size, max_page_size) - current_size; + uint length= MY_MIN(ma_page->size, max_page_size) - current_size; uchar *data= ma_page->buff + current_size; log_pos[0]= KEY_OP_ADD_SUFFIX; @@ -641,7 +641,7 @@ my_bool _ma_log_add(MARIA_PAGE *ma_page, overflow! */ ma_page->org_size= current_size; - DBUG_ASSERT(ma_page->org_size == min(ma_page->size, max_page_size)); + DBUG_ASSERT(ma_page->org_size == MY_MIN(ma_page->size, max_page_size)); if (translog_write_record(&lsn, LOGREC_REDO_INDEX, info->trn, info, @@ -663,7 +663,7 @@ void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array, uint *translog_parts) { MARIA_SHARE *share= ma_page->info->s; - int page_length= min(ma_page->size, share->max_index_block_size); + int page_length= MY_MIN(ma_page->size, share->max_index_block_size); uint org_length; ha_checksum crc; @@ -1111,7 +1111,7 @@ uint _ma_apply_redo_index(MARIA_HA *info, uint2korr(header), uint2korr(header+2))); DBUG_ASSERT(uint2korr(header) == page_length); #ifndef DBUG_OFF - new_page_length= min(uint2korr(header+2), max_page_size); + new_page_length= MY_MIN(uint2korr(header+2), max_page_size); #endif header+= 4; break; @@ -1148,7 +1148,7 @@ uint _ma_apply_redo_index(MARIA_HA *info, from= uint2korr(header); header+= 2; /* "from" is a place in the existing page */ - DBUG_ASSERT(max(from, to) < max_page_size); + DBUG_ASSERT(MY_MAX(from, to) < max_page_size); memcpy(buff + to, buff + from, full_length); } break; diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c index 56926c048d8..2a2681c0844 100644 --- a/storage/maria/ma_loghandler.c +++ b/storage/maria/ma_loghandler.c @@ -4808,7 +4808,7 @@ static my_bool translog_advance_pointer(int pages, uint16 last_page_data) } #endif - min_offset= min(buffer_end_offset, file_end_offset); + min_offset= MY_MIN(buffer_end_offset, file_end_offset); /* TODO: check is it ptr or size enough */ log_descriptor.bc.buffer->size+= min_offset; log_descriptor.bc.ptr+= min_offset; @@ -6833,7 +6833,7 @@ translog_variable_length_header(uchar *page, translog_size_t page_offset, page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); base_lsn= buff->lsn; - body_len= min(page_rest, buff->record_length); + body_len= MY_MIN(page_rest, buff->record_length); } else { @@ -7396,7 +7396,7 @@ translog_size_t translog_read_record(LSN lsn, data->scanner.fixed_horizon)); if (offset < data->read_header) { - uint16 len= min(data->read_header, end) - offset; + uint16 len= MY_MIN(data->read_header, end) - offset; DBUG_PRINT("info", ("enter header offset: %lu length: %lu", (ulong) offset, (ulong) length)); diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c index 88422e3dc5f..0543f426af3 100644 --- a/storage/maria/ma_open.c +++ b/storage/maria/ma_open.c @@ -78,6 +78,7 @@ MARIA_HA *_ma_test_if_reopen(const char *filename) mode Mode of table (O_RDONLY | O_RDWR) data_file Filedescriptor of data file to use < 0 if one should open open it. + internal_table <> 0 if this is an internal temporary table RETURN # Maria handler @@ -86,7 +87,8 @@ MARIA_HA *_ma_test_if_reopen(const char *filename) static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, const char *name, - int mode, File data_file) + int mode, File data_file, + uint internal_table) { int save_errno; uint errpos; @@ -159,7 +161,7 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, const char *name, /* The following should be big enough for all pinning purposes */ if (my_init_dynamic_array(&info.pinned_pages, sizeof(MARIA_PINNED_PAGE), - max(share->base.blobs*2 + 4, + MY_MAX(share->base.blobs*2 + 4, MARIA_MAX_TREE_LEVELS*3), 16, MYF(0))) goto err; @@ -207,9 +209,17 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, const char *name, if (share->options & HA_OPTION_TMP_TABLE) m_info->lock.type= TL_WRITE; - m_info->open_list.data=(void*) m_info; - maria_open_list=list_add(maria_open_list,&m_info->open_list); - + if (!internal_table) + { + m_info->open_list.data=(void*) m_info; + maria_open_list=list_add(maria_open_list,&m_info->open_list); + } + else + { + /* We don't need to mark internal temporary tables as changed on disk */ + share->internal_table= 1; + share->global_changed= 1; + } DBUG_RETURN(m_info); err: @@ -243,7 +253,7 @@ MARIA_HA *maria_clone(MARIA_SHARE *share, int mode) mysql_mutex_lock(&THR_LOCK_maria); new_info= maria_clone_internal(share, NullS, mode, share->data_file_type == BLOCK_RECORD ? - share->bitmap.file.file : -1); + share->bitmap.file.file : -1, 0); mysql_mutex_unlock(&THR_LOCK_maria); return new_info; } @@ -263,6 +273,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags) int kfile,open_mode,save_errno; uint i,j,len,errpos,head_length,base_pos,keys, realpath_err, key_parts,unique_key_parts,fulltext_keys,uniques; + uint internal_table= test(open_flags & HA_OPEN_INTERNAL_TABLE); size_t info_length; char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN], data_name[FN_REFLEN]; @@ -293,10 +304,11 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags) DBUG_RETURN(0); } - mysql_mutex_lock(&THR_LOCK_maria); old_info= 0; + if (!internal_table) + mysql_mutex_lock(&THR_LOCK_maria); if ((open_flags & HA_OPEN_COPY) || - !(old_info=_ma_test_if_reopen(name_buff))) + (internal_table || !(old_info=_ma_test_if_reopen(name_buff)))) { share= &share_buff; bzero((uchar*) &share_buff,sizeof(share_buff)); @@ -592,7 +604,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags) { /* Packed key, ensure we don't get overflow in underflow() */ keyinfo->underflow_block_length= - max((int) (share->max_index_block_size - keyinfo->maxlength * 3), + MY_MAX((int) (share->max_index_block_size - keyinfo->maxlength * 3), (int) (share->keypage_header + share->base.key_reflength)); set_if_smaller(keyinfo->underflow_block_length, keyinfo->block_length/3); @@ -780,7 +792,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags) /* Need some extra bytes for decode_bytes */ share->base.extra_rec_buff_size+= 7; } - share->base.default_rec_buff_size= max(share->base.pack_reclength + + share->base.default_rec_buff_size= MY_MAX(share->base.pack_reclength + share->base.extra_rec_buff_size, share->base.max_key_length); @@ -981,14 +993,16 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags) data_file= share->bitmap.file.file; /* Only opened once */ } - if (!(m_info= maria_clone_internal(share, name, mode, data_file))) + if (!(m_info= maria_clone_internal(share, name, mode, data_file, + internal_table))) goto err; if (maria_is_crashed(m_info)) DBUG_PRINT("warning", ("table is crashed: changed: %u", share->state.changed)); - mysql_mutex_unlock(&THR_LOCK_maria); + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); m_info->open_flags= open_flags; DBUG_PRINT("exit", ("table: %p name: %s",m_info, name)); @@ -1027,7 +1041,8 @@ err: default: break; } - mysql_mutex_unlock(&THR_LOCK_maria); + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); my_errno= save_errno; DBUG_RETURN (NULL); } /* maria_open */ diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c index c14e69414b5..9b06c0d4f78 100644 --- a/storage/maria/ma_packrec.c +++ b/storage/maria/ma_packrec.c @@ -718,7 +718,7 @@ static uint find_longest_bitstream(uint16 *table, uint16 *end) return OFFSET_TABLE_SIZE; } length2= find_longest_bitstream(next, end) + 1; - length=max(length,length2); + length=MY_MAX(length,length2); } return length; } @@ -1447,7 +1447,7 @@ uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff, info->filepos=filepos+head_length; if (file > 0) { - info->offset=min(info->rec_len, ref_length - head_length); + info->offset=MY_MIN(info->rec_len, ref_length - head_length); memcpy(*rec_buff_p, header + head_length, info->offset); } return 0; diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c index 9ac42f885b5..c896f730d3f 100644 --- a/storage/maria/ma_recovery.c +++ b/storage/maria/ma_recovery.c @@ -3679,7 +3679,7 @@ static void print_redo_phase_progress(TRANSLOG_ADDRESS addr) cur_offset= LSN_OFFSET(addr); local_remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) : (((longlong)log_file_size) - cur_offset + - max(end_logno - cur_logno - 1, 0) * ((longlong)log_file_size) + + MY_MAX(end_logno - cur_logno - 1, 0) * ((longlong)log_file_size) + end_offset); if (initial_remainder == (ulonglong)(-1)) initial_remainder= local_remainder; diff --git a/storage/maria/ma_rt_mbr.c b/storage/maria/ma_rt_mbr.c index b3e2b0ceab8..496ace2a84f 100644 --- a/storage/maria/ma_rt_mbr.c +++ b/storage/maria/ma_rt_mbr.c @@ -329,8 +329,8 @@ int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a, bmin= korr_func(b); \ amax= korr_func(a+len); \ bmax= korr_func(b+len); \ - amin= min(amin, bmin); \ - amax= max(amax, bmax); \ + amin= MY_MIN(amin, bmin); \ + amax= MY_MAX(amax, bmax); \ store_func(c, amin); \ store_func(c+len, amax); \ } @@ -342,8 +342,8 @@ int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a, get_func(bmin, b); \ get_func(amax, a+len); \ get_func(bmax, b+len); \ - amin= min(amin, bmin); \ - amax= max(amax, bmax); \ + amin= MY_MIN(amin, bmin); \ + amax= MY_MAX(amax, bmax); \ store_func(c, amin); \ store_func(c+len, amax); \ } @@ -422,8 +422,8 @@ int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, const uchar* a, bmin= korr_func(b); \ amax= korr_func(a+len); \ bmax= korr_func(b+len); \ - amin= max(amin, bmin); \ - amax= min(amax, bmax); \ + amin= MY_MAX(amin, bmin); \ + amax= MY_MIN(amax, bmax); \ if (amin >= amax) \ return 0; \ res *= amax - amin; \ @@ -436,8 +436,8 @@ int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, const uchar* a, get_func(bmin, b); \ get_func(amax, a+len); \ get_func(bmax, b+len); \ - amin= max(amin, bmin); \ - amax= min(amax, bmax); \ + amin= MY_MAX(amin, bmin); \ + amax= MY_MIN(amax, bmax); \ if (amin >= amax) \ return 0; \ res *= amax - amin; \ @@ -513,7 +513,7 @@ double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b, amax= korr_func(a+len); \ bmax= korr_func(b+len); \ a_area *= (((double)amax) - ((double)amin)); \ - loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + loc_ab_area *= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } #define RT_AREA_INC_GET(type, get_func, len)\ @@ -524,7 +524,7 @@ double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b, get_func(amax, a+len); \ get_func(bmax, b+len); \ a_area *= (((double)amax) - ((double)amin)); \ - loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + loc_ab_area *= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } /* @@ -612,7 +612,7 @@ safe_end: amax= korr_func(a+len); \ bmax= korr_func(b+len); \ a_perim+= (((double)amax) - ((double)amin)); \ - *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + *ab_perim+= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } #define RT_PERIM_INC_GET(type, get_func, len)\ @@ -623,7 +623,7 @@ safe_end: get_func(amax, a+len); \ get_func(bmax, b+len); \ a_perim+= (((double)amax) - ((double)amin)); \ - *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + *ab_perim+= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } /* diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c index 4bc179c3008..72f9c7ceca3 100644 --- a/storage/maria/ma_sort.c +++ b/storage/maria/ma_sort.c @@ -133,7 +133,7 @@ int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, sort_keys= (uchar **) NULL; error= 1; maxbuffer=1; - memavl=max(sortbuff_size,MIN_SORT_MEMORY); + memavl=MY_MAX(sortbuff_size,MIN_SORT_MEMORY); records= info->sort_info->max_records; sort_length= info->key_length; LINT_INIT(keys); @@ -364,7 +364,7 @@ pthread_handler_t _ma_thr_find_all_keys(void *arg) bzero((char*) &sort_param->buffpek,sizeof(sort_param->buffpek)); bzero((char*) &sort_param->unique, sizeof(sort_param->unique)); - memavl= max(sort_param->sortbuff_size, MIN_SORT_MEMORY); + memavl= MY_MAX(sort_param->sortbuff_size, MIN_SORT_MEMORY); idx= (uint)sort_param->sort_info->max_records; sort_length= sort_param->key_length; maxbuffer= 1; @@ -857,7 +857,7 @@ static uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek, register uint count; uint length; - if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + if ((count=(uint) MY_MIN((ha_rows) buffpek->max_keys,buffpek->count))) { if (mysql_file_pread(fromfile->file, buffpek->base, (length= sort_length*count),buffpek->file_pos,MYF_RW)) @@ -877,7 +877,7 @@ static uint read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek, uint idx; uchar *buffp; - if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + if ((count=(uint) MY_MIN((ha_rows) buffpek->max_keys,buffpek->count))) { buffp= buffpek->base; diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c index 945654a0bbe..0147d00d6e0 100644 --- a/storage/maria/ma_test1.c +++ b/storage/maria/ma_test1.c @@ -631,7 +631,7 @@ static void create_record(uchar *record,uint rownr) uint tmp; uchar *ptr;; sprintf((char*) blob_record,"... row: %d", rownr); - strappend((char*) blob_record,max(MAX_REC_LENGTH-rownr,10),' '); + strappend((char*) blob_record,MY_MAX(MAX_REC_LENGTH-rownr,10),' '); tmp=strlen((char*) blob_record); int4store(pos,tmp); ptr=blob_record; diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c index ea1978b4ee5..242e5e16333 100644 --- a/storage/maria/ma_test2.c +++ b/storage/maria/ma_test2.c @@ -698,7 +698,7 @@ int main(int argc, char *argv[]) goto err2; } - for (i=min(2,keys) ; i-- > 0 ;) + for (i=MY_MIN(2,keys) ; i-- > 0 ;) { if (maria_rsame(file,read_record2,(int) i)) goto err; if (bcmp(read_record,read_record2,reclength) != 0) diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c index 5d440a40dc0..24a3f96f42e 100644 --- a/storage/maria/ma_write.c +++ b/storage/maria/ma_write.c @@ -933,7 +933,7 @@ ChangeSet@1.2562, 2008-04-09 07:41:40+02:00, serg@janus.mylan +9 -0 &s_temp)); } DBUG_RETURN(_ma_split_page(info, key, anc_page, - min(org_anc_length, + MY_MIN(org_anc_length, info->s->max_index_block_size), key_pos, s_temp.changed_length, t_length, key_buff, insert_last)); @@ -2075,7 +2075,7 @@ static my_bool _ma_log_split(MARIA_PAGE *ma_page, Handle case when split happened directly after the newly inserted key. */ max_key_length= new_length - offset; - extra_length= min(key_length, max_key_length); + extra_length= MY_MIN(key_length, max_key_length); if (offset + move_length > new_length) { /* This is true when move_length includes changes for next packed key */ diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h index e983f561bbb..c1ac49a6b35 100644 --- a/storage/maria/maria_def.h +++ b/storage/maria/maria_def.h @@ -465,6 +465,7 @@ typedef struct st_maria_share my_bool changed, /* If changed since lock */ global_changed, /* If changed since open */ not_flushed; + my_bool internal_table; /* Internal tmp table */ my_bool lock_key_trees; /* If we have to lock trees on read */ my_bool non_transactional_concurrent_insert; my_bool delay_key_write; diff --git a/storage/maria/maria_pack.c b/storage/maria/maria_pack.c index 40686995378..2fe5e818db9 100644 --- a/storage/maria/maria_pack.c +++ b/storage/maria/maria_pack.c @@ -1243,7 +1243,7 @@ static void check_counts(HUFF_COUNTS *huff_counts, uint trees, { if (huff_counts->field_length > 2 && huff_counts->empty_fields + (records - huff_counts->empty_fields)* - (1+max_bit(max(huff_counts->max_pre_space, + (1+max_bit(MY_MAX(huff_counts->max_pre_space, huff_counts->max_end_space))) < records * max_bit(huff_counts->field_length)) { @@ -3021,7 +3021,7 @@ static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length, if (mrg->src_file_has_indexes_disabled) { isam_file->s->state.state.key_file_length= - max(isam_file->s->state.state.key_file_length, new_length); + MY_MAX(isam_file->s->state.state.key_file_length, new_length); } state.dellink= HA_OFFSET_ERROR; state.version=(ulong) time((time_t*) 0); diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c index f8959c977f8..38fdb358e53 100644 --- a/storage/maria/trnman.c +++ b/storage/maria/trnman.c @@ -877,7 +877,7 @@ TrID trnman_get_min_safe_trid() { TrID trid; mysql_mutex_lock(&LOCK_trn_list); - trid= min(active_list_min.next->min_read_from, + trid= MY_MIN(active_list_min.next->min_read_from, global_trid_generator); mysql_mutex_unlock(&LOCK_trn_list); return trid; diff --git a/storage/maria/unittest/ma_test_all-t b/storage/maria/unittest/ma_test_all-t index e66d269ab93..18b26a7bd45 100755 --- a/storage/maria/unittest/ma_test_all-t +++ b/storage/maria/unittest/ma_test_all-t @@ -650,6 +650,8 @@ sub ok { exit 1; } + # Unlink all files so that we can continue on error + unlink_all_possible_tmp_files(); return 0; } @@ -702,7 +704,7 @@ sub unlink_all_possible_tmp_files() unlink_log_files(); # Unlink tmp files that may have been created when testing the test programs - unlink <$full_tmpdir/*.TMD $full_tmpdir/aria_read_log_test1.txt $full_tmpdir/test1*.MA? $full_tmpdir/ma_test_recovery.output aria_log_control aria_log.00000001 aria_log.00000002 aria_logtest1.MA? test1.MA? test2.MA? test3.MA?>; + unlink <$full_tmpdir/*.TMD $full_tmpdir/aria_read_log_test1.txt $full_tmpdir/test1*.MA? $full_tmpdir/ma_test_recovery.output aria_log_control aria_log.00000001 aria_log.00000002 aria_logtest1.MA? test1.MA? test2.MA? test3.MA? *.TMD>; } #### diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc index f649de8bd5c..3e73bb7c801 100644 --- a/storage/myisam/ha_myisam.cc +++ b/storage/myisam/ha_myisam.cc @@ -247,8 +247,8 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out, pos->algorithm; keydef[i].block_length= pos->block_size; keydef[i].seg= keyseg; - keydef[i].keysegs= pos->key_parts; - for (j= 0; j < pos->key_parts; j++) + keydef[i].keysegs= pos->user_defined_key_parts; + for (j= 0; j < pos->user_defined_key_parts; j++) { Field *field= pos->key_part[j].field; type= field->key_type(); @@ -310,7 +310,7 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out, (uchar*) table_arg->record[0]); } } - keyseg+= pos->key_parts; + keyseg+= pos->user_defined_key_parts; } if (table_arg->found_next_number_field) keydef[share->next_number_index].flag|= HA_AUTO_KEY; @@ -1136,8 +1136,8 @@ int ha_myisam::repair(THD *thd, HA_CHECK ¶m, bool do_optimize) } if (error && file->create_unique_index_by_sort && share->state.dupp_key != MAX_KEY) - print_keydup_error(share->state.dupp_key, - ER(ER_DUP_ENTRY_WITH_KEY_NAME), MYF(0)); + print_keydup_error(table, &table->key_info[share->state.dupp_key], + MYF(0)); } else { @@ -1527,8 +1527,8 @@ void ha_myisam::start_bulk_insert(ha_rows rows, uint flags) { DBUG_ENTER("ha_myisam::start_bulk_insert"); THD *thd= current_thd; - ulong size= min(thd->variables.read_buff_size, - (ulong) (table->s->avg_row_length*rows)); + ulong size= MY_MIN(thd->variables.read_buff_size, + (ulong) (table->s->avg_row_length*rows)); DBUG_PRINT("info",("start_bulk_insert: rows %lu size %lu", (ulong) rows, size)); @@ -1539,36 +1539,33 @@ void ha_myisam::start_bulk_insert(ha_rows rows, uint flags) can_enable_indexes= mi_is_all_keys_active(file->s->state.key_map, file->s->base.keys); - if (!(specialflag & SPECIAL_SAFE_MODE)) + /* + Only disable old index if the table was empty and we are inserting + a lot of rows. + Note that in end_bulk_insert() we may truncate the table if + enable_indexes() failed, thus it's essential that indexes are + disabled ONLY for an empty table. + */ + if (file->state->records == 0 && can_enable_indexes && + (!rows || rows >= MI_MIN_ROWS_TO_DISABLE_INDEXES)) { - /* - Only disable old index if the table was empty and we are inserting - a lot of rows. - Note that in end_bulk_insert() we may truncate the table if - enable_indexes() failed, thus it's essential that indexes are - disabled ONLY for an empty table. - */ - if (file->state->records == 0 && can_enable_indexes && - (!rows || rows >= MI_MIN_ROWS_TO_DISABLE_INDEXES)) + if (file->open_flag & HA_OPEN_INTERNAL_TABLE) { - if (file->open_flag & HA_OPEN_INTERNAL_TABLE) - { - file->update|= HA_STATE_CHANGED; - mi_clear_all_keys_active(file->s->state.key_map); - } - else - { - my_bool all_keys= test(flags & HA_CREATE_UNIQUE_INDEX_BY_SORT); - mi_disable_indexes_for_rebuild(file, rows, all_keys); - } + file->update|= HA_STATE_CHANGED; + mi_clear_all_keys_active(file->s->state.key_map); } else + { + my_bool all_keys= test(flags & HA_CREATE_UNIQUE_INDEX_BY_SORT); + mi_disable_indexes_for_rebuild(file, rows, all_keys); + } + } + else if (!file->bulk_insert && (!rows || rows >= MI_MIN_ROWS_TO_USE_BULK_INSERT)) { mi_init_bulk_insert(file, thd->variables.bulk_insert_buff_size, rows); } - } DBUG_VOID_RETURN; } @@ -1846,7 +1843,7 @@ int ha_myisam::info(uint flag) number of records in the buffer results in a different number of buffer refills and in a different order of records in the result set. */ - stats.mrr_length_per_rec= misam_info.reflength + 8; // 8=max(sizeof(void *)) + stats.mrr_length_per_rec= misam_info.reflength + 8; // 8=MY_MAX(sizeof(void *)) ref_length= misam_info.reflength; share->db_options_in_use= misam_info.options; @@ -1896,8 +1893,6 @@ int ha_myisam::info(uint flag) int ha_myisam::extra(enum ha_extra_function operation) { - if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_KEYREAD) - return 0; if (operation == HA_EXTRA_MMAP && !opt_myisam_use_mmap) return 0; return mi_extra(file, operation, 0); @@ -1915,8 +1910,6 @@ int ha_myisam::reset(void) int ha_myisam::extra_opt(enum ha_extra_function operation, ulong cache_size) { - if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_WRITE_CACHE) - return 0; return mi_extra(file, operation, (void*) &cache_size); } diff --git a/storage/myisam/mi_cache.c b/storage/myisam/mi_cache.c index 6e9feaefb2d..3477e67eae5 100644 --- a/storage/myisam/mi_cache.c +++ b/storage/myisam/mi_cache.c @@ -62,7 +62,7 @@ int _mi_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos, uint length, (my_off_t) (info->read_end - info->request_pos)) { in_buff_pos=info->request_pos+(uint) offset; - in_buff_length= min(length, (size_t) (info->read_end-in_buff_pos)); + in_buff_length= MY_MIN(length, (size_t) (info->read_end-in_buff_pos)); memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length); if (!(length-=in_buff_length)) DBUG_RETURN(0); diff --git a/storage/myisam/mi_check.c b/storage/myisam/mi_check.c index 056aff5a72b..61dbbb7a18d 100644 --- a/storage/myisam/mi_check.c +++ b/storage/myisam/mi_check.c @@ -1946,7 +1946,13 @@ int mi_sort_index(HA_CHECK *param, register MI_INFO *info, char * name) key++,keyinfo++) { if (! mi_is_key_active(info->s->state.key_map, key)) + { + /* Since the key is not active, this should not be read, but we + initialize it anyway to silence a Valgrind warn when passing that + chunk of memory to pwrite(). */ + index_pos[key]= HA_OFFSET_ERROR; continue; + } if (share->state.key_root[key] != HA_OFFSET_ERROR) { @@ -2145,7 +2151,7 @@ int filecopy(HA_CHECK *param, File to,File from,my_off_t start, ulong buff_length; DBUG_ENTER("filecopy"); - buff_length=(ulong) min(param->write_buffer_length,length); + buff_length=(ulong) MY_MIN(param->write_buffer_length,length); if (!(buff=my_malloc(buff_length,MYF(0)))) { buff=tmp_buff; buff_length=IO_SIZE; @@ -2303,7 +2309,7 @@ int mi_repair_by_sort(HA_CHECK *param, register MI_INFO *info, MYF(param->malloc_flags)); if (share->data_file_type == DYNAMIC_RECORD) - length=max(share->base.min_pack_length+1,share->base.min_block_length); + length=MY_MAX(share->base.min_pack_length+1,share->base.min_block_length); else if (share->data_file_type == COMPRESSED_RECORD) length=share->base.min_block_length; else @@ -2392,7 +2398,7 @@ int mi_repair_by_sort(HA_CHECK *param, register MI_INFO *info, (see _create_index_by_sort) */ sort_info.max_records= 10 * - max(param->sort_buffer_length, MIN_SORT_BUFFER) / + MY_MAX(param->sort_buffer_length, MIN_SORT_BUFFER) / sort_param.key_length; } @@ -2759,7 +2765,7 @@ int mi_repair_parallel(HA_CHECK *param, register MI_INFO *info, mysql_file_seek(param->read_cache.file, 0L, MY_SEEK_END, MYF(0)); if (share->data_file_type == DYNAMIC_RECORD) - rec_length=max(share->base.min_pack_length+1,share->base.min_block_length); + rec_length=MY_MAX(share->base.min_pack_length+1,share->base.min_block_length); else if (share->data_file_type == COMPRESSED_RECORD) rec_length=share->base.min_block_length; else @@ -3984,7 +3990,7 @@ word_init_ft_buf: ft_buf->buf=ft_buf->lastkey+a_len; /* 32 is just a safety margin here - (at least max(val_len, sizeof(nod_flag)) should be there). + (at least MY_MAX(val_len, sizeof(nod_flag)) should be there). May be better performance could be achieved if we'd put (sort_info->keyinfo->block_length-32)/XXX instead. diff --git a/storage/myisam/mi_close.c b/storage/myisam/mi_close.c index e58c2e0f189..f0a82bcef04 100644 --- a/storage/myisam/mi_close.c +++ b/storage/myisam/mi_close.c @@ -31,7 +31,8 @@ int mi_close(register MI_INFO *info) (long) info, (uint) share->reopen, (uint) share->tot_locks)); - mysql_mutex_lock(&THR_LOCK_myisam); + if (info->open_list.data) + mysql_mutex_lock(&THR_LOCK_myisam); if (info->lock_type == F_EXTRA_LCK) info->lock_type=F_UNLCK; /* HA_EXTRA_NO_USER_CHANGE */ @@ -54,7 +55,8 @@ int mi_close(register MI_INFO *info) info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); } flag= !--share->reopen; - myisam_open_list=list_delete(myisam_open_list,&info->open_list); + if (info->open_list.data) + myisam_open_list= list_delete(myisam_open_list, &info->open_list); mysql_mutex_unlock(&share->intern_lock); my_free(mi_get_rec_buff_ptr(info, info->rec_buff)); @@ -111,7 +113,8 @@ int mi_close(register MI_INFO *info) } my_free(info->s); } - mysql_mutex_unlock(&THR_LOCK_myisam); + if (info->open_list.data) + mysql_mutex_unlock(&THR_LOCK_myisam); if (info->ftparser_param) { my_free(info->ftparser_param); diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c index ad97fba2cbb..cc0cfd0ae3a 100644 --- a/storage/myisam/mi_create.c +++ b/storage/myisam/mi_create.c @@ -43,6 +43,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, base_pos,long_varchar_count,varchar_length, max_key_block_length,unique_key_parts,fulltext_keys,offset; uint aligned_key_start, block_length, res; + uint internal_table= flags & HA_CREATE_INTERNAL_TABLE; ulong reclength, real_reclength,min_pack_length; char filename[FN_REFLEN],linkname[FN_REFLEN], *linkname_ptr; ulong pack_reclength; @@ -446,8 +447,8 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, block_length= (keydef->block_length ? my_round_up_to_next_power(keydef->block_length) : myisam_block_size); - block_length= max(block_length, MI_MIN_KEY_BLOCK_LENGTH); - block_length= min(block_length, MI_MAX_KEY_BLOCK_LENGTH); + block_length= MY_MAX(block_length, MI_MIN_KEY_BLOCK_LENGTH); + block_length= MY_MIN(block_length, MI_MAX_KEY_BLOCK_LENGTH); keydef->block_length= (uint16) MI_BLOCK_SIZE(length-real_length_diff, pointer,MI_MAX_KEYPTR_SIZE, @@ -536,7 +537,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, got from MYI file header (see also myisampack.c:save_state) */ share.base.key_reflength= - mi_get_pointer_length(max(ci->key_file_length,tmp),3); + mi_get_pointer_length(MY_MAX(ci->key_file_length,tmp),3); share.base.keys= share.state.header.keys= keys; share.state.header.uniques= uniques; share.state.header.fulltext_keys= fulltext_keys; @@ -569,12 +570,13 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, share.base.min_block_length= (share.base.pack_reclength+3 < MI_EXTEND_BLOCK_LENGTH && ! share.base.blobs) ? - max(share.base.pack_reclength,MI_MIN_BLOCK_LENGTH) : + MY_MAX(share.base.pack_reclength,MI_MIN_BLOCK_LENGTH) : MI_EXTEND_BLOCK_LENGTH; if (! (flags & HA_DONT_TOUCH_DATA)) share.state.create_time= time((time_t*) 0); - mysql_mutex_lock(&THR_LOCK_myisam); + if (!internal_table) + mysql_mutex_lock(&THR_LOCK_myisam); /* NOTE: For test_if_reopen() we need a real path name. Hence we need @@ -631,7 +633,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, NOTE: The filename is compared against unique_file_name of every open table. Hence we need a real path here. */ - if (test_if_reopen(filename)) + if (!internal_table && test_if_reopen(filename)) { my_printf_error(HA_ERR_TABLE_EXIST, "MyISAM table '%s' is in use " "(most likely by a MERGE table). Try FLUSH TABLES.", @@ -820,7 +822,8 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, goto err; } errpos=0; - mysql_mutex_unlock(&THR_LOCK_myisam); + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_myisam); res= 0; if (mysql_file_close(file, MYF(0))) res= my_errno; @@ -828,7 +831,8 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, DBUG_RETURN(res); err: - mysql_mutex_unlock(&THR_LOCK_myisam); + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_myisam); err_no_lock: save_errno=my_errno; diff --git a/storage/myisam/mi_dynrec.c b/storage/myisam/mi_dynrec.c index 009a2affe0c..021809ed892 100644 --- a/storage/myisam/mi_dynrec.c +++ b/storage/myisam/mi_dynrec.c @@ -118,7 +118,8 @@ int mi_munmap_file(MI_INFO *info) { int ret; DBUG_ENTER("mi_unmap_file"); - if ((ret= my_munmap(info->s->file_map, (size_t) info->s->mmaped_length))) + if ((ret= my_munmap((void*) info->s->file_map, + (size_t) info->s->mmaped_length))) DBUG_RETURN(ret); info->s->file_read= mi_nommap_pread; info->s->file_write= mi_nommap_pwrite; @@ -865,7 +866,7 @@ static int update_dynamic_record(MI_INFO *info, my_off_t filepos, uchar *record, uint tmp=MY_ALIGN(reclength - length + 3 + test(reclength >= 65520L),MI_DYN_ALIGN_SIZE); /* Don't create a block bigger than MI_MAX_BLOCK_LENGTH */ - tmp= min(length+tmp, MI_MAX_BLOCK_LENGTH)-length; + tmp= MY_MIN(length+tmp, MI_MAX_BLOCK_LENGTH)-length; /* Check if we can extend this block */ if (block_info.filepos + block_info.block_len == info->state->data_file_length && @@ -1780,15 +1781,21 @@ int _mi_read_rnd_dynamic_record(MI_INFO *info, uchar *buf, if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | BLOCK_FATAL_ERROR)) { - if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) - && skip_deleted_blocks) - { - filepos=block_info.filepos+block_info.block_len; - block_info.second_read=0; - continue; /* Search after next_record */ - } - if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))) { + if (skip_deleted_blocks) + { + filepos=block_info.filepos+block_info.block_len; + block_info.second_read=0; + continue; /* Search after next_record */ + } + /* + If we're not on the first block of a record and + the block is marked as deleted or out of sync, + something's gone wrong: the record is damaged. + */ + if (block_of_record != 0) + goto panic; my_errno=HA_ERR_RECORD_DELETED; info->lastpos=block_info.filepos; info->nextpos=block_info.filepos+block_info.block_len; diff --git a/storage/myisam/mi_extra.c b/storage/myisam/mi_extra.c index dab1f66ed6d..f57fba5c2c5 100644 --- a/storage/myisam/mi_extra.c +++ b/storage/myisam/mi_extra.c @@ -100,7 +100,7 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg) cache_size= (extra_arg ? *(ulong*) extra_arg : my_default_record_cache_size); if (!(init_io_cache(&info->rec_cache,info->dfile, - (uint) min(info->state->data_file_length+1, + (uint) MY_MIN(info->state->data_file_length+1, cache_size), READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK), MYF(share->write_flag & MY_WAIT_IF_FULL)))) diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c index 438057e22df..f8213b1a3a5 100644 --- a/storage/myisam/mi_open.c +++ b/storage/myisam/mi_open.c @@ -14,7 +14,18 @@ along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -/* open a isam-database */ +/* + open a isam-database + + Internal temporary tables + ------------------------- + Since only single instance of internal temporary table is required by + optimizer, such tables are not registered on myisam_open_list. In effect + it means (a) THR_LOCK_myisam is not held while such table is being created, + opened or closed; (b) no iteration through myisam_open_list while opening a + table. This optimization gives nice scalability benefit in concurrent + environment. MEMORY internal temporary tables are optimized similarly. +*/ #include "fulltext.h" #include "sp_defs.h" @@ -74,10 +85,11 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) int lock_error,kfile,open_mode,save_errno,have_rtree=0, realpath_err; uint i,j,len,errpos,head_length,base_pos,offset,info_length,keys, key_parts,unique_key_parts,base_key_parts,fulltext_keys,uniques; + uint internal_table= open_flags & HA_OPEN_INTERNAL_TABLE; char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN], data_name[FN_REFLEN]; uchar *UNINIT_VAR(disk_cache), *disk_pos, *end_pos; - MI_INFO info,*UNINIT_VAR(m_info),*old_info; + MI_INFO info,*UNINIT_VAR(m_info),*old_info= NULL; MYISAM_SHARE share_buff,*share; ulong *rec_per_key_part= 0; my_off_t *key_root, *key_del; @@ -99,8 +111,13 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) DBUG_RETURN (NULL); } - mysql_mutex_lock(&THR_LOCK_myisam); - if (!(old_info=test_if_reopen(name_buff))) + if (!internal_table) + { + mysql_mutex_lock(&THR_LOCK_myisam); + old_info= test_if_reopen(name_buff); + } + + if (!old_info) { share= &share_buff; bzero((uchar*) &share_buff,sizeof(share_buff)); @@ -311,7 +328,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) strmov(share->index_file_name, index_name); strmov(share->data_file_name, data_name); - share->blocksize=min(IO_SIZE,myisam_block_size); + share->blocksize=MY_MIN(IO_SIZE,myisam_block_size); { HA_KEYSEG *pos=share->keyparts; uint32 ftkey_nr= 1; @@ -349,6 +366,12 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) } else if (pos->type == HA_KEYTYPE_BINARY) pos->charset= &my_charset_bin; + if (!(share->keyinfo[i].flag & HA_SPATIAL) && + pos->start > share->base.reclength) + { + my_errno= HA_ERR_CRASHED; + goto err; + } } if (share->keyinfo[i].flag & HA_SPATIAL) { @@ -491,7 +514,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) share->base.margin_key_file_length=(share->base.max_key_file_length - (keys ? MI_INDEX_BLOCK_MARGIN * share->blocksize * keys : 0)); - share->blocksize=min(IO_SIZE,myisam_block_size); + share->blocksize=MY_MIN(IO_SIZE,myisam_block_size); share->data_file_type=STATIC_RECORD; if (share->options & HA_OPTION_COMPRESS_RECORD) { @@ -638,10 +661,13 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) *m_info=info; thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info); - m_info->open_list.data=(void*) m_info; - myisam_open_list=list_add(myisam_open_list,&m_info->open_list); - mysql_mutex_unlock(&THR_LOCK_myisam); + if (!internal_table) + { + m_info->open_list.data= (void*) m_info; + myisam_open_list= list_add(myisam_open_list, &m_info->open_list); + mysql_mutex_unlock(&THR_LOCK_myisam); + } bzero(info.buff, share->base.max_key_block_length * 2); my_free(rec_per_key_part); @@ -686,7 +712,8 @@ err: default: break; } - mysql_mutex_unlock(&THR_LOCK_myisam); + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_myisam); my_errno=save_errno; DBUG_RETURN (NULL); } /* mi_open */ @@ -706,10 +733,10 @@ uchar *mi_alloc_rec_buff(MI_INFO *info, ulong length, uchar **buf) if (length == (ulong) -1) { if (info->s->options & HA_OPTION_COMPRESS_RECORD) - length= max(info->s->base.pack_reclength, info->s->max_pack_length); + length= MY_MAX(info->s->base.pack_reclength, info->s->max_pack_length); else length= info->s->base.pack_reclength; - length= max(length, info->s->base.max_key_length); + length= MY_MAX(length, info->s->base.max_key_length); /* Avoid unnecessary realloc */ if (newptr && length == old_length) return newptr; diff --git a/storage/myisam/mi_packrec.c b/storage/myisam/mi_packrec.c index 7e2403b64c9..c95afe57725 100644 --- a/storage/myisam/mi_packrec.c +++ b/storage/myisam/mi_packrec.c @@ -685,7 +685,7 @@ static uint find_longest_bitstream(uint16 *table, uint16 *end) return OFFSET_TABLE_SIZE; } length2= find_longest_bitstream(next, end) + 1; - length=max(length,length2); + length=MY_MAX(length,length2); } return length; } @@ -1399,7 +1399,7 @@ uint _mi_pack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff, info->filepos=filepos+head_length; if (file > 0) { - info->offset=min(info->rec_len, ref_length - head_length); + info->offset=MY_MIN(info->rec_len, ref_length - head_length); memcpy(*rec_buff_p, header + head_length, info->offset); } return 0; diff --git a/storage/myisam/mi_search.c b/storage/myisam/mi_search.c index 968cb9624a6..01fa10de7a3 100644 --- a/storage/myisam/mi_search.c +++ b/storage/myisam/mi_search.c @@ -949,9 +949,7 @@ uint _mi_get_binary_pack_key(register MI_KEYDEF *keyinfo, uint nod_flag, ("Found too long binary packed key: %u of %u at 0x%lx", length, keyinfo->maxlength, (long) *page_pos)); DBUG_DUMP("key", *page_pos, 16); - mi_print_error(keyinfo->share, HA_ERR_CRASHED); - my_errno=HA_ERR_CRASHED; - DBUG_RETURN(0); /* Wrong key */ + goto crashed; /* Wrong key */ } /* Key is packed against prev key, take prefix from prev key. */ from= key; @@ -994,6 +992,8 @@ uint _mi_get_binary_pack_key(register MI_KEYDEF *keyinfo, uint nod_flag, if (from == from_end) { from=page; from_end=page_end; } length+= (uint) ((*key++ = *from++)); } + if (length > keyseg->length) + goto crashed; } else length=keyseg->length; @@ -1033,15 +1033,18 @@ uint _mi_get_binary_pack_key(register MI_KEYDEF *keyinfo, uint nod_flag, if (from_end != page_end) { DBUG_PRINT("error",("Error when unpacking key")); - mi_print_error(keyinfo->share, HA_ERR_CRASHED); - my_errno=HA_ERR_CRASHED; - DBUG_RETURN(0); /* Error */ + goto crashed; /* Error */ } /* Copy data pointer and, if appropriate, key block pointer. */ memcpy((uchar*) key,(uchar*) from,(size_t) length); *page_pos= from+length; } DBUG_RETURN((uint) (key-start_key)+keyseg->length); + + crashed: + mi_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno= HA_ERR_CRASHED; + DBUG_RETURN(0); } diff --git a/storage/myisam/mi_test1.c b/storage/myisam/mi_test1.c index 3b2597eb01e..9e4e1c46891 100644 --- a/storage/myisam/mi_test1.c +++ b/storage/myisam/mi_test1.c @@ -439,7 +439,7 @@ static void create_record(uchar *record,uint rownr) uint tmp; uchar *ptr;; sprintf((char*) blob_record,"... row: %d", rownr); - strappend((char*) blob_record,max(MAX_REC_LENGTH-rownr,10),' '); + strappend((char*) blob_record,MY_MAX(MAX_REC_LENGTH-rownr,10),' '); tmp=strlen((char*) blob_record); int4store(pos,tmp); ptr=blob_record; diff --git a/storage/myisam/mi_test2.c b/storage/myisam/mi_test2.c index 3ec12ef5cca..e53c68874b2 100644 --- a/storage/myisam/mi_test2.c +++ b/storage/myisam/mi_test2.c @@ -597,7 +597,7 @@ int main(int argc, char *argv[]) goto err; bmove(read_record2,read_record,reclength); - for (i=min(2,keys) ; i-- > 0 ;) + for (i=MY_MIN(2,keys) ; i-- > 0 ;) { if (mi_rsame(file,read_record2,(int) i)) goto err; if (memcmp(read_record,read_record2,reclength) != 0) diff --git a/storage/myisam/myisamchk.c b/storage/myisam/myisamchk.c index c8546ee56f5..64ffffc3a1e 100644 --- a/storage/myisam/myisamchk.c +++ b/storage/myisam/myisamchk.c @@ -16,6 +16,7 @@ /* Describe, check and repair of MyISAM tables */ #include "fulltext.h" +#include "my_default.h" #include <m_ctype.h> #include <stdarg.h> #include <my_getopt.h> diff --git a/storage/myisam/myisamlog.c b/storage/myisam/myisamlog.c index 1624213851b..86e1978edaa 100644 --- a/storage/myisam/myisamlog.c +++ b/storage/myisam/myisamlog.c @@ -91,7 +91,7 @@ int main(int argc, char **argv) log_filename=myisam_log_filename; get_options(&argc,&argv); /* Number of MyISAM files we can have open at one time */ - max_files= (my_set_max_open_files(min(max_files,8))-6)/2; + max_files= (my_set_max_open_files(MY_MIN(max_files,8))-6)/2; if (update) printf("Trying to %s MyISAM files according to log '%s'\n", (recover ? "recover" : "update"),log_filename); diff --git a/storage/myisam/myisampack.c b/storage/myisam/myisampack.c index 6ce88db87f5..c52bef1e40e 100644 --- a/storage/myisam/myisampack.c +++ b/storage/myisam/myisampack.c @@ -20,6 +20,7 @@ #endif #include "myisamdef.h" +#include "my_default.h" #include <queues.h> #include <my_tree.h> #include "mysys_err.h" @@ -783,7 +784,7 @@ static int create_dest_frm(char *source_table, char *dest_table) */ (void) my_copy(source_name, dest_name, MYF(MY_DONT_OVERWRITE_FILE)); - return 0; + DBUG_RETURN(0); } @@ -1269,7 +1270,7 @@ static void check_counts(HUFF_COUNTS *huff_counts, uint trees, { if (huff_counts->field_length > 2 && huff_counts->empty_fields + (records - huff_counts->empty_fields)* - (1+max_bit(max(huff_counts->max_pre_space, + (1+max_bit(MY_MAX(huff_counts->max_pre_space, huff_counts->max_end_space))) < records * max_bit(huff_counts->field_length)) { @@ -3022,7 +3023,7 @@ static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length, if (mrg->src_file_has_indexes_disabled) { isam_file->s->state.state.key_file_length= - max(isam_file->s->state.state.key_file_length, new_length); + MY_MAX(isam_file->s->state.state.key_file_length, new_length); } state.dellink= HA_OFFSET_ERROR; state.version=(ulong) time((time_t*) 0); diff --git a/storage/myisam/rt_mbr.c b/storage/myisam/rt_mbr.c index deca23bbec7..90569f4a5fc 100644 --- a/storage/myisam/rt_mbr.c +++ b/storage/myisam/rt_mbr.c @@ -325,8 +325,8 @@ int rtree_d_mbr(HA_KEYSEG *keyseg, uchar *a, uint key_length, double *res) bmin = korr_func(b); \ amax = korr_func(a+len); \ bmax = korr_func(b+len); \ - amin = min(amin, bmin); \ - amax = max(amax, bmax); \ + amin = MY_MIN(amin, bmin); \ + amax = MY_MAX(amax, bmax); \ store_func(c, amin); \ store_func(c+len, amax); \ } @@ -338,8 +338,8 @@ int rtree_d_mbr(HA_KEYSEG *keyseg, uchar *a, uint key_length, double *res) get_func(bmin, b); \ get_func(amax, a+len); \ get_func(bmax, b+len); \ - amin = min(amin, bmin); \ - amax = max(amax, bmax); \ + amin = MY_MIN(amin, bmin); \ + amax = MY_MAX(amax, bmax); \ store_func(c, amin); \ store_func(c+len, amax); \ } @@ -417,8 +417,8 @@ int rtree_combine_rect(HA_KEYSEG *keyseg, uchar* a, uchar* b, uchar* c, bmin = korr_func(b); \ amax = korr_func(a+len); \ bmax = korr_func(b+len); \ - amin = max(amin, bmin); \ - amax = min(amax, bmax); \ + amin = MY_MAX(amin, bmin); \ + amax = MY_MIN(amax, bmax); \ if (amin >= amax) \ return 0; \ res *= amax - amin; \ @@ -431,8 +431,8 @@ int rtree_combine_rect(HA_KEYSEG *keyseg, uchar* a, uchar* b, uchar* c, get_func(bmin, b); \ get_func(amax, a+len); \ get_func(bmax, b+len); \ - amin = max(amin, bmin); \ - amax = min(amax, bmax); \ + amin = MY_MAX(amin, bmin); \ + amax = MY_MIN(amax, bmax); \ if (amin >= amax) \ return 0; \ res *= amax - amin; \ @@ -508,7 +508,7 @@ double rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b, amax = korr_func(a+len); \ bmax = korr_func(b+len); \ a_area *= (((double)amax) - ((double)amin)); \ - loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + loc_ab_area *= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } #define RT_AREA_INC_GET(type, get_func, len)\ @@ -519,7 +519,7 @@ double rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b, get_func(amax, a+len); \ get_func(bmax, b+len); \ a_area *= (((double)amax) - ((double)amin)); \ - loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + loc_ab_area *= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } /* @@ -604,7 +604,7 @@ safe_end: amax = korr_func(a+len); \ bmax = korr_func(b+len); \ a_perim+= (((double)amax) - ((double)amin)); \ - *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + *ab_perim+= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } #define RT_PERIM_INC_GET(type, get_func, len)\ @@ -615,7 +615,7 @@ safe_end: get_func(amax, a+len); \ get_func(bmax, b+len); \ a_perim+= (((double)amax) - ((double)amin)); \ - *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ + *ab_perim+= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ } /* diff --git a/storage/myisam/sort.c b/storage/myisam/sort.c index 4af45ea02e9..6a328f9ef4e 100644 --- a/storage/myisam/sort.c +++ b/storage/myisam/sort.c @@ -130,7 +130,7 @@ int _create_index_by_sort(MI_SORT_PARAM *info,my_bool no_messages, sort_keys= (uchar **) NULL; error= 1; maxbuffer=1; - memavl= max(sortbuff_size, MIN_SORT_BUFFER); + memavl= MY_MAX(sortbuff_size, MIN_SORT_BUFFER); records= info->sort_info->max_records; sort_length= info->key_length; LINT_INIT(keys); @@ -351,7 +351,7 @@ pthread_handler_t thr_find_all_keys(void *arg) bzero((char*) &sort_param->unique, sizeof(sort_param->unique)); sort_keys= (uchar **) NULL; - memavl= max(sort_param->sortbuff_size, MIN_SORT_BUFFER); + memavl= MY_MAX(sort_param->sortbuff_size, MIN_SORT_BUFFER); idx= (uint)sort_param->sort_info->max_records; sort_length= sort_param->key_length; maxbuffer= 1; @@ -824,7 +824,7 @@ static uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek, register uint count; uint length; - if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + if ((count=(uint) MY_MIN((ha_rows) buffpek->max_keys,buffpek->count))) { if (mysql_file_pread(fromfile->file, (uchar*) buffpek->base, (length= sort_length*count), @@ -846,7 +846,7 @@ static uint read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek, uint idx; uchar *buffp; - if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + if ((count=(uint) MY_MIN((ha_rows) buffpek->max_keys,buffpek->count))) { buffp = buffpek->base; diff --git a/storage/myisammrg/ha_myisammrg.cc b/storage/myisammrg/ha_myisammrg.cc index 7bb7990d9a3..0971e9297d5 100644 --- a/storage/myisammrg/ha_myisammrg.cc +++ b/storage/myisammrg/ha_myisammrg.cc @@ -1304,7 +1304,7 @@ int ha_myisammrg::info(uint flag) memcpy((char*) table->key_info[0].rec_per_key, (char*) mrg_info.rec_per_key, sizeof(table->key_info[0].rec_per_key[0]) * - min(file->keys, table->s->key_parts)); + MY_MIN(file->keys, table->s->key_parts)); } } if (flag & HA_STATUS_ERRKEY) diff --git a/storage/perfschema/CMakeLists.txt b/storage/perfschema/CMakeLists.txt index 0c9713d45d4..ef644030317 100644 --- a/storage/perfschema/CMakeLists.txt +++ b/storage/perfschema/CMakeLists.txt @@ -118,6 +118,10 @@ table_tiws_by_index_usage.h table_tiws_by_table.h table_tlws_by_table.h table_users.h +cursor_by_thread_connect_attr.h +table_session_connect.h +table_session_connect_attrs.h +table_session_account_connect_attrs.h cursor_by_account.cc cursor_by_host.cc cursor_by_thread.cc @@ -126,6 +130,7 @@ ha_perfschema.cc pfs.cc pfs_account.cc pfs_atomic.cc +pfs_autosize.cc pfs_check.cc pfs_column_values.cc pfs_con_slice.cc @@ -189,6 +194,10 @@ table_tiws_by_index_usage.cc table_tiws_by_table.cc table_tlws_by_table.cc table_users.cc +cursor_by_thread_connect_attr.cc +table_session_connect.cc +table_session_connect_attrs.cc +table_session_account_connect_attrs.cc ) MYSQL_ADD_PLUGIN(perfschema ${PERFSCHEMA_SOURCES} STORAGE_ENGINE DEFAULT STATIC_ONLY) diff --git a/storage/perfschema/cursor_by_thread_connect_attr.cc b/storage/perfschema/cursor_by_thread_connect_attr.cc new file mode 100644 index 00000000000..7a0dd04119d --- /dev/null +++ b/storage/perfschema/cursor_by_thread_connect_attr.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#include "my_global.h" +#include "cursor_by_thread_connect_attr.h" + +cursor_by_thread_connect_attr::cursor_by_thread_connect_attr( + const PFS_engine_table_share *share) : + PFS_engine_table(share, &m_pos), m_row_exists(false) +{} + +int cursor_by_thread_connect_attr::rnd_next(void) +{ + PFS_thread *thread; + + for (m_pos.set_at(&m_next_pos); + m_pos.has_more_thread(); + m_pos.next_thread()) + { + thread= &thread_array[m_pos.m_index_1]; + + if (thread->m_lock.is_populated()) + { + make_row(thread, m_pos.m_index_2); + if (m_row_exists) + { + m_next_pos.set_after(&m_pos); + return 0; + } + } + } + return HA_ERR_END_OF_FILE; +} + + +int cursor_by_thread_connect_attr::rnd_pos(const void *pos) +{ + PFS_thread *thread; + + set_position(pos); + DBUG_ASSERT(m_pos.m_index_1 < thread_max); + + thread= &thread_array[m_pos.m_index_1]; + if (!thread->m_lock.is_populated()) + return HA_ERR_RECORD_DELETED; + + make_row(thread, m_pos.m_index_2); + if (m_row_exists) + return 0; + + return HA_ERR_RECORD_DELETED; +} + + +void cursor_by_thread_connect_attr::reset_position(void) +{ + m_pos.reset(); + m_next_pos.reset(); +} diff --git a/storage/perfschema/cursor_by_thread_connect_attr.h b/storage/perfschema/cursor_by_thread_connect_attr.h new file mode 100644 index 00000000000..fbce56f208d --- /dev/null +++ b/storage/perfschema/cursor_by_thread_connect_attr.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#ifndef CURSOR_BY_THREAD_CONNECT_ATTR_H +#define CURSOR_BY_THREAD_CONNECT_ATTR_H + +#include "pfs_column_types.h" +#include "pfs_engine_table.h" +#include "pfs_instr.h" + +/** + \addtogroup Performance_schema_tables + @{ +*/ + +struct pos_connect_attr_by_thread_by_attr +: public PFS_double_index +{ + pos_connect_attr_by_thread_by_attr() + : PFS_double_index(0, 0) + {} + + inline bool has_more_thread(void) + { + return (m_index_1 < thread_max); + } + + inline void next_thread(void) + { + m_index_1++; + m_index_2= 0; + } + + inline void reset(void) + { + m_index_1= 0; + m_index_2= 0; + } +}; + +/** Cursor CURSOR_BY_THREAD_CONNECT_ATTR. */ +class cursor_by_thread_connect_attr : public PFS_engine_table +{ +public: + virtual int rnd_next(); + virtual int rnd_pos(const void *pos); + virtual void reset_position(void); + +protected: + cursor_by_thread_connect_attr(const PFS_engine_table_share *share); + +public: + ~cursor_by_thread_connect_attr() + {} + +protected: + virtual void make_row(PFS_thread *thread, uint ordinal)= 0; + /** True if row exists */ + bool m_row_exists; + +private: + /** Current position. */ + pos_connect_attr_by_thread_by_attr m_pos; + /** Next position. */ + pos_connect_attr_by_thread_by_attr m_next_pos; +}; + +/** @} */ +#endif diff --git a/storage/perfschema/gen_pfs_lex_token.cc b/storage/perfschema/gen_pfs_lex_token.cc index b7470061de1..7581255b284 100644 --- a/storage/perfschema/gen_pfs_lex_token.cc +++ b/storage/perfschema/gen_pfs_lex_token.cc @@ -1,5 +1,5 @@ /* - Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -243,7 +243,7 @@ void print_tokens() int main(int argc,char **argv) { puts("/*"); - puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2011, 2012")); + puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2011")); puts("*/"); printf("/*\n"); diff --git a/storage/perfschema/ha_perfschema.cc b/storage/perfschema/ha_perfschema.cc index 773d822af2b..50bdb043566 100644 --- a/storage/perfschema/ha_perfschema.cc +++ b/storage/perfschema/ha_perfschema.cc @@ -166,6 +166,8 @@ static struct st_mysql_show_var pfs_status_vars[]= (char*) &statement_class_lost, SHOW_LONG}, {"Performance_schema_digest_lost", (char*) &digest_lost, SHOW_LONG}, + {"Performance_schema_session_connect_attrs_lost", + (char*) &session_connect_attrs_lost, SHOW_LONG}, {NullS, NullS, SHOW_LONG} }; @@ -256,12 +258,12 @@ int ha_perfschema::write_row(uchar *buf) int result; DBUG_ENTER("ha_perfschema::write_row"); + if (!pfs_initialized) + DBUG_RETURN(HA_ERR_WRONG_COMMAND); - ha_statistic_increment(&SSV::ha_write_count); DBUG_ASSERT(m_table_share); - + ha_statistic_increment(&SSV::ha_write_count); result= m_table_share->write_row(table, buf, table->field); - DBUG_RETURN(result); } @@ -279,7 +281,9 @@ void ha_perfschema::use_hidden_primary_key(void) int ha_perfschema::update_row(const uchar *old_data, uchar *new_data) { DBUG_ENTER("ha_perfschema::update_row"); - + if (!pfs_initialized) + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + DBUG_ASSERT(m_table); ha_statistic_increment(&SSV::ha_update_count); int result= m_table->update_row(table, old_data, new_data, table->field); @@ -289,6 +293,8 @@ int ha_perfschema::update_row(const uchar *old_data, uchar *new_data) int ha_perfschema::delete_row(const uchar *buf) { DBUG_ENTER("ha_perfschema::delete_row"); + if (!pfs_initialized) + DBUG_RETURN(HA_ERR_WRONG_COMMAND); DBUG_ASSERT(m_table); ha_statistic_increment(&SSV::ha_delete_count); @@ -329,6 +335,8 @@ int ha_perfschema::rnd_end(void) int ha_perfschema::rnd_next(uchar *buf) { DBUG_ENTER("ha_perfschema::rnd_next"); + if (!pfs_initialized) + DBUG_RETURN(HA_ERR_END_OF_FILE); DBUG_ASSERT(m_table); ha_statistic_increment(&SSV::ha_read_rnd_next_count); @@ -355,6 +363,8 @@ void ha_perfschema::position(const uchar *record) int ha_perfschema::rnd_pos(uchar *buf, uchar *pos) { DBUG_ENTER("ha_perfschema::rnd_pos"); + if (!pfs_initialized) + DBUG_RETURN(HA_ERR_END_OF_FILE); DBUG_ASSERT(m_table); ha_statistic_increment(&SSV::ha_read_rnd_count); @@ -380,6 +390,8 @@ int ha_perfschema::delete_all_rows(void) int result; DBUG_ENTER("ha_perfschema::delete_all_rows"); + if (!pfs_initialized) + DBUG_RETURN(0); DBUG_ASSERT(m_table_share); if (m_table_share->m_delete_all_rows) diff --git a/storage/perfschema/ha_perfschema.h b/storage/perfschema/ha_perfschema.h index dc465da3758..c2929046f3d 100644 --- a/storage/perfschema/ha_perfschema.h +++ b/storage/perfschema/ha_perfschema.h @@ -72,8 +72,7 @@ public: records. */ return (HA_NO_TRANSACTIONS | HA_REC_NOT_IN_SEQ | HA_NO_AUTO_INCREMENT | - HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE | - HA_PRIMARY_KEY_REQUIRED_FOR_DELETE | HA_HAS_OWN_BINLOGGING); + HA_PRIMARY_KEY_REQUIRED_FOR_DELETE); } /** diff --git a/storage/perfschema/pfs.cc b/storage/perfschema/pfs.cc index d3de38d025c..33b21ee2817 100644 --- a/storage/perfschema/pfs.cc +++ b/storage/perfschema/pfs.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -688,6 +688,7 @@ static inline int mysql_mutex_lock(...) - socket io (MYSQL_SOCKET) - table io - table lock + - idle The flow of data between aggregates tables varies for each instrumentation. @@ -857,24 +858,35 @@ static inline int mysql_mutex_lock(...) @subsection IMPL_WAIT_SOCKET Socket waits @verbatim - socket_locker(T, F) + socket_locker(T, S) | | [1] | - |-> pfs_socket(F) =====>> [A], [B], [C], [D], [E] + |-> pfs_socket(S) =====>> [A], [B], [C], [D], [E] | | [2] | - |-> pfs_socket_class(F.class) =====>> [C], [D] + |-> pfs_socket_class(S.class) =====>> [C], [D] | - |-> pfs_thread(T).event_name(F) =====>> [A] + |-> pfs_thread(T).event_name(S) =====>> [A] | - ... + | [3] + | + 3a |-> pfs_account(U, H).event_name(S) =====>> [F], [G], [H] + . | + . | [4-RESET] + . | + 3b .....+-> pfs_user(U).event_name(S) =====>> [G] + . | + 3c .....+-> pfs_host(H).event_name(S) =====>> [H] @endverbatim Implemented as: - [1] @c start_socket_wait_v1(), @c end_socket_wait_v1(). - [2] @c close_socket_v1() + - [3] @c aggregate_thread_waits() + - [4] @c PFS_account::aggregate_waits() + - [5] @c PFS_host::aggregate_waits() - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME, @c table_ews_by_thread_by_event_name::make_row() - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE, @@ -885,37 +897,78 @@ static inline int mysql_mutex_lock(...) @c table_socket_summary_by_event_name::make_row() - [E] SOCKET_SUMMARY_BY_INSTANCE, @c table_socket_summary_by_instance::make_row() + - [F] EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME, + @c table_ews_by_account_by_event_name::make_row() + - [G] EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME, + @c table_ews_by_user_by_event_name::make_row() + - [H] EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME, + @c table_ews_by_host_by_event_name::make_row() @subsection IMPL_WAIT_TABLE Table waits @verbatim - table_locker(T, Tb) + table_locker(Thread Th, Table Tb, Event = io or lock) | | [1] | - |-> pfs_table(Tb) =====>> [B], [C], [D] - | - | [2] - | - |-> pfs_table_share(Tb.share) =====>> [C], [D] - | - |-> pfs_thread(T).event_name(Tb) =====>> [A] - | - ... +1a |-> pfs_table(Tb) =====>> [A], [B], [C] + | | + | | [2] + | | + | |-> pfs_table_share(Tb.share) =====>> [B], [C] + | | + | | [3] + | | + | |-> global_table_io_stat =====>> [C] + | | + | |-> global_table_lock_stat =====>> [C] + | +1b |-> pfs_thread(Th).event_name(E) =====>> [D], [E], [F], [G] + | | + | | [ 4-RESET] + | | + | |-> pfs_account(U, H).event_name(E) =====>> [E], [F], [G] + | . | + | . | [5-RESET] + | . | + | .....+-> pfs_user(U).event_name(E) =====>> [F] + | . | + | .....+-> pfs_host(H).event_name(E) =====>> [G] + | +1c |-> pfs_thread(Th).waits_current(W) =====>> [H] + | +1d |-> pfs_thread(Th).waits_history(W) =====>> [I] + | +1e |-> waits_history_long(W) =====>> [J] @endverbatim Implemented as: - [1] @c start_table_io_wait_v1(), @c end_table_io_wait_v1() - [2] @c close_table_v1() - - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME, - @c table_ews_by_thread_by_event_name::make_row() - - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE, + - [3] @c drop_table_share_v1() + - [4] @c TRUNCATE TABLE EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME + - [5] @c TRUNCATE TABLE EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME + - [A] EVENTS_WAITS_SUMMARY_BY_INSTANCE, @c table_events_waits_summary_by_instance::make_table_row() + - [B] OBJECTS_SUMMARY_GLOBAL_BY_TYPE, + @c table_os_global_by_type::make_row() - [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME, @c table_ews_global_by_event_name::make_table_io_row(), @c table_ews_global_by_event_name::make_table_lock_row() - - [D] OBJECTS_SUMMARY_GLOBAL_BY_TYPE, - @c table_os_global_by_type::make_row() + - [D] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME, + @c table_ews_by_thread_by_event_name::make_row() + - [E] EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME, + @c table_ews_by_user_by_account_name::make_row() + - [F] EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME, + @c table_ews_by_user_by_event_name::make_row() + - [G] EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME, + @c table_ews_by_host_by_event_name::make_row() + - [H] EVENTS_WAITS_CURRENT, + @c table_events_waits_common::make_row() + - [I] EVENTS_WAITS_HISTORY, + @c table_events_waits_common::make_row() + - [J] EVENTS_WAITS_HISTORY_LONG, + @c table_events_waits_common::make_row() @section IMPL_STAGE Implementation for stages aggregates @@ -1594,7 +1647,6 @@ static void unbind_table_v1(PSI_table *table) PFS_table *pfs= reinterpret_cast<PFS_table*> (table); if (likely(pfs != NULL)) { - pfs->aggregate(); pfs->m_thread_owner= NULL; } } @@ -1615,12 +1667,6 @@ rebind_table_v1(PSI_table_share *share, const void *identity, PSI_table *table) /* The table handle was already instrumented, reuse it for this thread. */ thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); - if (unlikely(thread == NULL)) - { - destroy_table(pfs); - return NULL; - } - if (unlikely(! pfs->m_share->m_enabled)) { destroy_table(pfs); @@ -1660,8 +1706,6 @@ rebind_table_v1(PSI_table_share *share, const void *identity, PSI_table *table) return NULL; PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); - if (unlikely(thread == NULL)) - return NULL; PFS_table *pfs_table= create_table(pfs_table_share, thread, identity); return reinterpret_cast<PSI_table *> (pfs_table); @@ -1681,9 +1725,18 @@ static void close_table_v1(PSI_table *table) } static PSI_socket* -init_socket_v1(PSI_socket_key key, const my_socket *fd) +init_socket_v1(PSI_socket_key key, const my_socket *fd, + const struct sockaddr *addr, socklen_t addr_len) { - INIT_BODY_V1(socket, key, fd); + PFS_socket_class *klass; + PFS_socket *pfs; + klass= find_socket_class(key); + if (unlikely(klass == NULL)) + return NULL; + if (! klass->m_enabled) + return NULL; + pfs= create_socket(klass, fd, addr, addr_len); + return reinterpret_cast<PSI_socket *> (pfs); } static void destroy_socket_v1(PSI_socket *socket) @@ -1731,7 +1784,7 @@ static void create_file_v1(PSI_file_key key, const char *name, File file) } uint len= strlen(name); - PFS_file *pfs_file= find_or_create_file(pfs_thread, klass, name, len); + PFS_file *pfs_file= find_or_create_file(pfs_thread, klass, name, len, true); file_handle_array[index]= pfs_file; } @@ -1835,13 +1888,13 @@ static int spawn_thread_v1(PSI_thread_key key, @sa PSI_v1::new_thread. */ static PSI_thread* -new_thread_v1(PSI_thread_key key, const void *identity, ulong thread_id) +new_thread_v1(PSI_thread_key key, const void *identity, ulonglong processlist_id) { PFS_thread *pfs; PFS_thread_class *klass= find_thread_class(key); if (likely(klass != NULL)) - pfs= create_thread(klass, identity, thread_id); + pfs= create_thread(klass, identity, processlist_id); else pfs= NULL; @@ -1852,12 +1905,12 @@ new_thread_v1(PSI_thread_key key, const void *identity, ulong thread_id) Implementation of the thread instrumentation interface. @sa PSI_v1::set_thread_id. */ -static void set_thread_id_v1(PSI_thread *thread, unsigned long id) +static void set_thread_id_v1(PSI_thread *thread, ulonglong processlist_id) { PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread); if (unlikely(pfs == NULL)) return; - pfs->m_thread_id= id; + pfs->m_processlist_id= processlist_id; } /** @@ -2045,10 +2098,10 @@ static void set_thread_state_v1(const char* state) { int state_len= state ? strlen(state) : 0; - pfs->m_lock.allocated_to_dirty(); + pfs->m_processlist_lock.allocated_to_dirty(); pfs->m_processlist_state_ptr= state; pfs->m_processlist_state_length= state_len; - pfs->m_lock.dirty_to_allocated(); + pfs->m_processlist_lock.dirty_to_allocated(); } } @@ -2060,12 +2113,14 @@ static void set_thread_info_v1(const char* info, int info_len) { PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + DBUG_ASSERT((info != NULL) || (info_len == 0)); + if (likely(pfs != NULL)) { - pfs->m_lock.allocated_to_dirty(); + pfs->m_processlist_lock.allocated_to_dirty(); pfs->m_processlist_info_ptr= info; pfs->m_processlist_info_length= info_len; - pfs->m_lock.dirty_to_allocated(); + pfs->m_processlist_lock.dirty_to_allocated(); } } @@ -2196,7 +2251,7 @@ start_mutex_wait_v1(PSI_mutex_locker_state *state, Complete shortcut. */ /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */ - pfs_mutex->m_wait_stat.aggregate_counted(); + pfs_mutex->m_mutex_stat.m_wait_stat.aggregate_counted(); return NULL; } } @@ -2294,7 +2349,7 @@ start_rwlock_wait_v1(PSI_rwlock_locker_state *state, Complete shortcut. */ /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */ - pfs_rwlock->m_wait_stat.aggregate_counted(); + pfs_rwlock->m_rwlock_stat.m_wait_stat.aggregate_counted(); return NULL; } } @@ -2401,7 +2456,7 @@ start_cond_wait_v1(PSI_cond_locker_state *state, Complete shortcut. */ /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */ - pfs_cond->m_wait_stat.aggregate_counted(); + pfs_cond->m_cond_stat.m_wait_stat.aggregate_counted(); return NULL; } } @@ -2478,8 +2533,6 @@ start_table_io_wait_v1(PSI_table_locker_state *state, return NULL; PFS_thread *pfs_thread= pfs_table->m_thread_owner; - if (unlikely(pfs_thread == NULL)) - return NULL; DBUG_ASSERT(pfs_thread == my_pthread_getspecific_ptr(PFS_thread*, THR_PFS)); @@ -2489,6 +2542,8 @@ start_table_io_wait_v1(PSI_table_locker_state *state, if (flag_thread_instrumentation) { + if (pfs_thread == NULL) + return NULL; if (! pfs_thread->m_enabled) return NULL; state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread); @@ -2538,7 +2593,6 @@ start_table_io_wait_v1(PSI_table_locker_state *state, pfs_thread->m_events_waits_current++; } - /* TODO: consider a shortcut here */ } else { @@ -2585,11 +2639,6 @@ start_table_lock_wait_v1(PSI_table_locker_state *state, return NULL; PFS_thread *pfs_thread= pfs_table->m_thread_owner; - if (unlikely(pfs_thread == NULL)) - return NULL; - - DBUG_ASSERT(pfs_thread == - my_pthread_getspecific_ptr(PFS_thread*, THR_PFS)); PFS_TL_LOCK_TYPE lock_type; @@ -2619,6 +2668,8 @@ start_table_lock_wait_v1(PSI_table_locker_state *state, if (flag_thread_instrumentation) { + if (pfs_thread == NULL) + return NULL; if (! pfs_thread->m_enabled) return NULL; state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread); @@ -2668,7 +2719,6 @@ start_table_lock_wait_v1(PSI_table_locker_state *state, pfs_thread->m_events_waits_current++; } - /* TODO: consider a shortcut here */ } else { @@ -2729,11 +2779,6 @@ get_thread_file_name_locker_v1(PSI_file_locker_state *state, if (klass->m_timed) flags|= STATE_FLAG_TIMED; - uint len= strlen(name); - PFS_file *pfs_file= find_or_create_file(pfs_thread, klass, name, len); - if (unlikely(pfs_file == NULL)) - return NULL; - if (flag_events_waits_current) { if (unlikely(pfs_thread->m_events_waits_current >= @@ -2755,9 +2800,9 @@ get_thread_file_name_locker_v1(PSI_file_locker_state *state, wait->m_class= klass; wait->m_timer_start= 0; wait->m_timer_end= 0; - wait->m_object_instance_addr= pfs_file; - wait->m_weak_file= pfs_file; - wait->m_weak_version= pfs_file->get_version(); + wait->m_object_instance_addr= NULL; + wait->m_weak_file= NULL; + wait->m_weak_version= 0; wait->m_event_id= pfs_thread->m_event_id++; wait->m_end_event_id= 0; wait->m_operation= file_operation_map[static_cast<int> (op)]; @@ -2767,7 +2812,9 @@ get_thread_file_name_locker_v1(PSI_file_locker_state *state, } state->m_flags= flags; - state->m_file= reinterpret_cast<PSI_file*> (pfs_file); + state->m_file= NULL; + state->m_name= name; + state->m_class= klass; state->m_operation= op; return reinterpret_cast<PSI_file_locker*> (state); } @@ -2788,6 +2835,7 @@ get_thread_file_stream_locker_v1(PSI_file_locker_state *state, if (unlikely(pfs_file == NULL)) return NULL; DBUG_ASSERT(pfs_file->m_class != NULL); + PFS_file_class *klass= pfs_file->m_class; if (! pfs_file->m_enabled) return NULL; @@ -2825,7 +2873,7 @@ get_thread_file_stream_locker_v1(PSI_file_locker_state *state, wait->m_nesting_event_type= parent_event->m_event_type; wait->m_thread= pfs_thread; - wait->m_class= pfs_file->m_class; + wait->m_class= klass; wait->m_timer_start= 0; wait->m_timer_end= 0; wait->m_object_instance_addr= pfs_file; @@ -2856,6 +2904,8 @@ get_thread_file_stream_locker_v1(PSI_file_locker_state *state, state->m_flags= flags; state->m_file= reinterpret_cast<PSI_file*> (pfs_file); state->m_operation= op; + state->m_name= NULL; + state->m_class= klass; return reinterpret_cast<PSI_file_locker*> (state); } @@ -2890,10 +2940,12 @@ get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state, if (op == PSI_FILE_CLOSE) file_handle_array[index]= NULL; - DBUG_ASSERT(pfs_file->m_class != NULL); if (! pfs_file->m_enabled) return NULL; + DBUG_ASSERT(pfs_file->m_class != NULL); + PFS_file_class *klass= pfs_file->m_class; + register uint flags; if (flag_thread_instrumentation) @@ -2927,7 +2979,7 @@ get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state, wait->m_nesting_event_type= parent_event->m_event_type; wait->m_thread= pfs_thread; - wait->m_class= pfs_file->m_class; + wait->m_class= klass; wait->m_timer_start= 0; wait->m_timer_end= 0; wait->m_object_instance_addr= pfs_file; @@ -2958,6 +3010,8 @@ get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state, state->m_flags= flags; state->m_file= reinterpret_cast<PSI_file*> (pfs_file); state->m_operation= op; + state->m_name= NULL; + state->m_class= klass; return reinterpret_cast<PSI_file_locker*> (state); } @@ -2991,14 +3045,6 @@ start_socket_wait_v1(PSI_socket_locker_state *state, if (unlikely(pfs_thread == NULL)) return NULL; -#ifdef LATER - /* - Needs refinement, because of KILL. - */ - DBUG_ASSERT(pfs_thread == - my_pthread_getspecific_ptr(PFS_thread*, THR_PFS)); -#endif - if (!pfs_thread->m_enabled) return NULL; @@ -3112,22 +3158,15 @@ static void unlock_mutex_v1(PSI_mutex *mutex) PFS_mutex::m_lock_stat is not exposed in user visible tables currently, so there is no point spending time computing it. */ - PFS_thread *pfs_thread= reinterpret_cast<PFS_thread*> (thread); - DBUG_ASSERT(pfs_thread != NULL); - - if (unlikely(! flag_events_waits_current)) - return; - if (! pfs_mutex->m_class->m_enabled) + if (! pfs_mutex->m_enabled) return; - if (! pfs_thread->m_enabled) + + if (! pfs_mutex->m_timed) return; - if (pfs_mutex->m_class->m_timed) - { - ulonglong locked_time; - locked_time= get_timer_pico_value(wait_timer) - pfs_mutex->m_last_locked; - aggregate_single_stat_chain(&pfs_mutex->m_lock_stat, locked_time); - } + ulonglong locked_time; + locked_time= get_timer_pico_value(wait_timer) - pfs_mutex->m_last_locked; + pfs_mutex->m_mutex_stat.m_lock_stat.aggregate_value(locked_time); #endif } @@ -3185,32 +3224,23 @@ static void unlock_rwlock_v1(PSI_rwlock *rwlock) #ifdef LATER_WL2333 /* See WL#2333: SHOW ENGINE ... LOCK STATUS. */ - PFS_thread *pfs_thread= reinterpret_cast<PFS_thread*> (thread); - DBUG_ASSERT(pfs_thread != NULL); - if (unlikely(! flag_events_waits_current)) - return; - if (! pfs_rwlock->m_class->m_enabled) + if (! pfs_rwlock->m_enabled) return; - if (! pfs_thread->m_enabled) + + if (! pfs_rwlock->m_timed) return; ulonglong locked_time; if (last_writer) { - if (pfs_rwlock->m_class->m_timed) - { - locked_time= get_timer_pico_value(wait_timer) - pfs_rwlock->m_last_written; - aggregate_single_stat_chain(&pfs_rwlock->m_write_lock_stat, locked_time); - } + locked_time= get_timer_pico_value(wait_timer) - pfs_rwlock->m_last_written; + pfs_rwlock->m_rwlock_stat.m_write_lock_stat.aggregate_value(locked_time); } else if (last_reader) { - if (pfs_rwlock->m_class->m_timed) - { - locked_time= get_timer_pico_value(wait_timer) - pfs_rwlock->m_last_read; - aggregate_single_stat_chain(&pfs_rwlock->m_read_lock_stat, locked_time); - } + locked_time= get_timer_pico_value(wait_timer) - pfs_rwlock->m_last_read; + pfs_rwlock->m_rwlock_stat.m_read_lock_stat.aggregate_value(locked_time); } #else (void) last_reader; @@ -3352,17 +3382,16 @@ static void end_idle_wait_v1(PSI_idle_locker* locker) PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread); PFS_single_stat *event_name_array; event_name_array= thread->m_instr_class_waits_stats; - uint index= global_idle_class.m_event_name_index; if (flags & STATE_FLAG_TIMED) { /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */ - event_name_array[index].aggregate_value(wait_time); + event_name_array[GLOBAL_IDLE_EVENT_INDEX].aggregate_value(wait_time); } else { /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */ - event_name_array[index].aggregate_counted(); + event_name_array[GLOBAL_IDLE_EVENT_INDEX].aggregate_counted(); } if (flags & STATE_FLAG_EVENT) @@ -3379,6 +3408,17 @@ static void end_idle_wait_v1(PSI_idle_locker* locker) thread->m_events_waits_current--; } } + + if (flags & STATE_FLAG_TIMED) + { + /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME (timed) */ + global_idle_stat.aggregate_value(wait_time); + } + else + { + /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME (counted) */ + global_idle_stat.aggregate_counted(); + } } /** @@ -3404,12 +3444,12 @@ static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc) timer_end= state->m_timer(); wait_time= timer_end - state->m_timer_start; /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */ - mutex->m_wait_stat.aggregate_value(wait_time); + mutex->m_mutex_stat.m_wait_stat.aggregate_value(wait_time); } else { /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */ - mutex->m_wait_stat.aggregate_counted(); + mutex->m_mutex_stat.m_wait_stat.aggregate_counted(); } if (likely(rc == 0)) @@ -3471,12 +3511,12 @@ static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc) timer_end= state->m_timer(); wait_time= timer_end - state->m_timer_start; /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */ - rwlock->m_wait_stat.aggregate_value(wait_time); + rwlock->m_rwlock_stat.m_wait_stat.aggregate_value(wait_time); } else { /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */ - rwlock->m_wait_stat.aggregate_counted(); + rwlock->m_rwlock_stat.m_wait_stat.aggregate_counted(); } if (rc == 0) @@ -3551,12 +3591,12 @@ static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc) timer_end= state->m_timer(); wait_time= timer_end - state->m_timer_start; /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */ - rwlock->m_wait_stat.aggregate_value(wait_time); + rwlock->m_rwlock_stat.m_wait_stat.aggregate_value(wait_time); } else { /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */ - rwlock->m_wait_stat.aggregate_counted(); + rwlock->m_rwlock_stat.m_wait_stat.aggregate_counted(); } if (likely(rc == 0)) @@ -3622,12 +3662,12 @@ static void end_cond_wait_v1(PSI_cond_locker* locker, int rc) timer_end= state->m_timer(); wait_time= timer_end - state->m_timer_start; /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */ - cond->m_wait_stat.aggregate_value(wait_time); + cond->m_cond_stat.m_wait_stat.aggregate_value(wait_time); } else { /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */ - cond->m_wait_stat.aggregate_counted(); + cond->m_cond_stat.m_wait_stat.aggregate_counted(); } if (state->m_flags & STATE_FLAG_THREAD) @@ -3682,23 +3722,27 @@ static void end_table_io_wait_v1(PSI_table_locker* locker) DBUG_ASSERT(table != NULL); PFS_single_stat *stat; + PFS_table_io_stat *table_io_stat; DBUG_ASSERT((state->m_index < table->m_share->m_key_count) || - (state->m_index == MAX_KEY)); + (state->m_index == MAX_INDEXES)); + + table_io_stat= & table->m_table_stat.m_index_stat[state->m_index]; + table_io_stat->m_has_data= true; switch (state->m_io_operation) { case PSI_TABLE_FETCH_ROW: - stat= & table->m_table_stat.m_index_stat[state->m_index].m_fetch; + stat= & table_io_stat->m_fetch; break; case PSI_TABLE_WRITE_ROW: - stat= & table->m_table_stat.m_index_stat[state->m_index].m_insert; + stat= & table_io_stat->m_insert; break; case PSI_TABLE_UPDATE_ROW: - stat= & table->m_table_stat.m_index_stat[state->m_index].m_update; + stat= & table_io_stat->m_update; break; case PSI_TABLE_DELETE_ROW: - stat= & table->m_table_stat.m_index_stat[state->m_index].m_delete; + stat= & table_io_stat->m_delete; break; default: DBUG_ASSERT(false); @@ -3719,22 +3763,40 @@ static void end_table_io_wait_v1(PSI_table_locker* locker) stat->aggregate_counted(); } - if (flags & STATE_FLAG_EVENT) + if (flags & STATE_FLAG_THREAD) { - DBUG_ASSERT(flags & STATE_FLAG_THREAD); PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread); DBUG_ASSERT(thread != NULL); - PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait); - DBUG_ASSERT(wait != NULL); + PFS_single_stat *event_name_array; + event_name_array= thread->m_instr_class_waits_stats; - wait->m_timer_end= timer_end; - wait->m_end_event_id= thread->m_event_id; - if (flag_events_waits_history) - insert_events_waits_history(thread, wait); - if (flag_events_waits_history_long) - insert_events_waits_history_long(wait); - thread->m_events_waits_current--; + /* + Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME + (for wait/io/table/sql/handler) + */ + if (flags & STATE_FLAG_TIMED) + { + event_name_array[GLOBAL_TABLE_IO_EVENT_INDEX].aggregate_value(wait_time); + } + else + { + event_name_array[GLOBAL_TABLE_IO_EVENT_INDEX].aggregate_counted(); + } + + if (flags & STATE_FLAG_EVENT) + { + PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait); + DBUG_ASSERT(wait != NULL); + + wait->m_timer_end= timer_end; + wait->m_end_event_id= thread->m_event_id; + if (flag_events_waits_history) + insert_events_waits_history(thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + thread->m_events_waits_current--; + } } table->m_has_io_stats= true; @@ -3770,22 +3832,40 @@ static void end_table_lock_wait_v1(PSI_table_locker* locker) stat->aggregate_counted(); } - if (flags & STATE_FLAG_EVENT) + if (flags & STATE_FLAG_THREAD) { - DBUG_ASSERT(flags & STATE_FLAG_THREAD); PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread); DBUG_ASSERT(thread != NULL); - PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait); - DBUG_ASSERT(wait != NULL); + PFS_single_stat *event_name_array; + event_name_array= thread->m_instr_class_waits_stats; - wait->m_timer_end= timer_end; - wait->m_end_event_id= thread->m_event_id; - if (flag_events_waits_history) - insert_events_waits_history(thread, wait); - if (flag_events_waits_history_long) - insert_events_waits_history_long(wait); - thread->m_events_waits_current--; + /* + Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME + (for wait/lock/table/sql/handler) + */ + if (flags & STATE_FLAG_TIMED) + { + event_name_array[GLOBAL_TABLE_LOCK_EVENT_INDEX].aggregate_value(wait_time); + } + else + { + event_name_array[GLOBAL_TABLE_LOCK_EVENT_INDEX].aggregate_counted(); + } + + if (flags & STATE_FLAG_EVENT) + { + PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait); + DBUG_ASSERT(wait != NULL); + + wait->m_timer_end= timer_end; + wait->m_end_event_id= thread->m_event_id; + if (flag_events_waits_history) + insert_events_waits_history(thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + thread->m_events_waits_current--; + } } table->m_has_lock_stats= true; @@ -3803,25 +3883,50 @@ static void end_file_wait_v1(PSI_file_locker *locker, Implementation of the file instrumentation interface. @sa PSI_v1::start_file_open_wait. */ -static PSI_file* start_file_open_wait_v1(PSI_file_locker *locker, - const char *src_file, - uint src_line) +static void start_file_open_wait_v1(PSI_file_locker *locker, + const char *src_file, + uint src_line) { - PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker); - DBUG_ASSERT(state != NULL); - start_file_wait_v1(locker, 0, src_file, src_line); - return state->m_file; + return; } /** Implementation of the file instrumentation interface. @sa PSI_v1::end_file_open_wait. */ -static void end_file_open_wait_v1(PSI_file_locker *locker) +static PSI_file* end_file_open_wait_v1(PSI_file_locker *locker, + void *result) { + PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker); + DBUG_ASSERT(state != NULL); + + switch (state->m_operation) + { + case PSI_FILE_STAT: + break; + case PSI_FILE_STREAM_OPEN: + case PSI_FILE_CREATE: + if (result != NULL) + { + PFS_file_class *klass= reinterpret_cast<PFS_file_class*> (state->m_class); + PFS_thread *thread= reinterpret_cast<PFS_thread*> (state->m_thread); + const char *name= state->m_name; + uint len= strlen(name); + PFS_file *pfs_file= find_or_create_file(thread, klass, name, len, true); + state->m_file= reinterpret_cast<PSI_file*> (pfs_file); + } + break; + case PSI_FILE_OPEN: + default: + DBUG_ASSERT(false); + break; + } + end_file_wait_v1(locker, 0); + + return state->m_file; } /** @@ -3831,25 +3936,33 @@ static void end_file_open_wait_v1(PSI_file_locker *locker) static void end_file_open_wait_and_bind_to_descriptor_v1 (PSI_file_locker *locker, File file) { + PFS_file *pfs_file= NULL; int index= (int) file; PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker); DBUG_ASSERT(state != NULL); - end_file_wait_v1(locker, 0); + if (index >= 0) + { + PFS_file_class *klass= reinterpret_cast<PFS_file_class*> (state->m_class); + PFS_thread *thread= reinterpret_cast<PFS_thread*> (state->m_thread); + const char *name= state->m_name; + uint len= strlen(name); + pfs_file= find_or_create_file(thread, klass, name, len, true); + state->m_file= reinterpret_cast<PSI_file*> (pfs_file); + } - PFS_file *pfs_file= reinterpret_cast<PFS_file*> (state->m_file); - DBUG_ASSERT(pfs_file != NULL); + end_file_wait_v1(locker, 0); if (likely(index >= 0)) { if (likely(index < file_handle_max)) file_handle_array[index]= pfs_file; else + { + if (pfs_file != NULL) + release_file(pfs_file); file_handle_lost++; - } - else - { - release_file(pfs_file); + } } } @@ -3896,7 +4009,7 @@ static void end_file_wait_v1(PSI_file_locker *locker, PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker); DBUG_ASSERT(state != NULL); PFS_file *file= reinterpret_cast<PFS_file *> (state->m_file); - DBUG_ASSERT(file != NULL); + PFS_file_class *klass= reinterpret_cast<PFS_file_class *> (state->m_class); PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread); ulonglong timer_end= 0; @@ -3905,15 +4018,26 @@ static void end_file_wait_v1(PSI_file_locker *locker, register uint flags= state->m_flags; size_t bytes= ((int)byte_count > -1 ? byte_count : 0); + PFS_file_stat *file_stat; + + if (file != NULL) + { + file_stat= & file->m_file_stat; + } + else + { + file_stat= & klass->m_file_stat; + } + switch (state->m_operation) { /* Group read operations */ case PSI_FILE_READ: - byte_stat= &file->m_file_stat.m_io_stat.m_read; + byte_stat= &file_stat->m_io_stat.m_read; break; /* Group write operations */ case PSI_FILE_WRITE: - byte_stat= &file->m_file_stat.m_io_stat.m_write; + byte_stat= &file_stat->m_io_stat.m_write; break; /* Group remaining operations as miscellaneous */ case PSI_FILE_CREATE: @@ -3931,7 +4055,7 @@ static void end_file_wait_v1(PSI_file_locker *locker, case PSI_FILE_SYNC: case PSI_FILE_STAT: case PSI_FILE_CLOSE: - byte_stat= &file->m_file_stat.m_io_stat.m_misc; + byte_stat= &file_stat->m_io_stat.m_misc; break; default: DBUG_ASSERT(false); @@ -3959,7 +4083,7 @@ static void end_file_wait_v1(PSI_file_locker *locker, PFS_single_stat *event_name_array; event_name_array= thread->m_instr_class_waits_stats; - uint index= file->m_class->m_event_name_index; + uint index= klass->m_event_name_index; if (flags & STATE_FLAG_TIMED) { @@ -3980,6 +4104,9 @@ static void end_file_wait_v1(PSI_file_locker *locker, wait->m_timer_end= timer_end; wait->m_number_of_bytes= bytes; wait->m_end_event_id= thread->m_event_id; + wait->m_object_instance_addr= file; + wait->m_weak_file= file; + wait->m_weak_version= (file ? file->get_version() : 0); if (flag_events_waits_history) insert_events_waits_history(thread, wait); @@ -3988,22 +4115,79 @@ static void end_file_wait_v1(PSI_file_locker *locker, thread->m_events_waits_current--; } } +} - /* Release or destroy the file if necessary */ - switch(state->m_operation) +/** + Implementation of the file instrumentation interface. + @sa PSI_v1::start_file_close_wait. +*/ +static void start_file_close_wait_v1(PSI_file_locker *locker, + const char *src_file, + uint src_line) +{ + PFS_thread *thread; + const char *name; + uint len; + PFS_file *pfs_file; + PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker); + DBUG_ASSERT(state != NULL); + + switch (state->m_operation) { - case PSI_FILE_CLOSE: - case PSI_FILE_STREAM_CLOSE: - case PSI_FILE_STAT: - release_file(file); - break; case PSI_FILE_DELETE: - DBUG_ASSERT(thread != NULL); - destroy_file(thread, file); + thread= reinterpret_cast<PFS_thread*> (state->m_thread); + name= state->m_name; + len= strlen(name); + pfs_file= find_or_create_file(thread, NULL, name, len, false); + state->m_file= reinterpret_cast<PSI_file*> (pfs_file); + break; + case PSI_FILE_STREAM_CLOSE: + case PSI_FILE_CLOSE: break; default: + DBUG_ASSERT(false); break; } + + start_file_wait_v1(locker, 0, src_file, src_line); + + return; +} + +/** + Implementation of the file instrumentation interface. + @sa PSI_v1::end_file_close_wait. +*/ +static void end_file_close_wait_v1(PSI_file_locker *locker, int rc) +{ + PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker); + DBUG_ASSERT(state != NULL); + + end_file_wait_v1(locker, 0); + + if (rc == 0) + { + PFS_thread *thread= reinterpret_cast<PFS_thread*> (state->m_thread); + PFS_file *file= reinterpret_cast<PFS_file*> (state->m_file); + + /* Release or destroy the file if necessary */ + switch(state->m_operation) + { + case PSI_FILE_CLOSE: + case PSI_FILE_STREAM_CLOSE: + if (file != NULL) + release_file(file); + break; + case PSI_FILE_DELETE: + if (file != NULL) + destroy_file(thread, file); + break; + default: + DBUG_ASSERT(false); + break; + } + } + return; } static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line) @@ -4165,7 +4349,8 @@ static void end_stage_v1() static PSI_statement_locker* get_thread_statement_locker_v1(PSI_statement_locker_state *state, - PSI_statement_key key) + PSI_statement_key key, + const void *charset) { DBUG_ASSERT(state != NULL); if (! flag_global_instrumentation) @@ -4262,9 +4447,11 @@ get_thread_statement_locker_v1(PSI_statement_locker_state *state, if (flag_statements_digest) { + const CHARSET_INFO *cs= static_cast <const CHARSET_INFO*> (charset); flags|= STATE_FLAG_DIGEST; state->m_digest_state.m_last_id_index= 0; digest_reset(& state->m_digest_state.m_digest_storage); + state->m_digest_state.m_digest_storage.m_charset_number= cs->number; } state->m_discarded= false; @@ -4288,6 +4475,8 @@ get_thread_statement_locker_v1(PSI_statement_locker_state *state, state->m_no_index_used= 0; state->m_no_good_index_used= 0; + state->m_schema_name_length= 0; + return reinterpret_cast<PSI_statement_locker*> (state); } @@ -4352,6 +4541,13 @@ static void start_statement_v1(PSI_statement_locker *locker, state->m_timer_start= timer_start; } + compile_time_assert(PSI_SCHEMA_NAME_LEN == NAME_LEN); + DBUG_ASSERT(db_len <= sizeof(state->m_schema_name)); + + if (db_len > 0) + memcpy(state->m_schema_name, db, db_len); + state->m_schema_name_length= db_len; + if (flags & STATE_FLAG_EVENT) { PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement); @@ -4563,11 +4759,10 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da) if (flags & STATE_FLAG_DIGEST) { digest_storage= &state->m_digest_state.m_digest_storage; - - /* - Populate PFS_statements_digest_stat with computed digest information. - */ - digest_stat= find_or_create_digest(thread, digest_storage); + /* Populate PFS_statements_digest_stat with computed digest information.*/ + digest_stat= find_or_create_digest(thread, digest_storage, + state->m_schema_name, + state->m_schema_name_length); } if (flags & STATE_FLAG_EVENT) @@ -4633,11 +4828,10 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da) { /* Set digest stat. */ digest_storage= &state->m_digest_state.m_digest_storage; - - /* - Populate PFS_statements_digest_stat with computed digest information. - */ - digest_stat= find_or_create_digest(thread, digest_storage); + /* Populate statements_digest_stat with computed digest information. */ + digest_stat= find_or_create_digest(thread, digest_storage, + state->m_schema_name, + state->m_schema_name_length); } } @@ -4869,6 +5063,42 @@ static void set_socket_thread_owner_v1(PSI_socket *socket) pfs_socket->m_thread_owner= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); } + +/** + Implementation of the thread attribute connection interface + @sa PSI_v1::set_thread_connect_attr. +*/ +static int set_thread_connect_attrs_v1(const char *buffer, uint length, + const void *from_cs) +{ + + PFS_thread *thd= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + + DBUG_ASSERT(buffer != NULL); + + if (likely(thd != NULL) && session_connect_attrs_size_per_thread > 0) + { + /* copy from the input buffer as much as we can fit */ + uint copy_size= (uint)(length < session_connect_attrs_size_per_thread ? + length : session_connect_attrs_size_per_thread); + thd->m_lock.allocated_to_dirty(); + memcpy(thd->m_session_connect_attrs, buffer, copy_size); + thd->m_session_connect_attrs_length= copy_size; + thd->m_session_connect_attrs_cs= (const CHARSET_INFO *) from_cs; + thd->m_lock.dirty_to_allocated(); + + if (copy_size == length) + return 0; + else + { + session_connect_attrs_lost++; + return 1; + } + } + return 0; +} + + /** Implementation of the instrumentation interface. @sa PSI_v1. @@ -4939,6 +5169,8 @@ PSI_v1 PFS_v1= end_file_open_wait_and_bind_to_descriptor_v1, start_file_wait_v1, end_file_wait_v1, + start_file_close_wait_v1, + end_file_close_wait_v1, start_stage_v1, end_stage_v1, get_thread_statement_locker_v1, @@ -4968,7 +5200,8 @@ PSI_v1 PFS_v1= set_socket_info_v1, set_socket_thread_owner_v1, pfs_digest_start_v1, - pfs_digest_add_token_v1 + pfs_digest_add_token_v1, + set_thread_connect_attrs_v1, }; static void* get_interface(int version) diff --git a/storage/perfschema/pfs_account.cc b/storage/perfschema/pfs_account.cc index 18716478681..9221fc3b991 100644 --- a/storage/perfschema/pfs_account.cc +++ b/storage/perfschema/pfs_account.cc @@ -45,7 +45,7 @@ static PFS_single_stat *account_instr_class_waits_array= NULL; static PFS_stage_stat *account_instr_class_stages_array= NULL; static PFS_statement_stat *account_instr_class_statements_array= NULL; -static LF_HASH account_hash; +LF_HASH account_hash; static bool account_hash_inited= false; /** @@ -149,10 +149,11 @@ C_MODE_END */ int init_account_hash(void) { - if (! account_hash_inited) + if ((! account_hash_inited) && (account_max > 0)) { lf_hash_init(&account_hash, sizeof(PFS_account*), LF_HASH_UNIQUE, 0, 0, account_hash_get_key, &my_charset_bin); + account_hash.size= account_max; account_hash_inited= true; } return 0; diff --git a/storage/perfschema/pfs_account.h b/storage/perfschema/pfs_account.h index 77a9dfab7ba..1ac379e0fc9 100644 --- a/storage/perfschema/pfs_account.h +++ b/storage/perfschema/pfs_account.h @@ -46,7 +46,7 @@ struct PFS_account_key uint m_key_length; }; -struct PFS_account : PFS_connection_slice +struct PFS_ALIGNED PFS_account : PFS_connection_slice { public: inline void init_refcount(void) @@ -115,6 +115,8 @@ extern ulong account_lost; extern PFS_account *account_array; +extern LF_HASH account_hash; + /** @} */ #endif diff --git a/storage/perfschema/pfs_atomic.h b/storage/perfschema/pfs_atomic.h index ffb4c24ecbf..61b8c2b2804 100644 --- a/storage/perfschema/pfs_atomic.h +++ b/storage/perfschema/pfs_atomic.h @@ -43,6 +43,16 @@ public: } /** Atomic load. */ + static inline int64 load_64(volatile int64 *ptr) + { + int64 result; + rdlock(ptr); + result= my_atomic_load64(ptr); + rdunlock(ptr); + return result; + } + + /** Atomic load. */ static inline uint32 load_u32(volatile uint32 *ptr) { uint32 result; @@ -52,6 +62,16 @@ public: return result; } + /** Atomic load. */ + static inline uint64 load_u64(volatile uint64 *ptr) + { + uint64 result; + rdlock(ptr); + result= (uint64) my_atomic_load64((int64*) ptr); + rdunlock(ptr); + return result; + } + /** Atomic store. */ static inline void store_32(volatile int32 *ptr, int32 value) { @@ -61,6 +81,14 @@ public: } /** Atomic store. */ + static inline void store_64(volatile int64 *ptr, int64 value) + { + wrlock(ptr); + my_atomic_store64(ptr, value); + wrunlock(ptr); + } + + /** Atomic store. */ static inline void store_u32(volatile uint32 *ptr, uint32 value) { wrlock(ptr); @@ -68,6 +96,14 @@ public: wrunlock(ptr); } + /** Atomic store. */ + static inline void store_u64(volatile uint64 *ptr, uint64 value) + { + wrlock(ptr); + my_atomic_store64((int64*) ptr, (int64) value); + wrunlock(ptr); + } + /** Atomic add. */ static inline int32 add_32(volatile int32 *ptr, int32 value) { @@ -79,6 +115,16 @@ public: } /** Atomic add. */ + static inline int64 add_64(volatile int64 *ptr, int64 value) + { + int64 result; + wrlock(ptr); + result= my_atomic_add64(ptr, value); + wrunlock(ptr); + return result; + } + + /** Atomic add. */ static inline uint32 add_u32(volatile uint32 *ptr, uint32 value) { uint32 result; @@ -88,6 +134,16 @@ public: return result; } + /** Atomic add. */ + static inline uint64 add_u64(volatile uint64 *ptr, uint64 value) + { + uint64 result; + wrlock(ptr); + result= (uint64) my_atomic_add64((int64*) ptr, (int64) value); + wrunlock(ptr); + return result; + } + /** Atomic compare and swap. */ static inline bool cas_32(volatile int32 *ptr, int32 *old_value, int32 new_value) @@ -100,6 +156,17 @@ public: } /** Atomic compare and swap. */ + static inline bool cas_64(volatile int64 *ptr, int64 *old_value, + int64 new_value) + { + bool result; + wrlock(ptr); + result= my_atomic_cas64(ptr, old_value, new_value); + wrunlock(ptr); + return result; + } + + /** Atomic compare and swap. */ static inline bool cas_u32(volatile uint32 *ptr, uint32 *old_value, uint32 new_value) { @@ -111,6 +178,18 @@ public: return result; } + /** Atomic compare and swap. */ + static inline bool cas_u64(volatile uint64 *ptr, uint64 *old_value, + uint64 new_value) + { + bool result; + wrlock(ptr); + result= my_atomic_cas64((int64*) ptr, (int64*) old_value, + (uint64) new_value); + wrunlock(ptr); + return result; + } + private: static my_atomic_rwlock_t m_rwlock_array[256]; diff --git a/storage/perfschema/pfs_autosize.cc b/storage/perfschema/pfs_autosize.cc new file mode 100644 index 00000000000..38bd36d8321 --- /dev/null +++ b/storage/perfschema/pfs_autosize.cc @@ -0,0 +1,366 @@ +/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +/** + @file storage/perfschema/pfs_autosize.cc + Private interface for the server (implementation). +*/ + +#include "my_global.h" +#include "sql_const.h" +#include "pfs_server.h" + +#include <algorithm> +using std::min; +using std::max; + +static const ulong fixed_mutex_instances= 500; +static const ulong fixed_rwlock_instances= 200; +static const ulong fixed_cond_instances= 50; +static const ulong fixed_file_instances= 200; +static const ulong fixed_socket_instances= 10; +static const ulong fixed_thread_instances= 50; + +static const ulong mutex_per_connection= 3; +static const ulong rwlock_per_connection= 1; +static const ulong cond_per_connection= 2; +static const ulong file_per_connection= 0; +static const ulong socket_per_connection= 1; +static const ulong thread_per_connection= 1; + +static const ulong mutex_per_handle= 0; +static const ulong rwlock_per_handle= 0; +static const ulong cond_per_handle= 0; +static const ulong file_per_handle= 0; +static const ulong socket_per_handle= 0; +static const ulong thread_per_handle= 0; + +static const ulong mutex_per_share= 5; +static const ulong rwlock_per_share= 3; +static const ulong cond_per_share= 1; +static const ulong file_per_share= 3; +static const ulong socket_per_share= 0; +static const ulong thread_per_share= 0; + +struct PFS_sizing_data +{ + /** Default value for @c PFS_param.m_account_sizing. */ + ulong m_account_sizing; + /** Default value for @c PFS_param.m_user_sizing. */ + ulong m_user_sizing; + /** Default value for @c PFS_param.m_host_sizing. */ + ulong m_host_sizing; + + /** Default value for @c PFS_param.m_events_waits_history_sizing. */ + ulong m_events_waits_history_sizing; + /** Default value for @c PFS_param.m_events_waits_history_long_sizing. */ + ulong m_events_waits_history_long_sizing; + /** Default value for @c PFS_param.m_events_stages_history_sizing. */ + ulong m_events_stages_history_sizing; + /** Default value for @c PFS_param.m_events_stages_history_long_sizing. */ + ulong m_events_stages_history_long_sizing; + /** Default value for @c PFS_param.m_events_statements_history_sizing. */ + ulong m_events_statements_history_sizing; + /** Default value for @c PFS_param.m_events_statements_history_long_sizing. */ + ulong m_events_statements_history_long_sizing; + /** Default value for @c PFS_param.m_digest_sizing. */ + ulong m_digest_sizing; + /** Default value for @c PFS_param.m_session_connect_attrs_sizing. */ + ulong m_session_connect_attrs_sizing; + + /** + Minimum number of tables to keep statistics for. + On small deployments, all the tables can fit into the table definition cache, + and this value can be 0. + On big deployments, the table definition cache is only a subset of all the tables + in the database, which are accounted for here. + */ + ulong m_min_number_of_tables; + + /** + Load factor for 'volatile' objects (mutexes, table handles, ...). + Instrumented objects that: + - use little memory + - are created/destroyed very frequently + should be stored in a low density (mostly empty) memory buffer, + to optimize for speed. + */ + float m_load_factor_volatile; + /** + Load factor for 'normal' objects (files). + Instrumented objects that: + - use a medium amount of memory + - are created/destroyed + should be stored in a medium density memory buffer, + as a trade off between space and speed. + */ + float m_load_factor_normal; + /** + Load factor for 'static' objects (table shares). + Instrumented objects that: + - use a lot of memory + - are created/destroyed very rarely + can be stored in a high density (mostly packed) memory buffer, + to optimize for space. + */ + float m_load_factor_static; +}; + +PFS_sizing_data small_data= +{ + /* Account / user / host */ + 10, 5, 20, + /* History sizes */ + 5, 100, 5, 100, 5, 100, + /* Digests */ + 1000, + /* Session connect attrs. */ + 512, + /* Min tables */ + 200, + /* Load factors */ + 0.90, 0.90, 0.90 +}; + +PFS_sizing_data medium_data= +{ + /* Account / user / host */ + 100, 100, 100, + /* History sizes */ + 10, 1000, 10, 1000, 10, 1000, + /* Digests */ + 5000, + /* Session connect attrs. */ + 512, + /* Min tables */ + 500, + /* Load factors */ + 0.70, 0.80, 0.90 +}; + +PFS_sizing_data large_data= +{ + /* Account / user / host */ + 100, 100, 100, + /* History sizes */ + 10, 10000, 10, 10000, 10, 10000, + /* Digests */ + 10000, + /* Session connect attrs. */ + 512, + /* Min tables */ + 10000, + /* Load factors */ + 0.50, 0.65, 0.80 +}; + +static inline ulong apply_load_factor(ulong raw_value, float factor) +{ + float value = ((float) raw_value) / factor; + return (ulong) ceil(value); +} + +PFS_sizing_data *estimate_hints(PFS_global_param *param) +{ + if ((param->m_hints.m_max_connections <= MAX_CONNECTIONS_DEFAULT) && + (param->m_hints.m_table_definition_cache <= TABLE_DEF_CACHE_DEFAULT) && + (param->m_hints.m_table_open_cache <= TABLE_OPEN_CACHE_DEFAULT)) + { + /* The my.cnf used is either unchanged, or lower than factory defaults. */ + return & small_data; + } + + if ((param->m_hints.m_max_connections <= MAX_CONNECTIONS_DEFAULT * 2) && + (param->m_hints.m_table_definition_cache <= TABLE_DEF_CACHE_DEFAULT * 2) && + (param->m_hints.m_table_open_cache <= TABLE_OPEN_CACHE_DEFAULT * 2)) + { + /* Some defaults have been increased, to "moderate" values. */ + return & medium_data; + } + + /* Looks like a server in production. */ + return & large_data; +} + +static void apply_heuristic(PFS_global_param *p, PFS_sizing_data *h) +{ + ulong count; + ulong con = p->m_hints.m_max_connections; + ulong handle = p->m_hints.m_table_open_cache; + ulong share = p->m_hints.m_table_definition_cache; + ulong file = p->m_hints.m_open_files_limit; + + if (p->m_table_sizing < 0) + { + count= handle; + + p->m_table_sizing= apply_load_factor(count, h->m_load_factor_volatile); + } + + if (p->m_table_share_sizing < 0) + { + count= share; + + count= max<ulong>(count, h->m_min_number_of_tables); + p->m_table_share_sizing= apply_load_factor(count, h->m_load_factor_static); + } + + if (p->m_account_sizing < 0) + { + p->m_account_sizing= h->m_account_sizing; + } + + if (p->m_user_sizing < 0) + { + p->m_user_sizing= h->m_user_sizing; + } + + if (p->m_host_sizing < 0) + { + p->m_host_sizing= h->m_host_sizing; + } + + if (p->m_events_waits_history_sizing < 0) + { + p->m_events_waits_history_sizing= h->m_events_waits_history_sizing; + } + + if (p->m_events_waits_history_long_sizing < 0) + { + p->m_events_waits_history_long_sizing= h->m_events_waits_history_long_sizing; + } + + if (p->m_events_stages_history_sizing < 0) + { + p->m_events_stages_history_sizing= h->m_events_stages_history_sizing; + } + + if (p->m_events_stages_history_long_sizing < 0) + { + p->m_events_stages_history_long_sizing= h->m_events_stages_history_long_sizing; + } + + if (p->m_events_statements_history_sizing < 0) + { + p->m_events_statements_history_sizing= h->m_events_statements_history_sizing; + } + + if (p->m_events_statements_history_long_sizing < 0) + { + p->m_events_statements_history_long_sizing= h->m_events_statements_history_long_sizing; + } + + if (p->m_digest_sizing < 0) + { + p->m_digest_sizing= h->m_digest_sizing; + } + + if (p->m_session_connect_attrs_sizing < 0) + { + p->m_session_connect_attrs_sizing= h->m_session_connect_attrs_sizing; + } + + if (p->m_mutex_sizing < 0) + { + count= fixed_mutex_instances + + con * mutex_per_connection + + handle * mutex_per_handle + + share * mutex_per_share; + + p->m_mutex_sizing= apply_load_factor(count, h->m_load_factor_volatile); + } + + if (p->m_rwlock_sizing < 0) + { + count= fixed_rwlock_instances + + con * rwlock_per_connection + + handle * rwlock_per_handle + + share * rwlock_per_share; + + p->m_rwlock_sizing= apply_load_factor(count, h->m_load_factor_volatile); + } + + if (p->m_cond_sizing < 0) + { + ulong count; + count= fixed_cond_instances + + con * cond_per_connection + + handle * cond_per_handle + + share * cond_per_share; + + p->m_cond_sizing= apply_load_factor(count, h->m_load_factor_volatile); + } + + if (p->m_file_sizing < 0) + { + count= fixed_file_instances + + con * file_per_connection + + handle * file_per_handle + + share * file_per_share; + + count= max<ulong>(count, file); + p->m_file_sizing= apply_load_factor(count, h->m_load_factor_normal); + } + + if (p->m_socket_sizing < 0) + { + count= fixed_socket_instances + + con * socket_per_connection + + handle * socket_per_handle + + share * socket_per_share; + + p->m_socket_sizing= apply_load_factor(count, h->m_load_factor_volatile); + } + + if (p->m_thread_sizing < 0) + { + count= fixed_thread_instances + + con * thread_per_connection + + handle * thread_per_handle + + share * thread_per_share; + + p->m_thread_sizing= apply_load_factor(count, h->m_load_factor_volatile); + } +} + +void pfs_automated_sizing(PFS_global_param *param) +{ + PFS_sizing_data *heuristic; + heuristic= estimate_hints(param); + apply_heuristic(param, heuristic); + + DBUG_ASSERT(param->m_account_sizing >= 0); + DBUG_ASSERT(param->m_digest_sizing >= 0); + DBUG_ASSERT(param->m_host_sizing >= 0); + DBUG_ASSERT(param->m_user_sizing >= 0); + + DBUG_ASSERT(param->m_events_waits_history_sizing >= 0); + DBUG_ASSERT(param->m_events_waits_history_long_sizing >= 0); + DBUG_ASSERT(param->m_events_stages_history_sizing >= 0); + DBUG_ASSERT(param->m_events_stages_history_long_sizing >= 0); + DBUG_ASSERT(param->m_events_statements_history_sizing >= 0); + DBUG_ASSERT(param->m_events_statements_history_long_sizing >= 0); + DBUG_ASSERT(param->m_session_connect_attrs_sizing >= 0); + + DBUG_ASSERT(param->m_mutex_sizing >= 0); + DBUG_ASSERT(param->m_rwlock_sizing >= 0); + DBUG_ASSERT(param->m_cond_sizing >= 0); + DBUG_ASSERT(param->m_file_sizing >= 0); + DBUG_ASSERT(param->m_socket_sizing >= 0); + DBUG_ASSERT(param->m_thread_sizing >= 0); + DBUG_ASSERT(param->m_table_sizing >= 0); + DBUG_ASSERT(param->m_table_share_sizing >= 0); +} + diff --git a/storage/perfschema/pfs_digest.cc b/storage/perfschema/pfs_digest.cc index 92c27b2e85f..c5df64d9243 100644 --- a/storage/perfschema/pfs_digest.cc +++ b/storage/perfschema/pfs_digest.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,6 +30,7 @@ #include "table_helper.h" #include "my_md5.h" #include "sql_lex.h" +#include "sql_string.h" #include <string.h> /* Generated code */ @@ -58,7 +59,6 @@ ulong digest_max= 0; ulong digest_lost= 0; - /** EVENTS_STATEMENTS_HISTORY_LONG circular buffer. */ PFS_statements_digest_stat *statements_digest_stat_array= NULL; /** Consumer flag for table EVENTS_STATEMENTS_SUMMARY_BY_DIGEST. */ @@ -69,7 +69,7 @@ bool flag_statements_digest= true; */ volatile uint32 digest_index= 1; -static LF_HASH digest_hash; +LF_HASH digest_hash; static bool digest_hash_inited= false; /** @@ -123,8 +123,8 @@ static uchar *digest_hash_get_key(const uchar *entry, size_t *length, DBUG_ASSERT(typed_entry != NULL); digest= *typed_entry; DBUG_ASSERT(digest != NULL); - *length= PFS_MD5_SIZE; - result= digest->m_digest_hash.m_md5; + *length= sizeof (PFS_digest_key); + result= & digest->m_digest_key; return const_cast<uchar*> (reinterpret_cast<const uchar*> (result)); } C_MODE_END @@ -136,11 +136,12 @@ C_MODE_END */ int init_digest_hash(void) { - if (! digest_hash_inited) + if ((! digest_hash_inited) && (digest_max > 0)) { lf_hash_init(&digest_hash, sizeof(PFS_statements_digest_stat*), LF_HASH_UNIQUE, 0, 0, digest_hash_get_key, &my_charset_bin); + digest_hash.size= digest_max; digest_hash_inited= true; } return 0; @@ -167,8 +168,10 @@ static LF_PINS* get_digest_hash_pins(PFS_thread *thread) } PFS_statement_stat* -find_or_create_digest(PFS_thread* thread, - PSI_digest_storage* digest_storage) +find_or_create_digest(PFS_thread *thread, + PSI_digest_storage *digest_storage, + const char *schema_name, + uint schema_name_length) { if (statements_digest_stat_array == NULL) return NULL; @@ -180,13 +183,21 @@ find_or_create_digest(PFS_thread* thread, if (unlikely(pins == NULL)) return NULL; + /* + Note: the LF_HASH key is a block of memory, + make sure to clean unused bytes, + so that memcmp() can compare keys. + */ + PFS_digest_key hash_key; + memset(& hash_key, 0, sizeof(hash_key)); /* Compute MD5 Hash of the tokens received. */ - PFS_digest_hash md5; - compute_md5_hash((char *) md5.m_md5, + compute_md5_hash((char *) hash_key.m_md5, (char *) digest_storage->m_token_array, digest_storage->m_byte_count); - - unsigned char* hash_key= md5.m_md5; + /* Add the current schema to the key */ + hash_key.m_schema_name_length= schema_name_length; + if (schema_name_length > 0) + memcpy(hash_key.m_schema_name, schema_name, schema_name_length); int res; ulong safe_index; @@ -202,7 +213,7 @@ search: /* Lookup LF_HASH using this new key. */ entry= reinterpret_cast<PFS_statements_digest_stat**> (lf_hash_search(&digest_hash, pins, - hash_key, PFS_MD5_SIZE)); + &hash_key, sizeof(PFS_digest_key))); if (entry && (entry != MY_ERRPTR)) { @@ -244,7 +255,7 @@ search: pfs= &statements_digest_stat_array[safe_index]; /* Copy digest hash/LF Hash search key. */ - memcpy(pfs->m_digest_hash.m_md5, md5.m_md5, PFS_MD5_SIZE); + memcpy(& pfs->m_digest_key, &hash_key, sizeof(PFS_digest_key)); /* Copy digest storage to statement_digest_stat_array so that it could be @@ -278,7 +289,7 @@ search: return NULL; } -void purge_digest(PFS_thread* thread, unsigned char* hash_key) +void purge_digest(PFS_thread* thread, PFS_digest_key *hash_key) { LF_PINS *pins= get_digest_hash_pins(thread); if (unlikely(pins == NULL)) @@ -289,12 +300,12 @@ void purge_digest(PFS_thread* thread, unsigned char* hash_key) /* Lookup LF_HASH using this new key. */ entry= reinterpret_cast<PFS_statements_digest_stat**> (lf_hash_search(&digest_hash, pins, - hash_key, PFS_MD5_SIZE)); + hash_key, sizeof(PFS_digest_key))); if (entry && (entry != MY_ERRPTR)) - { + { lf_hash_delete(&digest_hash, pins, - hash_key, PFS_MD5_SIZE); + hash_key, sizeof(PFS_digest_key)); } lf_hash_search_unpin(pins); return; @@ -313,7 +324,7 @@ void PFS_statements_digest_stat::reset_index(PFS_thread *thread) /* Only remove entries that exists in the HASH index. */ if (m_digest_storage.m_byte_count > 0) { - purge_digest(thread, m_digest_hash.m_md5); + purge_digest(thread, & m_digest_key); } } @@ -347,98 +358,130 @@ void reset_esms_by_digest() */ void get_digest_text(char* digest_text, PSI_digest_storage* digest_storage) { + DBUG_ASSERT(digest_storage != NULL); bool truncated= false; int byte_count= digest_storage->m_byte_count; - int need_bytes; + int bytes_needed= 0; uint tok= 0; - char *id_string; - int id_length; int current_byte= 0; lex_token_string *tok_data; /* -4 is to make sure extra space for '...' and a '\0' at the end. */ - int available_bytes_to_write= COL_DIGEST_TEXT_SIZE - 4; + int bytes_available= COL_DIGEST_TEXT_SIZE - 4; + + /* Convert text to utf8 */ + const CHARSET_INFO *from_cs= get_charset(digest_storage->m_charset_number, MYF(0)); + const CHARSET_INFO *to_cs= &my_charset_utf8_bin; + + if (from_cs == NULL) + { + /* + Can happen, as we do dirty reads on digest_storage, + which can be written to in another thread. + */ + *digest_text= '\0'; + return; + } + + /* + Max converted size is number of characters * max multibyte length of the + target charset, which is 4 for UTF8. + */ + const uint max_converted_size= PSI_MAX_DIGEST_STORAGE_SIZE * 4; + char id_buffer[max_converted_size]; + char *id_string; + int id_length; + bool convert_text= !my_charset_same(from_cs, to_cs); DBUG_ASSERT(byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE); while ((current_byte < byte_count) && - (available_bytes_to_write > 0) && - (! truncated)) + (bytes_available > 0) && + !truncated) { current_byte= read_token(digest_storage, current_byte, &tok); - tok_data= & lex_token_array[tok]; + tok_data= &lex_token_array[tok]; switch (tok) { /* All identifiers are printed with their name. */ case IDENT: - current_byte= read_identifier(digest_storage, current_byte, - & id_string, & id_length); - need_bytes= id_length + 1; /* <id> space */ - if (need_bytes <= available_bytes_to_write) + case IDENT_QUOTED: { - if (id_length > 0) + char *id_ptr; + int id_len; + uint err_cs= 0; + + /* Get the next identifier from the storage buffer. */ + current_byte= read_identifier(digest_storage, current_byte, + &id_ptr, &id_len); + if (convert_text) { - strncpy(digest_text, id_string, id_length); - digest_text+= id_length; + /* Verify that the converted text will fit. */ + if (to_cs->mbmaxlen*id_len > max_converted_size) + { + truncated= true; + break; + } + /* Convert identifier string into the storage character set. */ + id_length= my_convert(id_buffer, max_converted_size, to_cs, + id_ptr, id_len, from_cs, &err_cs); + id_string= id_buffer; } - *digest_text= ' '; - digest_text++; - available_bytes_to_write-= need_bytes; - } - else - { - truncated= true; - } - break; - case IDENT_QUOTED: - current_byte= read_identifier(digest_storage, current_byte, - & id_string, & id_length); - need_bytes= id_length + 3; /* quote <id> quote space */ - if (need_bytes <= available_bytes_to_write) - { - *digest_text= '`'; - digest_text++; - if (id_length > 0) + else { - strncpy(digest_text, id_string, id_length); - digest_text+= id_length; + id_string= id_ptr; + id_length= id_len; + } + + if (id_length == 0 || err_cs != 0) + { + truncated= true; + break; + } + /* Copy the converted identifier into the digest string. */ + bytes_needed= id_length + (tok == IDENT ? 1 : 3); + if (bytes_needed <= bytes_available) + { + if (tok == IDENT_QUOTED) + *digest_text++= '`'; + if (id_length > 0) + { + memcpy(digest_text, id_string, id_length); + digest_text+= id_length; + } + if (tok == IDENT_QUOTED) + *digest_text++= '`'; + *digest_text++= ' '; + bytes_available-= bytes_needed; + } + else + { + truncated= true; } - *digest_text= '`'; - digest_text++; - *digest_text= ' '; - digest_text++; - available_bytes_to_write-= need_bytes; - } - else - { - truncated= true; } break; /* Everything else is printed as is. */ default: /* - Make sure not to overflow digest_text buffer while writing - this token string. + Make sure not to overflow digest_text buffer. +1 is to make sure extra space for ' '. */ int tok_length= tok_data->m_token_length; - need_bytes= tok_length + 1; + bytes_needed= tok_length + 1; - if (need_bytes <= available_bytes_to_write) + if (bytes_needed <= bytes_available) { - strncpy(digest_text, - tok_data->m_token_string, - tok_length); + strncpy(digest_text, tok_data->m_token_string, tok_length); digest_text+= tok_length; - *digest_text= ' '; - digest_text++; - available_bytes_to_write-= need_bytes; + *digest_text++= ' '; + bytes_available-= bytes_needed; } else { truncated= true; } + break; } } @@ -524,7 +567,11 @@ PSI_digest_locker* pfs_digest_add_token_v1(PSI_digest_locker *locker, digest_storage= &state->m_digest_storage; - if (digest_storage->m_full) + /* + Stop collecting further tokens if digest storage is full or + if END token is received. + */ + if (digest_storage->m_full || token == END_OF_INPUT) return NULL; /* @@ -555,19 +602,23 @@ PSI_digest_locker* pfs_digest_add_token_v1(PSI_digest_locker *locker, TOK_PFS_GENERIC_VALUE := BIN_NUM | DECIMAL_NUM | ... | ULONGLONG_NUM */ token= TOK_PFS_GENERIC_VALUE; - + } + /* fall through */ + case NULL_SYM: + { if ((last_token2 == TOK_PFS_GENERIC_VALUE || - last_token2 == TOK_PFS_GENERIC_VALUE_LIST) && + last_token2 == TOK_PFS_GENERIC_VALUE_LIST || + last_token2 == NULL_SYM) && (last_token == ',')) { /* REDUCE: TOK_PFS_GENERIC_VALUE_LIST := - TOK_PFS_GENERIC_VALUE ',' TOK_PFS_GENERIC_VALUE + (TOK_PFS_GENERIC_VALUE|NULL_SYM) ',' (TOK_PFS_GENERIC_VALUE|NULL_SYM) REDUCE: TOK_PFS_GENERIC_VALUE_LIST := - TOK_PFS_GENERIC_VALUE_LIST ',' TOK_PFS_GENERIC_VALUE + TOK_PFS_GENERIC_VALUE_LIST ',' (TOK_PFS_GENERIC_VALUE|NULL_SYM) */ digest_storage->m_byte_count-= 2*PFS_SIZE_OF_A_TOKEN; token= TOK_PFS_GENERIC_VALUE_LIST; diff --git a/storage/perfschema/pfs_digest.h b/storage/perfschema/pfs_digest.h index 2646596171c..d2453dc32c6 100644 --- a/storage/perfschema/pfs_digest.h +++ b/storage/perfschema/pfs_digest.h @@ -38,32 +38,26 @@ struct PFS_thread; /** Structure to store a MD5 hash value (digest) for a statement. */ -struct PFS_digest_hash +struct PFS_digest_key { unsigned char m_md5[PFS_MD5_SIZE]; + char m_schema_name[NAME_LEN]; + uint m_schema_name_length; }; /** A statement digest stat record. */ -struct PFS_statements_digest_stat +struct PFS_ALIGNED PFS_statements_digest_stat { - /** - Digest MD5 Hash. - */ - PFS_digest_hash m_digest_hash; + /** Digest Schema + MD5 Hash. */ + PFS_digest_key m_digest_key; - /** - Digest Storage. - */ + /** Digest Storage. */ PSI_digest_storage m_digest_storage; - /** - Statement stat. - */ + /** Statement stat. */ PFS_statement_stat m_stat; - /** - First Seen/last seen. - */ + /** First and last seen timestamps.*/ ulonglong m_first_seen; ulonglong m_last_seen; @@ -78,10 +72,12 @@ void cleanup_digest(); int init_digest_hash(void); void cleanup_digest_hash(void); -PFS_statement_stat* find_or_create_digest(PFS_thread*, - PSI_digest_storage*); +PFS_statement_stat* find_or_create_digest(PFS_thread *thread, + PSI_digest_storage *digest_storage, + const char *schema_name, + uint schema_name_length); -void get_digest_text(char* digest_text, PSI_digest_storage*); +void get_digest_text(char *digest_text, PSI_digest_storage *digest_storage); void reset_esms_by_digest(); @@ -90,8 +86,8 @@ extern PFS_statements_digest_stat *statements_digest_stat_array; /* Instrumentation callbacks for pfs.cc */ -struct PSI_digest_locker* pfs_digest_start_v1(PSI_statement_locker *locker); -PSI_digest_locker* pfs_digest_add_token_v1(PSI_digest_locker *locker, +struct PSI_digest_locker *pfs_digest_start_v1(PSI_statement_locker *locker); +PSI_digest_locker *pfs_digest_add_token_v1(PSI_digest_locker *locker, uint token, OPAQUE_LEX_YYSTYPE *yylval); @@ -99,6 +95,7 @@ static inline void digest_reset(PSI_digest_storage *digest) { digest->m_full= false; digest->m_byte_count= 0; + digest->m_charset_number= 0; } static inline void digest_copy(PSI_digest_storage *to, const PSI_digest_storage *from) @@ -107,20 +104,21 @@ static inline void digest_copy(PSI_digest_storage *to, const PSI_digest_storage { to->m_full= from->m_full; to->m_byte_count= from->m_byte_count; + to->m_charset_number= from->m_charset_number; DBUG_ASSERT(to->m_byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE); memcpy(to->m_token_array, from->m_token_array, to->m_byte_count); } else { - DBUG_ASSERT(! from->m_full); DBUG_ASSERT(from->m_byte_count == 0); to->m_full= false; to->m_byte_count= 0; + to->m_charset_number= 0; } } /** - Function to read a single token from token array. + Read a single token from token array. */ inline int read_token(PSI_digest_storage *digest_storage, int index, uint *tok) @@ -141,7 +139,7 @@ inline int read_token(PSI_digest_storage *digest_storage, } /** - Function to store a single token in token array. + Store a single token in token array. */ inline void store_token(PSI_digest_storage* digest_storage, uint token) { @@ -162,7 +160,7 @@ inline void store_token(PSI_digest_storage* digest_storage, uint token) } /** - Function to read an identifier from token array. + Read an identifier from token array. */ inline int read_identifier(PSI_digest_storage* digest_storage, int index, char ** id_string, int *id_length) @@ -186,7 +184,7 @@ inline int read_identifier(PSI_digest_storage* digest_storage, } /** - Function to store an identifier in token array. + Store an identifier in token array. */ inline void store_token_identifier(PSI_digest_storage* digest_storage, uint token, @@ -207,9 +205,7 @@ inline void store_token_identifier(PSI_digest_storage* digest_storage, dest[3]= (id_length >> 8) & 0xff; /* Write the string data */ if (id_length > 0) - { - strncpy((char *)(dest + 4), id_name, id_length); - } + memcpy((char *)(dest + 4), id_name, id_length); digest_storage->m_byte_count+= bytes_needed; } else @@ -218,4 +214,6 @@ inline void store_token_identifier(PSI_digest_storage* digest_storage, } } +extern LF_HASH digest_hash; + #endif diff --git a/storage/perfschema/pfs_engine_table.cc b/storage/perfschema/pfs_engine_table.cc index 304e837fa84..8f6f0fa3bcd 100644 --- a/storage/perfschema/pfs_engine_table.cc +++ b/storage/perfschema/pfs_engine_table.cc @@ -20,6 +20,7 @@ #include "my_global.h" #include "my_pthread.h" +#include "hostname.h" /* For Host_entry */ #include "pfs_engine_table.h" #include "table_events_waits.h" @@ -69,6 +70,8 @@ #include "table_socket_instances.h" #include "table_socket_summary_by_instance.h" #include "table_socket_summary_by_event_name.h" +#include "table_session_connect_attrs.h" +#include "table_session_account_connect_attrs.h" /* For show status */ #include "pfs_column_values.h" @@ -145,6 +148,8 @@ static PFS_engine_table_share *all_shares[]= &table_socket_instances::m_share, &table_socket_summary_by_instance::m_share, &table_socket_summary_by_event_name::m_share, + &table_session_connect_attrs::m_share, + &table_session_account_connect_attrs::m_share, NULL }; @@ -683,20 +688,22 @@ PFS_unknown_acl pfs_unknown_acl; ACL_internal_access_result PFS_unknown_acl::check(ulong want_access, ulong *save_priv) const { - const ulong always_forbidden= INSERT_ACL | UPDATE_ACL | DELETE_ACL - | CREATE_ACL | REFERENCES_ACL | INDEX_ACL | ALTER_ACL - | CREATE_VIEW_ACL | TRIGGER_ACL | LOCK_TABLES_ACL; + const ulong always_forbidden= CREATE_ACL + | REFERENCES_ACL | INDEX_ACL | ALTER_ACL + | CREATE_VIEW_ACL | TRIGGER_ACL; if (unlikely(want_access & always_forbidden)) return ACL_INTERNAL_ACCESS_DENIED; /* - There is no point in hidding (by enforcing ACCESS_DENIED for SELECT_ACL + There is no point in hiding (by enforcing ACCESS_DENIED for SELECT_ACL on performance_schema.*) tables that do not exist anyway. When SELECT_ACL is granted on performance_schema.* or *.*, SELECT * from performance_schema.wrong_table will fail with a more understandable ER_NO_SUCH_TABLE error, instead of ER_TABLEACCESS_DENIED_ERROR. + The same goes for other DML (INSERT_ACL | UPDATE_ACL | DELETE_ACL), + for ease of use: error messages will be less surprising. */ return ACL_INTERNAL_ACCESS_CHECK_GRANT; } @@ -978,363 +985,445 @@ bool pfs_show_status(handlerton *hton, THD *thd, total_memory+= size; break; case 56: - name= "events_waits_summary_global_by_event_name.row_size"; - size= sizeof(PFS_single_stat); - break; - case 57: - name= "events_waits_summary_global_by_event_name.row_count"; - size= wait_class_max; - break; - case 58: - name= "events_waits_summary_global_by_event_name.memory"; - size= wait_class_max * sizeof(PFS_single_stat); - total_memory+= size; - break; - case 59: name= "(pfs_account).row_size"; size= sizeof(PFS_account); break; - case 60: + case 57: name= "(pfs_account).row_count"; size= account_max; break; - case 61: + case 58: name= "(pfs_account).memory"; size= account_max * sizeof(PFS_account); total_memory+= size; break; - case 62: + case 59: name= "events_waits_summary_by_account_by_event_name.row_size"; size= sizeof(PFS_single_stat); break; - case 63: + case 60: name= "events_waits_summary_by_account_by_event_name.row_count"; size= account_max * wait_class_max; break; - case 64: + case 61: name= "events_waits_summary_by_account_by_event_name.memory"; size= account_max * wait_class_max * sizeof(PFS_single_stat); total_memory+= size; break; - case 65: + case 62: name= "events_waits_summary_by_user_by_event_name.row_size"; size= sizeof(PFS_single_stat); break; - case 66: + case 63: name= "events_waits_summary_by_user_by_event_name.row_count"; size= user_max * wait_class_max; break; - case 67: + case 64: name= "events_waits_summary_by_user_by_event_name.memory"; size= user_max * wait_class_max * sizeof(PFS_single_stat); total_memory+= size; break; - case 68: + case 65: name= "events_waits_summary_by_host_by_event_name.row_size"; size= sizeof(PFS_single_stat); break; - case 69: + case 66: name= "events_waits_summary_by_host_by_event_name.row_count"; size= host_max * wait_class_max; break; - case 70: + case 67: name= "events_waits_summary_by_host_by_event_name.memory"; size= host_max * wait_class_max * sizeof(PFS_single_stat); total_memory+= size; break; - case 71: + case 68: name= "(pfs_user).row_size"; size= sizeof(PFS_user); break; - case 72: + case 69: name= "(pfs_user).row_count"; size= user_max; break; - case 73: + case 70: name= "(pfs_user).memory"; size= user_max * sizeof(PFS_user); total_memory+= size; break; - case 74: + case 71: name= "(pfs_host).row_size"; size= sizeof(PFS_host); break; - case 75: + case 72: name= "(pfs_host).row_count"; size= host_max; break; - case 76: + case 73: name= "(pfs_host).memory"; size= host_max * sizeof(PFS_host); total_memory+= size; break; - case 77: + case 74: name= "(pfs_stage_class).row_size"; size= sizeof(PFS_stage_class); break; - case 78: + case 75: name= "(pfs_stage_class).row_count"; size= stage_class_max; break; - case 79: + case 76: name= "(pfs_stage_class).memory"; size= stage_class_max * sizeof(PFS_stage_class); total_memory+= size; break; - case 80: + case 77: name= "events_stages_history.row_size"; size= sizeof(PFS_events_stages); break; - case 81: + case 78: name= "events_stages_history.row_count"; size= events_stages_history_per_thread * thread_max; break; - case 82: + case 79: name= "events_stages_history.memory"; size= events_stages_history_per_thread * thread_max * sizeof(PFS_events_stages); total_memory+= size; break; - case 83: + case 80: name= "events_stages_history_long.row_size"; size= sizeof(PFS_events_stages); break; - case 84: + case 81: name= "events_stages_history_long.row_count"; size= events_stages_history_long_size; break; - case 85: + case 82: name= "events_stages_history_long.memory"; size= events_stages_history_long_size * sizeof(PFS_events_stages); total_memory+= size; break; - case 86: + case 83: name= "events_stages_summary_by_thread_by_event_name.row_size"; size= sizeof(PFS_stage_stat); break; - case 87: + case 84: name= "events_stages_summary_by_thread_by_event_name.row_count"; size= thread_max * stage_class_max; break; - case 88: + case 85: name= "events_stages_summary_by_thread_by_event_name.memory"; size= thread_max * stage_class_max * sizeof(PFS_stage_stat); total_memory+= size; break; - case 89: + case 86: name= "events_stages_summary_global_by_event_name.row_size"; size= sizeof(PFS_stage_stat); break; - case 90: + case 87: name= "events_stages_summary_global_by_event_name.row_count"; size= stage_class_max; break; - case 91: + case 88: name= "events_stages_summary_global_by_event_name.memory"; size= stage_class_max * sizeof(PFS_stage_stat); total_memory+= size; break; - case 92: + case 89: name= "events_stages_summary_by_account_by_event_name.row_size"; size= sizeof(PFS_stage_stat); break; - case 93: + case 90: name= "events_stages_summary_by_account_by_event_name.row_count"; size= account_max * stage_class_max; break; - case 94: + case 91: name= "events_stages_summary_by_account_by_event_name.memory"; size= account_max * stage_class_max * sizeof(PFS_stage_stat); total_memory+= size; break; - case 95: + case 92: name= "events_stages_summary_by_user_by_event_name.row_size"; size= sizeof(PFS_stage_stat); break; - case 96: + case 93: name= "events_stages_summary_by_user_by_event_name.row_count"; size= user_max * stage_class_max; break; - case 97: + case 94: name= "events_stages_summary_by_user_by_event_name.memory"; size= user_max * stage_class_max * sizeof(PFS_stage_stat); total_memory+= size; break; - case 98: + case 95: name= "events_stages_summary_by_host_by_event_name.row_size"; size= sizeof(PFS_stage_stat); break; - case 99: + case 96: name= "events_stages_summary_by_host_by_event_name.row_count"; size= host_max * stage_class_max; break; - case 100: + case 97: name= "events_stages_summary_by_host_by_event_name.memory"; size= host_max * stage_class_max * sizeof(PFS_stage_stat); total_memory+= size; break; - case 101: + case 98: name= "(pfs_statement_class).row_size"; size= sizeof(PFS_statement_class); break; - case 102: + case 99: name= "(pfs_statement_class).row_count"; size= statement_class_max; break; - case 103: + case 100: name= "(pfs_statement_class).memory"; size= statement_class_max * sizeof(PFS_statement_class); total_memory+= size; break; - case 104: + case 101: name= "events_statements_history.row_size"; size= sizeof(PFS_events_statements); break; - case 105: + case 102: name= "events_statements_history.row_count"; size= events_statements_history_per_thread * thread_max; break; - case 106: + case 103: name= "events_statements_history.memory"; size= events_statements_history_per_thread * thread_max * sizeof(PFS_events_statements); total_memory+= size; break; - case 107: + case 104: name= "events_statements_history_long.row_size"; size= sizeof(PFS_events_statements); break; - case 108: + case 105: name= "events_statements_history_long.row_count"; size= events_statements_history_long_size; break; - case 109: + case 106: name= "events_statements_history_long.memory"; size= events_statements_history_long_size * sizeof(PFS_events_statements); total_memory+= size; break; - case 110: + case 107: name= "events_statements_summary_by_thread_by_event_name.row_size"; size= sizeof(PFS_statement_stat); break; - case 111: + case 108: name= "events_statements_summary_by_thread_by_event_name.row_count"; size= thread_max * statement_class_max; break; - case 112: + case 109: name= "events_statements_summary_by_thread_by_event_name.memory"; size= thread_max * statement_class_max * sizeof(PFS_statement_stat); total_memory+= size; break; - case 113: + case 110: name= "events_statements_summary_global_by_event_name.row_size"; size= sizeof(PFS_statement_stat); break; - case 114: + case 111: name= "events_statements_summary_global_by_event_name.row_count"; size= statement_class_max; break; - case 115: + case 112: name= "events_statements_summary_global_by_event_name.memory"; size= statement_class_max * sizeof(PFS_statement_stat); total_memory+= size; break; - case 116: + case 113: name= "events_statements_summary_by_account_by_event_name.row_size"; size= sizeof(PFS_statement_stat); break; - case 117: + case 114: name= "events_statements_summary_by_account_by_event_name.row_count"; size= account_max * statement_class_max; break; - case 118: + case 115: name= "events_statements_summary_by_account_by_event_name.memory"; size= account_max * statement_class_max * sizeof(PFS_statement_stat); total_memory+= size; break; - case 119: + case 116: name= "events_statements_summary_by_user_by_event_name.row_size"; size= sizeof(PFS_statement_stat); break; - case 120: + case 117: name= "events_statements_summary_by_user_by_event_name.row_count"; size= user_max * statement_class_max; break; - case 121: + case 118: name= "events_statements_summary_by_user_by_event_name.memory"; size= user_max * statement_class_max * sizeof(PFS_statement_stat); total_memory+= size; break; - case 122: + case 119: name= "events_statements_summary_by_host_by_event_name.row_size"; size= sizeof(PFS_statement_stat); break; - case 123: + case 120: name= "events_statements_summary_by_host_by_event_name.row_count"; size= host_max * statement_class_max; break; - case 124: + case 121: name= "events_statements_summary_by_host_by_event_name.memory"; size= host_max * statement_class_max * sizeof(PFS_statement_stat); total_memory+= size; break; - case 125: + case 122: name= "events_statements_current.row_size"; size= sizeof(PFS_events_statements); break; - case 126: + case 123: name= "events_statements_current.row_count"; size= thread_max * statement_stack_max; break; - case 127: + case 124: name= "events_statements_current.memory"; size= thread_max * statement_stack_max * sizeof(PFS_events_statements); total_memory+= size; break; - case 128: + case 125: name= "(pfs_socket_class).row_size"; size= sizeof(PFS_socket_class); break; - case 129: + case 126: name= "(pfs_socket_class).row_count"; size= socket_class_max; break; - case 130: + case 127: name= "(pfs_socket_class).memory"; size= socket_class_max * sizeof(PFS_socket_class); total_memory+= size; break; - case 131: + case 128: name= "socket_instances.row_size"; size= sizeof(PFS_socket); break; - case 132: + case 129: name= "socket_instances.row_count"; size= socket_max; break; - case 133: + case 130: name= "socket_instances.memory"; size= socket_max * sizeof(PFS_socket); total_memory+= size; break; - case 134: + case 131: name= "events_statements_summary_by_digest.row_size"; size= sizeof(PFS_statements_digest_stat); break; - case 135: + case 132: name= "events_statements_summary_by_digest.row_count"; size= digest_max; break; - case 136: + case 133: name= "events_statements_summary_by_digest.memory"; size= digest_max * sizeof(PFS_statements_digest_stat); total_memory+= size; - break; + break; + case 134: + name= "session_connect_attrs.row_size"; + size= thread_max; + break; + case 135: + name= "session_connect_attrs.row_count"; + size= session_connect_attrs_size_per_thread; + break; + case 136: + name= "session_connect_attrs.memory"; + size= thread_max * session_connect_attrs_size_per_thread; + total_memory+= size; + break; + + case 137: + name= "(account_hash).count"; + size= account_hash.count; + break; + case 138: + name= "(account_hash).size"; + size= account_hash.size; + break; + case 139: + name= "(digest_hash).count"; + size= digest_hash.count; + break; + case 140: + name= "(digest_hash).size"; + size= digest_hash.size; + break; + case 141: + name= "(filename_hash).count"; + size= filename_hash.count; + break; + case 142: + name= "(filename_hash).size"; + size= filename_hash.size; + break; + case 143: + name= "(host_hash).count"; + size= host_hash.count; + break; + case 144: + name= "(host_hash).size"; + size= host_hash.size; + break; + case 145: + name= "(setup_actor_hash).count"; + size= setup_actor_hash.count; + break; + case 146: + name= "(setup_actor_hash).size"; + size= setup_actor_hash.size; + break; + case 147: + name= "(setup_object_hash).count"; + size= setup_object_hash.count; + break; + case 148: + name= "(setup_object_hash).size"; + size= setup_object_hash.size; + break; + case 149: + name= "(table_share_hash).count"; + size= table_share_hash.count; + break; + case 150: + name= "(table_share_hash).size"; + size= table_share_hash.size; + break; + case 151: + name= "(user_hash).count"; + size= user_hash.count; + break; + case 152: + name= "(user_hash).size"; + size= user_hash.size; + break; + case 153: + /* + This is not a performance_schema buffer, + the data is maintained in the server, + in hostname_cache. + Print the size only, there are: + - no host_cache.count + - no host_cache.memory + */ + name= "host_cache.size"; +#ifdef NOT_YET_IMPLEMENTED + size= sizeof(Host_entry); +#else + size= 0; +#endif + break; + /* This case must be last, for aggregation in total_memory. */ - case 137: + case 154: name= "performance_schema.memory"; size= total_memory; /* This will fail if something is not advertised here */ diff --git a/storage/perfschema/pfs_engine_table.h b/storage/perfschema/pfs_engine_table.h index 40f5404d0b7..981d72ee19e 100644 --- a/storage/perfschema/pfs_engine_table.h +++ b/storage/perfschema/pfs_engine_table.h @@ -263,7 +263,7 @@ public: ~PFS_readonly_acl() {} - ACL_internal_access_result check(ulong want_access, ulong *save_priv) const; + virtual ACL_internal_access_result check(ulong want_access, ulong *save_priv) const; }; /** Singleton instance of PFS_readonly_acl. */ diff --git a/storage/perfschema/pfs_events.h b/storage/perfschema/pfs_events.h index c9586df11bd..97fb7e08d63 100644 --- a/storage/perfschema/pfs_events.h +++ b/storage/perfschema/pfs_events.h @@ -29,7 +29,7 @@ struct PFS_instr_class; struct PFS_events { /** THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** EVENT_ID. */ ulonglong m_event_id; /** END_EVENT_ID. */ diff --git a/storage/perfschema/pfs_events_waits.cc b/storage/perfschema/pfs_events_waits.cc index 2ee9ec292a2..c8a9d20a2f1 100644 --- a/storage/perfschema/pfs_events_waits.cc +++ b/storage/perfschema/pfs_events_waits.cc @@ -230,16 +230,6 @@ void reset_events_waits_by_host() } } -/** Reset table EVENTS_WAITS_GLOBAL_BY_EVENT_NAME data. */ -void reset_events_waits_global() -{ - PFS_single_stat *stat= global_instr_class_waits_array; - PFS_single_stat *stat_last= global_instr_class_waits_array + wait_class_max; - - for ( ; stat < stat_last; stat++) - stat->reset(); -} - void reset_table_waits_by_table() { PFS_table_share *pfs= table_share_array; diff --git a/storage/perfschema/pfs_global.cc b/storage/perfschema/pfs_global.cc index 6c3b79a3e1f..0c022b85748 100644 --- a/storage/perfschema/pfs_global.cc +++ b/storage/perfschema/pfs_global.cc @@ -18,13 +18,16 @@ Miscellaneous global dependencies (implementation). */ -#include "my_global.h" -#include "my_sys.h" #include "pfs_global.h" -#include "my_net.h" +#include <my_sys.h> +#include <my_net.h> +#ifdef HAVE_MALLOC_H +#include <malloc.h> /* memalign() may be here */ +#endif -#include <stdlib.h> -#include <string.h> +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif #ifdef __WIN__ #include <winsock2.h> @@ -45,18 +48,65 @@ void *pfs_malloc(size_t size, myf flags) DBUG_ASSERT(! pfs_initialized); DBUG_ASSERT(size > 0); - void *ptr= malloc(size); - if (likely(ptr != NULL)) - pfs_allocated_memory+= size; - if (likely((ptr != NULL) && (flags & MY_ZEROFILL))) + void *ptr; + +#ifdef PFS_ALIGNEMENT +#ifdef HAVE_POSIX_MEMALIGN + /* Linux */ + if (unlikely(posix_memalign(& ptr, PFS_ALIGNEMENT, size))) + return NULL; +#else +#ifdef HAVE_MEMALIGN + /* Solaris */ + ptr= memalign(PFS_ALIGNEMENT, size); + if (unlikely(ptr == NULL)) + return NULL; +#else +#ifdef HAVE_ALIGNED_MALLOC + /* Windows */ + ptr= _aligned_malloc(size, PFS_ALIGNEMENT); + if (unlikely(ptr == NULL)) + return NULL; +#else +#error "Missing implementation for PFS_ALIGNENT" +#endif /* HAVE_ALIGNED_MALLOC */ +#endif /* HAVE_MEMALIGN */ +#endif /* HAVE_POSIX_MEMALIGN */ +#else /* PFS_ALIGNMENT */ + /* Everything else */ + ptr= malloc(size); + if (unlikely(ptr == NULL)) + return NULL; +#endif + + pfs_allocated_memory+= size; + if (flags & MY_ZEROFILL) memset(ptr, 0, size); return ptr; } void pfs_free(void *ptr) { - if (ptr != NULL) - free(ptr); + if (ptr == NULL) + return; + +#ifdef HAVE_POSIX_MEMALIGN + /* Allocated with posix_memalign() */ + free(ptr); +#else +#ifdef HAVE_MEMALIGN + /* Allocated with memalign() */ + free(ptr); +#else +#ifdef HAVE_ALIGNED_MALLOC + /* Allocated with _aligned_malloc() */ + _aligned_free(ptr); +#else + /* Allocated with malloc() */ + free(ptr); +#endif /* HAVE_ALIGNED_MALLOC */ +#endif /* HAVE_MEMALIGN */ +#endif /* HAVE_POSIX_MEMALIGN */ } void pfs_print_error(const char *format, ...) diff --git a/storage/perfschema/pfs_global.h b/storage/perfschema/pfs_global.h index 693153cb097..cddf688ddf4 100644 --- a/storage/perfschema/pfs_global.h +++ b/storage/perfschema/pfs_global.h @@ -16,6 +16,9 @@ #ifndef PFS_GLOBAL_H #define PFS_GLOBAL_H +#include "my_global.h" +#include "my_compiler.h" + /** @file storage/perfschema/pfs_global.h Miscellaneous global dependencies (declarations). @@ -26,6 +29,18 @@ extern bool pfs_initialized; /** Total memory allocated by the performance schema, in bytes. */ extern ulonglong pfs_allocated_memory; +#if defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN) || defined(HAVE_ALIGNED_MALLOC) +#define PFS_ALIGNEMENT 64 +#define PFS_ALIGNED MY_ALIGNED(PFS_ALIGNEMENT) +#else +/* + Known platforms that do not provide aligned memory: + - MacOSX Darwin (osx10.5) + For these platforms, compile without the alignment optimization. +*/ +#define PFS_ALIGNED +#endif /* HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_MALLOC */ + void *pfs_malloc(size_t size, myf flags); /** diff --git a/storage/perfschema/pfs_host.cc b/storage/perfschema/pfs_host.cc index 82b78e19ce8..09763b0bd8b 100644 --- a/storage/perfschema/pfs_host.cc +++ b/storage/perfschema/pfs_host.cc @@ -42,7 +42,7 @@ static PFS_single_stat *host_instr_class_waits_array= NULL; static PFS_stage_stat *host_instr_class_stages_array= NULL; static PFS_statement_stat *host_instr_class_statements_array= NULL; -static LF_HASH host_hash; +LF_HASH host_hash; static bool host_hash_inited= false; /** @@ -146,10 +146,11 @@ C_MODE_END */ int init_host_hash(void) { - if (! host_hash_inited) + if ((! host_hash_inited) && (host_max > 0)) { lf_hash_init(&host_hash, sizeof(PFS_host*), LF_HASH_UNIQUE, 0, 0, host_hash_get_key, &my_charset_bin); + host_hash.size= host_max; host_hash_inited= true; } return 0; diff --git a/storage/perfschema/pfs_host.h b/storage/perfschema/pfs_host.h index d04b88e62f3..eb0ff6efc6f 100644 --- a/storage/perfschema/pfs_host.h +++ b/storage/perfschema/pfs_host.h @@ -44,7 +44,7 @@ struct PFS_host_key uint m_key_length; }; -struct PFS_host : PFS_connection_slice +struct PFS_ALIGNED PFS_host : PFS_connection_slice { public: inline void init_refcount(void) @@ -105,6 +105,8 @@ extern ulong host_lost; extern PFS_host *host_array; +extern LF_HASH host_hash; + /** @} */ #endif diff --git a/storage/perfschema/pfs_instr.cc b/storage/perfschema/pfs_instr.cc index 39caabaf030..25e78ee7b5e 100644 --- a/storage/perfschema/pfs_instr.cc +++ b/storage/perfschema/pfs_instr.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -82,6 +82,10 @@ uint statement_stack_max; ulong locker_lost= 0; /** Number of statement lost. @sa STATEMENT_STACK_SIZE. */ ulong statement_lost= 0; +/** Size of connection attribute storage per thread */ +ulong session_connect_attrs_size_per_thread; +/** Number of connection attributes lost */ +ulong session_connect_attrs_lost= 0; /** Mutex instrumentation instances array. @@ -140,11 +144,10 @@ PFS_table *table_array= NULL; */ PFS_socket *socket_array= NULL; -PFS_single_stat *global_instr_class_waits_array= NULL; PFS_stage_stat *global_instr_class_stages_array= NULL; PFS_statement_stat *global_instr_class_statements_array= NULL; -static volatile uint32 thread_internal_id_counter= 0; +static volatile uint64 thread_internal_id_counter= 0; static uint thread_instr_class_waits_sizing; static uint thread_instr_class_stages_sizing; @@ -157,9 +160,10 @@ static PFS_events_waits *thread_waits_history_array= NULL; static PFS_events_stages *thread_stages_history_array= NULL; static PFS_events_statements *thread_statements_history_array= NULL; static PFS_events_statements *thread_statements_stack_array= NULL; +static char *thread_session_connect_attrs_array= NULL; /** Hash table for instrumented files. */ -static LF_HASH filename_hash; +LF_HASH filename_hash; /** True if filename_hash is initialized. */ static bool filename_hash_inited= false; @@ -174,6 +178,7 @@ int init_instruments(const PFS_global_param *param) uint thread_stages_history_sizing; uint thread_statements_history_sizing; uint thread_statements_stack_sizing; + uint thread_session_connect_attrs_sizing; uint index; DBUG_ENTER("init_instruments"); @@ -221,6 +226,11 @@ int init_instruments(const PFS_global_param *param) thread_instr_class_statements_sizing= param->m_thread_sizing * param->m_statement_class_sizing; + session_connect_attrs_size_per_thread= param->m_session_connect_attrs_sizing; + thread_session_connect_attrs_sizing= param->m_thread_sizing + * session_connect_attrs_size_per_thread; + session_connect_attrs_lost= 0; + mutex_array= NULL; rwlock_array= NULL; cond_array= NULL; @@ -366,6 +376,14 @@ int init_instruments(const PFS_global_param *param) thread_instr_class_statements_array[index].reset(); } + if (thread_session_connect_attrs_sizing > 0) + { + thread_session_connect_attrs_array= + (char *)pfs_malloc(thread_session_connect_attrs_sizing, MYF(MY_ZEROFILL)); + if (unlikely(thread_session_connect_attrs_array == NULL)) + return 1; + } + for (index= 0; index < thread_max; index++) { thread_array[index].m_waits_history= @@ -382,18 +400,8 @@ int init_instruments(const PFS_global_param *param) &thread_statements_stack_array[index * statement_stack_max]; thread_array[index].m_instr_class_statements_stats= &thread_instr_class_statements_array[index * statement_class_max]; - } - - if (wait_class_max > 0) - { - global_instr_class_waits_array= - PFS_MALLOC_ARRAY(wait_class_max, - PFS_single_stat, MYF(MY_ZEROFILL)); - if (unlikely(global_instr_class_waits_array == NULL)) - DBUG_RETURN(1); - - for (index= 0; index < wait_class_max; index++) - global_instr_class_waits_array[index].reset(); + thread_array[index].m_session_connect_attrs= + &thread_session_connect_attrs_array[index * session_connect_attrs_size_per_thread]; } if (stage_class_max > 0) @@ -461,8 +469,6 @@ void cleanup_instruments(void) thread_statements_stack_array= NULL; pfs_free(thread_instr_class_waits_array); thread_instr_class_waits_array= NULL; - pfs_free(global_instr_class_waits_array); - global_instr_class_waits_array= NULL; pfs_free(global_instr_class_stages_array); global_instr_class_stages_array= NULL; pfs_free(global_instr_class_statements_array); @@ -471,6 +477,9 @@ void cleanup_instruments(void) thread_instr_class_statements_array= NULL; pfs_free(thread_instr_class_stages_array); thread_instr_class_stages_array= NULL; + pfs_free(thread_session_connect_attrs_array); + thread_session_connect_attrs_array=NULL; + DBUG_VOID_RETURN; } @@ -502,10 +511,11 @@ int init_file_hash(void) { DBUG_ENTER("init_file_hash"); - if (! filename_hash_inited) + if ((! filename_hash_inited) && (file_max > 0)) { lf_hash_init(&filename_hash, sizeof(PFS_file*), LF_HASH_UNIQUE, 0, 0, filename_hash_get_key, &my_charset_bin); + filename_hash.size= file_max; filename_hash_inited= true; } DBUG_RETURN(0); @@ -604,7 +614,7 @@ void PFS_scan::init(uint random, uint max_size) */ PFS_mutex* create_mutex(PFS_mutex_class *klass, const void *identity) { - static uint mutex_monotonic_index= 0; + static uint PFS_ALIGNED mutex_monotonic_index= 0; uint index; uint attempts= 0; PFS_mutex *pfs; @@ -642,8 +652,7 @@ PFS_mutex* create_mutex(PFS_mutex_class *klass, const void *identity) pfs->m_class= klass; pfs->m_enabled= klass->m_enabled && flag_global_instrumentation; pfs->m_timed= klass->m_timed; - pfs->m_wait_stat.reset(); - pfs->m_lock_stat.reset(); + pfs->m_mutex_stat.reset(); pfs->m_owner= NULL; pfs->m_last_locked= 0; pfs->m_lock.dirty_to_allocated(); @@ -667,10 +676,9 @@ void destroy_mutex(PFS_mutex *pfs) DBUG_ENTER("destroy_mutex"); DBUG_ASSERT(pfs != NULL); PFS_mutex_class *klass= pfs->m_class; - /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */ - uint index= klass->m_event_name_index; - global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat); - pfs->m_wait_stat.reset(); + /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME */ + klass->m_mutex_stat.aggregate(& pfs->m_mutex_stat); + pfs->m_mutex_stat.reset(); if (klass->is_singleton()) klass->m_singleton= NULL; pfs->m_lock.allocated_to_free(); @@ -685,7 +693,7 @@ void destroy_mutex(PFS_mutex *pfs) */ PFS_rwlock* create_rwlock(PFS_rwlock_class *klass, const void *identity) { - static uint rwlock_monotonic_index= 0; + static uint PFS_ALIGNED rwlock_monotonic_index= 0; uint index; uint attempts= 0; PFS_rwlock *pfs; @@ -705,10 +713,8 @@ PFS_rwlock* create_rwlock(PFS_rwlock_class *klass, const void *identity) pfs->m_class= klass; pfs->m_enabled= klass->m_enabled && flag_global_instrumentation; pfs->m_timed= klass->m_timed; - pfs->m_wait_stat.reset(); + pfs->m_rwlock_stat.reset(); pfs->m_lock.dirty_to_allocated(); - pfs->m_read_lock_stat.reset(); - pfs->m_write_lock_stat.reset(); pfs->m_writer= NULL; pfs->m_readers= 0; pfs->m_last_written= 0; @@ -733,10 +739,9 @@ void destroy_rwlock(PFS_rwlock *pfs) DBUG_ENTER("destroy_rwlock"); DBUG_ASSERT(pfs != NULL); PFS_rwlock_class *klass= pfs->m_class; - /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */ - uint index= klass->m_event_name_index; - global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat); - pfs->m_wait_stat.reset(); + /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME */ + klass->m_rwlock_stat.aggregate(& pfs->m_rwlock_stat); + pfs->m_rwlock_stat.reset(); if (klass->is_singleton()) klass->m_singleton= NULL; pfs->m_lock.allocated_to_free(); @@ -751,7 +756,7 @@ void destroy_rwlock(PFS_rwlock *pfs) */ PFS_cond* create_cond(PFS_cond_class *klass, const void *identity) { - static uint cond_monotonic_index= 0; + static uint PFS_ALIGNED cond_monotonic_index= 0; uint index; uint attempts= 0; PFS_cond *pfs; @@ -796,9 +801,8 @@ void destroy_cond(PFS_cond *pfs) DBUG_ASSERT(pfs != NULL); PFS_cond_class *klass= pfs->m_class; - /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */ - uint index= klass->m_event_name_index; - global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat); + /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME */ + klass->m_cond_stat.aggregate(& pfs->m_cond_stat); pfs->m_wait_stat.reset(); if (klass->is_singleton()) klass->m_singleton= NULL; @@ -812,19 +816,32 @@ PFS_thread* PFS_thread::get_current_thread() return pfs; } +void PFS_thread::reset_session_connect_attrs() +{ + m_session_connect_attrs_length= 0; + m_session_connect_attrs_cs= NULL; + + if ((m_session_connect_attrs != NULL) && + (session_connect_attrs_size_per_thread > 0) ) + { + /* Do not keep user data */ + memset(m_session_connect_attrs, 0, session_connect_attrs_size_per_thread); + } +} + /** Create instrumentation for a thread instance. @param klass the thread class @param identity the thread address, or a value characteristic of this thread - @param thread_id the PROCESSLIST thread id, + @param processlist_id the PROCESSLIST id, or 0 if unknown @return a thread instance, or NULL */ PFS_thread* create_thread(PFS_thread_class *klass, const void *identity, - ulong thread_id) + ulonglong processlist_id) { - static uint thread_monotonic_index= 0; + static uint PFS_ALIGNED thread_monotonic_index= 0; uint index; uint attempts= 0; PFS_thread *pfs; @@ -841,9 +858,9 @@ PFS_thread* create_thread(PFS_thread_class *klass, const void *identity, if (pfs->m_lock.free_to_dirty()) { pfs->m_thread_internal_id= - PFS_atomic::add_u32(&thread_internal_id_counter, 1); + PFS_atomic::add_u64(&thread_internal_id_counter, 1); pfs->m_parent_thread_internal_id= 0; - pfs->m_thread_id= thread_id; + pfs->m_processlist_id= processlist_id; pfs->m_event_id= 1; pfs->m_enabled= true; pfs->m_class= klass; @@ -856,6 +873,7 @@ PFS_thread* create_thread(PFS_thread_class *klass, const void *identity, pfs->m_statements_history_index= 0; pfs->reset_stats(); + pfs->reset_session_connect_attrs(); pfs->m_filename_hash_pins= NULL; pfs->m_table_share_hash_pins= NULL; @@ -871,8 +889,11 @@ PFS_thread* create_thread(PFS_thread_class *klass, const void *identity, pfs->m_dbname_length= 0; pfs->m_command= 0; pfs->m_start_time= 0; + pfs->m_processlist_state_ptr= NULL; pfs->m_processlist_state_length= 0; + pfs->m_processlist_info_ptr= NULL; pfs->m_processlist_info_length= 0; + pfs->m_processlist_lock.set_allocated(); pfs->m_host= NULL; pfs->m_user= NULL; @@ -999,6 +1020,7 @@ PFS_socket *sanitize_socket(PFS_socket *unsafe) void destroy_thread(PFS_thread *pfs) { DBUG_ASSERT(pfs != NULL); + pfs->reset_session_connect_attrs(); if (pfs->m_account != NULL) { pfs->m_account->release(); @@ -1084,11 +1106,12 @@ LF_PINS* get_filename_hash_pins(PFS_thread *thread) @param klass the file class @param filename the file name @param len the length in bytes of filename + @param create create a file instance if none found @return a file instance, or NULL */ PFS_file* find_or_create_file(PFS_thread *thread, PFS_file_class *klass, - const char *filename, uint len) + const char *filename, uint len, bool create) { PFS_file *pfs; LF_PINS *pins; @@ -1096,6 +1119,8 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass, const char *safe_filename; DBUG_ENTER("find_or_create_file"); + DBUG_ASSERT(klass != NULL || ! create); + pins= get_filename_hash_pins(thread); if (unlikely(pins == NULL)) { @@ -1171,7 +1196,7 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass, /* Append the unresolved file name to the resolved path */ char *ptr= buffer + strlen(buffer); char *buf_end= &buffer[sizeof(buffer)-1]; - if (buf_end > ptr) + if ((buf_end > ptr) && (*(ptr-1) != FN_LIBCHAR)) *ptr++= FN_LIBCHAR; if (buf_end > ptr) strncpy(ptr, safe_filename + dirlen, buf_end - ptr); @@ -1183,7 +1208,7 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass, PFS_file **entry; uint retry_count= 0; const uint retry_max= 3; - static uint file_monotonic_index= 0; + static uint PFS_ALIGNED file_monotonic_index= 0; uint index; uint attempts= 0; @@ -1202,6 +1227,12 @@ search: lf_hash_search_unpin(pins); + if (! create) + { + /* No lost counter, just looking for the file existence. */ + return NULL; + } + while (++attempts <= file_max) { /* See create_mutex() */ @@ -1218,7 +1249,6 @@ search: strncpy(pfs->m_filename, normalized_filename, normalized_length); pfs->m_filename[normalized_length]= '\0'; pfs->m_filename_length= normalized_length; - pfs->m_wait_stat.reset(); pfs->m_file_stat.m_open_count= 1; pfs->m_file_stat.m_io_stat.reset(); pfs->m_identity= (const void *)pfs; @@ -1285,14 +1315,9 @@ void destroy_file(PFS_thread *thread, PFS_file *pfs) DBUG_ASSERT(pfs != NULL); PFS_file_class *klass= pfs->m_class; - /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */ - uint index= klass->m_event_name_index; - global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat); - pfs->m_wait_stat.reset(); - /* Aggregate to FILE_SUMMARY_BY_EVENT_NAME */ - klass->m_file_stat.m_io_stat.aggregate(& pfs->m_file_stat.m_io_stat); - pfs->m_file_stat.m_io_stat.reset(); + klass->m_file_stat.aggregate(& pfs->m_file_stat); + pfs->m_file_stat.reset(); if (klass->is_singleton()) klass->m_singleton= NULL; @@ -1318,7 +1343,7 @@ void destroy_file(PFS_thread *thread, PFS_file *pfs) PFS_table* create_table(PFS_table_share *share, PFS_thread *opening_thread, const void *identity) { - static uint table_monotonic_index= 0; + static uint PFS_ALIGNED table_monotonic_index= 0; uint index; uint attempts= 0; PFS_table *pfs; @@ -1364,23 +1389,33 @@ void PFS_table::sanitized_aggregate(void) and not own the table handle. */ PFS_table_share *safe_share= sanitize_table_share(m_share); - PFS_thread *safe_thread= sanitize_thread(m_thread_owner); - if ((safe_share != NULL && safe_thread != NULL) && - (m_has_io_stats || m_has_lock_stats)) + if (safe_share != NULL) { - safe_aggregate(& m_table_stat, safe_share, safe_thread); - m_has_io_stats= false; - m_has_lock_stats= false; + if (m_has_io_stats && m_has_lock_stats) + { + safe_aggregate(& m_table_stat, safe_share); + m_has_io_stats= false; + m_has_lock_stats= false; + } + else if (m_has_io_stats) + { + safe_aggregate_io(& m_table_stat, safe_share); + m_has_io_stats= false; + } + else if (m_has_lock_stats) + { + safe_aggregate_lock(& m_table_stat, safe_share); + m_has_lock_stats= false; + } } } void PFS_table::sanitized_aggregate_io(void) { PFS_table_share *safe_share= sanitize_table_share(m_share); - PFS_thread *safe_thread= sanitize_thread(m_thread_owner); - if (safe_share != NULL && safe_thread != NULL && m_has_io_stats) + if (safe_share != NULL && m_has_io_stats) { - safe_aggregate_io(& m_table_stat, safe_share, safe_thread); + safe_aggregate_io(& m_table_stat, safe_share); m_has_io_stats= false; } } @@ -1388,96 +1423,44 @@ void PFS_table::sanitized_aggregate_io(void) void PFS_table::sanitized_aggregate_lock(void) { PFS_table_share *safe_share= sanitize_table_share(m_share); - PFS_thread *safe_thread= sanitize_thread(m_thread_owner); - if (safe_share != NULL && safe_thread != NULL && m_has_lock_stats) + if (safe_share != NULL && m_has_lock_stats) { - safe_aggregate_lock(& m_table_stat, safe_share, safe_thread); + safe_aggregate_lock(& m_table_stat, safe_share); m_has_lock_stats= false; } } void PFS_table::safe_aggregate(PFS_table_stat *table_stat, - PFS_table_share *table_share, - PFS_thread *thread) + PFS_table_share *table_share) { DBUG_ASSERT(table_stat != NULL); DBUG_ASSERT(table_share != NULL); - DBUG_ASSERT(thread != NULL); - - if (flag_thread_instrumentation && thread->m_enabled) - { - PFS_single_stat *event_name_array; - uint index; - event_name_array= thread->m_instr_class_waits_stats; - /* - Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME - (for wait/io/table/sql/handler) - */ - index= global_table_io_class.m_event_name_index; - table_stat->sum_io(& event_name_array[index]); - - /* - Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME - (for wait/lock/table/sql/handler) - */ - index= global_table_lock_class.m_event_name_index; - table_stat->sum_lock(& event_name_array[index]); - } + uint key_count= sanitize_index_count(table_share->m_key_count); /* Aggregate to TABLE_IO_SUMMARY, TABLE_LOCK_SUMMARY */ - table_share->m_table_stat.aggregate(table_stat); + table_share->m_table_stat.aggregate(table_stat, key_count); table_stat->fast_reset(); } void PFS_table::safe_aggregate_io(PFS_table_stat *table_stat, - PFS_table_share *table_share, - PFS_thread *thread) + PFS_table_share *table_share) { DBUG_ASSERT(table_stat != NULL); DBUG_ASSERT(table_share != NULL); - DBUG_ASSERT(thread != NULL); - - if (flag_thread_instrumentation && thread->m_enabled) - { - PFS_single_stat *event_name_array; - uint index; - event_name_array= thread->m_instr_class_waits_stats; - /* - Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME - (for wait/io/table/sql/handler) - */ - index= global_table_io_class.m_event_name_index; - table_stat->sum_io(& event_name_array[index]); - } + uint key_count= sanitize_index_count(table_share->m_key_count); /* Aggregate to TABLE_IO_SUMMARY */ - table_share->m_table_stat.aggregate_io(table_stat); + table_share->m_table_stat.aggregate_io(table_stat, key_count); table_stat->fast_reset_io(); } void PFS_table::safe_aggregate_lock(PFS_table_stat *table_stat, - PFS_table_share *table_share, - PFS_thread *thread) + PFS_table_share *table_share) { DBUG_ASSERT(table_stat != NULL); DBUG_ASSERT(table_share != NULL); - DBUG_ASSERT(thread != NULL); - - if (flag_thread_instrumentation && thread->m_enabled) - { - PFS_single_stat *event_name_array; - uint index; - event_name_array= thread->m_instr_class_waits_stats; - - /* - Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME - (for wait/lock/table/sql/handler) - */ - index= global_table_lock_class.m_event_name_index; - table_stat->sum_lock(& event_name_array[index]); - } /* Aggregate to TABLE_LOCK_SUMMARY */ table_share->m_table_stat.aggregate_lock(table_stat); @@ -1504,47 +1487,59 @@ void destroy_table(PFS_table *pfs) @param identity the socket descriptor @return a socket instance, or NULL */ -PFS_socket* create_socket(PFS_socket_class *klass, const void *identity) +PFS_socket* create_socket(PFS_socket_class *klass, const my_socket *fd, + const struct sockaddr *addr, socklen_t addr_len) { - PFS_scan scan; + static uint PFS_ALIGNED socket_monotonic_index= 0; + uint index; + uint attempts= 0; + PFS_socket *pfs; DBUG_ENTER("create_socket"); - /** - Unlike other instrumented objects, there is no socket 'object' to use as a - unique identifier. Instead, a pointer to the PFS_socket object will be used - to identify this socket instance. The socket descriptor will be used to - seed the the random index assignment. - */ - my_socket fd= likely(identity != NULL) ? - *(reinterpret_cast<const my_socket*>(identity)) : 0; - my_ptrdiff_t ptr= fd; - uint random= randomized_index((const void *)ptr, socket_max); - - for (scan.init(random, socket_max); - scan.has_pass(); - scan.next_pass()) - { - PFS_socket *pfs= socket_array + scan.first(); - PFS_socket *pfs_last= socket_array + scan.last(); - for ( ; pfs < pfs_last; pfs++) + uint fd_used= 0; + uint addr_len_used= addr_len; + + if (fd != NULL) + fd_used= *fd; + + if (addr_len_used > sizeof(sockaddr_storage)) + addr_len_used= sizeof(sockaddr_storage); + + while (++attempts <= socket_max) + { + index= PFS_atomic::add_u32(& socket_monotonic_index, 1) % socket_max; + pfs= socket_array + index; + + if (pfs->m_lock.is_free()) { - if (pfs->m_lock.is_free()) + if (pfs->m_lock.free_to_dirty()) { - if (pfs->m_lock.free_to_dirty()) + pfs->m_fd= fd_used; + /* There is no socket object, so we use the instrumentation. */ + pfs->m_identity= pfs; + pfs->m_class= klass; + pfs->m_enabled= klass->m_enabled && flag_global_instrumentation; + pfs->m_timed= klass->m_timed; + pfs->m_idle= false; + pfs->m_socket_stat.reset(); + pfs->m_thread_owner= NULL; + + pfs->m_addr_len= addr_len_used; + if ((addr != NULL) && (addr_len_used > 0)) { - pfs->m_fd= fd; - pfs->m_identity= pfs; - pfs->m_class= klass; - pfs->m_enabled= klass->m_enabled && flag_global_instrumentation; - pfs->m_timed= klass->m_timed; - pfs->m_idle= false; - pfs->m_socket_stat.reset(); - pfs->m_lock.dirty_to_allocated(); - pfs->m_thread_owner= NULL; - if (klass->is_singleton()) - klass->m_singleton= pfs; - DBUG_RETURN(pfs); + pfs->m_addr_len= addr_len_used; + memcpy(&pfs->m_sock_addr, addr, addr_len_used); } + else + { + pfs->m_addr_len= 0; + } + + pfs->m_lock.dirty_to_allocated(); + + if (klass->is_singleton()) + klass->m_singleton= pfs; + DBUG_RETURN(pfs); } } } @@ -1598,7 +1593,7 @@ static void reset_mutex_waits_by_instance(void) DBUG_ENTER("reset_mutex_waits_by_instance"); for ( ; pfs < pfs_last; pfs++) - pfs->m_wait_stat.reset(); + pfs->m_mutex_stat.reset(); DBUG_VOID_RETURN; } @@ -1609,7 +1604,7 @@ static void reset_rwlock_waits_by_instance(void) DBUG_ENTER("reset_rwlock_waits_by_instance"); for ( ; pfs < pfs_last; pfs++) - pfs->m_wait_stat.reset(); + pfs->m_rwlock_stat.reset(); DBUG_VOID_RETURN; } @@ -1620,7 +1615,7 @@ static void reset_cond_waits_by_instance(void) DBUG_ENTER("reset_cond_waits_by_instance"); for ( ; pfs < pfs_last; pfs++) - pfs->m_wait_stat.reset(); + pfs->m_cond_stat.reset(); DBUG_VOID_RETURN; } @@ -1678,15 +1673,6 @@ void reset_socket_instance_io(void) DBUG_VOID_RETURN; } -void reset_global_wait_stat() -{ - PFS_single_stat *stat= global_instr_class_waits_array; - PFS_single_stat *stat_last= global_instr_class_waits_array + wait_class_max; - - for ( ; stat < stat_last; stat++) - stat->reset(); -} - void aggregate_all_event_names(PFS_single_stat *from_array, PFS_single_stat *to_array) { diff --git a/storage/perfschema/pfs_instr.h b/storage/perfschema/pfs_instr.h index b579c1d7902..2ea44830d2b 100644 --- a/storage/perfschema/pfs_instr.h +++ b/storage/perfschema/pfs_instr.h @@ -34,6 +34,8 @@ struct PFS_socket_class; #else #include <arpa/inet.h> #endif +#include "my_global.h" +#include "my_compiler.h" #include "pfs_lock.h" #include "pfs_stat.h" #include "pfs_instr_class.h" @@ -63,24 +65,17 @@ struct PFS_instr bool m_enabled; /** Timed flag. */ bool m_timed; - /** Instrument wait statistics. */ - PFS_single_stat m_wait_stat; }; /** Instrumented mutex implementation. @see PSI_mutex. */ -struct PFS_mutex : public PFS_instr +struct PFS_ALIGNED PFS_mutex : public PFS_instr { /** Mutex identity, typically a pthread_mutex_t. */ const void *m_identity; /** Mutex class. */ PFS_mutex_class *m_class; - /** Instrument wait statistics. */ - PFS_single_stat m_wait_stat; - /** - Mutex lock usage statistics. - This statistic is not exposed in user visible tables yet. - */ - PFS_single_stat m_lock_stat; + /** Instrument statistics. */ + PFS_mutex_stat m_mutex_stat; /** Current owner. */ PFS_thread *m_owner; /** @@ -91,24 +86,14 @@ struct PFS_mutex : public PFS_instr }; /** Instrumented rwlock implementation. @see PSI_rwlock. */ -struct PFS_rwlock : public PFS_instr +struct PFS_ALIGNED PFS_rwlock : public PFS_instr { /** RWLock identity, typically a pthread_rwlock_t. */ const void *m_identity; /** RWLock class. */ PFS_rwlock_class *m_class; - /** Instrument wait statistics. */ - PFS_single_stat m_wait_stat; - /** - RWLock read lock usage statistics. - This statistic is not exposed in user visible tables yet. - */ - PFS_single_stat m_read_lock_stat; - /** - RWLock write lock usage statistics. - This statistic is not exposed in user visible tables yet. - */ - PFS_single_stat m_write_lock_stat; + /** Instrument statistics. */ + PFS_rwlock_stat m_rwlock_stat; /** Current writer thread. */ PFS_thread *m_writer; /** Current count of readers. */ @@ -126,7 +111,7 @@ struct PFS_rwlock : public PFS_instr }; /** Instrumented cond implementation. @see PSI_cond. */ -struct PFS_cond : public PFS_instr +struct PFS_ALIGNED PFS_cond : public PFS_instr { /** Condition identity, typically a pthread_cond_t. */ const void *m_identity; @@ -139,7 +124,7 @@ struct PFS_cond : public PFS_instr }; /** Instrumented File and FILE implementation. @see PSI_file. */ -struct PFS_file : public PFS_instr +struct PFS_ALIGNED PFS_file : public PFS_instr { uint32 get_version() { return m_lock.get_version(); } @@ -152,14 +137,12 @@ struct PFS_file : public PFS_instr uint m_filename_length; /** File class. */ PFS_file_class *m_class; - /** Instrument wait statistics. */ - PFS_single_stat m_wait_stat; /** File usage statistics. */ PFS_file_stat m_file_stat; }; /** Instrumented table implementation. @see PSI_table. */ -struct PFS_table +struct PFS_ALIGNED PFS_table { /** True if table io instrumentation is enabled. @@ -196,12 +179,22 @@ public: */ void aggregate(void) { - if (likely((m_thread_owner != NULL) && (m_has_io_stats || m_has_lock_stats))) + if (m_has_io_stats && m_has_lock_stats) { - safe_aggregate(& m_table_stat, m_share, m_thread_owner); + safe_aggregate(& m_table_stat, m_share); m_has_io_stats= false; m_has_lock_stats= false; } + else if (m_has_io_stats) + { + safe_aggregate_io(& m_table_stat, m_share); + m_has_io_stats= false; + } + else if (m_has_lock_stats) + { + safe_aggregate_lock(& m_table_stat, m_share); + m_has_lock_stats= false; + } } /** @@ -238,18 +231,15 @@ public: private: static void safe_aggregate(PFS_table_stat *stat, - PFS_table_share *safe_share, - PFS_thread *safe_thread); + PFS_table_share *safe_share); static void safe_aggregate_io(PFS_table_stat *stat, - PFS_table_share *safe_share, - PFS_thread *safe_thread); + PFS_table_share *safe_share); static void safe_aggregate_lock(PFS_table_stat *stat, - PFS_table_share *safe_share, - PFS_thread *safe_thread); + PFS_table_share *safe_share); }; /** Instrumented socket implementation. @see PSI_socket. */ -struct PFS_socket : public PFS_instr +struct PFS_ALIGNED PFS_socket : public PFS_instr { uint32 get_version() { return m_lock.get_version(); } @@ -371,7 +361,7 @@ private: /** Instrumented thread implementation. @see PSI_thread. */ -struct PFS_thread : PFS_connection_slice +struct PFS_ALIGNED PFS_thread : PFS_connection_slice { static PFS_thread* get_current_thread(void); @@ -400,11 +390,11 @@ struct PFS_thread : PFS_connection_slice /** Pins for digest_hash. */ LF_PINS *m_digest_hash_pins; /** Internal thread identifier, unique. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Parent internal thread identifier. */ - ulong m_parent_thread_internal_id; + ulonglong m_parent_thread_internal_id; /** External (SHOW PROCESSLIST) thread identifier, not unique. */ - ulong m_thread_id; + ulong m_processlist_id; /** Thread class. */ PFS_thread_class *m_class; /** @@ -486,6 +476,8 @@ struct PFS_thread : PFS_connection_slice int m_command; /** Start time. */ time_t m_start_time; + /** Lock for Processlist state, Processlist info. */ + pfs_lock m_processlist_lock; /** Processlist state. */ const char *m_processlist_state_ptr; /** Length of @c m_processlist_state_ptr. */ @@ -504,9 +496,18 @@ struct PFS_thread : PFS_connection_slice PFS_host *m_host; PFS_user *m_user; PFS_account *m_account; + + /** Reset session connect attributes */ + void reset_session_connect_attrs(); + + /** a buffer for the connection attributes */ + char *m_session_connect_attrs; + /** length used by @c m_connect_attrs */ + uint m_session_connect_attrs_length; + /** character set in which @c m_connect_attrs are encoded */ + const CHARSET_INFO *m_session_connect_attrs_cs; }; -extern PFS_single_stat *global_instr_class_waits_array; extern PFS_stage_stat *global_instr_class_stages_array; extern PFS_statement_stat *global_instr_class_statements_array; @@ -529,12 +530,12 @@ PFS_cond* create_cond(PFS_cond_class *klass, const void *identity); void destroy_cond(PFS_cond *pfs); PFS_thread* create_thread(PFS_thread_class *klass, const void *identity, - ulong thread_id); + ulonglong processlist_id); void destroy_thread(PFS_thread *pfs); PFS_file* find_or_create_file(PFS_thread *thread, PFS_file_class *klass, - const char *filename, uint len); + const char *filename, uint len, bool create); void release_file(PFS_file *pfs); void destroy_file(PFS_thread *thread, PFS_file *pfs); @@ -542,7 +543,10 @@ PFS_table* create_table(PFS_table_share *share, PFS_thread *opening_thread, const void *identity); void destroy_table(PFS_table *pfs); -PFS_socket* create_socket(PFS_socket_class *socket_class, const void *identity); +PFS_socket* create_socket(PFS_socket_class *socket_class, + const my_socket *fd, + const struct sockaddr *addr, + socklen_t addr_len); void destroy_socket(PFS_socket *pfs); /* For iterators and show status. */ @@ -568,6 +572,8 @@ extern ulong events_stages_history_per_thread; extern ulong events_statements_history_per_thread; extern ulong locker_lost; extern ulong statement_lost; +extern ulong session_connect_attrs_lost; +extern ulong session_connect_attrs_size_per_thread; /* Exposing the data directly, for iterators. */ @@ -624,6 +630,8 @@ void update_socket_derived_flags(); /** Update derived flags for all instruments. */ void update_instruments_derived_flags(); +extern LF_HASH filename_hash; + /** @} */ #endif diff --git a/storage/perfschema/pfs_instr_class.cc b/storage/perfschema/pfs_instr_class.cc index 0a4b47404a4..05c85104a94 100644 --- a/storage/perfschema/pfs_instr_class.cc +++ b/storage/perfschema/pfs_instr_class.cc @@ -135,9 +135,12 @@ static PFS_thread_class *thread_class_array= NULL; */ PFS_table_share *table_share_array= NULL; -PFS_instr_class global_table_io_class; -PFS_instr_class global_table_lock_class; -PFS_instr_class global_idle_class; +PFS_ALIGNED PFS_single_stat global_idle_stat; +PFS_ALIGNED PFS_table_io_stat global_table_io_stat; +PFS_ALIGNED PFS_table_lock_stat global_table_lock_stat; +PFS_ALIGNED PFS_instr_class global_table_io_class; +PFS_ALIGNED PFS_instr_class global_table_lock_class; +PFS_ALIGNED PFS_instr_class global_idle_class; /** Class-timer map */ enum_timer_name *class_timers[] = @@ -165,7 +168,7 @@ enum_timer_name *class_timers[] = @sa table_share_hash_get_key @sa get_table_share_hash_pins */ -static LF_HASH table_share_hash; +LF_HASH table_share_hash; /** True if table_share_hash is initialized. */ static bool table_share_hash_inited= false; @@ -193,19 +196,17 @@ uint mutex_class_start= 0; uint rwlock_class_start= 0; uint cond_class_start= 0; uint file_class_start= 0; -uint table_class_start= 0; uint wait_class_max= 0; uint socket_class_start= 0; void init_event_name_sizing(const PFS_global_param *param) { - mutex_class_start= 0; + mutex_class_start= 3; /* global table io, table lock, idle */ rwlock_class_start= mutex_class_start + param->m_mutex_class_sizing; cond_class_start= rwlock_class_start + param->m_rwlock_class_sizing; file_class_start= cond_class_start + param->m_cond_class_sizing; socket_class_start= file_class_start + param->m_file_class_sizing; - table_class_start= socket_class_start + param->m_socket_class_sizing; - wait_class_max= table_class_start + 3; /* global table io, lock, idle */ + wait_class_max= socket_class_start + param->m_socket_class_sizing; } void register_global_classes() @@ -213,19 +214,19 @@ void register_global_classes() /* Table IO class */ init_instr_class(&global_table_io_class, "wait/io/table/sql/handler", 25, 0, PFS_CLASS_TABLE_IO); - global_table_io_class.m_event_name_index= table_class_start; + global_table_io_class.m_event_name_index= GLOBAL_TABLE_IO_EVENT_INDEX; configure_instr_class(&global_table_io_class); /* Table lock class */ init_instr_class(&global_table_lock_class, "wait/lock/table/sql/handler", 27, 0, PFS_CLASS_TABLE_LOCK); - global_table_lock_class.m_event_name_index= table_class_start + 1; + global_table_lock_class.m_event_name_index= GLOBAL_TABLE_LOCK_EVENT_INDEX; configure_instr_class(&global_table_lock_class); /* Idle class */ init_instr_class(&global_idle_class, "idle", 4, 0, PFS_CLASS_IDLE); - global_idle_class.m_event_name_index= table_class_start + 2; + global_idle_class.m_event_name_index= GLOBAL_IDLE_EVENT_INDEX; configure_instr_class(&global_idle_class); } @@ -384,6 +385,7 @@ int init_table_share_hash(void) { lf_hash_init(&table_share_hash, sizeof(PFS_table_share*), LF_HASH_UNIQUE, 0, 0, table_share_hash_get_key, &my_charset_bin); + table_share_hash.size= table_share_max; table_share_hash_inited= true; } return 0; @@ -715,7 +717,7 @@ PFS_sync_key register_mutex_class(const char *name, uint name_length, */ entry= &mutex_class_array[index]; init_instr_class(entry, name, name_length, flags, PFS_CLASS_MUTEX); - entry->m_lock_stat.reset(); + entry->m_mutex_stat.reset(); entry->m_event_name_index= mutex_class_start + index; entry->m_singleton= NULL; entry->m_enabled= false; /* disabled by default */ @@ -781,8 +783,7 @@ PFS_sync_key register_rwlock_class(const char *name, uint name_length, { entry= &rwlock_class_array[index]; init_instr_class(entry, name, name_length, flags, PFS_CLASS_RWLOCK); - entry->m_read_lock_stat.reset(); - entry->m_write_lock_stat.reset(); + entry->m_rwlock_stat.reset(); entry->m_event_name_index= rwlock_class_start + index; entry->m_singleton= NULL; entry->m_enabled= false; /* disabled by default */ @@ -1193,7 +1194,7 @@ static void set_keys(PFS_table_share *pfs, const TABLE_SHARE *share) pfs_key->m_name_length= len; } - pfs_key_last= pfs->m_keys + MAX_KEY; + pfs_key_last= pfs->m_keys + MAX_INDEXES; for ( ; pfs_key < pfs_key_last; pfs_key++) pfs_key->m_name_length= 0; } @@ -1256,7 +1257,7 @@ PFS_table_share* find_or_create_table_share(PFS_thread *thread, const uint retry_max= 3; bool enabled= true; bool timed= true; - static uint table_share_monotonic_index= 0; + static uint PFS_ALIGNED table_share_monotonic_index= 0; uint index; uint attempts= 0; PFS_table_share *pfs; @@ -1299,8 +1300,7 @@ search: while (++attempts <= table_share_max) { /* See create_mutex() */ - PFS_atomic::add_u32(& table_share_monotonic_index, 1); - index= table_share_monotonic_index % table_share_max; + index= PFS_atomic::add_u32(& table_share_monotonic_index, 1) % table_share_max; pfs= table_share_array + index; if (pfs->m_lock.is_free()) @@ -1353,17 +1353,28 @@ search: void PFS_table_share::aggregate_io(void) { - uint index= global_table_io_class.m_event_name_index; - PFS_single_stat *table_io_total= & global_instr_class_waits_array[index]; - m_table_stat.sum_io(table_io_total); + uint safe_key_count= sanitize_index_count(m_key_count); + PFS_table_io_stat *from_stat; + PFS_table_io_stat *from_stat_last; + PFS_table_io_stat sum_io; + + /* Aggregate stats for each index, if any */ + from_stat= & m_table_stat.m_index_stat[0]; + from_stat_last= from_stat + safe_key_count; + for ( ; from_stat < from_stat_last ; from_stat++) + sum_io.aggregate(from_stat); + + /* Aggregate stats for the table */ + sum_io.aggregate(& m_table_stat.m_index_stat[MAX_INDEXES]); + + /* Add this table stats to the global sink. */ + global_table_io_stat.aggregate(& sum_io); m_table_stat.fast_reset_io(); } void PFS_table_share::aggregate_lock(void) { - uint index= global_table_lock_class.m_event_name_index; - PFS_single_stat *table_lock_total= & global_instr_class_waits_array[index]; - m_table_stat.sum_lock(table_lock_total); + global_table_lock_stat.aggregate(& m_table_stat.m_lock_stat); m_table_stat.fast_reset_lock(); } @@ -1418,6 +1429,16 @@ PFS_table_share *sanitize_table_share(PFS_table_share *unsafe) SANITIZE_ARRAY_BODY(PFS_table_share, table_share_array, table_share_max, unsafe); } +/** Reset the wait statistics per instrument class. */ +void reset_events_waits_by_class() +{ + reset_file_class_io(); + reset_socket_class_io(); + global_idle_stat.reset(); + global_table_io_stat.reset(); + global_table_lock_stat.reset(); +} + /** Reset the io statistics per file class. */ void reset_file_class_io(void) { diff --git a/storage/perfschema/pfs_instr_class.h b/storage/perfschema/pfs_instr_class.h index bef25e76467..d0b90734b66 100644 --- a/storage/perfschema/pfs_instr_class.h +++ b/storage/perfschema/pfs_instr_class.h @@ -16,7 +16,10 @@ #ifndef PFS_INSTR_CLASS_H #define PFS_INSTR_CLASS_H +#include "my_global.h" #include "mysql_com.h" /* NAME_LEN */ +#include "lf.h" +#include "pfs_global.h" /** @file storage/perfschema/pfs_instr_class.h @@ -112,7 +115,6 @@ extern uint mutex_class_start; extern uint rwlock_class_start; extern uint cond_class_start; extern uint file_class_start; -extern uint table_class_start; extern uint socket_class_start; extern uint wait_class_max; @@ -166,13 +168,10 @@ struct PFS_instr_class struct PFS_mutex; /** Instrumentation metadata for a MUTEX. */ -struct PFS_mutex_class : public PFS_instr_class +struct PFS_ALIGNED PFS_mutex_class : public PFS_instr_class { - /** - Lock statistics. - This statistic is not exposed in user visible tables yet. - */ - PFS_single_stat m_lock_stat; + /** Mutex usage statistics. */ + PFS_mutex_stat m_mutex_stat; /** Singleton instance. */ PFS_mutex *m_singleton; }; @@ -180,18 +179,10 @@ struct PFS_mutex_class : public PFS_instr_class struct PFS_rwlock; /** Instrumentation metadata for a RWLOCK. */ -struct PFS_rwlock_class : public PFS_instr_class +struct PFS_ALIGNED PFS_rwlock_class : public PFS_instr_class { - /** - Read lock statistics. - This statistic is not exposed in user visible tables yet. - */ - PFS_single_stat m_read_lock_stat; - /** - Write lock statistics. - This statistic is not exposed in user visible tables yet. - */ - PFS_single_stat m_write_lock_stat; + /** Rwlock usage statistics. */ + PFS_rwlock_stat m_rwlock_stat; /** Singleton instance. */ PFS_rwlock *m_singleton; }; @@ -199,7 +190,7 @@ struct PFS_rwlock_class : public PFS_instr_class struct PFS_cond; /** Instrumentation metadata for a COND. */ -struct PFS_cond_class : public PFS_instr_class +struct PFS_ALIGNED PFS_cond_class : public PFS_instr_class { /** Condition usage statistics. @@ -211,7 +202,7 @@ struct PFS_cond_class : public PFS_instr_class }; /** Instrumentation metadata of a thread. */ -struct PFS_thread_class +struct PFS_ALIGNED PFS_thread_class { /** True if this thread instrument is enabled. */ bool m_enabled; @@ -249,7 +240,7 @@ struct PFS_table_key }; /** Instrumentation metadata for a table share. */ -struct PFS_table_share +struct PFS_ALIGNED PFS_table_share { public: uint32 get_version() @@ -318,13 +309,31 @@ public: /** Table statistics. */ PFS_table_stat m_table_stat; /** Index names. */ - PFS_table_key m_keys[MAX_KEY]; + PFS_table_key m_keys[MAX_INDEXES]; private: /** Number of opened table handles. */ int m_refcount; }; +/** Statistics for the IDLE instrument. */ +extern PFS_single_stat global_idle_stat; +/** Statistics for dropped table io. */ +extern PFS_table_io_stat global_table_io_stat; +/** Statistics for dropped table lock. */ +extern PFS_table_lock_stat global_table_lock_stat; + +inline uint sanitize_index_count(uint count) +{ + if (likely(count <= MAX_INDEXES)) + return count; + return 0; +} + +#define GLOBAL_TABLE_IO_EVENT_INDEX 0 +#define GLOBAL_TABLE_LOCK_EVENT_INDEX 1 +#define GLOBAL_IDLE_EVENT_INDEX 2 + /** Instrument controlling all table io. This instrument is used with table SETUP_OBJECTS. @@ -345,7 +354,7 @@ extern PFS_instr_class global_idle_class; struct PFS_file; /** Instrumentation metadata for a file. */ -struct PFS_file_class : public PFS_instr_class +struct PFS_ALIGNED PFS_file_class : public PFS_instr_class { /** File usage statistics. */ PFS_file_stat m_file_stat; @@ -354,21 +363,21 @@ struct PFS_file_class : public PFS_instr_class }; /** Instrumentation metadata for a stage. */ -struct PFS_stage_class : public PFS_instr_class +struct PFS_ALIGNED PFS_stage_class : public PFS_instr_class { /** Stage usage statistics. */ PFS_stage_stat m_stage_stat; }; /** Instrumentation metadata for a statement. */ -struct PFS_statement_class : public PFS_instr_class +struct PFS_ALIGNED PFS_statement_class : public PFS_instr_class { }; struct PFS_socket; /** Instrumentation metadata for a socket. */ -struct PFS_socket_class : public PFS_instr_class +struct PFS_ALIGNED PFS_socket_class : public PFS_instr_class { /** Socket usage statistics. */ PFS_socket_stat m_socket_stat; @@ -483,12 +492,15 @@ extern PFS_cond_class *cond_class_array; extern PFS_file_class *file_class_array; extern PFS_table_share *table_share_array; +void reset_events_waits_by_class(); void reset_file_class_io(); void reset_socket_class_io(); /** Update derived flags for all table shares. */ void update_table_share_derived_flags(PFS_thread *thread); +extern LF_HASH table_share_hash; + /** @} */ #endif diff --git a/storage/perfschema/pfs_lock.h b/storage/perfschema/pfs_lock.h index 65937e94ece..09efecd1c5f 100644 --- a/storage/perfschema/pfs_lock.h +++ b/storage/perfschema/pfs_lock.h @@ -33,7 +33,7 @@ Values of a free record should not be read by a reader. Writers can concurrently attempt to allocate a free record. */ -#define PFS_LOCK_FREE 0 +#define PFS_LOCK_FREE 0x00 /** State of a dirty record. Values of a dirty record should not be read by a reader, @@ -41,14 +41,18 @@ Only one writer, the writer which owns the record, should modify the record content. */ -#define PFS_LOCK_DIRTY 1 +#define PFS_LOCK_DIRTY 0x01 /** State of an allocated record. Values of an allocated record are safe to read by a reader. A writer may modify some but not all properties of the record: only modifying values that can never cause the reader to crash is allowed. */ -#define PFS_LOCK_ALLOCATED 2 +#define PFS_LOCK_ALLOCATED 0x02 + +#define VERSION_MASK 0xFFFFFFFC +#define STATE_MASK 0x00000003 +#define VERSION_INC 4 /** A 'lock' protecting performance schema internal buffers. @@ -60,15 +64,11 @@ struct pfs_lock { /** - The record internal state. + The record internal version and state @sa PFS_LOCK_FREE @sa PFS_LOCK_DIRTY @sa PFS_LOCK_ALLOCATED - */ - volatile int32 m_state; - /** - The record internal version number. - This version number is to transform the 'ABA' problem + The version number is to transform the 'ABA' problem (see http://en.wikipedia.org/wiki/ABA_problem) into an 'A(n)BA(n + 1)' problem, where 'n' is the m_version number. When the performance schema instrumentation deletes a record, @@ -76,21 +76,23 @@ struct pfs_lock the version number is incremented, so that a reader can detect that the record was changed. Note that the version number is never reset to zero when a new record is created. + The version number is stored in the high 30 bits. + The state is stored in the low 2 bits. */ - volatile uint32 m_version; + volatile uint32 m_version_state; /** Returns true if the record is free. */ bool is_free(void) { - /* This is a dirty read */ - return (m_state == PFS_LOCK_FREE); + uint32 copy= m_version_state; /* non volatile copy, and dirty read */ + return ((copy & STATE_MASK) == PFS_LOCK_FREE); } /** Returns true if the record contains values that can be read. */ bool is_populated(void) { - int32 copy= m_state; /* non volatile copy, and dirty read */ - return (copy == PFS_LOCK_ALLOCATED); + uint32 copy= m_version_state; /* non volatile copy, and dirty read */ + return ((copy & STATE_MASK) == PFS_LOCK_ALLOCATED); } /** @@ -101,10 +103,11 @@ struct pfs_lock */ bool free_to_dirty(void) { - int32 old_state= PFS_LOCK_FREE; - int32 new_state= PFS_LOCK_DIRTY; + uint32 copy= m_version_state; /* non volatile copy, and dirty read */ + uint32 old_val= (copy & VERSION_MASK) + PFS_LOCK_FREE; + uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_DIRTY; - return (PFS_atomic::cas_32(&m_state, &old_state, new_state)); + return (PFS_atomic::cas_u32(&m_version_state, &old_val, new_val)); } /** @@ -114,8 +117,13 @@ struct pfs_lock */ void allocated_to_dirty(void) { - DBUG_ASSERT(m_state == PFS_LOCK_ALLOCATED); - PFS_atomic::store_32(&m_state, PFS_LOCK_DIRTY); + uint32 copy= PFS_atomic::load_u32(&m_version_state); + /* Make sure the record was ALLOCATED. */ + DBUG_ASSERT((copy & STATE_MASK) == PFS_LOCK_ALLOCATED); + /* Keep the same version, set the DIRTY state */ + uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_DIRTY; + /* We own the record, no need to use compare and swap. */ + PFS_atomic::store_u32(&m_version_state, new_val); } /** @@ -125,9 +133,26 @@ struct pfs_lock */ void dirty_to_allocated(void) { - DBUG_ASSERT(m_state == PFS_LOCK_DIRTY); - PFS_atomic::add_u32(&m_version, 1); - PFS_atomic::store_32(&m_state, PFS_LOCK_ALLOCATED); + uint32 copy= PFS_atomic::load_u32(&m_version_state); + /* Make sure the record was DIRTY. */ + DBUG_ASSERT((copy & STATE_MASK) == PFS_LOCK_DIRTY); + /* Increment the version, set the ALLOCATED state */ + uint32 new_val= (copy & VERSION_MASK) + VERSION_INC + PFS_LOCK_ALLOCATED; + PFS_atomic::store_u32(&m_version_state, new_val); + } + + /** + Initialize a lock to allocated. + This transition should be executed by the writer that owns the record and the lock, + after the record is in a state ready to be read. + */ + void set_allocated(void) + { + /* Do not set the version to 0, read the previous value. */ + uint32 copy= PFS_atomic::load_u32(&m_version_state); + /* Increment the version, set the ALLOCATED state */ + uint32 new_val= (copy & VERSION_MASK) + VERSION_INC + PFS_LOCK_ALLOCATED; + PFS_atomic::store_u32(&m_version_state, new_val); } /** @@ -136,8 +161,12 @@ struct pfs_lock */ void dirty_to_free(void) { - DBUG_ASSERT(m_state == PFS_LOCK_DIRTY); - PFS_atomic::store_32(&m_state, PFS_LOCK_FREE); + uint32 copy= PFS_atomic::load_u32(&m_version_state); + /* Make sure the record was DIRTY. */ + DBUG_ASSERT((copy & STATE_MASK) == PFS_LOCK_DIRTY); + /* Keep the same version, set the FREE state */ + uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_FREE; + PFS_atomic::store_u32(&m_version_state, new_val); } /** @@ -153,8 +182,12 @@ struct pfs_lock The correct assert to use here to guarantee data integrity is simply: DBUG_ASSERT(m_state == PFS_LOCK_ALLOCATED); */ - DBUG_ASSERT(m_state == PFS_LOCK_ALLOCATED); - PFS_atomic::store_32(&m_state, PFS_LOCK_FREE); + uint32 copy= PFS_atomic::load_u32(&m_version_state); + /* Make sure the record was ALLOCATED. */ + DBUG_ASSERT(((copy & STATE_MASK) == PFS_LOCK_ALLOCATED)); + /* Keep the same version, set the FREE state */ + uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_FREE; + PFS_atomic::store_u32(&m_version_state, new_val); } /** @@ -163,8 +196,7 @@ struct pfs_lock */ void begin_optimistic_lock(struct pfs_lock *copy) { - copy->m_version= PFS_atomic::load_u32(&m_version); - copy->m_state= PFS_atomic::load_32(&m_state); + copy->m_version_state= PFS_atomic::load_u32(&m_version_state); } /** @@ -174,19 +206,20 @@ struct pfs_lock */ bool end_optimistic_lock(struct pfs_lock *copy) { - /* - return true if: - - the version + state has not changed - - and there was valid data to look at - */ - return ((copy->m_version == PFS_atomic::load_u32(&m_version)) && - (copy->m_state == PFS_atomic::load_32(&m_state)) && - (copy->m_state == PFS_LOCK_ALLOCATED)); + /* Check there was valid data to look at. */ + if ((copy->m_version_state & STATE_MASK) != PFS_LOCK_ALLOCATED) + return false; + + /* Check the version + state has not changed. */ + if (copy->m_version_state != PFS_atomic::load_u32(&m_version_state)) + return false; + + return true; } uint32 get_version() { - return PFS_atomic::load_u32(&m_version); + return (PFS_atomic::load_u32(&m_version_state) & VERSION_MASK); } }; diff --git a/storage/perfschema/pfs_server.cc b/storage/perfschema/pfs_server.cc index 3df0f27f652..383a46785fb 100644 --- a/storage/perfschema/pfs_server.cc +++ b/storage/perfschema/pfs_server.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -50,11 +50,16 @@ static void cleanup_performance_schema(void); void cleanup_instrument_config(void); struct PSI_bootstrap* -initialize_performance_schema(const PFS_global_param *param) +initialize_performance_schema(PFS_global_param *param) { pfs_initialized= false; PFS_table_stat::g_reset_template.reset(); + global_idle_stat.reset(); + global_table_io_stat.reset(); + global_table_lock_stat.reset(); + + pfs_automated_sizing(param); if (! param->m_enabled) { diff --git a/storage/perfschema/pfs_server.h b/storage/perfschema/pfs_server.h index f65febdeb6d..e0c782fde58 100644 --- a/storage/perfschema/pfs_server.h +++ b/storage/perfschema/pfs_server.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -24,96 +24,50 @@ #ifndef PFS_MAX_MUTEX_CLASS #define PFS_MAX_MUTEX_CLASS 200 #endif -#ifndef PFS_MAX_MUTEX - #define PFS_MAX_MUTEX 1000000 -#endif #ifndef PFS_MAX_RWLOCK_CLASS #define PFS_MAX_RWLOCK_CLASS 30 #endif -#ifndef PFS_MAX_RWLOCK - #define PFS_MAX_RWLOCK 1000000 -#endif #ifndef PFS_MAX_COND_CLASS #define PFS_MAX_COND_CLASS 80 #endif -#ifndef PFS_MAX_COND - #define PFS_MAX_COND 1000 -#endif #ifndef PFS_MAX_THREAD_CLASS #define PFS_MAX_THREAD_CLASS 50 #endif -#ifndef PFS_MAX_THREAD - #define PFS_MAX_THREAD 1000 -#endif #ifndef PFS_MAX_FILE_CLASS #define PFS_MAX_FILE_CLASS 50 #endif -#ifndef PFS_MAX_FILE - #define PFS_MAX_FILE 10000 -#endif #ifndef PFS_MAX_FILE_HANDLE #define PFS_MAX_FILE_HANDLE 32768 #endif -#ifndef PFS_MAX_SOCKETS - #define PFS_MAX_SOCKETS 1000 -#endif #ifndef PFS_MAX_SOCKET_CLASS #define PFS_MAX_SOCKET_CLASS 10 #endif -#ifndef PFS_MAX_TABLE_SHARE - #define PFS_MAX_TABLE_SHARE 1000 -#endif -#ifndef PFS_MAX_TABLE - #define PFS_MAX_TABLE 10000 -#endif -#ifndef PFS_WAITS_HISTORY_SIZE - #define PFS_WAITS_HISTORY_SIZE 10 -#endif -#ifndef PFS_WAITS_HISTORY_LONG_SIZE - #define PFS_WAITS_HISTORY_LONG_SIZE 10000 -#endif #ifndef PFS_MAX_SETUP_ACTOR #define PFS_MAX_SETUP_ACTOR 100 #endif #ifndef PFS_MAX_SETUP_OBJECT #define PFS_MAX_SETUP_OBJECT 100 #endif -#ifndef PFS_MAX_HOST - #define PFS_MAX_HOST 100 -#endif -#ifndef PFS_MAX_USER - #define PFS_MAX_USER 100 -#endif -#ifndef PFS_MAX_ACCOUNT - #define PFS_MAX_ACCOUNT 100 -#endif #ifndef PFS_MAX_STAGE_CLASS #define PFS_MAX_STAGE_CLASS 150 #endif -#ifndef PFS_STAGES_HISTORY_SIZE - #define PFS_STAGES_HISTORY_SIZE 10 -#endif -#ifndef PFS_STAGES_HISTORY_LONG_SIZE - #define PFS_STAGES_HISTORY_LONG_SIZE 10000 -#endif -#ifndef PFS_STATEMENTS_HISTORY_SIZE - #define PFS_STATEMENTS_HISTORY_SIZE 10 -#endif -#ifndef PFS_STATEMENTS_HISTORY_LONG_SIZE - #define PFS_STATEMENTS_HISTORY_LONG_SIZE 10000 -#endif #ifndef PFS_STATEMENTS_STACK_SIZE #define PFS_STATEMENTS_STACK_SIZE 10 #endif -#ifndef PFS_DIGEST_SIZE - #define PFS_DIGEST_SIZE 200 -#endif + +struct PFS_sizing_hints +{ + long m_table_definition_cache; + long m_table_open_cache; + long m_max_connections; + long m_open_files_limit; +}; /** Performance schema global sizing parameters. */ struct PFS_global_param { /** True if the performance schema is enabled. */ - bool m_enabled; + bool m_enabled; /** Default values for SETUP_CONSUMERS. */ bool m_consumer_events_stages_current_enabled; bool m_consumer_events_stages_history_enabled; @@ -155,7 +109,7 @@ struct PFS_global_param Maximum number of instrumented table share. @sa table_share_lost. */ - ulong m_table_share_sizing; + long m_table_share_sizing; /** Maximum number of instrumented file classes. @sa file_class_lost. @@ -165,81 +119,86 @@ struct PFS_global_param Maximum number of instrumented mutex instances. @sa mutex_lost. */ - ulong m_mutex_sizing; + long m_mutex_sizing; /** Maximum number of instrumented rwlock instances. @sa rwlock_lost. */ - ulong m_rwlock_sizing; + long m_rwlock_sizing; /** Maximum number of instrumented cond instances. @sa cond_lost. */ - ulong m_cond_sizing; + long m_cond_sizing; /** Maximum number of instrumented thread instances. @sa thread_lost. */ - ulong m_thread_sizing; + long m_thread_sizing; /** Maximum number of instrumented table handles. @sa table_lost. */ - ulong m_table_sizing; + long m_table_sizing; /** Maximum number of instrumented file instances. @sa file_lost. */ - ulong m_file_sizing; + long m_file_sizing; /** Maximum number of instrumented file handles. @sa file_handle_lost. */ - ulong m_file_handle_sizing; + long m_file_handle_sizing; /** Maxium number of instrumented socket instances @sa socket_lost */ - ulong m_socket_sizing; + long m_socket_sizing; /** Maximum number of instrumented socket classes. @sa socket_class_lost. */ ulong m_socket_class_sizing; /** Maximum number of rows per thread in table EVENTS_WAITS_HISTORY. */ - ulong m_events_waits_history_sizing; + long m_events_waits_history_sizing; /** Maximum number of rows in table EVENTS_WAITS_HISTORY_LONG. */ - ulong m_events_waits_history_long_sizing; + long m_events_waits_history_long_sizing; /** Maximum number of rows in table SETUP_ACTORS. */ ulong m_setup_actor_sizing; /** Maximum number of rows in table SETUP_OBJECTS. */ ulong m_setup_object_sizing; /** Maximum number of rows in table HOSTS. */ - ulong m_host_sizing; + long m_host_sizing; /** Maximum number of rows in table USERS. */ - ulong m_user_sizing; + long m_user_sizing; /** Maximum number of rows in table ACCOUNTS. */ - ulong m_account_sizing; + long m_account_sizing; /** Maximum number of instrumented stage classes. @sa stage_class_lost. */ ulong m_stage_class_sizing; /** Maximum number of rows per thread in table EVENTS_STAGES_HISTORY. */ - ulong m_events_stages_history_sizing; + long m_events_stages_history_sizing; /** Maximum number of rows in table EVENTS_STAGES_HISTORY_LONG. */ - ulong m_events_stages_history_long_sizing; + long m_events_stages_history_long_sizing; /** Maximum number of instrumented statement classes. @sa statement_class_lost. */ ulong m_statement_class_sizing; /** Maximum number of rows per thread in table EVENTS_STATEMENT_HISTORY. */ - ulong m_events_statements_history_sizing; + long m_events_statements_history_sizing; /** Maximum number of rows in table EVENTS_STATEMENTS_HISTORY_LONG. */ - ulong m_events_statements_history_long_sizing; + long m_events_statements_history_long_sizing; /** Maximum number of digests to be captured */ - ulong m_digest_sizing; + long m_digest_sizing; + /** Maximum number of session attribute strings per thread */ + long m_session_connect_attrs_sizing; + + /** Sizing hints, for auto tuning. */ + PFS_sizing_hints m_hints; }; /** @@ -254,7 +213,9 @@ extern PFS_global_param pfs_param; @return A boostrap handle, or NULL. */ struct PSI_bootstrap* -initialize_performance_schema(const PFS_global_param *param); +initialize_performance_schema(PFS_global_param *param); + +void pfs_automated_sizing(PFS_global_param *param); /** Initialize the performance schema ACL. diff --git a/storage/perfschema/pfs_setup_actor.cc b/storage/perfschema/pfs_setup_actor.cc index a587d3643d2..943654ce1c9 100644 --- a/storage/perfschema/pfs_setup_actor.cc +++ b/storage/perfschema/pfs_setup_actor.cc @@ -43,7 +43,7 @@ ulong setup_actor_max; PFS_setup_actor *setup_actor_array= NULL; /** Hash table for setup_actor records. */ -static LF_HASH setup_actor_hash; +LF_HASH setup_actor_hash; /** True if @c setup_actor_hash is initialized. */ static bool setup_actor_hash_inited= false; @@ -100,10 +100,11 @@ C_MODE_END */ int init_setup_actor_hash(void) { - if (! setup_actor_hash_inited) + if ((! setup_actor_hash_inited) && (setup_actor_max > 0)) { lf_hash_init(&setup_actor_hash, sizeof(PFS_setup_actor*), LF_HASH_UNIQUE, 0, 0, setup_actor_hash_get_key, &my_charset_bin); + setup_actor_hash.size= setup_actor_max; setup_actor_hash_inited= true; } return 0; @@ -167,7 +168,7 @@ int insert_setup_actor(const String *user, const String *host, const String *rol if (unlikely(pins == NULL)) return HA_ERR_OUT_OF_MEM; - static uint setup_actor_monotonic_index= 0; + static uint PFS_ALIGNED setup_actor_monotonic_index= 0; uint index; uint attempts= 0; PFS_setup_actor *pfs; @@ -175,8 +176,7 @@ int insert_setup_actor(const String *user, const String *host, const String *rol while (++attempts <= setup_actor_max) { /* See create_mutex() */ - PFS_atomic::add_u32(& setup_actor_monotonic_index, 1); - index= setup_actor_monotonic_index % setup_actor_max; + index= PFS_atomic::add_u32(& setup_actor_monotonic_index, 1) % setup_actor_max; pfs= setup_actor_array + index; if (pfs->m_lock.is_free()) diff --git a/storage/perfschema/pfs_setup_actor.h b/storage/perfschema/pfs_setup_actor.h index 8b0ee8a485c..baebd27f0ad 100644 --- a/storage/perfschema/pfs_setup_actor.h +++ b/storage/perfschema/pfs_setup_actor.h @@ -49,7 +49,7 @@ struct PFS_setup_actor_key }; /** A setup_actor record. */ -struct PFS_setup_actor +struct PFS_ALIGNED PFS_setup_actor { /** Internal lock. */ pfs_lock m_lock; @@ -92,6 +92,8 @@ extern ulong setup_actor_max; extern PFS_setup_actor *setup_actor_array; +extern LF_HASH setup_actor_hash; + /** @} */ #endif diff --git a/storage/perfschema/pfs_setup_object.cc b/storage/perfschema/pfs_setup_object.cc index a9e9bb7881b..0ca7986e818 100644 --- a/storage/perfschema/pfs_setup_object.cc +++ b/storage/perfschema/pfs_setup_object.cc @@ -39,7 +39,7 @@ ulong setup_object_max; PFS_setup_object *setup_object_array= NULL; -static LF_HASH setup_object_hash; +LF_HASH setup_object_hash; static bool setup_object_hash_inited= false; /** @@ -95,10 +95,11 @@ C_MODE_END */ int init_setup_object_hash(void) { - if (! setup_object_hash_inited) + if ((! setup_object_hash_inited) && (setup_object_max > 0)) { lf_hash_init(&setup_object_hash, sizeof(PFS_setup_object*), LF_HASH_UNIQUE, 0, 0, setup_object_hash_get_key, &my_charset_bin); + setup_object_hash.size= setup_object_max; setup_object_hash_inited= true; } return 0; @@ -161,7 +162,7 @@ int insert_setup_object(enum_object_type object_type, const String *schema, if (unlikely(pins == NULL)) return HA_ERR_OUT_OF_MEM; - static uint setup_object_monotonic_index= 0; + static uint PFS_ALIGNED setup_object_monotonic_index= 0; uint index; uint attempts= 0; PFS_setup_object *pfs; @@ -169,8 +170,7 @@ int insert_setup_object(enum_object_type object_type, const String *schema, while (++attempts <= setup_object_max) { /* See create_mutex() */ - PFS_atomic::add_u32(& setup_object_monotonic_index, 1); - index= setup_object_monotonic_index % setup_object_max; + index= PFS_atomic::add_u32(& setup_object_monotonic_index, 1) % setup_object_max; pfs= setup_object_array + index; if (pfs->m_lock.is_free()) diff --git a/storage/perfschema/pfs_setup_object.h b/storage/perfschema/pfs_setup_object.h index 44d2b76c627..2615802fe01 100644 --- a/storage/perfschema/pfs_setup_object.h +++ b/storage/perfschema/pfs_setup_object.h @@ -45,7 +45,7 @@ struct PFS_setup_object_key }; /** A setup_object record. */ -struct PFS_setup_object +struct PFS_ALIGNED PFS_setup_object { enum_object_type get_object_type() { @@ -96,6 +96,8 @@ extern ulong setup_object_max; extern PFS_setup_object *setup_object_array; +extern LF_HASH setup_object_hash; + /** @} */ #endif diff --git a/storage/perfschema/pfs_stat.h b/storage/perfschema/pfs_stat.h index 32c462b8ba2..2a255a9e5b2 100644 --- a/storage/perfschema/pfs_stat.h +++ b/storage/perfschema/pfs_stat.h @@ -140,13 +140,90 @@ struct PFS_byte_stat : public PFS_single_stat } }; +/** Statistics for mutex usage. */ +struct PFS_mutex_stat +{ + /** Wait statistics. */ + PFS_single_stat m_wait_stat; + /** + Lock statistics. + This statistic is not exposed in user visible tables yet. + */ + PFS_single_stat m_lock_stat; + + inline void aggregate(const PFS_mutex_stat *stat) + { + m_wait_stat.aggregate(&stat->m_wait_stat); + m_lock_stat.aggregate(&stat->m_lock_stat); + } + + inline void reset(void) + { + m_wait_stat.reset(); + m_lock_stat.reset(); + } +}; + +/** Statistics for rwlock usage. */ +struct PFS_rwlock_stat +{ + /** Wait statistics. */ + PFS_single_stat m_wait_stat; + /** + RWLock read lock usage statistics. + This statistic is not exposed in user visible tables yet. + */ + PFS_single_stat m_read_lock_stat; + /** + RWLock write lock usage statistics. + This statistic is not exposed in user visible tables yet. + */ + PFS_single_stat m_write_lock_stat; + + inline void aggregate(const PFS_rwlock_stat *stat) + { + m_wait_stat.aggregate(&stat->m_wait_stat); + m_read_lock_stat.aggregate(&stat->m_read_lock_stat); + m_write_lock_stat.aggregate(&stat->m_write_lock_stat); + } + + inline void reset(void) + { + m_wait_stat.reset(); + m_read_lock_stat.reset(); + m_write_lock_stat.reset(); + } +}; + /** Statistics for COND usage. */ struct PFS_cond_stat { - /** Number of times a condition was signalled. */ + /** Wait statistics. */ + PFS_single_stat m_wait_stat; + /** + Number of times a condition was signalled. + This statistic is not exposed in user visible tables yet. + */ ulonglong m_signal_count; - /** Number of times a condition was broadcasted. */ + /** + Number of times a condition was broadcast. + This statistic is not exposed in user visible tables yet. + */ ulonglong m_broadcast_count; + + inline void aggregate(const PFS_cond_stat *stat) + { + m_wait_stat.aggregate(&stat->m_wait_stat); + m_signal_count+= stat->m_signal_count; + m_broadcast_count+= stat->m_broadcast_count; + } + + inline void reset(void) + { + m_wait_stat.reset(); + m_signal_count= 0; + m_broadcast_count= 0; + } }; /** Statistics for FILE IO. Used for both waits and byte counts. */ @@ -198,6 +275,11 @@ struct PFS_file_stat /** File IO statistics. */ PFS_file_io_stat m_io_stat; + inline void aggregate(const PFS_file_stat *stat) + { + m_io_stat.aggregate(&stat->m_io_stat); + } + /** Reset file statistics. */ inline void reset(void) { @@ -329,6 +411,7 @@ struct PFS_statement_stat /** Single table io statistic. */ struct PFS_table_io_stat { + bool m_has_data; /** FETCH statistics */ PFS_single_stat m_fetch; /** INSERT statistics */ @@ -338,8 +421,14 @@ struct PFS_table_io_stat /** DELETE statistics */ PFS_single_stat m_delete; + PFS_table_io_stat() + { + m_has_data= false; + } + inline void reset(void) { + m_has_data= false; m_fetch.reset(); m_insert.reset(); m_update.reset(); @@ -348,18 +437,25 @@ struct PFS_table_io_stat inline void aggregate(const PFS_table_io_stat *stat) { - m_fetch.aggregate(&stat->m_fetch); - m_insert.aggregate(&stat->m_insert); - m_update.aggregate(&stat->m_update); - m_delete.aggregate(&stat->m_delete); + if (stat->m_has_data) + { + m_has_data= true; + m_fetch.aggregate(&stat->m_fetch); + m_insert.aggregate(&stat->m_insert); + m_update.aggregate(&stat->m_update); + m_delete.aggregate(&stat->m_delete); + } } inline void sum(PFS_single_stat *result) { - result->aggregate(& m_fetch); - result->aggregate(& m_insert); - result->aggregate(& m_update); - result->aggregate(& m_delete); + if (m_has_data) + { + result->aggregate(& m_fetch); + result->aggregate(& m_insert); + result->aggregate(& m_update); + result->aggregate(& m_delete); + } } }; @@ -419,10 +515,10 @@ struct PFS_table_stat { /** Statistics, per index. - Each index stat is in [0, MAX_KEY-1], - stats when using no index are in [MAX_KEY]. + Each index stat is in [0, MAX_INDEXES-1], + stats when using no index are in [MAX_INDEXES]. */ - PFS_table_io_stat m_index_stat[MAX_KEY + 1]; + PFS_table_io_stat m_index_stat[MAX_INDEXES + 1]; /** Statistics, per lock type. @@ -433,7 +529,7 @@ struct PFS_table_stat inline void reset_io(void) { PFS_table_io_stat *stat= & m_index_stat[0]; - PFS_table_io_stat *stat_last= & m_index_stat[MAX_KEY + 1]; + PFS_table_io_stat *stat_last= & m_index_stat[MAX_INDEXES + 1]; for ( ; stat < stat_last ; stat++) stat->reset(); } @@ -466,13 +562,25 @@ struct PFS_table_stat memcpy(this, & g_reset_template, sizeof(*this)); } - inline void aggregate_io(const PFS_table_stat *stat) + inline void aggregate_io(const PFS_table_stat *stat, uint key_count) { - PFS_table_io_stat *to_stat= & m_index_stat[0]; - PFS_table_io_stat *to_stat_last= & m_index_stat[MAX_KEY + 1]; - const PFS_table_io_stat *from_stat= & stat->m_index_stat[0]; + PFS_table_io_stat *to_stat; + PFS_table_io_stat *to_stat_last; + const PFS_table_io_stat *from_stat; + + DBUG_ASSERT(key_count <= MAX_INDEXES); + + /* Aggregate stats for each index, if any */ + to_stat= & m_index_stat[0]; + to_stat_last= to_stat + key_count; + from_stat= & stat->m_index_stat[0]; for ( ; to_stat < to_stat_last ; from_stat++, to_stat++) to_stat->aggregate(from_stat); + + /* Aggregate stats for the table */ + to_stat= & m_index_stat[MAX_INDEXES]; + from_stat= & stat->m_index_stat[MAX_INDEXES]; + to_stat->aggregate(from_stat); } inline void aggregate_lock(const PFS_table_stat *stat) @@ -480,18 +588,27 @@ struct PFS_table_stat m_lock_stat.aggregate(& stat->m_lock_stat); } - inline void aggregate(const PFS_table_stat *stat) + inline void aggregate(const PFS_table_stat *stat, uint key_count) { - aggregate_io(stat); + aggregate_io(stat, key_count); aggregate_lock(stat); } - inline void sum_io(PFS_single_stat *result) + inline void sum_io(PFS_single_stat *result, uint key_count) { - PFS_table_io_stat *stat= & m_index_stat[0]; - PFS_table_io_stat *stat_last= & m_index_stat[MAX_KEY + 1]; + PFS_table_io_stat *stat; + PFS_table_io_stat *stat_last; + + DBUG_ASSERT(key_count <= MAX_INDEXES); + + /* Sum stats for each index, if any */ + stat= & m_index_stat[0]; + stat_last= stat + key_count; for ( ; stat < stat_last ; stat++) stat->sum(result); + + /* Sum stats for the table */ + m_index_stat[MAX_INDEXES].sum(result); } inline void sum_lock(PFS_single_stat *result) @@ -499,9 +616,9 @@ struct PFS_table_stat m_lock_stat.sum(result); } - inline void sum(PFS_single_stat *result) + inline void sum(PFS_single_stat *result, uint key_count) { - sum_io(result); + sum_io(result, key_count); sum_lock(result); } diff --git a/storage/perfschema/pfs_timer.cc b/storage/perfschema/pfs_timer.cc index 3d8d2e07ce5..8c3553db2b2 100644 --- a/storage/perfschema/pfs_timer.cc +++ b/storage/perfschema/pfs_timer.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -117,6 +117,75 @@ void init_timers(void) to_pico_data[TIMER_NAME_TICK].m_v0= tick_v0; to_pico_data[TIMER_NAME_TICK].m_factor= tick_to_pico; + + /* + Depending on the platform and build options, + some timers may not be available. + Pick best replacements. + */ + + /* + For STAGE and STATEMENT, a timer with a fixed frequency is better. + The prefered timer is nanosecond, or lower resolutions. + */ + + if (nanosec_to_pico != 0) + { + /* Normal case. */ + stage_timer= TIMER_NAME_NANOSEC; + statement_timer= TIMER_NAME_NANOSEC; + } + else if (microsec_to_pico != 0) + { + /* Windows. */ + stage_timer= TIMER_NAME_MICROSEC; + statement_timer= TIMER_NAME_MICROSEC; + } + else if (millisec_to_pico != 0) + { + /* Robustness, no known cases. */ + stage_timer= TIMER_NAME_MILLISEC; + statement_timer= TIMER_NAME_MILLISEC; + } + else if (tick_to_pico != 0) + { + /* Robustness, no known cases. */ + stage_timer= TIMER_NAME_TICK; + statement_timer= TIMER_NAME_TICK; + } + else + { + /* Robustness, no known cases. */ + stage_timer= TIMER_NAME_CYCLE; + statement_timer= TIMER_NAME_CYCLE; + } + + /* + For IDLE, a timer with a fixed frequency is critical, + as the CPU clock may slow down a lot if the server is completely idle. + The prefered timer is microsecond, or lower resolutions. + */ + + if (microsec_to_pico != 0) + { + /* Normal case. */ + idle_timer= TIMER_NAME_MICROSEC; + } + else if (millisec_to_pico != 0) + { + /* Robustness, no known cases. */ + idle_timer= TIMER_NAME_MILLISEC; + } + else if (tick_to_pico != 0) + { + /* Robustness, no known cases. */ + idle_timer= TIMER_NAME_TICK; + } + else + { + /* Robustness, no known cases. */ + idle_timer= TIMER_NAME_CYCLE; + } } ulonglong get_timer_raw_value(enum_timer_name timer_name) diff --git a/storage/perfschema/pfs_user.cc b/storage/perfschema/pfs_user.cc index d7794a131a1..697b5af2f0d 100644 --- a/storage/perfschema/pfs_user.cc +++ b/storage/perfschema/pfs_user.cc @@ -42,7 +42,7 @@ static PFS_single_stat *user_instr_class_waits_array= NULL; static PFS_stage_stat *user_instr_class_stages_array= NULL; static PFS_statement_stat *user_instr_class_statements_array= NULL; -static LF_HASH user_hash; +LF_HASH user_hash; static bool user_hash_inited= false; /** @@ -146,10 +146,11 @@ C_MODE_END */ int init_user_hash(void) { - if (! user_hash_inited) + if ((! user_hash_inited) && (user_max > 0)) { lf_hash_init(&user_hash, sizeof(PFS_user*), LF_HASH_UNIQUE, 0, 0, user_hash_get_key, &my_charset_bin); + user_hash.size= user_max; user_hash_inited= true; } return 0; diff --git a/storage/perfschema/pfs_user.h b/storage/perfschema/pfs_user.h index 0f937c6c927..dda7e221ca8 100644 --- a/storage/perfschema/pfs_user.h +++ b/storage/perfschema/pfs_user.h @@ -44,7 +44,7 @@ struct PFS_user_key uint m_key_length; }; -struct PFS_user : public PFS_connection_slice +struct PFS_ALIGNED PFS_user : public PFS_connection_slice { public: inline void init_refcount(void) @@ -108,6 +108,8 @@ extern ulong user_lost; extern PFS_user *user_array; +extern LF_HASH user_hash; + /** @} */ #endif diff --git a/storage/perfschema/pfs_visitor.cc b/storage/perfschema/pfs_visitor.cc index fe2b16a2f76..616bc27900a 100644 --- a/storage/perfschema/pfs_visitor.cc +++ b/storage/perfschema/pfs_visitor.cc @@ -666,7 +666,7 @@ void PFS_connection_wait_visitor::visit_global() it is more efficient. */ DBUG_ASSERT(m_index == global_idle_class.m_event_name_index); - m_stat.aggregate(& global_instr_class_waits_array[m_index]); + m_stat.aggregate(& global_idle_stat); } void PFS_connection_wait_visitor::visit_host(PFS_host *pfs) @@ -883,54 +883,44 @@ PFS_instance_wait_visitor::PFS_instance_wait_visitor() PFS_instance_wait_visitor::~PFS_instance_wait_visitor() {} -void PFS_instance_wait_visitor::visit_mutex_class(PFS_mutex_class *pfs) +void PFS_instance_wait_visitor::visit_mutex_class(PFS_mutex_class *pfs) { - uint index= pfs->m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); + m_stat.aggregate(&pfs->m_mutex_stat.m_wait_stat); } -void PFS_instance_wait_visitor::visit_rwlock_class(PFS_rwlock_class *pfs) +void PFS_instance_wait_visitor::visit_rwlock_class(PFS_rwlock_class *pfs) { - uint index= pfs->m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); + m_stat.aggregate(&pfs->m_rwlock_stat.m_wait_stat); } -void PFS_instance_wait_visitor::visit_cond_class(PFS_cond_class *pfs) +void PFS_instance_wait_visitor::visit_cond_class(PFS_cond_class *pfs) { - uint index= pfs->m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); + m_stat.aggregate(&pfs->m_cond_stat.m_wait_stat); } -void PFS_instance_wait_visitor::visit_file_class(PFS_file_class *pfs) +void PFS_instance_wait_visitor::visit_file_class(PFS_file_class *pfs) { - uint index= pfs->m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); + pfs->m_file_stat.m_io_stat.sum_waits(&m_stat); } -void PFS_instance_wait_visitor::visit_socket_class(PFS_socket_class *pfs) +void PFS_instance_wait_visitor::visit_socket_class(PFS_socket_class *pfs) { - /* Collect global wait stats */ - uint index= pfs->m_event_name_index; - m_stat.aggregate(&global_instr_class_waits_array[index]); - - /* If deferred, then pull wait stats directly from the socket class. */ - if (pfs->is_deferred()) - pfs->m_socket_stat.m_io_stat.sum_waits(&m_stat); + pfs->m_socket_stat.m_io_stat.sum_waits(&m_stat); } -void PFS_instance_wait_visitor::visit_mutex(PFS_mutex *pfs) +void PFS_instance_wait_visitor::visit_mutex(PFS_mutex *pfs) { - m_stat.aggregate(& pfs->m_wait_stat); + m_stat.aggregate(& pfs->m_mutex_stat.m_wait_stat); } -void PFS_instance_wait_visitor::visit_rwlock(PFS_rwlock *pfs) +void PFS_instance_wait_visitor::visit_rwlock(PFS_rwlock *pfs) { - m_stat.aggregate(& pfs->m_wait_stat); + m_stat.aggregate(& pfs->m_rwlock_stat.m_wait_stat); } -void PFS_instance_wait_visitor::visit_cond(PFS_cond *pfs) +void PFS_instance_wait_visitor::visit_cond(PFS_cond *pfs) { - m_stat.aggregate(& pfs->m_wait_stat); + m_stat.aggregate(& pfs->m_cond_stat.m_wait_stat); } void PFS_instance_wait_visitor::visit_file(PFS_file *pfs) @@ -959,23 +949,24 @@ PFS_object_wait_visitor::~PFS_object_wait_visitor() void PFS_object_wait_visitor::visit_global() { - uint index; - - index= global_table_io_class.m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); - - index= global_table_lock_class.m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); + global_table_io_stat.sum(& m_stat); + global_table_lock_stat.sum(& m_stat); } void PFS_object_wait_visitor::visit_table_share(PFS_table_share *pfs) { - pfs->m_table_stat.sum(& m_stat); + uint safe_key_count= sanitize_index_count(pfs->m_key_count); + pfs->m_table_stat.sum(& m_stat, safe_key_count); } void PFS_object_wait_visitor::visit_table(PFS_table *pfs) { - pfs->m_table_stat.sum(& m_stat); + PFS_table_share *table_share= sanitize_table_share(pfs->m_share); + if (table_share != NULL) + { + uint safe_key_count= sanitize_index_count(table_share->m_key_count); + pfs->m_table_stat.sum(& m_stat, safe_key_count); + } } PFS_table_io_wait_visitor::PFS_table_io_wait_visitor() @@ -986,21 +977,21 @@ PFS_table_io_wait_visitor::~PFS_table_io_wait_visitor() void PFS_table_io_wait_visitor::visit_global() { - uint index= global_table_io_class.m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); + global_table_io_stat.sum(& m_stat); } void PFS_table_io_wait_visitor::visit_table_share(PFS_table_share *pfs) { PFS_table_io_stat io_stat; + uint safe_key_count= sanitize_index_count(pfs->m_key_count); uint index; /* Aggregate index stats */ - for (index= 0; index < pfs->m_key_count; index++) + for (index= 0; index < safe_key_count; index++) io_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]); /* Aggregate global stats */ - io_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]); + io_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_INDEXES]); io_stat.sum(& m_stat); } @@ -1012,14 +1003,15 @@ void PFS_table_io_wait_visitor::visit_table(PFS_table *pfs) if (likely(safe_share != NULL)) { PFS_table_io_stat io_stat; + uint safe_key_count= sanitize_index_count(safe_share->m_key_count); uint index; /* Aggregate index stats */ - for (index= 0; index < safe_share->m_key_count; index++) + for (index= 0; index < safe_key_count; index++) io_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]); /* Aggregate global stats */ - io_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]); + io_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_INDEXES]); io_stat.sum(& m_stat); } @@ -1035,14 +1027,15 @@ PFS_table_io_stat_visitor::~PFS_table_io_stat_visitor() void PFS_table_io_stat_visitor::visit_table_share(PFS_table_share *pfs) { + uint safe_key_count= sanitize_index_count(pfs->m_key_count); uint index; /* Aggregate index stats */ - for (index= 0; index < pfs->m_key_count; index++) + for (index= 0; index < safe_key_count; index++) m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]); /* Aggregate global stats */ - m_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]); + m_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_INDEXES]); } void PFS_table_io_stat_visitor::visit_table(PFS_table *pfs) @@ -1051,14 +1044,15 @@ void PFS_table_io_stat_visitor::visit_table(PFS_table *pfs) if (likely(safe_share != NULL)) { + uint safe_key_count= sanitize_index_count(safe_share->m_key_count); uint index; /* Aggregate index stats */ - for (index= 0; index < safe_share->m_key_count; index++) + for (index= 0; index < safe_key_count; index++) m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]); /* Aggregate global stats */ - m_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]); + m_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_INDEXES]); } } @@ -1090,8 +1084,7 @@ PFS_table_lock_wait_visitor::~PFS_table_lock_wait_visitor() void PFS_table_lock_wait_visitor::visit_global() { - uint index= global_table_lock_class.m_event_name_index; - m_stat.aggregate(& global_instr_class_waits_array[index]); + global_table_lock_stat.sum(& m_stat); } void PFS_table_lock_wait_visitor::visit_table_share(PFS_table_share *pfs) diff --git a/storage/perfschema/table_esgs_by_thread_by_event_name.cc b/storage/perfschema/table_esgs_by_thread_by_event_name.cc index 2a69ec24277..eeef6c3fbb2 100644 --- a/storage/perfschema/table_esgs_by_thread_by_event_name.cc +++ b/storage/perfschema/table_esgs_by_thread_by_event_name.cc @@ -33,7 +33,7 @@ static const TABLE_FIELD_TYPE field_types[]= { { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -212,7 +212,7 @@ int table_esgs_by_thread_by_event_name switch(f->field_index) { case 0: /* THREAD_ID */ - set_field_ulong(f, m_row.m_thread_internal_id); + set_field_ulonglong(f, m_row.m_thread_internal_id); break; case 1: /* NAME */ m_row.m_event_name.set_field(f); diff --git a/storage/perfschema/table_esgs_by_thread_by_event_name.h b/storage/perfschema/table_esgs_by_thread_by_event_name.h index 049c8997396..5295a9eacdf 100644 --- a/storage/perfschema/table_esgs_by_thread_by_event_name.h +++ b/storage/perfschema/table_esgs_by_thread_by_event_name.h @@ -39,7 +39,7 @@ struct row_esgs_by_thread_by_event_name { /** Column THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Column EVENT_NAME. */ PFS_event_name_row m_event_name; /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */ diff --git a/storage/perfschema/table_esgs_global_by_event_name.cc b/storage/perfschema/table_esgs_global_by_event_name.cc index 2ac22fb1551..276ac8d7704 100644 --- a/storage/perfschema/table_esgs_global_by_event_name.cc +++ b/storage/perfschema/table_esgs_global_by_event_name.cc @@ -95,6 +95,9 @@ int table_esgs_global_by_event_name::delete_all_rows(void) { reset_events_stages_by_thread(); + reset_events_stages_by_account(); + reset_events_stages_by_user(); + reset_events_stages_by_host(); reset_events_stages_global(); return 0; } diff --git a/storage/perfschema/table_esms_by_digest.cc b/storage/perfschema/table_esms_by_digest.cc index dac8d3b01dc..d0250c14e5d 100644 --- a/storage/perfschema/table_esms_by_digest.cc +++ b/storage/perfschema/table_esms_by_digest.cc @@ -36,6 +36,11 @@ THR_LOCK table_esms_by_digest::m_table_lock; static const TABLE_FIELD_TYPE field_types[]= { { + { C_STRING_WITH_LEN("SCHEMA_NAME") }, + { C_STRING_WITH_LEN("varchar(64)") }, + { NULL, 0} + }, + { { C_STRING_WITH_LEN("DIGEST") }, { C_STRING_WITH_LEN("varchar(32)") }, { NULL, 0} @@ -45,7 +50,7 @@ static const TABLE_FIELD_TYPE field_types[]= { C_STRING_WITH_LEN("longtext") }, { NULL, 0} }, - { + { { C_STRING_WITH_LEN("COUNT_STAR") }, { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} @@ -170,7 +175,7 @@ static const TABLE_FIELD_TYPE field_types[]= { C_STRING_WITH_LEN("timestamp") }, { NULL, 0} }, - { + { { C_STRING_WITH_LEN("LAST_SEEN") }, { C_STRING_WITH_LEN("timestamp") }, { NULL, 0} @@ -179,7 +184,7 @@ static const TABLE_FIELD_TYPE field_types[]= TABLE_FIELD_DEF table_esms_by_digest::m_field_def= -{ 28, field_types }; +{ 29, field_types }; PFS_engine_table_share table_esms_by_digest::m_share= @@ -303,18 +308,19 @@ int table_esms_by_digest { switch(f->field_index) { - case 0: /* DIGEST */ - case 1: /* DIGEST_TEXT */ + case 0: /* SCHEMA_NAME */ + case 1: /* DIGEST */ + case 2: /* DIGEST_TEXT */ m_row.m_digest.set_field(f->field_index, f); break; - case 26: /* FIRST_SEEN */ + case 27: /* FIRST_SEEN */ set_field_timestamp(f, m_row.m_first_seen); break; - case 27: /* LAST_SEEN */ + case 28: /* LAST_SEEN */ set_field_timestamp(f, m_row.m_last_seen); break; - default: /* 1, ... COUNT/SUM/MIN/AVG/MAX */ - m_row.m_stat.set_field(f->field_index - 2, f); + default: /* 3, ... COUNT/SUM/MIN/AVG/MAX */ + m_row.m_stat.set_field(f->field_index - 3, f); break; } } diff --git a/storage/perfschema/table_esms_by_thread_by_event_name.cc b/storage/perfschema/table_esms_by_thread_by_event_name.cc index 5a7faca1b79..fccdf5dea60 100644 --- a/storage/perfschema/table_esms_by_thread_by_event_name.cc +++ b/storage/perfschema/table_esms_by_thread_by_event_name.cc @@ -33,7 +33,7 @@ static const TABLE_FIELD_TYPE field_types[]= { { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -308,7 +308,7 @@ int table_esms_by_thread_by_event_name switch(f->field_index) { case 0: /* THREAD_ID */ - set_field_ulong(f, m_row.m_thread_internal_id); + set_field_ulonglong(f, m_row.m_thread_internal_id); break; case 1: /* EVENT_NAME */ m_row.m_event_name.set_field(f); diff --git a/storage/perfschema/table_esms_by_thread_by_event_name.h b/storage/perfschema/table_esms_by_thread_by_event_name.h index 2f36606a5e1..9fb9f7c58dc 100644 --- a/storage/perfschema/table_esms_by_thread_by_event_name.h +++ b/storage/perfschema/table_esms_by_thread_by_event_name.h @@ -39,7 +39,7 @@ struct row_esms_by_thread_by_event_name { /** Column THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Column EVENT_NAME. */ PFS_event_name_row m_event_name; /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */ diff --git a/storage/perfschema/table_esms_global_by_event_name.cc b/storage/perfschema/table_esms_global_by_event_name.cc index 22c87f09137..efcb5b6fa7c 100644 --- a/storage/perfschema/table_esms_global_by_event_name.cc +++ b/storage/perfschema/table_esms_global_by_event_name.cc @@ -190,6 +190,9 @@ int table_esms_global_by_event_name::delete_all_rows(void) { reset_events_statements_by_thread(); + reset_events_statements_by_account(); + reset_events_statements_by_user(); + reset_events_statements_by_host(); reset_events_statements_global(); return 0; } diff --git a/storage/perfschema/table_events_stages.cc b/storage/perfschema/table_events_stages.cc index e438249fbd3..854e1be15cd 100644 --- a/storage/perfschema/table_events_stages.cc +++ b/storage/perfschema/table_events_stages.cc @@ -32,7 +32,7 @@ static const TABLE_FIELD_TYPE field_types[]= { { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -207,7 +207,7 @@ int table_events_stages_common::read_row_values(TABLE *table, switch(f->field_index) { case 0: /* THREAD_ID */ - set_field_ulong(f, m_row.m_thread_internal_id); + set_field_ulonglong(f, m_row.m_thread_internal_id); break; case 1: /* EVENT_ID */ set_field_ulonglong(f, m_row.m_event_id); diff --git a/storage/perfschema/table_events_stages.h b/storage/perfschema/table_events_stages.h index 6bc712c15a5..09c555c80fd 100644 --- a/storage/perfschema/table_events_stages.h +++ b/storage/perfschema/table_events_stages.h @@ -36,7 +36,7 @@ struct PFS_thread; struct row_events_stages { /** Column THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Column EVENT_ID. */ ulonglong m_event_id; /** Column END_EVENT_ID. */ diff --git a/storage/perfschema/table_events_statements.cc b/storage/perfschema/table_events_statements.cc index d453b14470f..fb2b4b242d4 100644 --- a/storage/perfschema/table_events_statements.cc +++ b/storage/perfschema/table_events_statements.cc @@ -35,7 +35,7 @@ static const TABLE_FIELD_TYPE field_types[]= { { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -372,7 +372,7 @@ void table_events_statements_common::make_row(PFS_events_statements *statement) PSI_digest_storage *digest= & statement->m_digest_storage; if (digest->m_byte_count > 0) { - PFS_digest_hash md5; + PFS_digest_key md5; compute_md5_hash((char *) md5.m_md5, (char *) digest->m_token_array, digest->m_byte_count); @@ -420,7 +420,7 @@ int table_events_statements_common::read_row_values(TABLE *table, switch(f->field_index) { case 0: /* THREAD_ID */ - set_field_ulong(f, m_row.m_thread_internal_id); + set_field_ulonglong(f, m_row.m_thread_internal_id); break; case 1: /* EVENT_ID */ set_field_ulonglong(f, m_row.m_event_id); diff --git a/storage/perfschema/table_events_statements.h b/storage/perfschema/table_events_statements.h index acd82de4fcf..dcc6611f555 100644 --- a/storage/perfschema/table_events_statements.h +++ b/storage/perfschema/table_events_statements.h @@ -37,7 +37,7 @@ struct PFS_thread; struct row_events_statements { /** Column THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Column EVENT_ID. */ ulonglong m_event_id; /** Column END_EVENT_ID. */ diff --git a/storage/perfschema/table_events_waits.cc b/storage/perfschema/table_events_waits.cc index d1c82e81f75..82d8ba2a0cc 100644 --- a/storage/perfschema/table_events_waits.cc +++ b/storage/perfschema/table_events_waits.cc @@ -34,7 +34,7 @@ static const TABLE_FIELD_TYPE field_types[]= { { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -239,7 +239,8 @@ int table_events_waits_common::make_table_object_columns(volatile PFS_events_wai /* INDEX NAME */ safe_index= wait->m_index; - if (safe_index < MAX_KEY && safe_index < safe_table_share->m_key_count) + uint safe_key_count= sanitize_index_count(safe_table_share->m_key_count); + if (safe_index < safe_key_count) { PFS_table_key *key= & safe_table_share->m_keys[safe_index]; m_row.m_index_name_length= key->m_name_length; @@ -602,7 +603,7 @@ int table_events_waits_common::read_row_values(TABLE *table, switch(f->field_index) { case 0: /* THREAD_ID */ - set_field_ulong(f, m_row.m_thread_internal_id); + set_field_ulonglong(f, m_row.m_thread_internal_id); break; case 1: /* EVENT_ID */ set_field_ulonglong(f, m_row.m_event_id); diff --git a/storage/perfschema/table_events_waits.h b/storage/perfschema/table_events_waits.h index 72065c765ca..065bf95e5a6 100644 --- a/storage/perfschema/table_events_waits.h +++ b/storage/perfschema/table_events_waits.h @@ -36,7 +36,7 @@ struct PFS_thread; struct row_events_waits { /** Column THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Column EVENT_ID. */ ulonglong m_event_id; /** Column END_EVENT_ID. */ diff --git a/storage/perfschema/table_events_waits_summary.cc b/storage/perfschema/table_events_waits_summary.cc index 2a144a07344..f437e83f3ff 100644 --- a/storage/perfschema/table_events_waits_summary.cc +++ b/storage/perfschema/table_events_waits_summary.cc @@ -139,7 +139,7 @@ void table_events_waits_summary_by_instance::make_mutex_row(PFS_mutex *pfs) if (unlikely(safe_class == NULL)) return; - make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_wait_stat); + make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_mutex_stat.m_wait_stat); } /** @@ -153,7 +153,7 @@ void table_events_waits_summary_by_instance::make_rwlock_row(PFS_rwlock *pfs) if (unlikely(safe_class == NULL)) return; - make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_wait_stat); + make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_rwlock_stat.m_wait_stat); } /** @@ -167,7 +167,7 @@ void table_events_waits_summary_by_instance::make_cond_row(PFS_cond *pfs) if (unlikely(safe_class == NULL)) return; - make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_wait_stat); + make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_cond_stat.m_wait_stat); } /** @@ -181,11 +181,13 @@ void table_events_waits_summary_by_instance::make_file_row(PFS_file *pfs) if (unlikely(safe_class == NULL)) return; + PFS_single_stat sum; + pfs->m_file_stat.m_io_stat.sum_waits(& sum); /* Files don't have a in memory structure associated to it, so we use the address of the PFS_file buffer as object_instance_begin */ - make_instr_row(pfs, safe_class, pfs, &pfs->m_wait_stat); + make_instr_row(pfs, safe_class, pfs, & sum); } /** diff --git a/storage/perfschema/table_ews_by_thread_by_event_name.cc b/storage/perfschema/table_ews_by_thread_by_event_name.cc index 25e3cf395c4..4db97b1c98c 100644 --- a/storage/perfschema/table_ews_by_thread_by_event_name.cc +++ b/storage/perfschema/table_ews_by_thread_by_event_name.cc @@ -33,7 +33,7 @@ static const TABLE_FIELD_TYPE field_types[]= { { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -282,7 +282,7 @@ int table_ews_by_thread_by_event_name switch(f->field_index) { case 0: /* THREAD_ID */ - set_field_ulong(f, m_row.m_thread_internal_id); + set_field_ulonglong(f, m_row.m_thread_internal_id); break; case 1: /* EVENT_NAME */ m_row.m_event_name.set_field(f); diff --git a/storage/perfschema/table_ews_by_thread_by_event_name.h b/storage/perfschema/table_ews_by_thread_by_event_name.h index b0710bb8a57..989356be646 100644 --- a/storage/perfschema/table_ews_by_thread_by_event_name.h +++ b/storage/perfschema/table_ews_by_thread_by_event_name.h @@ -39,7 +39,7 @@ struct row_ews_by_thread_by_event_name { /** Column THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Column EVENT_NAME. */ PFS_event_name_row m_event_name; /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */ diff --git a/storage/perfschema/table_ews_global_by_event_name.cc b/storage/perfschema/table_ews_global_by_event_name.cc index c71a1ed479e..1e165c36bc5 100644 --- a/storage/perfschema/table_ews_global_by_event_name.cc +++ b/storage/perfschema/table_ews_global_by_event_name.cc @@ -97,7 +97,7 @@ table_ews_global_by_event_name::delete_all_rows(void) reset_events_waits_by_instance(); reset_table_waits_by_table_handle(); reset_table_waits_by_table(); - reset_events_waits_global(); + reset_events_waits_by_class(); return 0; } @@ -121,9 +121,6 @@ int table_ews_global_by_event_name::rnd_next(void) PFS_socket_class *socket_class; PFS_instr_class *instr_class; - if (global_instr_class_waits_array == NULL) - return HA_ERR_END_OF_FILE; - for (m_pos.set_at(&m_next_pos); m_pos.has_more_view(); m_pos.next_view()) @@ -218,9 +215,6 @@ table_ews_global_by_event_name::rnd_pos(const void *pos) set_position(pos); - if (global_instr_class_waits_array == NULL) - return HA_ERR_END_OF_FILE; - switch (m_pos.m_index_1) { case pos_ews_global_by_event_name::VIEW_MUTEX: diff --git a/storage/perfschema/table_helper.cc b/storage/perfschema/table_helper.cc index d3954179539..9f803434ab6 100644 --- a/storage/perfschema/table_helper.cc +++ b/storage/perfschema/table_helper.cc @@ -110,26 +110,30 @@ int PFS_digest_row::make_row(PFS_statements_digest_stat* pfs) */ if (pfs->m_digest_storage.m_byte_count != 0) { + m_schema_name_length= pfs->m_digest_key.m_schema_name_length; + if (m_schema_name_length > 0) + memcpy(m_schema_name, pfs->m_digest_key.m_schema_name, m_schema_name_length); /* Calculate digest from MD5 HASH collected to be shown as DIGEST in this row. */ - MD5_HASH_TO_STRING(pfs->m_digest_hash.m_md5, m_digest); + MD5_HASH_TO_STRING(pfs->m_digest_key.m_md5, m_digest); m_digest_length= MD5_HASH_TO_STRING_LENGTH; - /* - Caclulate digest_text information from the token array collected + /* + Calculate digest_text information from the token array collected to be shown as DIGEST_TEXT column. - */ + */ get_digest_text(m_digest_text, &pfs->m_digest_storage); m_digest_text_length= strlen(m_digest_text); } else { + m_schema_name_length= 0; m_digest_length= 0; m_digest_text_length= 0; } - + return 0; } @@ -137,14 +141,21 @@ void PFS_digest_row::set_field(uint index, Field *f) { switch (index) { - case 0: /* DIGEST */ + case 0: /* SCHEMA_NAME */ + if (m_schema_name_length > 0) + PFS_engine_table::set_field_varchar_utf8(f, m_schema_name, + m_schema_name_length); + else + f->set_null(); + break; + case 1: /* DIGEST */ if (m_digest_length > 0) PFS_engine_table::set_field_varchar_utf8(f, m_digest, m_digest_length); else f->set_null(); break; - case 1: /* DIGEST_TEXT */ + case 2: /* DIGEST_TEXT */ if (m_digest_text_length > 0) PFS_engine_table::set_field_longtext_utf8(f, m_digest_text, m_digest_text_length); @@ -199,7 +210,7 @@ int PFS_index_row::make_row(PFS_table_share *pfs, uint table_index) if (m_object_row.make_row(pfs)) return 1; - if (table_index < MAX_KEY) + if (table_index < MAX_INDEXES) { PFS_table_key *key= &pfs->m_keys[table_index]; m_index_name_length= key->m_name_length; diff --git a/storage/perfschema/table_helper.h b/storage/perfschema/table_helper.h index 798ff16f4e5..769122570eb 100644 --- a/storage/perfschema/table_helper.h +++ b/storage/perfschema/table_helper.h @@ -127,6 +127,10 @@ struct PFS_account_row /** Row fragment for columns DIGEST, DIGEST_TEXT. */ struct PFS_digest_row { + /** Column SCHEMA_NAME. */ + char m_schema_name[NAME_LEN]; + /** Length in bytes of @c m_schema_name. */ + uint m_schema_name_length; /** Column DIGEST. */ char m_digest[COL_DIGEST_SIZE]; /** Length in bytes of @c m_digest. */ diff --git a/storage/perfschema/table_host_cache.cc b/storage/perfschema/table_host_cache.cc index d243204ddcd..9c44a1fe235 100644 --- a/storage/perfschema/table_host_cache.cc +++ b/storage/perfschema/table_host_cache.cc @@ -266,9 +266,6 @@ void table_host_cache::materialize(THD *thd) index++; row++; current= current->next(); - /* Host cache is a circular linked list. */ - if (current == first) - break; } m_all_rows= rows; diff --git a/storage/perfschema/table_os_global_by_type.cc b/storage/perfschema/table_os_global_by_type.cc index 82d176cd5b2..70d9d6819ac 100644 --- a/storage/perfschema/table_os_global_by_type.cc +++ b/storage/perfschema/table_os_global_by_type.cc @@ -174,6 +174,7 @@ void table_os_global_by_type::make_row(PFS_table_share *share) { pfs_lock lock; PFS_single_stat cumulated_stat; + uint safe_key_count; m_row_exists= false; @@ -184,7 +185,11 @@ void table_os_global_by_type::make_row(PFS_table_share *share) m_row.m_schema_name_length= share->m_schema_name_length; memcpy(m_row.m_object_name, share->m_table_name, share->m_table_name_length); m_row.m_object_name_length= share->m_table_name_length; - share->m_table_stat.sum(& cumulated_stat); + + /* This is a dirty read, some thread can write data while we are reading it */ + safe_key_count= sanitize_index_count(share->m_key_count); + + share->m_table_stat.sum(& cumulated_stat, safe_key_count); if (! share->m_lock.end_optimistic_lock(&lock)) return; @@ -204,7 +209,7 @@ void table_os_global_by_type::make_row(PFS_table_share *share) If the opened table handle is for this table share, aggregate the table handle statistics. */ - table->m_table_stat.sum(& cumulated_stat); + table->m_table_stat.sum(& cumulated_stat, safe_key_count); } } } diff --git a/storage/perfschema/table_session_account_connect_attrs.cc b/storage/perfschema/table_session_account_connect_attrs.cc new file mode 100644 index 00000000000..4a3fcc22341 --- /dev/null +++ b/storage/perfschema/table_session_account_connect_attrs.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#include "table_session_account_connect_attrs.h" + +THR_LOCK table_session_account_connect_attrs::m_table_lock; + +PFS_engine_table_share +table_session_account_connect_attrs::m_share= +{ + { C_STRING_WITH_LEN("session_account_connect_attrs") }, + &pfs_readonly_acl, + &table_session_account_connect_attrs::create, + NULL, /* write_row */ + NULL, /* delete_all_rows */ + NULL, /* get_row_count */ + 1000, /* records */ + sizeof(pos_connect_attr_by_thread_by_attr), /* ref length */ + &m_table_lock, + &m_field_def, + false /* checked */ +}; + +PFS_engine_table* table_session_account_connect_attrs::create() +{ + return new table_session_account_connect_attrs(); +} + +table_session_account_connect_attrs::table_session_account_connect_attrs() + : table_session_connect(&m_share) +{} + +bool +table_session_account_connect_attrs::thread_fits(PFS_thread *thread) +{ + PFS_thread *current_thread= PFS_thread::get_current_thread(); + /* The current thread may not have instrumentation attached. */ + if (current_thread == NULL) + return false; + + /* The thread we compare to, by definition, has some instrumentation. */ + DBUG_ASSERT(thread != NULL); + + uint username_length= current_thread->m_username_length; + uint hostname_length= current_thread->m_hostname_length; + + if ( (thread->m_username_length != username_length) + || (thread->m_hostname_length != hostname_length)) + return false; + + if (memcmp(thread->m_username, current_thread->m_username, username_length) != 0) + return false; + + if (memcmp(thread->m_hostname, current_thread->m_hostname, hostname_length) != 0) + return false; + + return true; +} diff --git a/storage/perfschema/table_session_account_connect_attrs.h b/storage/perfschema/table_session_account_connect_attrs.h new file mode 100644 index 00000000000..ba8893e7cad --- /dev/null +++ b/storage/perfschema/table_session_account_connect_attrs.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#ifndef TABLE_SESSION_ACCOUNT_CONNECT_ATTRS_H +#define TABLE_SESSION_ACCOUNT_CONNECT_ATTRS_H + +#include "table_session_connect.h" +/** + \addtogroup Performance_schema_tables + @{ +*/ + +/** Table PERFORMANCE_SCHEMA.SESSION_ACCOUNT_CONNECT_ATTRS. */ +class table_session_account_connect_attrs : public table_session_connect +{ +public: + /** Table share */ + static PFS_engine_table_share m_share; + /** Table builder */ + static PFS_engine_table* create(); + +protected: + table_session_account_connect_attrs(); + +public: + ~table_session_account_connect_attrs() + {} + +protected: + virtual bool thread_fits(PFS_thread *thread); + +private: + /** Table share lock. */ + static THR_LOCK m_table_lock; +}; + +/** @} */ +#endif diff --git a/storage/perfschema/table_session_connect.cc b/storage/perfschema/table_session_connect.cc new file mode 100644 index 00000000000..bd905b5756c --- /dev/null +++ b/storage/perfschema/table_session_connect.cc @@ -0,0 +1,268 @@ +/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#include "table_session_connect.h" + +static const TABLE_FIELD_TYPE field_types[]= +{ + { + { C_STRING_WITH_LEN("PROCESSLIST_ID") }, + { C_STRING_WITH_LEN("int(11)") }, + { NULL, 0} + }, + { + { C_STRING_WITH_LEN("ATTR_NAME") }, + { C_STRING_WITH_LEN("varchar(32)") }, + { NULL, 0} + }, + { + { C_STRING_WITH_LEN("ATTR_VALUE") }, + { C_STRING_WITH_LEN("varchar(1024)") }, + { NULL, 0} + }, + { + { C_STRING_WITH_LEN("ORDINAL_POSITION") }, + { C_STRING_WITH_LEN("int(11)") }, + { NULL, 0} + } +}; + +TABLE_FIELD_DEF table_session_connect::m_field_def= +{ 4, field_types }; + +table_session_connect::table_session_connect(const PFS_engine_table_share *share) : + cursor_by_thread_connect_attr(share) +{} + +/** + Take a length encoded string + + @arg ptr inout the input string array + @arg dest where to store the result + @arg dest_size max size of @c dest + @arg copied_len the actual length of the data copied + @arg start_ptr pointer to the start of input + @arg input_length the length of the incoming data + @arg copy_data copy the data or just skip the input + @arg from_cs character set in which @c ptr is encoded + @arg nchars_max maximum number of characters to read + @return status + @retval true parsing failed + @retval false parsing succeeded +*/ +bool parse_length_encoded_string(const char **ptr, + char *dest, uint dest_size, + uint *copied_len, + const char *start_ptr, uint input_length, + bool copy_data, + const CHARSET_INFO *from_cs, + uint nchars_max) +{ + ulong copy_length, data_length; + const char *well_formed_error_pos= NULL, *cannot_convert_error_pos= NULL, + *from_end_pos= NULL; + + copy_length= data_length= net_field_length((uchar **) ptr); + + /* we don't tolerate NULL as a length */ + if (data_length == NULL_LENGTH) + return true; + + if (*ptr - start_ptr + data_length > input_length) + return true; + + copy_length= well_formed_copy_nchars(&my_charset_utf8_bin, dest, dest_size, + from_cs, *ptr, data_length, nchars_max, + &well_formed_error_pos, + &cannot_convert_error_pos, + &from_end_pos); + *copied_len= copy_length; + (*ptr)+= data_length; + + return false; +} + +/** + Take the nth attribute name/value pair + + Parse the attributes blob form the beginning, skipping the attributes + whose number is lower than the one we seek. + When we reach the attribute at an index we're looking for the values + are copied to the output parameters. + If parsing fails or no more attributes are found the function stops + and returns an error code. + + @arg connect_attrs pointer to the connect attributes blob + @arg connect_attrs_length length of @c connect_attrs + @arg connect_attrs_cs character set used to encode @c connect_attrs + @arg ordinal index of the attribute we need + @arg attr_name [out] buffer to receive the attribute name + @arg max_attr_name max size of @c attr_name in bytes + @arg attr_name_length [out] number of bytes written in @attr_name + @arg attr_value [out] buffer to receive the attribute name + @arg max_attr_value max size of @c attr_value in bytes + @arg attr_value_length [out] number of bytes written in @attr_value + @return status + @retval true requested attribute pair is found and copied + @retval false error. Either because of parsing or too few attributes. +*/ +bool read_nth_attr(const char *connect_attrs, + uint connect_attrs_length, + const CHARSET_INFO *connect_attrs_cs, + uint ordinal, + char *attr_name, uint max_attr_name, + uint *attr_name_length, + char *attr_value, uint max_attr_value, + uint *attr_value_length) +{ + uint idx; + const char *ptr; + + for (ptr= connect_attrs, idx= 0; + (uint)(ptr - connect_attrs) < connect_attrs_length && idx <= ordinal; + idx++) + { + uint copy_length; + /* do the copying only if we absolutely have to */ + bool fill_in_attr_name= idx == ordinal; + bool fill_in_attr_value= idx == ordinal; + + /* read the key */ + if (parse_length_encoded_string(&ptr, + attr_name, max_attr_name, ©_length, + connect_attrs, + connect_attrs_length, + fill_in_attr_name, + connect_attrs_cs, 32) || + !copy_length + ) + return false; + + if (idx == ordinal) + *attr_name_length= copy_length; + + /* read the value */ + if (parse_length_encoded_string(&ptr, + attr_value, max_attr_value, ©_length, + connect_attrs, + connect_attrs_length, + fill_in_attr_value, + connect_attrs_cs, 1024)) + return false; + + if (idx == ordinal) + *attr_value_length= copy_length; + + if (idx == ordinal) + return true; + } + + return false; +} + +void table_session_connect::make_row(PFS_thread *pfs, uint ordinal) +{ + pfs_lock lock; + PFS_thread_class *safe_class; + + m_row_exists= false; + + /* Protect this reader against thread termination */ + pfs->m_lock.begin_optimistic_lock(&lock); + safe_class= sanitize_thread_class(pfs->m_class); + if (unlikely(safe_class == NULL)) + return; + + /* Filtering threads must be done under the protection of the optimistic lock. */ + if (! thread_fits(pfs)) + return; + + /* populate the row */ + if (read_nth_attr(pfs->m_session_connect_attrs, + pfs->m_session_connect_attrs_length, + pfs->m_session_connect_attrs_cs, + ordinal, + m_row.m_attr_name, (uint) sizeof(m_row.m_attr_name), + &m_row.m_attr_name_length, + m_row.m_attr_value, (uint) sizeof(m_row.m_attr_value), + &m_row.m_attr_value_length)) + { + /* we don't expect internal threads to have connection attributes */ + DBUG_ASSERT(pfs->m_processlist_id != 0); + + m_row.m_ordinal_position= ordinal; + m_row.m_process_id= pfs->m_processlist_id; + } + else + return; + + if (pfs->m_lock.end_optimistic_lock(& lock)) + m_row_exists= true; +} + +int table_session_connect::read_row_values(TABLE *table, + unsigned char *buf, + Field **fields, + bool read_all) +{ + Field *f; + + if (unlikely(!m_row_exists)) + return HA_ERR_RECORD_DELETED; + + /* Set the null bits */ + DBUG_ASSERT(table->s->null_bytes == 1); + buf[0]= 0; + + for (; (f= *fields) ; fields++) + { + if (read_all || bitmap_is_set(table->read_set, f->field_index)) + { + switch(f->field_index) + { + case FO_PROCESS_ID: + if (m_row.m_process_id != 0) + set_field_ulong(f, m_row.m_process_id); + else + f->set_null(); + break; + case FO_ATTR_NAME: + set_field_varchar_utf8(f, m_row.m_attr_name, + m_row.m_attr_name_length); + break; + case FO_ATTR_VALUE: + if (m_row.m_attr_value_length) + set_field_varchar_utf8(f, m_row.m_attr_value, + m_row.m_attr_value_length); + else + f->set_null(); + break; + case FO_ORDINAL_POSITION: + set_field_ulong(f, m_row.m_ordinal_position); + break; + default: + DBUG_ASSERT(false); + } + } + } + return 0; +} + +bool +table_session_connect::thread_fits(PFS_thread *thread) +{ + return true; +} + diff --git a/storage/perfschema/table_session_connect.h b/storage/perfschema/table_session_connect.h new file mode 100644 index 00000000000..097623d2c80 --- /dev/null +++ b/storage/perfschema/table_session_connect.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#ifndef TABLE_SESSION_CONNECT_H +#define TABLE_SESSION_CONNECT_H + +#include "pfs_column_types.h" +#include "cursor_by_thread_connect_attr.h" +#include "table_helper.h" + +#define MAX_ATTR_NAME_CHARS 32 +#define MAX_ATTR_VALUE_CHARS 1024 +#define MAX_UTF8_BYTES 6 + +/** symbolic names for field offsets, keep in sync with field_types */ +enum field_offsets { + FO_PROCESS_ID, + FO_ATTR_NAME, + FO_ATTR_VALUE, + FO_ORDINAL_POSITION +}; + +/** + A row of PERFORMANCE_SCHEMA.SESSION_CONNECT_ATTRS and + PERFORMANCE_SCHEMA.SESSION_ACCOUNT_CONNECT_ATTRS. +*/ +struct row_session_connect_attrs +{ + /** Column PROCESS_ID. */ + ulong m_process_id; + /** Column ATTR_NAME. In UTF-8 */ + char m_attr_name[MAX_ATTR_NAME_CHARS * MAX_UTF8_BYTES]; + /** Length in bytes of @c m_attr_name. */ + uint m_attr_name_length; + /** Column ATTR_VALUE. In UTF-8 */ + char m_attr_value[MAX_ATTR_VALUE_CHARS * MAX_UTF8_BYTES]; + /** Length in bytes of @c m_attr_name. */ + uint m_attr_value_length; + /** Column ORDINAL_POSITION. */ + ulong m_ordinal_position; +}; + +class table_session_connect : public cursor_by_thread_connect_attr +{ +protected: + table_session_connect(const PFS_engine_table_share *share); + +public: + ~table_session_connect() + {} + +protected: + virtual void make_row(PFS_thread *pfs, uint ordinal); + virtual bool thread_fits(PFS_thread *thread); + virtual int read_row_values(TABLE *table, unsigned char *buf, + Field **fields, bool read_all); +protected: + /** Fields definition. */ + static TABLE_FIELD_DEF m_field_def; + /** Current row. */ + row_session_connect_attrs m_row; +}; + +/** @} */ +#endif diff --git a/storage/perfschema/table_session_connect_attrs.cc b/storage/perfschema/table_session_connect_attrs.cc new file mode 100644 index 00000000000..9e1804b7294 --- /dev/null +++ b/storage/perfschema/table_session_connect_attrs.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#include "table_session_connect_attrs.h" + +THR_LOCK table_session_connect_attrs::m_table_lock; + +PFS_engine_table_share +table_session_connect_attrs::m_share= +{ + { C_STRING_WITH_LEN("session_connect_attrs") }, + &pfs_readonly_acl, + &table_session_connect_attrs::create, + NULL, /* write_row */ + NULL, /* delete_all_rows */ + NULL, /* get_row_count */ + 1000, /* records */ + sizeof(pos_connect_attr_by_thread_by_attr), /* ref length */ + &m_table_lock, + &m_field_def, + false /* checked */ +}; + +PFS_engine_table* table_session_connect_attrs::create() +{ + return new table_session_connect_attrs(); +} + +table_session_connect_attrs::table_session_connect_attrs() + : table_session_connect(&m_share) +{} diff --git a/storage/perfschema/table_session_connect_attrs.h b/storage/perfschema/table_session_connect_attrs.h new file mode 100644 index 00000000000..b10b106ba0d --- /dev/null +++ b/storage/perfschema/table_session_connect_attrs.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#ifndef TABLE_SESSION_CONNECT_ATTRS_H +#define TABLE_SESSION_CONNECT_ATTRS_H + +#include "table_session_connect.h" +/** + \addtogroup Performance_schema_tables + @{ +*/ + +/** Table PERFORMANCE_SCHEMA.SESSION_CONNECT_ATTRS. */ +class table_session_connect_attrs : public table_session_connect +{ +public: + /** Table share */ + static PFS_engine_table_share m_share; + /** Table builder */ + static PFS_engine_table* create(); + +protected: + table_session_connect_attrs(); + +public: + ~table_session_connect_attrs() + {} + +private: + /** Table share lock. */ + static THR_LOCK m_table_lock; +}; + +/** @} */ +#endif diff --git a/storage/perfschema/table_setup_actors.cc b/storage/perfschema/table_setup_actors.cc index 15d3d9d22a8..91dbb942ead 100644 --- a/storage/perfschema/table_setup_actors.cc +++ b/storage/perfschema/table_setup_actors.cc @@ -105,6 +105,9 @@ int table_setup_actors::write_row(TABLE *table, unsigned char *buf, } } + if (user->length() == 0 || host->length() == 0 || role->length() == 0) + return HA_ERR_WRONG_COMMAND; + return insert_setup_actor(user, host, role); } @@ -264,39 +267,13 @@ int table_setup_actors::delete_row_values(TABLE *table, const unsigned char *buf, Field **fields) { - Field *f; - String user_data("", 0, &my_charset_utf8_bin); - String host_data("", 0, &my_charset_utf8_bin); - String role_data("", 0, &my_charset_utf8_bin); - String *user= NULL; - String *host= NULL; - String *role= NULL; - - for (; (f= *fields) ; fields++) - { - if (bitmap_is_set(table->read_set, f->field_index)) - { - switch(f->field_index) - { - case 0: /* HOST */ - host= get_field_char_utf8(f, &host_data); - break; - case 1: /* USER */ - user= get_field_char_utf8(f, &user_data); - break; - case 2: /* ROLE */ - role= get_field_char_utf8(f, &role_data); - break; - default: - DBUG_ASSERT(false); - } - } - } + DBUG_ASSERT(m_row_exists); - DBUG_ASSERT(user != NULL); - DBUG_ASSERT(host != NULL); - DBUG_ASSERT(role != NULL); + CHARSET_INFO *cs= &my_charset_utf8_bin; + String user(m_row.m_username, m_row.m_username_length, cs); + String role(m_row.m_rolename, m_row.m_rolename_length, cs); + String host(m_row.m_hostname, m_row.m_hostname_length, cs); - return delete_setup_actor(user, host, role); + return delete_setup_actor(&user, &host, &role); } diff --git a/storage/perfschema/table_setup_objects.cc b/storage/perfschema/table_setup_objects.cc index 33e360e989b..11fab913ac4 100644 --- a/storage/perfschema/table_setup_objects.cc +++ b/storage/perfschema/table_setup_objects.cc @@ -339,42 +339,15 @@ int table_setup_objects::delete_row_values(TABLE *table, const unsigned char *buf, Field **fields) { - int result; - Field *f; - enum_object_type object_type= OBJECT_TYPE_TABLE; - String object_schema_data("", 0, &my_charset_utf8_bin); - String object_name_data("", 0, &my_charset_utf8_bin); - String *object_schema= NULL; - String *object_name= NULL; + DBUG_ASSERT(m_row_exists); - for (; (f= *fields) ; fields++) - { - if (bitmap_is_set(table->read_set, f->field_index)) - { - switch(f->field_index) - { - case 0: /* OBJECT_TYPE */ - object_type= (enum_object_type) get_field_enum(f); - break; - case 1: /* OBJECT_SCHEMA */ - object_schema= get_field_varchar_utf8(f, &object_schema_data); - break; - case 2: /* OBJECT_NAME */ - object_name= get_field_varchar_utf8(f, &object_name_data); - break; - case 3: /* ENABLED */ - case 4: /* TIMED */ - break; - default: - DBUG_ASSERT(false); - } - } - } + CHARSET_INFO *cs= &my_charset_utf8_bin; + enum_object_type object_type= OBJECT_TYPE_TABLE; + String object_schema(m_row.m_schema_name, m_row.m_schema_name_length, cs); + String object_name(m_row.m_object_name, m_row.m_object_name_length, cs); - DBUG_ASSERT(object_schema != NULL); - DBUG_ASSERT(object_name != NULL); + int result= delete_setup_object(object_type, &object_schema, &object_name); - result= delete_setup_object(object_type, object_schema, object_name); if (result == 0) result= update_derived_flags(); return result; diff --git a/storage/perfschema/table_socket_instances.cc b/storage/perfschema/table_socket_instances.cc index f913c8fcc65..0fa1d2b1a3a 100644 --- a/storage/perfschema/table_socket_instances.cc +++ b/storage/perfschema/table_socket_instances.cc @@ -42,7 +42,7 @@ static const TABLE_FIELD_TYPE field_types[]= }, { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -205,7 +205,7 @@ int table_socket_instances::read_row_values(TABLE *table, break; case 2: /* THREAD_ID */ if (m_row.m_thread_id_set) - set_field_ulong(f, m_row.m_thread_id); + set_field_ulonglong(f, m_row.m_thread_id); else f->set_null(); break; diff --git a/storage/perfschema/table_socket_instances.h b/storage/perfschema/table_socket_instances.h index 2a80aeaa76a..080f11c1ba8 100644 --- a/storage/perfschema/table_socket_instances.h +++ b/storage/perfschema/table_socket_instances.h @@ -39,7 +39,7 @@ struct row_socket_instances /** Column OBJECT_INSTANCE_BEGIN */ const void *m_identity; /** Column THREAD_ID */ - uint m_thread_id; + ulonglong m_thread_id; /** True if thread_is is set */ bool m_thread_id_set; /** Column SOCKET_ID */ diff --git a/storage/perfschema/table_sync_instances.cc b/storage/perfschema/table_sync_instances.cc index 9631c5fb205..4d7c48efdc1 100644 --- a/storage/perfschema/table_sync_instances.cc +++ b/storage/perfschema/table_sync_instances.cc @@ -43,7 +43,7 @@ static const TABLE_FIELD_TYPE mutex_field_types[]= }, { { C_STRING_WITH_LEN("LOCKED_BY_THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} } }; @@ -178,7 +178,7 @@ int table_mutex_instances::read_row_values(TABLE *table, break; case 2: /* LOCKED_BY_THREAD_ID */ if (m_row.m_locked) - set_field_ulong(f, m_row.m_locked_by_thread_id); + set_field_ulonglong(f, m_row.m_locked_by_thread_id); else f->set_null(); break; @@ -207,7 +207,7 @@ static const TABLE_FIELD_TYPE rwlock_field_types[]= }, { { C_STRING_WITH_LEN("WRITE_LOCKED_BY_THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -351,7 +351,7 @@ int table_rwlock_instances::read_row_values(TABLE *table, break; case 2: /* WRITE_LOCKED_BY_THREAD_ID */ if (m_row.m_write_locked) - set_field_ulong(f, m_row.m_write_locked_by_thread_id); + set_field_ulonglong(f, m_row.m_write_locked_by_thread_id); else f->set_null(); break; diff --git a/storage/perfschema/table_sync_instances.h b/storage/perfschema/table_sync_instances.h index b6fc78e1cd5..ff7b2765a11 100644 --- a/storage/perfschema/table_sync_instances.h +++ b/storage/perfschema/table_sync_instances.h @@ -45,7 +45,7 @@ struct row_mutex_instances /** True if column LOCKED_BY_THREAD_ID is not null. */ bool m_locked; /** Column LOCKED_BY_THREAD_ID. */ - ulong m_locked_by_thread_id; + ulonglong m_locked_by_thread_id; }; /** Table PERFORMANCE_SCHEMA.MUTEX_INSTANCES. */ @@ -102,7 +102,7 @@ struct row_rwlock_instances /** True if column WRITE_LOCKED_BY_THREAD_ID is not null. */ bool m_write_locked; /** Column WRITE_LOCKED_BY_THREAD_ID. */ - ulong m_write_locked_by_thread_id; + ulonglong m_write_locked_by_thread_id; /** Column READ_LOCKED_BY_COUNT. */ ulong m_readers; }; diff --git a/storage/perfschema/table_threads.cc b/storage/perfschema/table_threads.cc index 91300d6b67e..b1ec2ad754e 100644 --- a/storage/perfschema/table_threads.cc +++ b/storage/perfschema/table_threads.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,7 +26,7 @@ static const TABLE_FIELD_TYPE field_types[]= { { { C_STRING_WITH_LEN("THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -41,7 +41,7 @@ static const TABLE_FIELD_TYPE field_types[]= }, { { C_STRING_WITH_LEN("PROCESSLIST_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -81,7 +81,7 @@ static const TABLE_FIELD_TYPE field_types[]= }, { { C_STRING_WITH_LEN("PARENT_THREAD_ID") }, - { C_STRING_WITH_LEN("int(11)") }, + { C_STRING_WITH_LEN("bigint(20)") }, { NULL, 0} }, { @@ -129,6 +129,7 @@ table_threads::table_threads() void table_threads::make_row(PFS_thread *pfs) { pfs_lock lock; + pfs_lock processlist_lock; PFS_thread_class *safe_class; m_row_exists= false; @@ -142,7 +143,7 @@ void table_threads::make_row(PFS_thread *pfs) m_row.m_thread_internal_id= pfs->m_thread_internal_id; m_row.m_parent_thread_internal_id= pfs->m_parent_thread_internal_id; - m_row.m_thread_id= pfs->m_thread_id; + m_row.m_processlist_id= pfs->m_processlist_id; m_row.m_name= safe_class->m_name; m_row.m_name_length= safe_class->m_name_length; @@ -166,12 +167,30 @@ void table_threads::make_row(PFS_thread *pfs) m_row.m_command= pfs->m_command; m_row.m_start_time= pfs->m_start_time; + + /* Protect this reader against attribute changes. */ + pfs->m_processlist_lock.begin_optimistic_lock(&processlist_lock); + /* FIXME: need to copy it ? */ m_row.m_processlist_state_ptr= pfs->m_processlist_state_ptr; m_row.m_processlist_state_length= pfs->m_processlist_state_length; /* FIXME: need to copy it ? */ m_row.m_processlist_info_ptr= pfs->m_processlist_info_ptr; m_row.m_processlist_info_length= pfs->m_processlist_info_length; + + if (! pfs->m_processlist_lock.end_optimistic_lock(& processlist_lock)) + { + /* + Columns PROCESSLIST_STATE or PROCESSLIST_INFO are being + updated while we read them, and are unsafe to use. + Do not discard the entire row. + Do not loop waiting for a stable value. + Just return NULL values for these columns. + */ + m_row.m_processlist_state_length= 0; + m_row.m_processlist_info_length= 0; + } + m_row.m_enabled_ptr= &pfs->m_enabled; if (pfs->m_lock.end_optimistic_lock(& lock)) @@ -200,20 +219,20 @@ int table_threads::read_row_values(TABLE *table, switch(f->field_index) { case 0: /* THREAD_ID */ - set_field_ulong(f, m_row.m_thread_internal_id); + set_field_ulonglong(f, m_row.m_thread_internal_id); break; case 1: /* NAME */ set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length); break; case 2: /* TYPE */ - if (m_row.m_thread_id != 0) + if (m_row.m_processlist_id != 0) set_field_varchar_utf8(f, "FOREGROUND", 10); else set_field_varchar_utf8(f, "BACKGROUND", 10); break; case 3: /* PROCESSLIST_ID */ - if (m_row.m_thread_id != 0) - set_field_ulong(f, m_row.m_thread_id); + if (m_row.m_processlist_id != 0) + set_field_ulonglong(f, m_row.m_processlist_id); else f->set_null(); break; @@ -239,7 +258,7 @@ int table_threads::read_row_values(TABLE *table, f->set_null(); break; case 7: /* PROCESSLIST_COMMAND */ - if (m_row.m_thread_id != 0) + if (m_row.m_processlist_id != 0) set_field_varchar_utf8(f, command_name[m_row.m_command].str, command_name[m_row.m_command].length); else @@ -271,7 +290,7 @@ int table_threads::read_row_values(TABLE *table, break; case 11: /* PARENT_THREAD_ID */ if (m_row.m_parent_thread_internal_id != 0) - set_field_ulong(f, m_row.m_parent_thread_internal_id); + set_field_ulonglong(f, m_row.m_parent_thread_internal_id); else f->set_null(); break; diff --git a/storage/perfschema/table_threads.h b/storage/perfschema/table_threads.h index 9819822f8c8..bce45c0cbce 100644 --- a/storage/perfschema/table_threads.h +++ b/storage/perfschema/table_threads.h @@ -32,9 +32,9 @@ struct PFS_thread; struct row_threads { /** Column THREAD_ID. */ - ulong m_thread_internal_id; + ulonglong m_thread_internal_id; /** Column PROCESSLIST_ID. */ - ulong m_thread_id; + ulonglong m_processlist_id; /** Column NAME. */ const char* m_name; /** Length in bytes of @c m_name. */ @@ -66,7 +66,7 @@ struct row_threads /** Column INSTRUMENTED. */ bool *m_enabled_ptr; /** Column PARENT_THREAD_ID. */ - ulong m_parent_thread_internal_id; + ulonglong m_parent_thread_internal_id; }; /** Table PERFORMANCE_SCHEMA.THREADS. */ diff --git a/storage/perfschema/table_tiws_by_index_usage.cc b/storage/perfschema/table_tiws_by_index_usage.cc index d354c40d3ed..71455793516 100644 --- a/storage/perfschema/table_tiws_by_index_usage.cc +++ b/storage/perfschema/table_tiws_by_index_usage.cc @@ -290,15 +290,16 @@ int table_tiws_by_index_usage::rnd_next(void) table_share= &table_share_array[m_pos.m_index_1]; if (table_share->m_lock.is_populated()) { - if (m_pos.m_index_2 < table_share->m_key_count) + uint safe_key_count= sanitize_index_count(table_share->m_key_count); + if (m_pos.m_index_2 < safe_key_count) { make_row(table_share, m_pos.m_index_2); m_next_pos.set_after(&m_pos); return 0; } - if (m_pos.m_index_2 <= MAX_KEY) + if (m_pos.m_index_2 <= MAX_INDEXES) { - m_pos.m_index_2= MAX_KEY; + m_pos.m_index_2= MAX_INDEXES; make_row(table_share, m_pos.m_index_2); m_next_pos.set_after(&m_pos); return 0; @@ -319,12 +320,13 @@ table_tiws_by_index_usage::rnd_pos(const void *pos) table_share= &table_share_array[m_pos.m_index_1]; if (table_share->m_lock.is_populated()) { - if (m_pos.m_index_2 < table_share->m_key_count) + uint safe_key_count= sanitize_index_count(table_share->m_key_count); + if (m_pos.m_index_2 < safe_key_count) { make_row(table_share, m_pos.m_index_2); return 0; } - if (m_pos.m_index_2 == MAX_KEY) + if (m_pos.m_index_2 == MAX_INDEXES) { make_row(table_share, m_pos.m_index_2); return 0; diff --git a/storage/perfschema/unittest/CMakeLists.txt b/storage/perfschema/unittest/CMakeLists.txt index 757bc24c566..c3a7fe5c72f 100644 --- a/storage/perfschema/unittest/CMakeLists.txt +++ b/storage/perfschema/unittest/CMakeLists.txt @@ -1,5 +1,4 @@ -# Copyright (c) 2009, 2010 Sun Microsystems, Inc. -# Use is subject to license terms. +# Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -11,18 +10,44 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software Foundation, -# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02111-1307 USA INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/include/mysql ${CMAKE_SOURCE_DIR}/regex ${CMAKE_SOURCE_DIR}/sql - ${CMAKE_SOURCE_DIR}/extra/yassl/include + ${SSL_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/unittest/mytap ${CMAKE_SOURCE_DIR}/storage/perfschema) -ADD_DEFINITIONS(-DMYSQL_SERVER) +ADD_DEFINITIONS(-DMYSQL_SERVER ${SSL_DEFINES}) MY_ADD_TESTS(pfs_instr_class pfs_instr_class-oom pfs_instr pfs_instr-oom pfs_account-oom pfs_host-oom pfs_user-oom pfs EXT "cc" LINK_LIBRARIES perfschema mysys) + +IF(WIN32) + SET(MYSQLD_EXTRA_SOURCES ${CMAKE_SOURCE_DIR}/sql/nt_servc.cc) +ENDIF() + +# We need the server libs to test the blob parser. +# Add sql_builtin.cc here, to force linkage of plugins below. +# Also add mysys/string.c (see Bug#45488) +ADD_EXECUTABLE(pfs_connect_attr-t + pfs_connect_attr-t.cc + ${CMAKE_BINARY_DIR}/sql/sql_builtin.cc + ${CMAKE_SOURCE_DIR}/mysys/string.c + ${MYSQLD_EXTRA_SOURCES} +) +ADD_DEPENDENCIES(pfs_connect_attr-t GenServerSource) +TARGET_LINK_LIBRARIES(pfs_connect_attr-t mytap perfschema) +# We need to explicitly link in everything referenced in sql/sql_builtin.cc +TARGET_LINK_LIBRARIES(pfs_connect_attr-t ${MYSQLD_STATIC_PLUGIN_LIBS}) +TARGET_LINK_LIBRARIES(pfs_connect_attr-t sql binlog rpl master slave sql) +TARGET_LINK_LIBRARIES(pfs_connect_attr-t mysys mysys_ssl) +TARGET_LINK_LIBRARIES(pfs_connect_attr-t vio ${SSL_LIBRARIES}) +TARGET_LINK_LIBRARIES(pfs_connect_attr-t strings dbug regex mysys zlib) +ADD_TEST(pfs_connect_attr pfs_connect_attr-t) + +# On windows, pfs_connect_attr-t may depend on openssl dlls. +COPY_OPENSSL_DLLS(copy_openssl_pfs_unittest) diff --git a/storage/perfschema/unittest/pfs-t.cc b/storage/perfschema/unittest/pfs-t.cc index 6b30c0cc498..31f68195f1b 100644 --- a/storage/perfschema/unittest/pfs-t.cc +++ b/storage/perfschema/unittest/pfs-t.cc @@ -111,6 +111,7 @@ void test_bootstrap() param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; param.m_digest_sizing= 0; + param.m_session_connect_attrs_sizing= 0; boot= initialize_performance_schema(& param); ok(boot != NULL, "boot"); @@ -168,6 +169,7 @@ PSI * load_perfschema() param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; param.m_digest_sizing= 0; + param.m_session_connect_attrs_sizing= 0; /* test_bootstrap() covered this, assuming it just works */ boot= initialize_performance_schema(& param); @@ -759,21 +761,21 @@ void test_init_disabled() /* disabled S-A + disabled T-1: no instrumentation */ socket_class_A->m_enabled= false; - socket_A1= psi->init_socket(socket_key_A, NULL); + socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0); ok(socket_A1 == NULL, "socket_A1 not instrumented"); /* enabled S-A + disabled T-1: instrumentation (for later) */ socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(socket_key_A, NULL); + socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0); ok(socket_A1 != NULL, "socket_A1 instrumented"); /* broken key + disabled T-1: no instrumentation */ socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(0, NULL); + socket_A1= psi->init_socket(0, NULL, NULL, 0); ok(socket_A1 == NULL, "socket key 0 not instrumented"); - socket_A1= psi->init_socket(99, NULL); + socket_A1= psi->init_socket(99, NULL, NULL, 0); ok(socket_A1 == NULL, "broken socket key not instrumented"); /* Pretend thread T-1 is enabled */ @@ -892,16 +894,16 @@ void test_init_disabled() /* enabled S-A + enabled T-1: instrumentation */ socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(socket_key_A, NULL); + socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0); ok(socket_A1 != NULL, "instrumented"); psi->destroy_socket(socket_A1); /* broken key + enabled T-1: no instrumentation */ socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(0, NULL); + socket_A1= psi->init_socket(0, NULL, NULL, 0); ok(socket_A1 == NULL, "not instrumented"); - socket_A1= psi->init_socket(99, NULL); + socket_A1= psi->init_socket(99, NULL, NULL, 0); ok(socket_A1 == NULL, "not instrumented"); /* Pretend the running thread is not instrumented */ @@ -996,21 +998,21 @@ void test_init_disabled() /* disabled S-A + unknown thread: no instrumentation */ socket_class_A->m_enabled= false; - socket_A1= psi->init_socket(socket_key_A, NULL); + socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0); ok(socket_A1 == NULL, "socket_A1 not instrumented"); /* enabled S-A + unknown thread: instrumentation (for later) */ socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(socket_key_A, NULL); + socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0); ok(socket_A1 != NULL, "socket_A1 instrumented"); /* broken key + unknown thread: no instrumentation */ socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(0, NULL); + socket_A1= psi->init_socket(0, NULL, NULL, 0); ok(socket_A1 == NULL, "socket key 0 not instrumented"); - socket_A1= psi->init_socket(99, NULL); + socket_A1= psi->init_socket(99, NULL, NULL, 0); ok(socket_A1 == NULL, "broken socket key not instrumented"); shutdown_performance_schema(); @@ -1126,7 +1128,7 @@ void test_locker_disabled() ok(file_A1 != NULL, "instrumented"); socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(socket_key_A, NULL); + socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0); ok(socket_A1 != NULL, "instrumented"); /* Socket lockers require a thread owner */ @@ -1294,10 +1296,10 @@ void test_locker_disabled() cond_locker= psi->start_cond_wait(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT, __FILE__, __LINE__); ok(cond_locker != NULL, "locker"); psi->end_cond_wait(cond_locker, 0); - file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_OPEN, "xxx", NULL); + file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_STREAM_OPEN, "xxx", NULL); ok(file_locker != NULL, "locker"); psi->start_file_open_wait(file_locker, __FILE__, __LINE__); - psi->end_file_open_wait(file_locker); + psi->end_file_open_wait(file_locker, NULL); file_locker= psi->get_thread_file_stream_locker(&file_state, file_A1, PSI_FILE_READ); ok(file_locker != NULL, "locker"); psi->start_file_wait(file_locker, 10, __FILE__, __LINE__); @@ -1314,7 +1316,7 @@ void test_locker_disabled() /* ---------------------------------------------- */ socket_class_A->m_enabled= true; - socket_A1= psi->init_socket(socket_key_A, NULL); + socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0); ok(socket_A1 != NULL, "instrumented"); /* Socket thread owner has not been set */ socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_SEND, 12, "foo.cc", 12); @@ -1485,6 +1487,8 @@ void test_event_name_index() memset(& param, 0xFF, sizeof(param)); param.m_enabled= true; + /* NOTE: Need to add 3 to each index: table io, table lock, idle */ + /* Per mutex info waits should be at [0..9] */ param.m_mutex_class_sizing= 10; /* Per rwlock info waits should be at [10..29] */ @@ -1509,6 +1513,7 @@ void test_event_name_index() param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; param.m_digest_sizing= 0; + param.m_session_connect_attrs_sizing= 0; param.m_mutex_sizing= 0; param.m_rwlock_sizing= 0; @@ -1540,10 +1545,10 @@ void test_event_name_index() psi->register_mutex("X", dummy_mutexes, 2); mutex_class= find_mutex_class(dummy_mutex_key_1); ok(mutex_class != NULL, "mutex class 1"); - ok(mutex_class->m_event_name_index == 0, "index 0"); + ok(mutex_class->m_event_name_index == 3, "index 3"); mutex_class= find_mutex_class(dummy_mutex_key_2); ok(mutex_class != NULL, "mutex class 2"); - ok(mutex_class->m_event_name_index == 1, "index 1"); + ok(mutex_class->m_event_name_index == 4, "index 4"); PFS_rwlock_class *rwlock_class; PSI_rwlock_key dummy_rwlock_key_1; @@ -1557,10 +1562,10 @@ void test_event_name_index() psi->register_rwlock("X", dummy_rwlocks, 2); rwlock_class= find_rwlock_class(dummy_rwlock_key_1); ok(rwlock_class != NULL, "rwlock class 1"); - ok(rwlock_class->m_event_name_index == 10, "index 10"); + ok(rwlock_class->m_event_name_index == 13, "index 13"); rwlock_class= find_rwlock_class(dummy_rwlock_key_2); ok(rwlock_class != NULL, "rwlock class 2"); - ok(rwlock_class->m_event_name_index == 11, "index 11"); + ok(rwlock_class->m_event_name_index == 14, "index 14"); PFS_cond_class *cond_class; PSI_cond_key dummy_cond_key_1; @@ -1574,10 +1579,10 @@ void test_event_name_index() psi->register_cond("X", dummy_conds, 2); cond_class= find_cond_class(dummy_cond_key_1); ok(cond_class != NULL, "cond class 1"); - ok(cond_class->m_event_name_index == 30, "index 30"); + ok(cond_class->m_event_name_index == 33, "index 33"); cond_class= find_cond_class(dummy_cond_key_2); ok(cond_class != NULL, "cond class 2"); - ok(cond_class->m_event_name_index == 31, "index 31"); + ok(cond_class->m_event_name_index == 34, "index 34"); PFS_file_class *file_class; PSI_file_key dummy_file_key_1; @@ -1591,10 +1596,10 @@ void test_event_name_index() psi->register_file("X", dummy_files, 2); file_class= find_file_class(dummy_file_key_1); ok(file_class != NULL, "file class 1"); - ok(file_class->m_event_name_index == 70, "index 70"); + ok(file_class->m_event_name_index == 73, "index 73"); file_class= find_file_class(dummy_file_key_2); ok(file_class != NULL, "file class 2"); - ok(file_class->m_event_name_index == 71, "index 71"); + ok(file_class->m_event_name_index == 74, "index 74"); PFS_socket_class *socket_class; PSI_socket_key dummy_socket_key_1; @@ -1608,13 +1613,13 @@ void test_event_name_index() psi->register_socket("X", dummy_sockets, 2); socket_class= find_socket_class(dummy_socket_key_1); ok(socket_class != NULL, "socket class 1"); - ok(socket_class->m_event_name_index == 150, "index 150"); + ok(socket_class->m_event_name_index == 153, "index 153"); socket_class= find_socket_class(dummy_socket_key_2); ok(socket_class != NULL, "socket class 2"); - ok(socket_class->m_event_name_index == 151, "index 151"); + ok(socket_class->m_event_name_index == 154, "index 154"); - ok(global_table_io_class.m_event_name_index == 310, "index 310"); - ok(global_table_lock_class.m_event_name_index == 311, "index 311"); + ok(global_table_io_class.m_event_name_index == 0, "index 0"); + ok(global_table_lock_class.m_event_name_index == 1, "index 1"); ok(wait_class_max= 313, "313 event names"); // 3 global classes } diff --git a/storage/perfschema/unittest/pfs_account-oom-t.cc b/storage/perfschema/unittest/pfs_account-oom-t.cc index d0c139476b0..0e48ab68ef7 100644 --- a/storage/perfschema/unittest/pfs_account-oom-t.cc +++ b/storage/perfschema/unittest/pfs_account-oom-t.cc @@ -59,6 +59,7 @@ void test_oom() param.m_statement_class_sizing= 50; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; /* Setup */ diff --git a/storage/perfschema/unittest/pfs_connect_attr-t.cc b/storage/perfschema/unittest/pfs_connect_attr-t.cc new file mode 100644 index 00000000000..7bee1d063a1 --- /dev/null +++ b/storage/perfschema/unittest/pfs_connect_attr-t.cc @@ -0,0 +1,345 @@ +/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +#include <my_global.h> +#include <my_pthread.h> +#include <pfs_server.h> +#include <pfs_instr_class.h> +#include <pfs_instr.h> +#include <pfs_global.h> +#include <tap.h> + + +#include <string.h> +#include <memory.h> + +/* test helpers, to inspect data */ +bool read_nth_attr(const char *connect_attrs, uint connect_attrs_length, + const CHARSET_INFO *connect_attrs_cs, + uint ordinal, + char *attr_name, uint max_attr_name, + uint *attr_name_length, + char *attr_value, uint max_attr_value, + uint *attr_value_length); + +void test_blob_parser() +{ + char name[100], value[4096]; + unsigned char packet[10000], *ptr; + uint name_len, value_len, idx, packet_length; + bool result; + const CHARSET_INFO *cs= &my_charset_utf8_bin; + + diag("test_blob_parser"); + + result= read_nth_attr("", 0, cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "zero length blob"); + + + result= read_nth_attr("\x1", 1, cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "invalid key length"); + + + result= read_nth_attr("\x2k1\x1", 4, cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "invalid value length"); + + + result= read_nth_attr("\x2k1\x2v1", 6, cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "one pair return"); + ok(name_len == 2, "one pair attr name length"); + ok(!strncmp(name, "k1", name_len), "one pair attr name"); + ok(value_len == 2, "one pair value length"); + ok(!strncmp(value, "v1", value_len), "one pair value"); + + result= read_nth_attr("\x2k1\x2v1", 6, cs, 1, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "no second arg"); + + result= read_nth_attr("\x2k1\x2v1\x2k2\x2v2", 12, cs, 1, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "two pairs return"); + ok(name_len == 2, "two pairs attr name length"); + ok(!strncmp(name, "k2", name_len), "two pairs attr name"); + ok(value_len == 2, "two pairs value length"); + ok(!strncmp(value, "v2", value_len), "two pairs value"); + + result= read_nth_attr("\x2k1\xff\x2k2\x2v2", 12, cs, 1, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "two pairs first value bad return"); + + result= read_nth_attr("\x2k1\x2v1\x2k2\x2v2", 10, cs, 1, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "two pairs wrong global length"); + + result= read_nth_attr("\x21z123456789z123456789z123456789z12\x2v1", 37, cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "attr name overflow"); + ok(name_len == 32, "attr name overflow length"); + ok(!strncmp(name, "z123456789z123456789z123456789z1", name_len), + "attr name overflow name"); + ok(value_len == 2, "attr name overflow value length"); + ok(!strncmp(value, "v1", value_len), "attr name overflow value"); + + packet[0]= 2; + packet[1]= 'k'; + packet[2]= '1'; + ptr= net_store_length(packet + 3, 1025); + for (idx= 0; idx < 1025; idx++) + *ptr++= '0' + (idx % 10); + packet_length= (uint) (ptr - packet); + result= read_nth_attr((char *) packet, packet_length, cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "attr value overflow"); + ok(name_len == 2, "attr value overflow length"); + ok(!strncmp(name, "k1", name_len), "attr value overflow name"); + ok(value_len == 1024, "attr value overflow value length"); + for (idx= 0; idx < 1024; idx++) + { + if (value[idx] != (char) ('0' + (idx % 10))) + break; + } + ok (idx == 1024, "attr value overflow value"); + + result= read_nth_attr("\x21z123456789z123456789z123456789z12\x2v1\x2k2\x2v2", + 43, cs, 1, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "prev attr name overflow"); + ok(name_len == 2, "prev attr name overflow length"); + ok(!strncmp(name, "k2", name_len), + "prev attr name overflow name"); + ok(value_len == 2, "prev attr name overflow value length"); + ok(!strncmp(value, "v2", value_len), "prev attr name overflow value"); + + + packet[1]= 'k'; + packet[2]= '1'; + packet[3]= 2; + packet[4]= 'v'; + packet[5]= '1'; + + for(idx= 251; idx < 256; idx++) + { + packet[0]= idx; + result= read_nth_attr((char *) packet, 6, cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "invalid string length %d", idx); + } + + memset(packet, 0, sizeof(packet)); + for (idx=0; idx < 1660 /* *6 = 9960 */; idx++) + memcpy(packet + idx * 6, "\x2k1\x2v1", 6); + result= read_nth_attr((char *) packet, 8192, cs, 1364, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "last valid attribute %d", 1364); + result= read_nth_attr((char *) packet, 8192, cs, 1365, + name, 32, &name_len, value, 1024, &value_len); + ok(result == false, "first attribute that's cut %d", 1365); +} + +void test_multibyte_lengths() +{ + char name[100], value[4096]; + uint name_len, value_len; + bool result; + const CHARSET_INFO *cs= &my_charset_utf8_bin; + + unsigned char var_len_packet[] = { + 252, 2, 0, 'k', '1', + 253, 2, 0, 0, 'v', '1', + 254, 2, 0, 0, 0, 0, 0, 0, 0, 'k', '2', + 254, 2, 0, 0, 0, 0, 0, 0, 0, 'v', '2' + }; + + result= read_nth_attr((char *) var_len_packet, sizeof(var_len_packet), cs, 0, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "multibyte lengths return"); + ok(name_len == 2, "multibyte lengths name length"); + ok(!strncmp(name, "k1", name_len), "multibyte lengths attr name"); + ok(value_len == 2, "multibyte lengths value length"); + ok(!strncmp(value, "v1", value_len), "multibyte lengths value"); + + result= read_nth_attr((char *) var_len_packet, sizeof(var_len_packet), cs, 1, + name, 32, &name_len, value, 1024, &value_len); + ok(result == true, "multibyte lengths second attr return"); + ok(name_len == 2, "multibyte lengths second attr name length"); + ok(!strncmp(name, "k2", name_len), "multibyte lengths second attr attr name"); + ok(value_len == 2, "multibyte lengths value length"); + ok(!strncmp(value, "v2", value_len), "multibyte lengths second attr value"); +} + + +void test_utf8_parser() +{ + /* utf8 max byte length per character is 6 */ + char name[33 * 6], value[1024 * 6], packet[1500 * 6], *ptr; + uint name_len, value_len; + bool result; + const CHARSET_INFO *cs= &my_charset_utf8_bin; + + /* note : this is encoded in utf-8 */ + const char *attr1= "Георги"; + const char *val1= "Кодинов"; + const char *attr2= "Пловдив"; + const char *val2= "България"; + + ptr= packet; + *ptr++= strlen(attr1); + memcpy(ptr, attr1, strlen(attr1)); + ptr+= strlen(attr1); + *ptr++= strlen(val1); + memcpy(ptr, val1, strlen(val1)); + ptr+= strlen(val1); + + *ptr++= strlen(attr2); + memcpy(ptr, attr2, strlen(attr2)); + ptr+= strlen(attr2); + *ptr++= strlen(val2); + memcpy(ptr, val2, strlen(val2)); + ptr+= strlen(val2); + + diag("test_utf8_parser attr pair #1"); + + result= read_nth_attr((char *) packet, ptr - packet, cs, 0, + name, sizeof(name), &name_len, + value, sizeof(value), &value_len); + ok(result == true, "return"); + ok(name_len == strlen(attr1), "name length"); + ok(!strncmp(name, attr1, name_len), "attr name"); + ok(value_len == strlen(val1), "value length"); + ok(!strncmp(value, val1, value_len), "value"); + + diag("test_utf8_parser attr pair #2"); + result= read_nth_attr((char *) packet, ptr - packet, cs, 1, + name, sizeof(name), &name_len, + value, sizeof(value), &value_len); + ok(result == true, "return"); + ok(name_len == strlen(attr2), "name length"); + ok(!strncmp(name, attr2, name_len), "attr name"); + ok(value_len == strlen(val2), "value length"); + ok(!strncmp(value, val2, value_len), "value"); +} + + +void test_utf8_parser_bad_encoding() +{ + /* utf8 max byte length per character is 3*/ + char name[33 * 3], value[1024 * 3], packet[1500 * 3], *ptr; + uint name_len, value_len; + bool result; + const CHARSET_INFO *cs= &my_charset_utf8_bin; + + /* note : this is encoded in utf-8 */ + const char *attr= "Георги"; + const char *val= "Кодинов"; + + ptr= packet; + *ptr++= strlen(attr); + memcpy(ptr, attr, strlen(attr)); + ptr[0]= 0xFA; // invalid UTF-8 char + ptr+= strlen(attr); + *ptr++= strlen(val); + memcpy(ptr, val, strlen(val)); + ptr+= strlen(val); + + diag("test_utf8_parser_bad_encoding"); + + result= read_nth_attr((char *) packet, ptr - packet, cs, 0, + name, sizeof(name), &name_len, + value, sizeof(value), &value_len); + ok(result == false, "return"); +} + +const CHARSET_INFO *cs_cp1251; + +void test_cp1251_parser() +{ + /* utf8 max byte length per character is 3*/ + char name[33 * 3], value[1024 * 3], packet[1500 * 3], *ptr; + uint name_len, value_len; + bool result; + + /* note : this is Георги in windows-1251 */ + const char *attr1= "\xc3\xe5\xee\xf0\xe3\xe8"; + /* note : this is Кодинов in windows-1251 */ + const char *val1= "\xca\xee\xe4\xe8\xed\xee\xe2"; + /* note : this is Пловдив in windows-1251 */ + const char *attr2= "\xcf\xeb\xee\xe2\xe4\xe8\xe2"; + /* note : this is България in windows-1251 */ + const char *val2= "\xc1\xfa\xeb\xe3\xe0\xf0\xe8\xff"; + + ptr= packet; + *ptr++= strlen(attr1); + memcpy(ptr, attr1, strlen(attr1)); + ptr+= strlen(attr1); + *ptr++= strlen(val1); + memcpy(ptr, val1, strlen(val1)); + ptr+= strlen(val1); + + *ptr++= strlen(attr2); + memcpy(ptr, attr2, strlen(attr2)); + ptr+= strlen(attr2); + *ptr++= strlen(val2); + memcpy(ptr, val2, strlen(val2)); + ptr+= strlen(val2); + + diag("test_cp1251_parser attr pair #1"); + + result= read_nth_attr((char *) packet, ptr - packet, cs_cp1251, 0, + name, sizeof(name), &name_len, + value, sizeof(value), &value_len); + ok(result == true, "return"); + /* need to compare to the UTF-8 equivalents */ + ok(name_len == strlen("Георги"), "name length"); + ok(!strncmp(name, "Георги", name_len), "attr name"); + ok(value_len == strlen("Кодинов"), "value length"); + ok(!strncmp(value, "Кодинов", value_len), "value"); + + diag("test_cp1251_parser attr pair #2"); + result= read_nth_attr((char *) packet, ptr - packet, cs_cp1251, 1, + name, sizeof(name), &name_len, + value, sizeof(value), &value_len); + ok(result == true, "return"); + /* need to compare to the UTF-8 equivalents */ + ok(name_len == strlen("Пловдив"), "name length"); + ok(!strncmp(name, "Пловдив", name_len), "attr name"); + ok(value_len == strlen("България"), "value length"); + ok(!strncmp(value, "България", value_len), "value"); +} + + +void do_all_tests() +{ + test_blob_parser(); + test_multibyte_lengths(); + test_utf8_parser(); + test_utf8_parser_bad_encoding(); + test_cp1251_parser(); +} + +int main(int, char **) +{ + MY_INIT("pfs_connect_attr-t"); + + cs_cp1251= get_charset_by_csname("cp1251", MY_CS_PRIMARY, MYF(0)); + if (!cs_cp1251) + diag("skipping the cp1251 tests : missing character set"); + plan(59 + (cs_cp1251 ? 10 : 0)); + do_all_tests(); + return 0; +} diff --git a/storage/perfschema/unittest/pfs_host-oom-t.cc b/storage/perfschema/unittest/pfs_host-oom-t.cc index a4fb36b0b08..cc445620496 100644 --- a/storage/perfschema/unittest/pfs_host-oom-t.cc +++ b/storage/perfschema/unittest/pfs_host-oom-t.cc @@ -59,6 +59,7 @@ void test_oom() param.m_statement_class_sizing= 50; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; /* Setup */ diff --git a/storage/perfschema/unittest/pfs_instr-oom-t.cc b/storage/perfschema/unittest/pfs_instr-oom-t.cc index 41bb4ed6c5a..5d9873d7927 100644 --- a/storage/perfschema/unittest/pfs_instr-oom-t.cc +++ b/storage/perfschema/unittest/pfs_instr-oom-t.cc @@ -63,6 +63,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -98,6 +99,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -133,6 +135,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -168,6 +171,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -201,6 +205,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -236,6 +241,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -271,6 +277,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -308,6 +315,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 2; init_event_name_sizing(& param); @@ -341,6 +349,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 2; init_event_name_sizing(& param); @@ -383,8 +392,9 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; - stub_alloc_fails_after_count= 2; + stub_alloc_fails_after_count= 1; init_event_name_sizing(& param); rc= init_instruments(& param); ok(rc == 1, "oom (per thread waits)"); @@ -417,6 +427,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 3; init_event_name_sizing(& param); @@ -451,6 +462,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 2; init_event_name_sizing(& param); @@ -485,6 +497,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 10; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 2; init_event_name_sizing(& param); @@ -519,6 +532,7 @@ void test_oom() param.m_statement_class_sizing= 50; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 2; init_event_name_sizing(& param); @@ -553,6 +567,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 1; init_event_name_sizing(& param); @@ -587,6 +602,7 @@ void test_oom() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 3; init_event_name_sizing(& param); @@ -624,6 +640,7 @@ void test_oom() param.m_statement_class_sizing= 20; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_session_connect_attrs_sizing= 0; stub_alloc_fails_after_count= 3; init_event_name_sizing(& param); diff --git a/storage/perfschema/unittest/pfs_instr-t.cc b/storage/perfschema/unittest/pfs_instr-t.cc index b0839de70b2..4ef240ea819 100644 --- a/storage/perfschema/unittest/pfs_instr-t.cc +++ b/storage/perfschema/unittest/pfs_instr-t.cc @@ -60,6 +60,8 @@ void test_no_instruments() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_digest_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -117,6 +119,8 @@ void test_no_instances() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_digest_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -153,19 +157,19 @@ void test_no_instances() PFS_thread fake_thread; fake_thread.m_filename_hash_pins= NULL; - file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5); + file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true); ok(file == NULL, "no file"); ok(file_lost == 1, "lost 1"); - file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5); + file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true); ok(file == NULL, "no file"); ok(file_lost == 2, "lost 2"); init_file_hash(); - file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5); + file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true); ok(file == NULL, "no file"); ok(file_lost == 3, "lost 3"); - file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5); + file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true); ok(file == NULL, "no file"); ok(file_lost == 4, "lost 4"); @@ -173,7 +177,7 @@ void test_no_instances() int size= sizeof(long_file_name); memset(long_file_name, 'X', size); - file= find_or_create_file(& fake_thread, & dummy_file_class, long_file_name, size); + file= find_or_create_file(& fake_thread, & dummy_file_class, long_file_name, size, true); ok(file == NULL, "no file"); ok(file_lost == 5, "lost 5"); @@ -184,10 +188,10 @@ void test_no_instances() ok(table == NULL, "no table"); ok(table_lost == 2, "lost 2"); - socket= create_socket(& dummy_socket_class, NULL); + socket= create_socket(& dummy_socket_class, NULL, NULL, 0); ok(socket == NULL, "no socket"); ok(socket_lost == 1, "lost 1"); - socket= create_socket(& dummy_socket_class, NULL); + socket= create_socket(& dummy_socket_class, NULL, NULL, 0); ok(socket == NULL, "no socket"); ok(socket_lost == 2, "lost 2"); @@ -255,6 +259,8 @@ void test_with_instances() param.m_statement_class_sizing= 0; param.m_events_statements_history_sizing= 0; param.m_events_statements_history_long_sizing= 0; + param.m_digest_sizing= 0; + param.m_session_connect_attrs_sizing= 0; init_event_name_sizing(& param); rc= init_instruments(& param); @@ -325,50 +331,50 @@ void test_with_instances() PFS_thread fake_thread; fake_thread.m_filename_hash_pins= NULL; - file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5); + file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true); ok(file_1 == NULL, "no file"); ok(file_lost == 1, "lost 1"); - file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5); + file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true); ok(file_1 == NULL, "no file"); ok(file_lost == 2, "lost 2"); init_file_hash(); file_lost= 0; - file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_A", 7); + file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_A", 7, true); ok(file_1 != NULL, "file"); ok(file_1->m_file_stat.m_open_count == 1, "open count 1"); ok(file_lost == 0, "not lost"); - file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_A", 7); + file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_A", 7, true); ok(file_1 == file_2, "same file"); ok(file_1->m_file_stat.m_open_count == 2, "open count 2"); ok(file_lost == 0, "not lost"); release_file(file_2); ok(file_1->m_file_stat.m_open_count == 1, "open count 1"); - file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_B", 7); + file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_B", 7, true); ok(file_2 != NULL, "file"); ok(file_lost == 0, "not lost"); - file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_C", 7); + file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_C", 7, true); ok(file_2 == NULL, "no file"); ok(file_lost == 1, "lost"); release_file(file_1); /* the file still exists, not destroyed */ ok(file_1->m_file_stat.m_open_count == 0, "open count 0"); - file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_D", 7); + file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_D", 7, true); ok(file_2 == NULL, "no file"); ok(file_lost == 2, "lost"); - socket_1= create_socket(& dummy_socket_class, NULL); + socket_1= create_socket(& dummy_socket_class, NULL, NULL, 0); ok(socket_1 != NULL, "socket"); ok(socket_lost == 0, "not lost"); - socket_2= create_socket(& dummy_socket_class, NULL); + socket_2= create_socket(& dummy_socket_class, NULL, NULL, 0); ok(socket_2 != NULL, "socket"); ok(socket_lost == 0, "not lost"); - socket_2= create_socket(& dummy_socket_class, NULL); + socket_2= create_socket(& dummy_socket_class, NULL, NULL, 0); ok(socket_2 == NULL, "no socket"); ok(socket_lost == 1, "lost 1"); destroy_socket(socket_1); - socket_2= create_socket(& dummy_socket_class, NULL); + socket_2= create_socket(& dummy_socket_class, NULL, NULL, 0); ok(socket_2 != NULL, "socket"); ok(socket_lost == 1, "no new loss"); diff --git a/storage/perfschema/unittest/pfs_instr_class-t.cc b/storage/perfschema/unittest/pfs_instr_class-t.cc index 9e3efde656e..7b3ffccffcc 100644 --- a/storage/perfschema/unittest/pfs_instr_class-t.cc +++ b/storage/perfschema/unittest/pfs_instr_class-t.cc @@ -475,6 +475,7 @@ void test_table_registration() #endif } +#ifdef LATER void set_wait_stat(PFS_instr_class *klass) { PFS_single_stat *stat; @@ -501,6 +502,7 @@ bool is_empty_stat(PFS_instr_class *klass) return false; return true; } +#endif void test_instruments_reset() { |