diff options
author | unknown <knielsen@knielsen-hq.org> | 2010-10-19 12:27:31 +0200 |
---|---|---|
committer | unknown <knielsen@knielsen-hq.org> | 2010-10-19 12:27:31 +0200 |
commit | 4f907dc75b44bca9d9f3c293e7ada0911645837d (patch) | |
tree | 1935936ba46eb124e5bbfe4b84a61ee1e0c26ff6 | |
parent | 9ec2d5b6e510b9a3ad2fd54c37eb302c0121f9e3 (diff) | |
download | mariadb-git-4f907dc75b44bca9d9f3c293e7ada0911645837d.tar.gz |
Updated with changes from lp:percona-server/release-5.1.51-12 as of October 19, 2010
51 files changed, 1168 insertions, 388 deletions
diff --git a/ChangeLog b/ChangeLog index 5ebcf1e87a2..43f87a1baf5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,58 @@ +2010-08-24 The InnoDB Team + + * handler/ha_innodb.c, dict/dict0dict.c: + Fix Bug #55832 selects crash too easily when innodb_force_recovery>3 + +2010-08-03 The InnoDB Team + + * include/dict0dict.h, include/dict0dict.ic, row/row0mysql.c: + Fix bug #54678, InnoDB, TRUNCATE, ALTER, I_S SELECT, crash or deadlock + +2010-08-03 The InnoDB Team + + * dict/dict0load.c, handler/ha_innodb.cc, include/db0err.h, + include/dict0load.h, include/dict0mem.h, include/que0que.h, + row/row0merge.c, row/row0mysql.c: + Fix Bug#54582 stack overflow when opening many tables linked + with foreign keys at once + +2010-08-03 The InnoDB Team + + * include/ut0mem.h, ut/ut0mem.c: + Fix Bug #55627 segv in ut_free pars_lexer_close innobase_shutdown + innodb-use-sys-malloc=0 + +2010-08-01 The InnoDB Team + + * handler/ha_innodb.cc + Fix Bug #55382 Assignment with SELECT expressions takes unexpected + S locks in READ COMMITTED +>>>>>>> MERGE-SOURCE + +2010-07-27 The InnoDB Team + + * include/mem0pool.h, mem/mem0mem.c, mem/mem0pool.c, srv/srv0start.c: + Fix Bug#55581 shutdown with innodb-use-sys-malloc=0: assert + mutex->magic_n == MUTEX_MAGIC_N. + +2010-06-30 The InnoDB Team + + * btr/btr0sea.c, ha/ha0ha.c, handler/ha_innodb.cc, include/btr0sea.h: + Fix Bug#54311 Crash on CHECK PARTITION after concurrent LOAD DATA + and adaptive_hash_index=OFF + +2010-06-29 The InnoDB Team + * row/row0row.c, row/row0undo.c, row/row0upd.c: + Fix Bug#54408 txn rollback after recovery: row0umod.c:673 + dict_table_get_format(index->table) + +2010-06-29 The InnoDB Team + + * btr/btr0cur.c, include/btr0cur.h, + include/row0mysql.h, row/row0merge.c, row/row0sel.c: + Fix Bug#54358 READ UNCOMMITTED access failure of off-page DYNAMIC + or COMPRESSED columns + 2010-06-24 The InnoDB Team * handler/ha_innodb.cc: diff --git a/Makefile.am b/Makefile.am index aa01aabcdc8..c73eb3d4f47 100644 --- a/Makefile.am +++ b/Makefile.am @@ -326,14 +326,14 @@ libinnobase_a_SOURCES= \ ut/ut0vec.c \ ut/ut0wqueue.c -libinnobase_a_CXXFLAGS= $(AM_CFLAGS) +libinnobase_a_CXXFLAGS= $(AM_CXXFLAGS) libinnobase_a_CFLAGS= $(AM_CFLAGS) EXTRA_LTLIBRARIES= ha_innodb_plugin.la pkgplugin_LTLIBRARIES= @plugin_innodb_plugin_shared_target@ ha_innodb_plugin_la_LDFLAGS= -module -rpath $(pkgplugindir) -ha_innodb_plugin_la_CXXFLAGS= $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS) +ha_innodb_plugin_la_CXXFLAGS= $(AM_CXXFLAGS) $(INNODB_DYNAMIC_CFLAGS) ha_innodb_plugin_la_CFLAGS= $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS) ha_innodb_plugin_la_SOURCES= $(libinnobase_a_SOURCES) diff --git a/Makefile.in b/Makefile.in index ab393ac1d62..f60347b7980 100644 --- a/Makefile.in +++ b/Makefile.in @@ -56,7 +56,8 @@ subdir = storage/innodb_plugin DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.am \ $(srcdir)/Makefile.in COPYING ChangeLog ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/config/ac-macros/alloca.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/config/ac-macros/maintainer.m4 \ + $(top_srcdir)/config/ac-macros/alloca.m4 \ $(top_srcdir)/config/ac-macros/check_cpu.m4 \ $(top_srcdir)/config/ac-macros/character_sets.m4 \ $(top_srcdir)/config/ac-macros/compiler_flag.m4 \ @@ -275,6 +276,8 @@ ABI_CHECK = @ABI_CHECK@ ACLOCAL = @ACLOCAL@ ALLOCA = @ALLOCA@ AMTAR = @AMTAR@ +AM_CFLAGS = @AM_CFLAGS@ +AM_CXXFLAGS = @AM_CXXFLAGS@ AR = @AR@ ARFLAGS = @ARFLAGS@ AS = @AS@ @@ -883,12 +886,12 @@ libinnobase_a_SOURCES = \ ut/ut0vec.c \ ut/ut0wqueue.c -libinnobase_a_CXXFLAGS = $(AM_CFLAGS) +libinnobase_a_CXXFLAGS = $(AM_CXXFLAGS) libinnobase_a_CFLAGS = $(AM_CFLAGS) EXTRA_LTLIBRARIES = ha_innodb_plugin.la pkgplugin_LTLIBRARIES = @plugin_innodb_plugin_shared_target@ ha_innodb_plugin_la_LDFLAGS = -module -rpath $(pkgplugindir) -ha_innodb_plugin_la_CXXFLAGS = $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS) +ha_innodb_plugin_la_CXXFLAGS = $(AM_CXXFLAGS) $(INNODB_DYNAMIC_CFLAGS) ha_innodb_plugin_la_CFLAGS = $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS) ha_innodb_plugin_la_SOURCES = $(libinnobase_a_SOURCES) EXTRA_DIST = CMakeLists.txt plug.in \ diff --git a/btr/btr0cur.c b/btr/btr0cur.c index 9b87d969a64..3fc2b48162a 100644 --- a/btr/btr0cur.c +++ b/btr/btr0cur.c @@ -3866,9 +3866,10 @@ btr_cur_set_ownership_of_extern_field( Marks not updated extern fields as not-owned by this record. The ownership is transferred to the updated record which is inserted elsewhere in the index tree. In purge only the owner of externally stored field is allowed -to free the field. */ +to free the field. +@return TRUE if BLOB ownership was transferred */ UNIV_INTERN -void +ibool btr_cur_mark_extern_inherited_fields( /*=================================*/ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed @@ -3882,13 +3883,14 @@ btr_cur_mark_extern_inherited_fields( ulint n; ulint j; ulint i; + ibool change_ownership = FALSE; ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); if (!rec_offs_any_extern(offsets)) { - return; + return(FALSE); } n = rec_offs_n_fields(offsets); @@ -3911,10 +3913,14 @@ btr_cur_mark_extern_inherited_fields( btr_cur_set_ownership_of_extern_field( page_zip, rec, index, offsets, i, FALSE, mtr); + + change_ownership = TRUE; updated: ; } } + + return(change_ownership); } /*******************************************************************//** @@ -5202,7 +5208,7 @@ btr_copy_externally_stored_field( /*******************************************************************//** Copies an externally stored field of a record to mem heap. -@return the field copied to heap */ +@return the field copied to heap, or NULL if the field is incomplete */ UNIV_INTERN byte* btr_rec_copy_externally_stored_field( @@ -5232,6 +5238,18 @@ btr_rec_copy_externally_stored_field( data = rec_get_nth_field(rec, offsets, no, &local_len); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY + (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { + /* The externally stored field was not written yet. + This record should only be seen by + recv_recovery_rollback_active() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return(NULL); + } + return(btr_copy_externally_stored_field(len, data, zip_size, local_len, heap)); } diff --git a/btr/btr0sea.c b/btr/btr0sea.c index 36dadd47e69..6628333d32a 100644 --- a/btr/btr0sea.c +++ b/btr/btr0sea.c @@ -46,6 +46,7 @@ Created 2/17/1996 Heikki Tuuri /** Flag: has the search system been enabled? Protected by btr_search_latch and btr_search_enabled_mutex. */ UNIV_INTERN char btr_search_enabled = TRUE; +UNIV_INTERN ibool btr_search_fully_disabled = FALSE; /** Mutex protecting btr_search_enabled */ static mutex_t btr_search_enabled_mutex; @@ -201,12 +202,19 @@ btr_search_disable(void) mutex_enter(&btr_search_enabled_mutex); rw_lock_x_lock(&btr_search_latch); + /* Disable access to hash index, also tell ha_insert_for_fold() + stop adding new nodes to hash index, but still allow updating + existing nodes */ btr_search_enabled = FALSE; /* Clear all block->is_hashed flags and remove all entries from btr_search_sys->hash_index. */ buf_pool_drop_hash_index(); + /* hash index has been cleaned up, disallow any operation to + the hash index */ + btr_search_fully_disabled = TRUE; + /* btr_search_enabled_mutex should guarantee this. */ ut_ad(!btr_search_enabled); @@ -225,6 +233,7 @@ btr_search_enable(void) rw_lock_x_lock(&btr_search_latch); btr_search_enabled = TRUE; + btr_search_fully_disabled = FALSE; rw_lock_x_unlock(&btr_search_latch); mutex_exit(&btr_search_enabled_mutex); @@ -1488,7 +1497,7 @@ btr_search_build_page_hash_index( rw_lock_x_lock(&btr_search_latch); - if (UNIV_UNLIKELY(!btr_search_enabled)) { + if (UNIV_UNLIKELY(btr_search_fully_disabled)) { goto exit_func; } @@ -1850,6 +1859,7 @@ function_exit: } } +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG /********************************************************************//** Validates the search system. @return TRUE if ok */ @@ -2019,3 +2029,4 @@ btr_search_validate(void) return(ok); } +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ diff --git a/buf/buf0buf.c b/buf/buf0buf.c index 8b12e9c67b0..48f367f0957 100644 --- a/buf/buf0buf.c +++ b/buf/buf0buf.c @@ -792,7 +792,7 @@ buf_block_reuse( ptrdiff_t frame_offset) { /* block_init */ - block->frame = ((void*)(block->frame) + frame_offset); + block->frame += frame_offset; UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block); @@ -809,7 +809,7 @@ buf_block_reuse( #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ if (block->page.zip.data) - block->page.zip.data = ((void*)(block->page.zip.data) + frame_offset); + block->page.zip.data += frame_offset; block->is_hashed = FALSE; @@ -845,6 +845,8 @@ buf_chunk_init( although it already should be. */ mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE); + srv_buffer_pool_shm_is_reused = FALSE; + if (srv_buffer_pool_shm_key) { /* zip_hash size */ zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2; @@ -870,39 +872,46 @@ buf_chunk_init( ut_a(buf_pool->n_chunks == 1); fprintf(stderr, - "InnoDB: Notice: innodb_buffer_pool_shm_key option is specified.\n" - "InnoDB: This option may not be safe to keep consistency of datafiles.\n" - "InnoDB: Because InnoDB cannot lock datafiles when shutdown until reusing shared memory segment.\n" - "InnoDB: You should ensure no change of InnoDB files while using innodb_buffer_pool_shm_key.\n"); + "InnoDB: Warning: The innodb_buffer_pool_shm_key option has been specified.\n" + "InnoDB: Do not change the following between restarts of the server while this option is being used:\n" + "InnoDB: * the mysqld executable between restarts of the server.\n" + "InnoDB: * the value of innodb_buffer_pool_size.\n" + "InnoDB: * the value of innodb_page_size.\n" + "InnoDB: * datafiles created by InnoDB during this session.\n" + "InnoDB: Otherwise, data corruption in datafiles may result.\n"); /* FIXME: This is vague id still */ - binary_id = (ulint) ((void*)mtr_commit - (void*)btr_root_get) - + (ulint) ((void*)os_get_os_version - (void*)buf_calc_page_new_checksum) - + (ulint) ((void*)page_dir_find_owner_slot - (void*)dfield_data_is_binary_equal) - + (ulint) ((void*)que_graph_publish - (void*)dict_casedn_str) - + (ulint) ((void*)read_view_oldest_copy_or_open_new - (void*)fil_space_get_version) - + (ulint) ((void*)rec_get_n_extern_new - (void*)fsp_get_size_low) - + (ulint) ((void*)row_get_trx_id_offset - (void*)ha_create_func) - + (ulint) ((void*)srv_set_io_thread_op_info - (void*)thd_is_replication_slave_thread) - + (ulint) ((void*)mutex_create_func - (void*)ibuf_inside) - + (ulint) ((void*)trx_set_detailed_error - (void*)lock_check_trx_id_sanity) - + (ulint) ((void*)ut_time - (void*)mem_heap_strdup); + binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get) + + (ulint) ((byte*)os_get_os_version - (byte*)buf_calc_page_new_checksum) + + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal) + + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str) + + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version) + + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low) + + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func) + + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread) + + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside) + + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity) + + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup); chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new); if (UNIV_UNLIKELY(chunk->mem == NULL)) { return(NULL); } - +init_again: #ifdef UNIV_SET_MEM_TO_ZERO if (is_new) { memset(chunk->mem, '\0', chunk->mem_size); } #endif + /* for ut_fold_binary_32(), these values should be 32-bit aligned */ + ut_a(sizeof(buf_shm_info_t) % 4 == 0); + ut_a((ulint)chunk->mem % 4 == 0); + ut_a(chunk->mem_size % 4 == 0); shm_info = chunk->mem; - zip_hash_tmp = (hash_table_t*)((void*)chunk->mem + chunk->mem_size - zip_hash_mem_size); + zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size); if (is_new) { strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8); @@ -932,16 +941,6 @@ buf_chunk_init( "InnoDB: Error: The shared memory was not initialized yet.\n"); return(NULL); } - if (!shm_info->clean) { - fprintf(stderr, - "InnoDB: Error: The shared memory was not shut down cleanly.\n"); - return(NULL); - } - if (!shm_info->reusable) { - fprintf(stderr, - "InnoDB: Error: The shared memory has unrecoverable contents.\n"); - return(NULL); - } if (shm_info->buf_pool_size != srv_buf_pool_size) { fprintf(stderr, "InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n", @@ -954,14 +953,34 @@ buf_chunk_init( shm_info->page_size, srv_page_size); return(NULL); } + if (!shm_info->reusable) { + fprintf(stderr, + "InnoDB: Warning: The shared memory has unrecoverable contents.\n" + "InnoDB: The shared memory segment is initialized.\n"); + is_new = TRUE; + goto init_again; + } + if (!shm_info->clean) { + fprintf(stderr, + "InnoDB: Warning: The shared memory was not shut down cleanly.\n" + "InnoDB: The shared memory segment is initialized.\n"); + is_new = TRUE; + goto init_again; + } ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size); ut_a(shm_info->zip_hash_n == zip_hash_n); /* check checksum */ - checksum = ut_fold_binary(chunk->mem + sizeof(buf_shm_info_t), - chunk->mem_size - sizeof(buf_shm_info_t)); - if (shm_info->checksum != checksum) { + if (srv_buffer_pool_shm_checksum) { + checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t), + chunk->mem_size - sizeof(buf_shm_info_t)); + } else { + checksum = BUF_NO_CHECKSUM_MAGIC; + } + + if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC + && shm_info->checksum != checksum) { fprintf(stderr, "InnoDB: Error: checksum of the shared memory is not match. " "(stored=%lu calculated=%lu)\n", @@ -979,6 +998,8 @@ buf_chunk_init( } else { /* adjust offset is done later */ hash_create_reuse(zip_hash_tmp); + + srv_buffer_pool_shm_is_reused = TRUE; } } else { chunk->mem = os_mem_alloc_large(&chunk->mem_size); @@ -992,7 +1013,7 @@ buf_chunk_init( /* Allocate the block descriptors from the start of the memory block. */ if (srv_buffer_pool_shm_key) { - chunk->blocks = chunk->mem + sizeof(buf_shm_info_t); + chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t)); } else { chunk->blocks = chunk->mem; } @@ -1039,10 +1060,10 @@ buf_chunk_init( } chunk->size = shm_info->chunk_backup.size; - phys_offset = (void*)frame - (void*)((void*)chunk->mem + shm_info->frame_offset); - logi_offset = (void*)frame - (void*)chunk->blocks[0].frame; + phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset); + logi_offset = frame - chunk->blocks[0].frame; previous_frame_address = chunk->blocks[0].frame; - blocks_offset = (void*)chunk->blocks - (void*)shm_info->chunk_backup.blocks; + blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks; if (phys_offset || logi_offset || blocks_offset) { fprintf(stderr, @@ -1053,8 +1074,8 @@ buf_chunk_init( "InnoDB: Pysical offset : %ld (%#lx)\n" "InnoDB: Logical offset (frames) : %ld (%#lx)\n" "InnoDB: Logical offset (blocks) : %ld (%#lx)\n", - (void*)((void*)chunk->mem + shm_info->frame_offset), - (void*)chunk->blocks[0].frame, (void*)frame, + (byte*)chunk->mem + shm_info->frame_offset, + chunk->blocks[0].frame, frame, phys_offset, phys_offset, logi_offset, logi_offset, blocks_offset, blocks_offset); } else { @@ -1066,24 +1087,24 @@ buf_chunk_init( fprintf(stderr, "InnoDB: Aligning physical offset..."); - memmove((void*)frame, (void*)((void*)chunk->mem + shm_info->frame_offset), + memmove(frame, (byte*)chunk->mem + shm_info->frame_offset, chunk->size * UNIV_PAGE_SIZE); fprintf(stderr, " Done.\n"); } + /* buf_block_t */ + block = chunk->blocks; + for (i = chunk->size; i--; ) { + buf_block_reuse(block, logi_offset); + block++; + } + if (logi_offset || blocks_offset) { fprintf(stderr, "InnoDB: Aligning logical offset..."); - /* buf_block_t */ - block = chunk->blocks; - - for (i = chunk->size; i--; ) { - buf_block_reuse(block, logi_offset); - block++; - } /* buf_pool_t buf_pool_backup */ UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list, @@ -1094,7 +1115,7 @@ buf_chunk_init( previous_frame_address, logi_offset, blocks_offset); if (shm_info->buf_pool_backup.LRU_old) shm_info->buf_pool_backup.LRU_old = - ((void*)(shm_info->buf_pool_backup.LRU_old) + (buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old) + (((void*)shm_info->buf_pool_backup.LRU_old > previous_frame_address) ? logi_offset : blocks_offset)); @@ -1141,7 +1162,7 @@ buf_chunk_init( } if (shm_info) { - shm_info->frame_offset = (void*)chunk->blocks[0].frame - (void*)chunk->mem; + shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem; } return(chunk); @@ -1396,10 +1417,10 @@ buf_pool_init(void) if (srv_buffer_pool_shm_key) { buf_shm_info_t* shm_info; - ut_a(chunk->blocks == chunk->mem + sizeof(buf_shm_info_t)); + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t)); shm_info = chunk->mem; - buf_pool->zip_hash = (hash_table_t*)((void*)chunk->mem + shm_info->zip_hash_offset); + buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset); if(shm_info->is_new) { shm_info->is_new = FALSE; /* initialization was finished */ @@ -1504,7 +1525,7 @@ buf_pool_free(void) chunk = buf_pool->chunks; shm_info = chunk->mem; - ut_a(chunk->blocks == chunk->mem + sizeof(buf_shm_info_t)); + ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t)); /* validation the shared memory segment doesn't have unrecoverable contents. */ /* Currently, validation became not needed */ @@ -1514,8 +1535,12 @@ buf_pool_free(void) memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t)); if (srv_fast_shutdown < 2) { - shm_info->checksum = ut_fold_binary(chunk->mem + sizeof(buf_shm_info_t), - chunk->mem_size - sizeof(buf_shm_info_t)); + if (srv_buffer_pool_shm_checksum) { + shm_info->checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t), + chunk->mem_size - sizeof(buf_shm_info_t)); + } else { + shm_info->checksum = BUF_NO_CHECKSUM_MAGIC; + } shm_info->clean = TRUE; } diff --git a/buf/buf0lru.c b/buf/buf0lru.c index cc730cad40d..c82a2d545d5 100644 --- a/buf/buf0lru.c +++ b/buf/buf0lru.c @@ -2228,6 +2228,26 @@ end: return(ret); } + +typedef struct { + ib_uint32_t space_id; + ib_uint32_t page_no; +} dump_record_t; + +static int dump_record_cmp(const void *a, const void *b) +{ + const dump_record_t *rec1 = (dump_record_t *) a; + const dump_record_t *rec2 = (dump_record_t *) b; + + if (rec1->space_id < rec2->space_id) + return -1; + if (rec1->space_id > rec2->space_id) + return 1; + if (rec1->page_no < rec2->page_no) + return -1; + return rec1->page_no > rec2->page_no; +} + /********************************************************************//** Read the pages based on the specific file.*/ UNIV_INTERN @@ -2245,25 +2265,34 @@ buf_LRU_file_restore(void) ulint req = 0; ibool terminated = FALSE; ibool ret = FALSE; - - buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE); - buffer = ut_align(buffer_base, UNIV_PAGE_SIZE); - if (!buffer) { - fprintf(stderr, - " InnoDB: cannot allocate buffer.\n"); - goto end; - } + dump_record_t* records; + ulint size; + ulint size_high; + ulint length; dump_file = os_file_create_simple_no_error_handling( LRU_DUMP_FILE, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); - if (!success) { + if (!success || !os_file_get_size(dump_file, &size, &size_high)) { os_file_get_last_error(TRUE); fprintf(stderr, " InnoDB: cannot open %s\n", LRU_DUMP_FILE); goto end; } + if (size == 0 || size_high > 0 || size % 8) { + fprintf(stderr, " InnoDB: broken LRU dump file\n"); + goto end; + } + buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE); + buffer = ut_align(buffer_base, UNIV_PAGE_SIZE); + records = ut_malloc(size); + if (!buffer || !records) { + fprintf(stderr, + " InnoDB: cannot allocate buffer.\n"); + goto end; + } buffers = 0; + length = 0; while (!terminated) { success = os_file_read(dump_file, buffer, (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, @@ -2272,15 +2301,14 @@ buf_LRU_file_restore(void) if (!success) { fprintf(stderr, " InnoDB: cannot read page %lu of %s," - " or meet unexpected terminal.", + " or meet unexpected terminal.\n", buffers, LRU_DUMP_FILE); goto end; } for (offset = 0; offset < UNIV_PAGE_SIZE/4; offset += 2) { - ulint space_id, zip_size, page_no; - ulint err; - ib_int64_t tablespace_version; + ulint space_id; + ulint page_no; space_id = mach_read_from_4(buffer + offset * 4); page_no = mach_read_from_4(buffer + (offset + 1) * 4); @@ -2290,31 +2318,61 @@ buf_LRU_file_restore(void) break; } - if (offset % 16 == 15) { - os_aio_simulated_wake_handler_threads(); - buf_flush_free_margin(FALSE); + records[length].space_id = space_id; + records[length].page_no = page_no; + length++; + if (length * 8 >= size) { + fprintf(stderr, + " InnoDB: could not find the " + "end-of-file marker after reading " + "the expected %lu bytes from the " + "LRU dump file.\n" + " InnoDB: this could be caused by a " + "broken or incomplete file.\n" + " InnoDB: trying to process what has " + "been read so far.\n", + size); + terminated= TRUE; + break; } + } + buffers++; + } - zip_size = fil_space_get_zip_size(space_id); - if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { - continue; - } + qsort(records, length, sizeof(dump_record_t), dump_record_cmp); - if (fil_area_is_exist(space_id, zip_size, page_no, 0, - zip_size ? zip_size : UNIV_PAGE_SIZE)) { + for (offset = 0; offset < length; offset++) { + ulint space_id; + ulint page_no; + ulint zip_size; + ulint err; + ib_int64_t tablespace_version; - tablespace_version = fil_space_get_version(space_id); + space_id = records[offset].space_id; + page_no = records[offset].page_no; - req++; - reads += buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE - | OS_AIO_SIMULATED_WAKE_LATER, - space_id, zip_size, TRUE, - tablespace_version, page_no, NULL); - buf_LRU_stat_inc_io(); - } + if (offset % 16 == 15) { + os_aio_simulated_wake_handler_threads(); + buf_flush_free_margin(FALSE); } - buffers++; + zip_size = fil_space_get_zip_size(space_id); + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + continue; + } + + if (fil_area_is_exist(space_id, zip_size, page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE)) { + + tablespace_version = fil_space_get_version(space_id); + + req++; + reads += buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE + | OS_AIO_SIMULATED_WAKE_LATER, + space_id, zip_size, TRUE, + tablespace_version, page_no, NULL); + buf_LRU_stat_inc_io(); + } } os_aio_simulated_wake_handler_threads(); @@ -2330,6 +2388,8 @@ end: os_file_close(dump_file); if (buffer_base) ut_free(buffer_base); + if (records) + ut_free(records); return(ret); } diff --git a/dict/dict0crea.c b/dict/dict0crea.c index 45e86d94f0d..0dbbf96780f 100644 --- a/dict/dict0crea.c +++ b/dict/dict0crea.c @@ -1245,13 +1245,13 @@ dict_create_index_step( goto function_exit; } - if (srv_use_sys_stats_table) { + if (srv_use_sys_stats_table + && !((node->table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) { node->state = INDEX_BUILD_STATS_COLS; } else { node->state = INDEX_CREATE_INDEX_TREE; } } - if (node->state == INDEX_BUILD_STATS_COLS) { if (node->stats_no <= dict_index_get_n_unique(node->index)) { diff --git a/dict/dict0dict.c b/dict/dict0dict.c index 51ee7f9246f..3db0b362a60 100644 --- a/dict/dict0dict.c +++ b/dict/dict0dict.c @@ -569,8 +569,7 @@ dict_table_get_on_id( { dict_table_t* table; - if (ut_dulint_cmp(table_id, DICT_FIELDS_ID) <= 0 - || trx->dict_operation_lock_mode == RW_X_LATCH) { + if (trx->dict_operation_lock_mode == RW_X_LATCH) { /* Note: An X latch implies that the transaction already owns the dictionary mutex. */ @@ -4514,7 +4513,6 @@ dict_update_statistics_low( ibool sync) /*!< in: TRUE if must update SYS_STATS */ { dict_index_t* index; - ulint size; ulint sum_of_index_sizes = 0; if (table->ibd_file_missing) { @@ -4529,15 +4527,7 @@ dict_update_statistics_low( return; } - /* If we have set a high innodb_force_recovery level, do not calculate - statistics, as a badly corrupted index can cause a crash in it. */ - - if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { - - return; - } - - if (srv_use_sys_stats_table && !sync) { + if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) && !sync) { /* reload statistics from SYS_STATS table */ if (dict_reload_statistics(table, &sum_of_index_sizes)) { /* success */ @@ -4565,33 +4555,55 @@ dict_update_statistics_low( return; } - while (index) { + + do { if (table->is_corrupt) { ut_a(srv_pass_corrupt_table); return; } - size = btr_get_size(index, BTR_TOTAL_SIZE); + if (UNIV_LIKELY + (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE + || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO + && dict_index_is_clust(index)))) { + ulint size; + size = btr_get_size(index, BTR_TOTAL_SIZE); - index->stat_index_size = size; + index->stat_index_size = size; - sum_of_index_sizes += size; + sum_of_index_sizes += size; - size = btr_get_size(index, BTR_N_LEAF_PAGES); + size = btr_get_size(index, BTR_N_LEAF_PAGES); - if (size == 0) { - /* The root node of the tree is a leaf */ - size = 1; - } + if (size == 0) { + /* The root node of the tree is a leaf */ + size = 1; + } - index->stat_n_leaf_pages = size; + index->stat_n_leaf_pages = size; + + btr_estimate_number_of_different_key_vals(index); + } else { + /* If we have set a high innodb_force_recovery + level, do not calculate statistics, as a badly + corrupted index can cause a crash in it. + Initialize some bogus index cardinality + statistics, so that the data can be queried in + various means, also via secondary indexes. */ + ulint i; + + sum_of_index_sizes++; + index->stat_index_size = index->stat_n_leaf_pages = 1; - btr_estimate_number_of_different_key_vals(index); + for (i = dict_index_get_n_unique(index); i; ) { + index->stat_n_diff_key_vals[i--] = 1; + } + } index = dict_table_get_next_index(index); - } + } while (index); - if (srv_use_sys_stats_table) { + if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) { /* store statistics to SYS_STATS table */ dict_store_statistics(table); } diff --git a/dict/dict0load.c b/dict/dict0load.c index 90c52bfba87..bc05282668e 100644 --- a/dict/dict0load.c +++ b/dict/dict0load.c @@ -1009,16 +1009,27 @@ err_exit: err = dict_load_indexes(table, heap); + /* Initialize table foreign_child value. Its value could be + changed when dict_load_foreigns() is called below */ + table->fk_max_recusive_level = 0; + /* If the force recovery flag is set, we open the table irrespective of the error condition, since the user may want to dump data from the clustered index. However we load the foreign key information only if all indexes were loaded. */ if (err == DB_SUCCESS) { - err = dict_load_foreigns(table->name, TRUE); + err = dict_load_foreigns(table->name, TRUE, TRUE); + + if (err != DB_SUCCESS) { + dict_table_remove_from_cache(table); + table = NULL; + } } else if (!srv_force_recovery) { dict_table_remove_from_cache(table); table = NULL; } + + table->fk_max_recusive_level = 0; #if 0 if (err != DB_SUCCESS && table != NULL) { @@ -1072,6 +1083,8 @@ dict_load_table_on_id( ut_ad(mutex_own(&(dict_sys->mutex))); + table = NULL; + /* NOTE that the operation of this function is protected by the dictionary mutex, and therefore no deadlocks can occur with other dictionary operations. */ @@ -1098,15 +1111,17 @@ dict_load_table_on_id( BTR_SEARCH_LEAF, &pcur, &mtr); rec = btr_pcur_get_rec(&pcur); - if (!btr_pcur_is_on_user_rec(&pcur) - || rec_get_deleted_flag(rec, 0)) { + if (!btr_pcur_is_on_user_rec(&pcur)) { /* Not found */ + goto func_exit; + } - btr_pcur_close(&pcur); - mtr_commit(&mtr); - mem_heap_free(heap); - - return(NULL); + /* Find the first record that is not delete marked */ + while (rec_get_deleted_flag(rec, 0)) { + if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) { + goto func_exit; + } + rec = btr_pcur_get_rec(&pcur); } /*---------------------------------------------------*/ @@ -1119,19 +1134,14 @@ dict_load_table_on_id( /* Check if the table id in record is the one searched for */ if (ut_dulint_cmp(table_id, mach_read_from_8(field)) != 0) { - - btr_pcur_close(&pcur); - mtr_commit(&mtr); - mem_heap_free(heap); - - return(NULL); + goto func_exit; } /* Now we get the table name from the record */ field = rec_get_nth_field_old(rec, 1, &len); /* Load the table definition to memory */ table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len)); - +func_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); mem_heap_free(heap); @@ -1241,8 +1251,12 @@ dict_load_foreign( /*==============*/ const char* id, /*!< in: foreign constraint id as a null-terminated string */ - ibool check_charsets) + ibool check_charsets, /*!< in: TRUE=check charset compatibility */ + ibool check_recursive) + /*!< in: Whether to record the foreign table + parent count to avoid unlimited recursive + load of chained foreign tables */ { dict_foreign_t* foreign; dict_table_t* sys_foreign; @@ -1256,6 +1270,8 @@ dict_load_foreign( ulint len; ulint n_fields_and_type; mtr_t mtr; + dict_table_t* for_table; + dict_table_t* ref_table; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -1340,11 +1356,54 @@ dict_load_foreign( dict_load_foreign_cols(id, foreign); - /* If the foreign table is not yet in the dictionary cache, we - have to load it so that we are able to make type comparisons - in the next function call. */ - - dict_table_get_low(foreign->foreign_table_name); + ref_table = dict_table_check_if_in_cache_low( + foreign->referenced_table_name); + + /* We could possibly wind up in a deep recursive calls if + we call dict_table_get_low() again here if there + is a chain of tables concatenated together with + foreign constraints. In such case, each table is + both a parent and child of the other tables, and + act as a "link" in such table chains. + To avoid such scenario, we would need to check the + number of ancesters the current table has. If that + exceeds DICT_FK_MAX_CHAIN_LEN, we will stop loading + the child table. + Foreign constraints are loaded in a Breath First fashion, + that is, the index on FOR_NAME is scanned first, and then + index on REF_NAME. So foreign constrains in which + current table is a child (foreign table) are loaded first, + and then those constraints where current table is a + parent (referenced) table. + Thus we could check the parent (ref_table) table's + reference count (fk_max_recusive_level) to know how deep the + recursive call is. If the parent table (ref_table) is already + loaded, and its fk_max_recusive_level is larger than + DICT_FK_MAX_CHAIN_LEN, we will stop the recursive loading + by skipping loading the child table. It will not affect foreign + constraint check for DMLs since child table will be loaded + at that time for the constraint check. */ + if (!ref_table + || ref_table->fk_max_recusive_level < DICT_FK_MAX_RECURSIVE_LOAD) { + + /* If the foreign table is not yet in the dictionary cache, we + have to load it so that we are able to make type comparisons + in the next function call. */ + + for_table = dict_table_get_low(foreign->foreign_table_name); + + if (for_table && ref_table && check_recursive) { + /* This is to record the longest chain of ancesters + this table has, if the parent has more ancesters + than this table has, record it after add 1 (for this + parent */ + if (ref_table->fk_max_recusive_level + >= for_table->fk_max_recusive_level) { + for_table->fk_max_recusive_level = + ref_table->fk_max_recusive_level + 1; + } + } + } /* Note that there may already be a foreign constraint object in the dictionary cache for this constraint: then the following @@ -1369,6 +1428,8 @@ ulint dict_load_foreigns( /*===============*/ const char* table_name, /*!< in: table name */ + ibool check_recursive,/*!< in: Whether to check recursive + load of tables chained by FK */ ibool check_charsets) /*!< in: TRUE=check charset compatibility */ { @@ -1470,7 +1531,7 @@ loop: /* Load the foreign constraint definition to the dictionary cache */ - err = dict_load_foreign(id, check_charsets); + err = dict_load_foreign(id, check_charsets, check_recursive); if (err != DB_SUCCESS) { btr_pcur_close(&pcur); @@ -1498,6 +1559,11 @@ load_next_index: mtr_start(&mtr); + /* Switch to scan index on REF_NAME, fk_max_recusive_level + already been updated when scanning FOR_NAME index, no need to + update again */ + check_recursive = FALSE; + goto start_load; } diff --git a/fil/fil0fil.c b/fil/fil0fil.c index 17b821bb7ee..3193fb661f1 100644 --- a/fil/fil0fil.c +++ b/fil/fil0fil.c @@ -3043,6 +3043,10 @@ fil_open_single_table_tablespace( if (srv_expand_import && (space_id != id || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) { + ibool file_is_corrupt = FALSE; + byte* buf3; + byte* descr_page; + ibool descr_is_corrupt = FALSE; dulint old_id[31]; dulint new_id[31]; ulint root_page[31]; @@ -3052,16 +3056,37 @@ fil_open_single_table_tablespace( ulint i; int len; ib_uint64_t current_lsn; - ulint size_low, size_high, size; - ib_int64_t size_bytes; + ulint size_low, size_high, size, free_limit; + ib_int64_t size_bytes, free_limit_bytes; dict_table_t* table; dict_index_t* index; fil_system_t* system; fil_node_t* node = NULL; fil_space_t* space; + buf3 = ut_malloc(2 * UNIV_PAGE_SIZE); + descr_page = ut_align(buf3, UNIV_PAGE_SIZE); + current_lsn = log_get_lsn(); + /* check the header page's consistency */ + if (buf_page_is_corrupted(page, + dict_table_flags_to_zip_size(space_flags))) { + fprintf(stderr, "InnoDB: page 0 of %s seems corrupt.\n", filepath); + file_is_corrupt = TRUE; + descr_is_corrupt = TRUE; + } + + /* store as first descr page */ + memcpy(descr_page, page, UNIV_PAGE_SIZE); + + /* get free limit (page number) of the table space */ +/* these should be same to the definition in fsp0fsp.c */ +#define FSP_HEADER_OFFSET FIL_PAGE_DATA +#define FSP_FREE_LIMIT 12 + free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page); + free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)UNIV_PAGE_SIZE; + /* overwrite fsp header */ fsp_header_init_fields(page, id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); @@ -3086,6 +3111,12 @@ fil_open_single_table_tablespace( size_bytes = (((ib_int64_t)size_high) << 32) + (ib_int64_t)size_low; + if (size_bytes < free_limit_bytes) { + free_limit_bytes = size_bytes; + fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath); + file_is_corrupt = TRUE; + } + /* get cruster index information */ table = dict_table_get_low(name); index = dict_table_get_first_index(table); @@ -3107,16 +3138,19 @@ fil_open_single_table_tablespace( info_file_path, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); if (!success) { fprintf(stderr, "InnoDB: cannot open %s\n", info_file_path); + file_is_corrupt = TRUE; goto skip_info; } success = os_file_read(info_file, page, 0, 0, UNIV_PAGE_SIZE); if (!success) { fprintf(stderr, "InnoDB: cannot read %s\n", info_file_path); + file_is_corrupt = TRUE; goto skip_info; } if (mach_read_from_4(page) != 0x78706f72UL || mach_read_from_4(page + 4) != 0x74696e66UL) { fprintf(stderr, "InnoDB: %s seems not to be a correct .exp file\n", info_file_path); + file_is_corrupt = TRUE; goto skip_info; } @@ -3153,20 +3187,29 @@ skip_info: fprintf(stderr, "InnoDB: Progress in %%:"); - for (offset = 0; offset < size_bytes; offset += UNIV_PAGE_SIZE) { + for (offset = 0; offset < free_limit_bytes; offset += UNIV_PAGE_SIZE) { ulint checksum_field; ulint old_checksum_field; + ibool page_is_corrupt; success = os_file_read(file, page, (ulint)(offset & 0xFFFFFFFFUL), (ulint)(offset >> 32), UNIV_PAGE_SIZE); - /* skip inconsistent pages, it may be free page. */ + page_is_corrupt = FALSE; + + /* check consistency */ if (memcmp(page + FIL_PAGE_LSN + 4, page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { - goto skip_write; + page_is_corrupt = TRUE; + } + + if (mach_read_from_4(page + FIL_PAGE_OFFSET) + != offset / UNIV_PAGE_SIZE) { + + page_is_corrupt = TRUE; } checksum_field = mach_read_from_4(page @@ -3182,7 +3225,7 @@ skip_info: && old_checksum_field != buf_calc_page_old_checksum(page)) { - goto skip_write; + page_is_corrupt = TRUE; } if (!srv_fast_checksum @@ -3191,7 +3234,7 @@ skip_info: && checksum_field != buf_calc_page_new_checksum(page)) { - goto skip_write; + page_is_corrupt = TRUE; } if (srv_fast_checksum @@ -3202,6 +3245,77 @@ skip_info: && checksum_field != buf_calc_page_new_checksum(page)) { + page_is_corrupt = TRUE; + } + + /* if it is free page, inconsistency is acceptable */ + if (!offset) { + /* header page*/ + /* it should be overwritten already */ + ut_a(!page_is_corrupt); + + } else if (!((offset / UNIV_PAGE_SIZE) % UNIV_PAGE_SIZE)) { + /* descr page (not header) */ + if (page_is_corrupt) { + file_is_corrupt = TRUE; + descr_is_corrupt = TRUE; + } else { + ut_a(fil_page_get_type(page) == FIL_PAGE_TYPE_XDES); + descr_is_corrupt = FALSE; + } + + /* store as descr page */ + memcpy(descr_page, page, UNIV_PAGE_SIZE); + + } else if (descr_is_corrupt) { + /* unknown state of the page */ + if (page_is_corrupt) { + file_is_corrupt = TRUE; + } + + } else { + /* check free page or not */ + /* These definitions should be same to fsp0fsp.c */ +#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE) + +#define XDES_BITMAP (FLST_NODE_SIZE + 12) +#define XDES_BITS_PER_PAGE 2 +#define XDES_FREE_BIT 0 +#define XDES_SIZE \ + (XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE)) +#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE) + + /*descr = descr_page + XDES_ARR_OFFSET + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)*/ + /*xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)*/ + byte* descr; + ulint index; + ulint byte_index; + ulint bit_index; + + descr = descr_page + XDES_ARR_OFFSET + + XDES_SIZE * (ut_2pow_remainder((offset / UNIV_PAGE_SIZE), UNIV_PAGE_SIZE) / FSP_EXTENT_SIZE); + + index = XDES_FREE_BIT + XDES_BITS_PER_PAGE * ((offset / UNIV_PAGE_SIZE) % FSP_EXTENT_SIZE); + byte_index = index / 8; + bit_index = index % 8; + + if (ut_bit_get_nth(mach_read_from_1(descr + XDES_BITMAP + byte_index), bit_index)) { + /* free page */ + if (page_is_corrupt) { + goto skip_write; + } + } else { + /* not free */ + if (page_is_corrupt) { + file_is_corrupt = TRUE; + } + } + } + + if (page_is_corrupt) { + fprintf(stderr, " [errp:%lld]", offset / UNIV_PAGE_SIZE); + + /* cannot treat corrupt page */ goto skip_write; } @@ -3294,11 +3408,11 @@ skip_info: } skip_write: - if (size_bytes - && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / size_bytes) - != ((offset * 100) / size_bytes)) { + if (free_limit_bytes + && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes) + != ((offset * 100) / free_limit_bytes)) { fprintf(stderr, " %lu", - (ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / size_bytes)); + (ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes)); } } @@ -3379,6 +3493,26 @@ skip_write: node->size = size; } mutex_exit(&(system->mutex)); + + ut_free(buf3); + + if (file_is_corrupt) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: file ", + stderr); + ut_print_filename(stderr, filepath); + fprintf(stderr, " seems to be corrupt.\n" + "InnoDB: anyway, all not corrupt pages were tried to be converted to salvage.\n" + "InnoDB: ##### CAUTION #####\n" + "InnoDB: ## The .ibd must cause to crash InnoDB, though re-import would seem to be succeeded.\n" + "InnoDB: ## If you don't have knowledge about salvaging data from .ibd, you should not use the file.\n" + "InnoDB: ###################\n"); + success = FALSE; + + ut_free(buf2); + + goto func_exit; + } } ut_free(buf2); diff --git a/ha/ha0ha.c b/ha/ha0ha.c index 9d9d341ad39..7f11917de0a 100644 --- a/ha/ha0ha.c +++ b/ha/ha0ha.c @@ -31,9 +31,7 @@ Created 8/22/1994 Heikki Tuuri #ifdef UNIV_DEBUG # include "buf0buf.h" #endif /* UNIV_DEBUG */ -#ifdef UNIV_SYNC_DEBUG -# include "btr0sea.h" -#endif /* UNIV_SYNC_DEBUG */ +#include "btr0sea.h" #include "page0page.h" /*************************************************************//** @@ -127,7 +125,8 @@ ha_clear( /*************************************************************//** Inserts an entry into a hash table. If an entry with the same fold number is found, its node is updated to point to the new data, and no new node -is inserted. +is inserted. If btr_search_enabled is set to FALSE, we will only allow +updating existing nodes, but no new node is allowed to be added. @return TRUE if succeed, FALSE if no more memory could be allocated */ UNIV_INTERN ibool @@ -174,6 +173,7 @@ ha_insert_for_fold_func( prev_block->n_pointers--; block->n_pointers++; } + ut_ad(!btr_search_fully_disabled); # endif /* !UNIV_HOTBACKUP */ prev_node->block = block; @@ -186,6 +186,13 @@ ha_insert_for_fold_func( prev_node = prev_node->next; } + /* We are in the process of disabling hash index, do not add + new chain node */ + if (!btr_search_enabled) { + ut_ad(!btr_search_fully_disabled); + return(TRUE); + } + /* We have to allocate a new chain node */ node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t)); @@ -347,6 +354,7 @@ ha_remove_all_nodes_to_page( #endif } +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG /*************************************************************//** Validates a given range of the cells in hash table. @return TRUE if ok */ @@ -393,6 +401,7 @@ ha_validate( return(ok); } +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ /*************************************************************//** Prints info of a hash table. */ diff --git a/ha/hash0hash.c b/ha/hash0hash.c index bc058cd4729..0f4fc55d895 100644 --- a/ha/hash0hash.c +++ b/ha/hash0hash.c @@ -161,7 +161,7 @@ hash_create_init( offset = (sizeof(hash_table_t) + 7) / 8; offset *= 8; - table->array = (hash_cell_t*)(((void*)table) + offset); + table->array = (hash_cell_t*)(((byte*)table) + offset); table->n_cells = prime; # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG table->adaptive = FALSE; @@ -187,7 +187,7 @@ hash_create_reuse( offset = (sizeof(hash_table_t) + 7) / 8; offset *= 8; - table->array = (hash_cell_t*)(((void*)table) + offset); + table->array = (hash_cell_t*)(((byte*)table) + offset); ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); } diff --git a/handler/ha_innodb.cc b/handler/ha_innodb.cc index 80d1e0b0435..3b833537e18 100644 --- a/handler/ha_innodb.cc +++ b/handler/ha_innodb.cc @@ -197,6 +197,7 @@ static my_bool innobase_rollback_on_timeout = FALSE; static my_bool innobase_create_status_file = FALSE; static my_bool innobase_stats_on_metadata = TRUE; static my_bool innobase_use_sys_stats_table = FALSE; +static my_bool innobase_buffer_pool_shm_checksum = TRUE; static char* internal_innobase_data_file_path = NULL; @@ -816,6 +817,19 @@ convert_error_code_to_mysql( case DB_INTERRUPTED: my_error(ER_QUERY_INTERRUPTED, MYF(0)); /* fall through */ + + case DB_FOREIGN_EXCEED_MAX_CASCADE: + push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, + HA_ERR_ROW_IS_REFERENCED, + "InnoDB: Cannot delete/update " + "rows with cascading foreign key " + "constraints that exceed max " + "depth of %d. Please " + "drop extra constraints and try " + "again", DICT_FK_MAX_RECURSIVE_LOAD); + + /* fall through */ + case DB_ERROR: default: return(-1); /* unspecified error */ @@ -2417,6 +2431,7 @@ innobase_change_buffering_inited_ok: srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; srv_use_checksums = (ibool) innobase_use_checksums; srv_fast_checksum = (ibool) innobase_fast_checksum; + srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum; #ifdef HAVE_LARGE_PAGES if ((os_use_large_pages = (ibool) my_use_large_pages)) @@ -2553,6 +2568,7 @@ skip_overwrite: /* Get the current high water mark format. */ innobase_file_format_check = (char*) trx_sys_file_format_max_get(); + btr_search_fully_disabled = (!btr_search_enabled); DBUG_RETURN(FALSE); error: DBUG_RETURN(TRUE); @@ -3650,12 +3666,19 @@ ha_innobase::innobase_initialize_autoinc() err = row_search_max_autoinc(index, col_name, &read_auto_inc); switch (err) { - case DB_SUCCESS: + case DB_SUCCESS: { + ulonglong col_max_value; + + col_max_value = innobase_get_int_col_max_value(field); + /* At the this stage we do not know the increment - or the offset, so use a default increment of 1. */ - auto_inc = read_auto_inc + 1; - break; + nor the offset, so use a default increment of 1. */ + auto_inc = innobase_next_autoinc( + read_auto_inc, 1, 1, col_max_value); + + break; + } case DB_RECORD_NOT_FOUND: ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: MySQL and InnoDB data " @@ -3969,8 +3992,6 @@ retry: dict_table_get_format(prebuilt->table)); } - info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); - /* Only if the table has an AUTOINC column. */ if (prebuilt->table != NULL && table->found_next_number_field != NULL) { dict_table_autoinc_lock(prebuilt->table); @@ -3987,6 +4008,8 @@ retry: dict_table_autoinc_unlock(prebuilt->table); } + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + DBUG_RETURN(0); } @@ -7911,28 +7934,15 @@ ha_innobase::info( dict_index_t* index; ha_rows rec_per_key; ib_int64_t n_rows; - ulong j; - ulong i; char path[FN_REFLEN]; os_file_stat_t stat_info; - DBUG_ENTER("info"); /* If we are forcing recovery at a high level, we will suppress statistics calculation on tables, because that may crash the server if an index is badly corrupted. */ - if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { - - /* We return success (0) instead of HA_ERR_CRASHED, - because we want MySQL to process this query and not - stop, like it would do if it received the error code - HA_ERR_CRASHED. */ - - DBUG_RETURN(0); - } - /* We do not know if MySQL can call this function before calling external_lock(). To be safe, update the thd of the current table handle. */ @@ -7955,7 +7965,7 @@ ha_innobase::info( /* In sql_show we call with this flag: update then statistics so that they are up-to-date */ - if (srv_use_sys_stats_table + if (srv_use_sys_stats_table && !((ib_table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) && thd_sql_command(user_thd) == SQLCOM_ANALYZE) { /* If the indexes on the table don't have enough rows in SYS_STATS system table, */ /* they need to be created. */ @@ -8050,12 +8060,18 @@ ha_innobase::info( acquiring latches inside InnoDB, we do not call it if we are asked by MySQL to avoid locking. Another reason to avoid the call is that it uses quite a lot of CPU. - See Bug#38185. - We do not update delete_length if no locking is requested - so the "old" value can remain. delete_length is initialized - to 0 in the ha_statistics' constructor. */ - if (!(flag & HA_STATUS_NO_LOCK) && srv_stats_update_need_lock) { - + See Bug#38185. */ + if (flag & HA_STATUS_NO_LOCK || !srv_stats_update_need_lock) { + /* We do not update delete_length if no + locking is requested so the "old" value can + remain. delete_length is initialized to 0 in + the ha_statistics' constructor. */ + } else if (UNIV_UNLIKELY + (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) { + /* Avoid accessing the tablespace if + innodb_crash_recovery is set to a high value. */ + stats.delete_length = 0; + } else { /* lock the data dictionary to avoid races with ibd_file_missing and tablespace_discarded */ row_mysql_lock_data_dictionary(prebuilt->trx); @@ -8100,6 +8116,7 @@ ha_innobase::info( } if (flag & HA_STATUS_CONST) { + ulong i; /* Verify the number of index in InnoDB and MySQL matches up. If prebuilt->clust_index_was_generated holds, InnoDB defines GEN_CLUST_INDEX internally */ @@ -8116,6 +8133,7 @@ ha_innobase::info( } for (i = 0; i < table->s->keys; i++) { + ulong j; /* We could get index quickly through internal index mapping with the index translation table. The identity of index (match up index name with @@ -8181,6 +8199,11 @@ ha_innobase::info( } } + if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + + goto func_exit; + } + if (flag & HA_STATUS_ERRKEY) { const dict_index_t* err_index; @@ -8201,6 +8224,7 @@ ha_innobase::info( stats.auto_increment_value = innobase_peek_autoinc(); } +func_exit: prebuilt->trx->op_info = (char*)""; DBUG_RETURN(0); @@ -9691,7 +9715,8 @@ ha_innobase::store_lock( && (sql_command == SQLCOM_INSERT_SELECT || sql_command == SQLCOM_REPLACE_SELECT || sql_command == SQLCOM_UPDATE - || sql_command == SQLCOM_CREATE_TABLE)) { + || sql_command == SQLCOM_CREATE_TABLE + || sql_command == SQLCOM_SET_OPTION)) { /* If we either have innobase_locks_unsafe_for_binlog option set or this session is using READ COMMITTED @@ -9699,9 +9724,9 @@ ha_innobase::store_lock( is not set to serializable and MySQL is doing INSERT INTO...SELECT or REPLACE INTO...SELECT or UPDATE ... = (SELECT ...) or CREATE ... - SELECT... without FOR UPDATE or IN SHARE - MODE in select, then we use consistent read - for select. */ + SELECT... or SET ... = (SELECT ...) without + FOR UPDATE or IN SHARE MODE in select, + then we use consistent read for select. */ prebuilt->select_lock_type = LOCK_NONE; prebuilt->stored_select_lock_type = LOCK_NONE; @@ -11341,9 +11366,14 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "[experimental] The key value of shared memory segment for the buffer pool. 0 means disable the feature (default).", + "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.", NULL, NULL, 0, 0, INT_MAX32, 0); +static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable buffer_pool_shm checksum validation (enabled by default).", + NULL, NULL, TRUE); + static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency, PLUGIN_VAR_RQCMDARG, "Helps in performance tuning in heavily concurrent environments.", @@ -11590,6 +11620,12 @@ static MYSQL_SYSVAR_ULONG(dict_size_limit, srv_dict_size_limit, "Limit the allocated memory for dictionary cache. (0: unlimited)", NULL, NULL, 0, 0, LONG_MAX, 0); +static MYSQL_SYSVAR_UINT(auto_lru_dump, srv_auto_lru_dump, + PLUGIN_VAR_RQCMDARG, + "Time in seconds between automatic buffer pool dumps. " + "0 (the default) disables automatic dumps.", + NULL, NULL, 0, 0, UINT_MAX32, 0); + static MYSQL_SYSVAR_ULONG(pass_corrupt_table, srv_pass_corrupt_table, PLUGIN_VAR_RQCMDARG, "Pass corruptions of user tables as 'corrupt table' instead of not crashing itself, " @@ -11604,6 +11640,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(autoextend_increment), MYSQL_SYSVAR(buffer_pool_size), MYSQL_SYSVAR(buffer_pool_shm_key), + MYSQL_SYSVAR(buffer_pool_shm_checksum), MYSQL_SYSVAR(checksums), MYSQL_SYSVAR(fast_checksum), MYSQL_SYSVAR(commit_concurrency), @@ -11681,6 +11718,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(change_buffering), MYSQL_SYSVAR(read_ahead_threshold), MYSQL_SYSVAR(io_capacity), + MYSQL_SYSVAR(auto_lru_dump), MYSQL_SYSVAR(use_purge_thread), MYSQL_SYSVAR(pass_corrupt_table), NULL diff --git a/include/btr0cur.h b/include/btr0cur.h index 716f15c4267..e151fdcb563 100644 --- a/include/btr0cur.h +++ b/include/btr0cur.h @@ -468,9 +468,10 @@ btr_estimate_number_of_different_key_vals( Marks not updated extern fields as not-owned by this record. The ownership is transferred to the updated record which is inserted elsewhere in the index tree. In purge only the owner of externally stored field is allowed -to free the field. */ +to free the field. +@return TRUE if BLOB ownership was transferred */ UNIV_INTERN -void +ibool btr_cur_mark_extern_inherited_fields( /*=================================*/ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed @@ -570,7 +571,7 @@ btr_copy_externally_stored_field_prefix( ulint local_len);/*!< in: length of data, in bytes */ /*******************************************************************//** Copies an externally stored field of a record to mem heap. -@return the field copied to heap */ +@return the field copied to heap, or NULL if the field is incomplete */ UNIV_INTERN byte* btr_rec_copy_externally_stored_field( diff --git a/include/btr0sea.h b/include/btr0sea.h index 7f4842d0897..f6d194319ae 100644 --- a/include/btr0sea.h +++ b/include/btr0sea.h @@ -187,6 +187,7 @@ btr_search_update_hash_on_delete( btr_cur_t* cursor);/*!< in: cursor which was positioned on the record to delete using btr_cur_search_..., the record is not yet deleted */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG /********************************************************************//** Validates the search system. @return TRUE if ok */ @@ -194,10 +195,19 @@ UNIV_INTERN ibool btr_search_validate(void); /*======================*/ +#else +# define btr_search_validate() TRUE +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ /** Flag: has the search system been enabled? Protected by btr_search_latch and btr_search_enabled_mutex. */ -extern char btr_search_enabled; +extern char btr_search_enabled; + +/** Flag: whether the search system has completed its disabling process, +It is set to TRUE right after buf_pool_drop_hash_index() in +btr_search_disable(), indicating hash index entries are cleaned up. +Protected by btr_search_latch and btr_search_enabled_mutex. */ +extern ibool btr_search_fully_disabled; /** The search info struct in an index */ struct btr_search_struct{ diff --git a/include/buf0buf.h b/include/buf0buf.h index 9484146d8a3..e06927f42f0 100644 --- a/include/buf0buf.h +++ b/include/buf0buf.h @@ -1305,7 +1305,7 @@ Compute the hash fold value for blocks in buf_pool->zip_hash. */ /* the fold should be relative when srv_buffer_pool_shm_key is enabled */ #define BUF_POOL_ZIP_FOLD_PTR(ptr) (!srv_buffer_pool_shm_key\ ?((ulint) (ptr) / UNIV_PAGE_SIZE)\ - :((ulint) ((void*)ptr - (void*)(buf_pool->chunks->blocks->frame)) / UNIV_PAGE_SIZE)) + :((ulint) ((byte*)ptr - (byte*)(buf_pool->chunks->blocks->frame)) / UNIV_PAGE_SIZE)) #define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) /* @} */ diff --git a/include/db0err.h b/include/db0err.h index c841c2b4afe..c7fa6d2a444 100644 --- a/include/db0err.h +++ b/include/db0err.h @@ -94,6 +94,9 @@ enum db_err { DB_PRIMARY_KEY_IS_NULL, /* a column in the PRIMARY KEY was found to be NULL */ + DB_FOREIGN_EXCEED_MAX_CASCADE, /* Foreign key constraint related + cascading delete/update exceeds + maximum allowed depth */ /* The following are partial failure codes */ DB_FAIL = 1000, diff --git a/include/dict0dict.h b/include/dict0dict.h index 3c5e620d3c1..d18b3ecb1b0 100644 --- a/include/dict0dict.h +++ b/include/dict0dict.h @@ -680,6 +680,22 @@ ulint dict_table_zip_size( /*================*/ const dict_table_t* table); /*!< in: table */ +/*********************************************************************//** +Obtain exclusive locks on all index trees of the table. This is to prevent +accessing index trees while InnoDB is updating internal metadata for +operations such as truncate tables. */ +UNIV_INLINE +void +dict_table_x_lock_indexes( +/*======================*/ + dict_table_t* table); /*!< in: table */ +/*********************************************************************//** +Release the exclusive locks on all index tree. */ +UNIV_INLINE +void +dict_table_x_unlock_indexes( +/*========================*/ + dict_table_t* table); /*!< in: table */ /********************************************************************//** Checks if a column is in the ordering columns of the clustered index of a table. Column prefixes are treated like whole columns. diff --git a/include/dict0dict.ic b/include/dict0dict.ic index aada3096261..bd7534dc7e2 100644 --- a/include/dict0dict.ic +++ b/include/dict0dict.ic @@ -452,6 +452,48 @@ dict_table_zip_size( return(dict_table_flags_to_zip_size(table->flags)); } +/*********************************************************************//** +Obtain exclusive locks on all index trees of the table. This is to prevent +accessing index trees while InnoDB is updating internal metadata for +operations such as truncate tables. */ +UNIV_INLINE +void +dict_table_x_lock_indexes( +/*======================*/ + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + + ut_a(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Loop through each index of the table and lock them */ + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_lock(dict_index_get_lock(index)); + } +} + +/*********************************************************************//** +Release the exclusive locks on all index tree. */ +UNIV_INLINE +void +dict_table_x_unlock_indexes( +/*========================*/ + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + + ut_a(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_unlock(dict_index_get_lock(index)); + } +} /********************************************************************//** Gets the number of fields in the internal representation of an index, including fields added by the dictionary system. diff --git a/include/dict0load.h b/include/dict0load.h index 60b8c1fb632..f41882019d5 100644 --- a/include/dict0load.h +++ b/include/dict0load.h @@ -97,6 +97,8 @@ ulint dict_load_foreigns( /*===============*/ const char* table_name, /*!< in: table name */ + ibool check_recursive,/*!< in: Whether to check recursive + load of tables chained by FK */ ibool check_charsets);/*!< in: TRUE=check charsets compatibility */ /********************************************************************//** diff --git a/include/dict0mem.h b/include/dict0mem.h index 37c5a4a24fc..6736c2a3a36 100644 --- a/include/dict0mem.h +++ b/include/dict0mem.h @@ -112,6 +112,21 @@ ROW_FORMAT=REDUNDANT. */ in table->flags. */ /* @} */ +/** Tables could be chained together with Foreign key constraint. When +first load the parent table, we would load all of its descedents. +This could result in rescursive calls and out of stack error eventually. +DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads, +when exceeded, the child table will not be loaded. It will be loaded when +the foreign constraint check needs to be run. */ +#define DICT_FK_MAX_RECURSIVE_LOAD 250 + +/** Similarly, when tables are chained together with foreign key constraints +with on cascading delete/update clause, delete from parent table could +result in recursive cascading calls. This defines the maximum number of +such cascading deletes/updates allowed. When exceeded, the delete from +parent table will fail, and user has to drop excessive foreign constraint +before proceeds. */ +#define FK_MAX_CASCADE_DEL 300 /**********************************************************************//** Creates a table memory object. @@ -434,6 +449,12 @@ struct dict_table_struct{ NOT allowed until this count gets to zero; MySQL does NOT itself check the number of open handles at drop */ + unsigned fk_max_recusive_level:8; + /*!< maximum recursive level we support when + loading tables chained together with FK + constraints. If exceeds this level, we will + stop loading child table into memory along with + its parent table */ ulint n_foreign_key_checks_running; /*!< count of how many foreign key check operations are currently being performed diff --git a/include/ha0ha.h b/include/ha0ha.h index 1ffbd3440aa..3299000bf3c 100644 --- a/include/ha0ha.h +++ b/include/ha0ha.h @@ -186,6 +186,7 @@ ha_remove_all_nodes_to_page( hash_table_t* table, /*!< in: hash table */ ulint fold, /*!< in: fold value */ const page_t* page); /*!< in: buffer page */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG /*************************************************************//** Validates a given range of the cells in hash table. @return TRUE if ok */ @@ -196,6 +197,7 @@ ha_validate( hash_table_t* table, /*!< in: hash table */ ulint start_index, /*!< in: start index */ ulint end_index); /*!< in: end index */ +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ /*************************************************************//** Prints info of a hash table. */ UNIV_INTERN diff --git a/include/hash0hash.h b/include/hash0hash.h index 9cb410e2ad7..492c767acc4 100644 --- a/include/hash0hash.h +++ b/include/hash0hash.h @@ -363,14 +363,14 @@ do {\ NODE_TYPE* node2222;\ \ if ((TABLE)->array[i2222].node) \ - (TABLE)->array[i2222].node \ - += (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET);\ + (TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \ + + (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\ node2222 = HASH_GET_FIRST((TABLE), i2222);\ \ while (node2222) {\ if (node2222->PTR_NAME) \ - node2222->PTR_NAME = ((void*)node2222->PTR_NAME) \ - + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET);\ + node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \ + + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\ \ node2222 = node2222->PTR_NAME;\ }\ diff --git a/include/mem0pool.h b/include/mem0pool.h index 5e93bf88a47..fa8be296ec9 100644 --- a/include/mem0pool.h +++ b/include/mem0pool.h @@ -100,18 +100,6 @@ mem_pool_get_reserved( /*==================*/ mem_pool_t* pool); /*!< in: memory pool */ /********************************************************************//** -Reserves the mem pool mutex. */ -UNIV_INTERN -void -mem_pool_mutex_enter(void); -/*======================*/ -/********************************************************************//** -Releases the mem pool mutex. */ -UNIV_INTERN -void -mem_pool_mutex_exit(void); -/*=====================*/ -/********************************************************************//** Validates a memory pool. @return TRUE if ok */ UNIV_INTERN diff --git a/include/que0que.h b/include/que0que.h index 09a671f49b1..ed48f980294 100644 --- a/include/que0que.h +++ b/include/que0que.h @@ -381,6 +381,9 @@ struct que_thr_struct{ thus far */ ulint lock_state; /*!< lock state of thread (table or row) */ + ulint fk_cascade_depth; /*!< maximum cascading call depth + supported for foreign key constraint + related delete/updates */ }; #define QUE_THR_MAGIC_N 8476583 diff --git a/include/row0mysql.h b/include/row0mysql.h index a604f6e3724..f8fab59ef80 100644 --- a/include/row0mysql.h +++ b/include/row0mysql.h @@ -630,7 +630,11 @@ struct row_prebuilt_struct { the secondary index, then this is set to TRUE */ unsigned templ_contains_blob:1;/*!< TRUE if the template contains - BLOB column(s) */ + a column with DATA_BLOB == + get_innobase_type_from_mysql_type(); + not to be confused with InnoDB + externally stored columns + (VARCHAR can be off-page too) */ mysql_row_templ_t* mysql_template;/*!< template used to transform rows fast between MySQL and Innobase formats; memory for this template diff --git a/include/srv0srv.h b/include/srv0srv.h index 43f35e8b8a5..16a90fdc015 100644 --- a/include/srv0srv.h +++ b/include/srv0srv.h @@ -157,6 +157,8 @@ extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; extern uint srv_buffer_pool_shm_key; +extern ibool srv_buffer_pool_shm_is_reused; +extern ibool srv_buffer_pool_shm_checksum; extern ibool srv_thread_concurrency_timer_based; @@ -340,6 +342,9 @@ extern ulint srv_buf_pool_flushed; reading of a disk page */ extern ulint srv_buf_pool_reads; +/** Time in seconds between automatic buffer pool dumps */ +extern uint srv_auto_lru_dump; + /** Status variables to be passed to MySQL */ typedef struct export_var_struct export_struc; @@ -608,6 +613,16 @@ srv_error_monitor_thread( /*=====================*/ void* arg); /*!< in: a dummy parameter required by os_thread_create */ +/*********************************************************************//** +A thread which restores the buffer pool from a dump file on startup and does +periodic buffer pool dumps. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +srv_LRU_dump_restore_thread( +/*====================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ /******************************************************************//** Outputs to a file the output of the InnoDB Monitor. @return FALSE if not all information printed diff --git a/include/univ.i b/include/univ.i index 6166e9c4248..cdc1815de16 100644 --- a/include/univ.i +++ b/include/univ.i @@ -46,10 +46,10 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 1 #define INNODB_VERSION_MINOR 0 -#define INNODB_VERSION_BUGFIX 10 +#define INNODB_VERSION_BUGFIX 12 #ifndef PERCONA_INNODB_VERSION -#define PERCONA_INNODB_VERSION 12.0 +#define PERCONA_INNODB_VERSION 12.1 #endif diff --git a/include/ut0lst.h b/include/ut0lst.h index 39553e270f7..a40c8054082 100644 --- a/include/ut0lst.h +++ b/include/ut0lst.h @@ -269,10 +269,10 @@ do { \ TYPE* ut_list_node_313; \ \ if ((BASE).start) \ - (BASE).start = ((void*)((BASE).start) \ + (BASE).start = (void*)((byte*)((BASE).start) \ + (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\ if ((BASE).end) \ - (BASE).end = ((void*)((BASE).end) \ + (BASE).end = (void*)((byte*)((BASE).end) \ + (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\ \ ut_list_node_313 = (BASE).start; \ @@ -280,10 +280,10 @@ do { \ for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \ ut_a(ut_list_node_313); \ if ((ut_list_node_313->NAME).prev) \ - (ut_list_node_313->NAME).prev = ((void*)((ut_list_node_313->NAME).prev)\ + (ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\ + (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\ if ((ut_list_node_313->NAME).next) \ - (ut_list_node_313->NAME).next = ((void*)((ut_list_node_313->NAME).next)\ + (ut_list_node_313->NAME).next = (void*)((byte*)((ut_list_node_313->NAME).next)\ + (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\ ut_list_node_313 = (ut_list_node_313->NAME).next; \ } \ diff --git a/include/ut0mem.h b/include/ut0mem.h index cf41cba4643..f14606be966 100644 --- a/include/ut0mem.h +++ b/include/ut0mem.h @@ -113,7 +113,8 @@ ut_test_malloc( ulint n); /*!< in: try to allocate this many bytes */ #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** -Frees a memory block allocated with ut_malloc. */ +Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is +a nop. */ UNIV_INTERN void ut_free( diff --git a/lock/lock0lock.c b/lock/lock0lock.c index 7ec4a53e0ea..1ded67d9147 100644 --- a/lock/lock0lock.c +++ b/lock/lock0lock.c @@ -4606,7 +4606,7 @@ print_rec: nth_lock++; if (nth_lock >= srv_show_locks_held) { - fputs("TOO LOCKS PRINTED FOR THIS TRX:" + fputs("TOO MANY LOCKS PRINTED FOR THIS TRX:" " SUPPRESSING FURTHER PRINTS\n", file); diff --git a/log/log0recv.c b/log/log0recv.c index bbb634addb0..200b3b088a7 100644 --- a/log/log0recv.c +++ b/log/log0recv.c @@ -2901,6 +2901,7 @@ recv_init_crash_recovery(void) /*==========================*/ { ut_a(!recv_needed_recovery); + ut_a(!srv_buffer_pool_shm_is_reused); recv_needed_recovery = TRUE; diff --git a/mem/mem0mem.c b/mem/mem0mem.c index c0ce8a3e1ac..1dd4db30841 100644 --- a/mem/mem0mem.c +++ b/mem/mem0mem.c @@ -367,7 +367,7 @@ mem_heap_create_block( block->line = line; #ifdef MEM_PERIODIC_CHECK - mem_pool_mutex_enter(); + mutex_enter(&(mem_comm_pool->mutex)); if (!mem_block_list_inited) { mem_block_list_inited = TRUE; @@ -376,7 +376,7 @@ mem_heap_create_block( UT_LIST_ADD_LAST(mem_block_list, mem_block_list, block); - mem_pool_mutex_exit(); + mutex_exit(&(mem_comm_pool->mutex)); #endif mem_block_set_len(block, len); mem_block_set_type(block, type); @@ -479,11 +479,11 @@ mem_heap_block_free( UT_LIST_REMOVE(list, heap->base, block); #ifdef MEM_PERIODIC_CHECK - mem_pool_mutex_enter(); + mutex_enter(&(mem_comm_pool->mutex)); UT_LIST_REMOVE(mem_block_list, mem_block_list, block); - mem_pool_mutex_exit(); + mutex_exit(&(mem_comm_pool->mutex)); #endif ut_ad(heap->total_size >= block->len); @@ -556,7 +556,7 @@ mem_validate_all_blocks(void) { mem_block_t* block; - mem_pool_mutex_enter(); + mutex_enter(&(mem_comm_pool->mutex)); block = UT_LIST_GET_FIRST(mem_block_list); @@ -568,6 +568,6 @@ mem_validate_all_blocks(void) block = UT_LIST_GET_NEXT(mem_block_list, block); } - mem_pool_mutex_exit(); + mutex_exit(&(mem_comm_pool->mutex)); } #endif diff --git a/mem/mem0pool.c b/mem/mem0pool.c index c4f8af607e0..3291453eeb5 100644 --- a/mem/mem0pool.c +++ b/mem/mem0pool.c @@ -34,6 +34,7 @@ Created 5/12/1997 Heikki Tuuri #include "ut0lst.h" #include "ut0byte.h" #include "mem0mem.h" +#include "srv0start.h" /* We would like to use also the buffer frames to allocate memory. This would be desirable, because then the memory consumption of the database @@ -121,23 +122,33 @@ mysql@lists.mysql.com */ UNIV_INTERN ulint mem_n_threads_inside = 0; /********************************************************************//** -Reserves the mem pool mutex. */ -UNIV_INTERN +Reserves the mem pool mutex if we are not in server shutdown. Use +this function only in memory free functions, since only memory +free functions are used during server shutdown. */ +UNIV_INLINE void -mem_pool_mutex_enter(void) -/*======================*/ +mem_pool_mutex_enter( +/*=================*/ + mem_pool_t* pool) /*!< in: memory pool */ { - mutex_enter(&(mem_comm_pool->mutex)); + if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) { + mutex_enter(&(pool->mutex)); + } } /********************************************************************//** -Releases the mem pool mutex. */ -UNIV_INTERN +Releases the mem pool mutex if we are not in server shutdown. As +its corresponding mem_pool_mutex_enter() function, use it only +in memory free functions */ +UNIV_INLINE void -mem_pool_mutex_exit(void) -/*=====================*/ +mem_pool_mutex_exit( +/*================*/ + mem_pool_t* pool) /*!< in: memory pool */ { - mutex_exit(&(mem_comm_pool->mutex)); + if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) { + mutex_exit(&(pool->mutex)); + } } /********************************************************************//** @@ -567,7 +578,7 @@ mem_area_free( n = ut_2_log(size); - mutex_enter(&(pool->mutex)); + mem_pool_mutex_enter(pool); mem_n_threads_inside++; ut_a(mem_n_threads_inside == 1); @@ -595,7 +606,7 @@ mem_area_free( pool->reserved += ut_2_exp(n); mem_n_threads_inside--; - mutex_exit(&(pool->mutex)); + mem_pool_mutex_exit(pool); mem_area_free(new_ptr, pool); @@ -611,7 +622,7 @@ mem_area_free( } mem_n_threads_inside--; - mutex_exit(&(pool->mutex)); + mem_pool_mutex_exit(pool); ut_ad(mem_pool_validate(pool)); } @@ -630,7 +641,7 @@ mem_pool_validate( ulint free; ulint i; - mutex_enter(&(pool->mutex)); + mem_pool_mutex_enter(pool); free = 0; @@ -658,7 +669,7 @@ mem_pool_validate( ut_a(free + pool->reserved == pool->size); - mutex_exit(&(pool->mutex)); + mem_pool_mutex_exit(pool); return(TRUE); } diff --git a/os/os0proc.c b/os/os0proc.c index 8f6c7f430f7..4567d96b6f4 100644 --- a/os/os0proc.c +++ b/os/os0proc.c @@ -243,13 +243,13 @@ os_shm_alloc( ibool* is_new) { void* ptr; +#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H ulint size; int shmid; *is_new = FALSE; -#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H fprintf(stderr, - "InnoDB: The shared memory key %#x (%d) is specified.\n", + "InnoDB: The shared memory segment containing the buffer pool is: key %#x (%d).\n", key, key); # if defined HAVE_LARGE_PAGES && defined UNIV_LINUX if (!os_use_large_pages || !os_large_page_size) { @@ -266,12 +266,12 @@ os_shm_alloc( if (shmid < 0) { if (errno == EEXIST) { fprintf(stderr, - "InnoDB: HugeTLB: The shared memory segment seems to exist already.\n"); + "InnoDB: HugeTLB: The shared memory segment exists.\n"); shmid = shmget((key_t)key, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W); if (shmid < 0) { fprintf(stderr, - "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes.(reuse) errno %d\n", + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n", size, errno); goto skip; } else { @@ -280,14 +280,14 @@ os_shm_alloc( } } else { fprintf(stderr, - "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes.(new) errno %d\n", + "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n", size, errno); goto skip; } } else { *is_new = TRUE; fprintf(stderr, - "InnoDB: HugeTLB: The new shared memory segment is created.\n"); + "InnoDB: HugeTLB: A new shared memory segment has been created .\n"); } ptr = shmat(shmid, NULL, 0); @@ -323,12 +323,12 @@ skip: if (shmid < 0) { if (errno == EEXIST) { fprintf(stderr, - "InnoDB: The shared memory segment seems to exist already.\n"); + "InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n"); shmid = shmget((key_t)key, (size_t)size, SHM_R | SHM_W); if (shmid < 0) { fprintf(stderr, - "InnoDB: Warning: Failed to allocate %lu bytes.(reuse) errno %d\n", + "InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n", size, errno); ptr = NULL; goto end; @@ -338,7 +338,7 @@ skip: } } else { fprintf(stderr, - "InnoDB: Warning: Failed to allocate %lu bytes.(new) errno %d\n", + "InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n", size, errno); ptr = NULL; goto end; @@ -346,7 +346,7 @@ skip: } else { *is_new = TRUE; fprintf(stderr, - "InnoDB: The new shared memory segment is created.\n"); + "InnoDB: A new shared memory segment has been created.\n"); } ptr = shmat(shmid, NULL, 0); diff --git a/percona-suite/percona_query_response_time-stored.result b/percona-suite/percona_query_response_time-stored.result index 386180c791a..df51f2bfd58 100644 --- a/percona-suite/percona_query_response_time-stored.result +++ b/percona-suite/percona_query_response_time-stored.result @@ -22,12 +22,13 @@ SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 2 FLUSH QUERY_RESPONSE_TIME; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 44 @@ -76,7 +77,7 @@ time 2097152.00000 4194304.00000 8388608.00000 -TOO LONG QUERY +TOO LONG SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=1; SELECT test_f(); test_f() @@ -91,14 +92,15 @@ SELECT test_f(); test_f() Hello, world! SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 5 2 44 -4 5 2 44 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 5 4 2 44 +4 5 4 2 44 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 44 @@ -147,7 +149,7 @@ time 2097152.00000 4194304.00000 8388608.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 2 @@ -161,14 +163,15 @@ SELECT test_f(); test_f() Hello, world! SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 2 2 14 -1 2 2 14 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 2 1 2 14 +1 2 1 2 14 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 14 @@ -187,7 +190,7 @@ time 10000.000000 100000.000000 1000000.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 10 @@ -201,14 +204,15 @@ SELECT test_f(); test_f() Hello, world! SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 2 2 17 -1 2 2 17 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 2 1 2 17 +1 2 1 2 17 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 17 @@ -230,7 +234,7 @@ time 117649.000000 823543.000000 5764801.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 7 @@ -244,14 +248,15 @@ SELECT test_f(); test_f() Hello, world! SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 2 2 7 -1 2 2 7 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 2 1 2 7 +1 2 1 2 7 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 7 @@ -263,7 +268,7 @@ time 156.000000 24336.000000 3796416.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 156 @@ -277,14 +282,15 @@ SELECT test_f(); test_f() Hello, world! SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 2 2 6 -1 2 2 6 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 2 1 2 6 +1 2 1 2 6 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 6 @@ -295,7 +301,7 @@ time 1.000000 1000.000000 1000000.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 1000 diff --git a/percona-suite/percona_query_response_time.result b/percona-suite/percona_query_response_time.result index 3c12284a525..54657b6ca06 100644 --- a/percona-suite/percona_query_response_time.result +++ b/percona-suite/percona_query_response_time.result @@ -9,12 +9,13 @@ SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 2 FLUSH QUERY_RESPONSE_TIME; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 44 @@ -63,7 +64,7 @@ time 2097152.00000 4194304.00000 8388608.00000 -TOO LONG QUERY +TOO LONG SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=1; SELECT SLEEP(0.31); SLEEP(0.31) @@ -123,17 +124,18 @@ SELECT SLEEP(2.5); SLEEP(2.5) 0 SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 20 5 44 -10 20 5 44 -1 20 5 44 -5 20 5 44 -3 20 5 44 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 20 15 5 44 +10 20 15 5 44 +1 20 15 5 44 +5 20 15 5 44 +3 20 15 5 44 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 44 @@ -182,7 +184,7 @@ time 2097152.00000 4194304.00000 8388608.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 2 @@ -250,15 +252,16 @@ SELECT SLEEP(2.5); SLEEP(2.5) 0 SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 20 3 14 -11 20 3 14 -8 20 3 14 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 20 17 3 14 +11 20 17 3 14 +8 20 17 3 14 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 14 @@ -277,7 +280,7 @@ time 10000.000000 100000.000000 1000000.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 10 @@ -345,15 +348,16 @@ SELECT SLEEP(2.5); SLEEP(2.5) 0 SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 20 3 17 -11 20 3 17 -8 20 3 17 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 20 17 3 17 +11 20 17 3 17 +8 20 17 3 17 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 17 @@ -375,7 +379,7 @@ time 117649.000000 823543.000000 5764801.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 7 @@ -443,15 +447,16 @@ SELECT SLEEP(2.5); SLEEP(2.5) 0 SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 20 3 7 -11 20 3 7 -8 20 3 7 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 20 17 3 7 +11 20 17 3 7 +8 20 17 3 7 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 7 @@ -463,7 +468,7 @@ time 156.000000 24336.000000 3796416.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 156 @@ -531,15 +536,16 @@ SELECT SLEEP(2.5); SLEEP(2.5) 0 SET GLOBAL ENABLE_QUERY_RESPONSE_TIME_STATS=0; -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; -count query_count not_zero_region_count region_count -1 20 3 6 -11 20 3 6 -8 20 3 6 +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; +count query_count query_total not_zero_region_count region_count +1 20 17 3 6 +11 20 17 3 6 +8 20 17 3 6 SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; region_count 6 @@ -550,7 +556,7 @@ time 1.000000 1000.000000 1000000.00000 -TOO LONG QUERY +TOO LONG SHOW GLOBAL VARIABLES where Variable_name like 'QUERY_RESPONSE_TIME_RANGE_BASE'; Variable_name Value query_response_time_range_base 1000 diff --git a/percona-suite/percona_query_response_time_show.inc b/percona-suite/percona_query_response_time_show.inc index 761b2c6f0df..709abf9872e 100644 --- a/percona-suite/percona_query_response_time_show.inc +++ b/percona-suite/percona_query_response_time_show.inc @@ -1,7 +1,8 @@ -SELECT c.count, +SELECT d.count, (SELECT SUM(a.count) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as a WHERE a.count != 0) as query_count, -(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as not_zero_region_count, +(SELECT SUM((b.total * 1000000) DIV 1000000) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as b WHERE b.count != 0) as query_total, +(SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count != 0) as not_zero_region_count, (SELECT COUNT(*) FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME) as region_count -FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as c WHERE c.count > 0; +FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME as d WHERE d.count > 0; SELECT COUNT(*) as region_count FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; SELECT time FROM INFORMATION_SCHEMA.QUERY_RESPONSE_TIME; diff --git a/percona-suite/percona_server_variables.result b/percona-suite/percona_server_variables.result index 25f961b201f..cb2af0d3d0e 100644 --- a/percona-suite/percona_server_variables.result +++ b/percona-suite/percona_server_variables.result @@ -77,8 +77,10 @@ innodb_adaptive_checkpoint Value innodb_adaptive_flushing Value innodb_adaptive_hash_index Value innodb_additional_mem_pool_size Value +innodb_auto_lru_dump Value innodb_autoextend_increment Value innodb_autoinc_lock_mode Value +innodb_buffer_pool_shm_checksum Value innodb_buffer_pool_shm_key Value innodb_buffer_pool_size Value innodb_change_buffering Value diff --git a/row/row0merge.c b/row/row0merge.c index 47c03c77850..65102851bdf 100644 --- a/row/row0merge.c +++ b/row/row0merge.c @@ -1787,6 +1787,11 @@ row_merge_copy_blobs( (below). */ data = btr_rec_copy_externally_stored_field( mrec, offsets, zip_size, i, &len, heap); + /* Because we have locked the table, any records + written by incomplete transactions must have been + rolled back already. There must not be any incomplete + BLOB columns. */ + ut_a(data); dfield_set_data(field, data, len); } @@ -2399,7 +2404,7 @@ row_merge_rename_tables( goto err_exit; } - err = dict_load_foreigns(old_name, TRUE); + err = dict_load_foreigns(old_name, FALSE, TRUE); if (err != DB_SUCCESS) { err_exit: diff --git a/row/row0mysql.c b/row/row0mysql.c index 98e2d22c56b..1d8b4be1a25 100644 --- a/row/row0mysql.c +++ b/row/row0mysql.c @@ -576,6 +576,13 @@ handle_new_error: "InnoDB: " REFMAN "forcing-recovery.html" " for help.\n", stderr); break; + case DB_FOREIGN_EXCEED_MAX_CASCADE: + fprintf(stderr, "InnoDB: Cannot delete/update rows with" + " cascading foreign key constraints that exceed max" + " depth of %lu\n" + "Please drop excessive foreign constraints" + " and try again\n", (ulong) DICT_FK_MAX_RECURSIVE_LOAD); + break; default: fprintf(stderr, "InnoDB: unknown error code %lu\n", (ulong) err); @@ -1391,11 +1398,15 @@ row_update_for_mysql( run_again: thr->run_node = node; thr->prev_node = node; + thr->fk_cascade_depth = 0; row_upd_step(thr); err = trx->error_state; + /* Reset fk_cascade_depth back to 0 */ + thr->fk_cascade_depth = 0; + if (err != DB_SUCCESS) { que_thr_stop_for_mysql(thr); @@ -1586,6 +1597,12 @@ row_update_cascade_for_mysql( trx_t* trx; trx = thr_get_trx(thr); + + thr->fk_cascade_depth++; + + if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) { + return (DB_FOREIGN_EXCEED_MAX_CASCADE); + } run_again: thr->run_node = node; thr->prev_node = node; @@ -2105,7 +2122,7 @@ row_table_add_foreign_constraints( name, reject_fks); if (err == DB_SUCCESS) { /* Check that also referencing constraints are ok */ - err = dict_load_foreigns(name, TRUE); + err = dict_load_foreigns(name, FALSE, TRUE); } if (err != DB_SUCCESS) { @@ -2798,6 +2815,15 @@ row_truncate_table_for_mysql( trx->table_id = table->id; + /* Lock all index trees for this table, as we will + truncate the table/index and possibly change their metadata. + All DML/DDL are blocked by table level lock, with + a few exceptions such as queries into information schema + about the table, MySQL could try to access index stats + for this kind of query, we need to use index locks to + sync up */ + dict_table_x_lock_indexes(table); + if (table->space && !table->dir_path_of_temp_table) { /* Discard and create the single-table tablespace. */ ulint space = table->space; @@ -2814,6 +2840,7 @@ row_truncate_table_for_mysql( || fil_create_new_single_table_tablespace( space, table->name, FALSE, flags, FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + dict_table_x_unlock_indexes(table); ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: TRUNCATE TABLE %s failed to" @@ -2917,6 +2944,10 @@ next_rec: mem_heap_free(heap); + /* Done with index truncation, release index tree locks, + subsequent work relates to table level metadata change */ + dict_table_x_unlock_indexes(table); + dict_hdr_get_new_id(&new_id, NULL, NULL); info = pars_info_create(); @@ -3966,7 +3997,7 @@ end: an ALTER, not in a RENAME. */ err = dict_load_foreigns( - new_name, !old_is_tmp || trx->check_foreigns); + new_name, FALSE, !old_is_tmp || trx->check_foreigns); if (err != DB_SUCCESS) { ut_print_timestamp(stderr); diff --git a/row/row0row.c b/row/row0row.c index cb7dfa2b7c9..8e806a14a98 100644 --- a/row/row0row.c +++ b/row/row0row.c @@ -294,7 +294,13 @@ row_build( ut_ad(dtuple_check_typed(row)); - if (j) { + if (!ext) { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored + column. No cache is needed. */ + ut_ad(dict_table_get_format(index->table) + < DICT_TF_FORMAT_ZIP); + } else if (j) { *ext = row_ext_create(j, ext_cols, row, dict_table_zip_size(index->table), heap); diff --git a/row/row0sel.c b/row/row0sel.c index 0db4fb6f3db..a1511e35435 100644 --- a/row/row0sel.c +++ b/row/row0sel.c @@ -416,7 +416,7 @@ row_sel_fetch_columns( field_no))) { /* Copy an externally stored field to the - temporary heap */ + temporary heap, if possible. */ heap = mem_heap_create(1); @@ -425,6 +425,17 @@ row_sel_fetch_columns( dict_table_zip_size(index->table), field_no, &len, heap); + /* data == NULL means that the + externally stored field was not + written yet. This record + should only be seen by + recv_recovery_rollback_active() or any + TRX_ISO_READ_UNCOMMITTED + transactions. The InnoDB SQL parser + (the sole caller of this function) + does not implement READ UNCOMMITTED, + and it is not involved during rollback. */ + ut_a(data); ut_a(len != UNIV_SQL_NULL); needs_copy = TRUE; @@ -926,6 +937,7 @@ row_sel_get_clust_rec( when plan->clust_pcur was positioned. The latch will not be released until mtr_commit(mtr). */ + ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets))); row_sel_fetch_columns(index, clust_rec, offsets, UT_LIST_GET_FIRST(plan->columns)); *out_rec = clust_rec; @@ -1628,6 +1640,13 @@ skip_lock: } if (old_vers == NULL) { + /* The record does not exist + in our read view. Skip it, but + first attempt to determine + whether the index segment we + are searching through has been + exhausted. */ + offsets = rec_get_offsets( rec, index, offsets, ULINT_UNDEFINED, &heap); @@ -2647,9 +2666,8 @@ Convert a row in the Innobase format to a row in the MySQL format. Note that the template in prebuilt may advise us to copy only a few columns to mysql_rec, other columns are left blank. All columns may not be needed in the query. -@return TRUE if success, FALSE if could not allocate memory for a BLOB -(though we may also assert in that case) */ -static +@return TRUE on success, FALSE if not all columns could be retrieved */ +static __attribute__((warn_unused_result)) ibool row_sel_store_mysql_rec( /*====================*/ @@ -2719,6 +2737,21 @@ row_sel_store_mysql_rec( dict_table_zip_size(prebuilt->table), templ->rec_field_no, &len, heap); + if (UNIV_UNLIKELY(!data)) { + /* The externally stored field + was not written yet. This + record should only be seen by + recv_recovery_rollback_active() + or any TRX_ISO_READ_UNCOMMITTED + transactions. */ + + if (extern_field_heap) { + mem_heap_free(extern_field_heap); + } + + return(FALSE); + } + ut_a(len != UNIV_SQL_NULL); } else { /* Field is stored in the row. */ @@ -3136,9 +3169,10 @@ row_sel_pop_cached_row_for_mysql( } /********************************************************************//** -Pushes a row for MySQL to the fetch cache. */ -UNIV_INLINE -void +Pushes a row for MySQL to the fetch cache. +@return TRUE on success, FALSE if the record contains incomplete BLOBs */ +UNIV_INLINE __attribute__((warn_unused_result)) +ibool row_sel_push_cache_row_for_mysql( /*=============================*/ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ @@ -3180,10 +3214,11 @@ row_sel_push_cache_row_for_mysql( prebuilt->fetch_cache[ prebuilt->n_fetch_cached], prebuilt, rec, offsets))) { - ut_error; + return(FALSE); } prebuilt->n_fetch_cached++; + return(TRUE); } /*********************************************************************//** @@ -3576,14 +3611,25 @@ row_search_for_mysql( row_sel_try_search_shortcut_for_mysql(). The latch will not be released until mtr_commit(&mtr). */ + ut_ad(!rec_get_deleted_flag(rec, comp)); if (!row_sel_store_mysql_rec(buf, prebuilt, rec, offsets)) { - err = DB_TOO_BIG_RECORD; - - /* We let the main loop to do the - error handling */ - goto shortcut_fails_too_big_rec; + /* Only fresh inserts may contain + incomplete externally stored + columns. Pretend that such + records do not exist. Such + records may only be accessed + at the READ UNCOMMITTED + isolation level or when + rolling back a recovered + transaction. Rollback happens + at a lower level, not here. */ + ut_a(trx->isolation_level + == TRX_ISO_READ_UNCOMMITTED); + + /* Proceed as in case SEL_RETRY. */ + break; } mtr_commit(&mtr); @@ -3623,7 +3669,7 @@ release_search_latch_if_needed: default: ut_ad(0); } -shortcut_fails_too_big_rec: + mtr_commit(&mtr); mtr_start(&mtr); } @@ -4217,7 +4263,7 @@ no_gap_lock: rec = old_vers; } - } else if (!lock_sec_rec_cons_read_sees(rec, trx->read_view)) { + } else { /* We are looking into a non-clustered index, and to get the right version of the record we have to look also into the clustered index: this @@ -4225,8 +4271,12 @@ no_gap_lock: information via the clustered index record. */ ut_ad(index != clust_index); + ut_ad(!dict_index_is_clust(index)); - goto requires_clust_rec; + if (!lock_sec_rec_cons_read_sees( + rec, trx->read_view)) { + goto requires_clust_rec; + } } } @@ -4349,8 +4399,13 @@ requires_clust_rec: ULINT_UNDEFINED, &heap); result_rec = rec; } + + /* result_rec can legitimately be delete-marked + now that it has been established that it points to a + clustered index record that exists in the read view. */ } else { result_rec = rec; + ut_ad(!rec_get_deleted_flag(rec, comp)); } /* We found a qualifying record 'result_rec'. At this point, @@ -4382,9 +4437,18 @@ requires_clust_rec: not cache rows because there the cursor is a scrollable cursor. */ - row_sel_push_cache_row_for_mysql(prebuilt, result_rec, - offsets); - if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { + if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec, + offsets)) { + /* Only fresh inserts may contain incomplete + externally stored columns. Pretend that such + records do not exist. Such records may only be + accessed at the READ UNCOMMITTED isolation + level or when rolling back a recovered + transaction. Rollback happens at a lower + level, not here. */ + ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED); + } else if (prebuilt->n_fetch_cached + == MYSQL_FETCH_CACHE_SIZE) { goto got_row; } @@ -4400,9 +4464,17 @@ requires_clust_rec: } else { if (!row_sel_store_mysql_rec(buf, prebuilt, result_rec, offsets)) { - err = DB_TOO_BIG_RECORD; - - goto lock_wait_or_error; + /* Only fresh inserts may contain + incomplete externally stored + columns. Pretend that such records do + not exist. Such records may only be + accessed at the READ UNCOMMITTED + isolation level or when rolling back a + recovered transaction. Rollback + happens at a lower level, not here. */ + ut_a(trx->isolation_level + == TRX_ISO_READ_UNCOMMITTED); + goto next_rec; } } diff --git a/row/row0undo.c b/row/row0undo.c index 9ef842b5114..fd28a4f6520 100644 --- a/row/row0undo.c +++ b/row/row0undo.c @@ -199,8 +199,24 @@ row_undo_search_clust_to_pcur( ret = FALSE; } else { + row_ext_t** ext; + + if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) { + /* In DYNAMIC or COMPRESSED format, there is + no prefix of externally stored columns in the + clustered index record. Build a cache of + column prefixes. */ + ext = &node->ext; + } else { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored + column. No cache is needed. */ + ext = NULL; + node->ext = NULL; + } + node->row = row_build(ROW_COPY_DATA, clust_index, rec, - offsets, NULL, &node->ext, node->heap); + offsets, NULL, ext, node->heap); if (node->update) { node->undo_row = dtuple_copy(node->row, node->heap); row_upd_replace(node->undo_row, &node->undo_ext, diff --git a/row/row0upd.c b/row/row0upd.c index d0aaecd3dae..04c3139fcc7 100644 --- a/row/row0upd.c +++ b/row/row0upd.c @@ -1398,6 +1398,7 @@ row_upd_store_row( dict_index_t* clust_index; rec_t* rec; mem_heap_t* heap = NULL; + row_ext_t** ext; ulint offsets_[REC_OFFS_NORMAL_SIZE]; const ulint* offsets; rec_offs_init(offsets_); @@ -1414,8 +1415,22 @@ row_upd_store_row( offsets = rec_get_offsets(rec, clust_index, offsets_, ULINT_UNDEFINED, &heap); + + if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) { + /* In DYNAMIC or COMPRESSED format, there is no prefix + of externally stored columns in the clustered index + record. Build a cache of column prefixes. */ + ext = &node->ext; + } else { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored column. + No cache is needed. */ + ext = NULL; + node->ext = NULL; + } + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, - NULL, &node->ext, node->heap); + NULL, ext, node->heap); if (node->is_delete) { node->upd_row = NULL; node->upd_ext = NULL; @@ -1583,6 +1598,7 @@ row_upd_clust_rec_by_insert( dict_table_t* table; dtuple_t* entry; ulint err; + ibool change_ownership = FALSE; ut_ad(node); ut_ad(dict_index_is_clust(index)); @@ -1615,9 +1631,9 @@ row_upd_clust_rec_by_insert( index = dict_table_get_first_index(table); offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap); - btr_cur_mark_extern_inherited_fields( - btr_cur_get_page_zip(btr_cur), - rec, index, offsets, node->update, mtr); + change_ownership = btr_cur_mark_extern_inherited_fields( + btr_cur_get_page_zip(btr_cur), rec, index, offsets, + node->update, mtr); if (check_ref) { /* NOTE that the following call loses the position of pcur ! */ @@ -1646,10 +1662,11 @@ row_upd_clust_rec_by_insert( row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); - if (node->upd_ext) { + if (change_ownership) { /* If we return from a lock wait, for example, we may have extern fields marked as not-owned in entry (marked in the - if-branch above). We must unmark them. */ + if-branch above). We must unmark them, take the ownership + back. */ btr_cur_unmark_dtuple_extern_fields(entry); diff --git a/srv/srv0srv.c b/srv/srv0srv.c index 4f66144b37c..d1a8d2c4dd9 100644 --- a/srv/srv0srv.c +++ b/srv/srv0srv.c @@ -213,6 +213,8 @@ UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX; /* key value for shm */ UNIV_INTERN uint srv_buffer_pool_shm_key = 0; +UNIV_INTERN ibool srv_buffer_pool_shm_is_reused = FALSE; +UNIV_INTERN ibool srv_buffer_pool_shm_checksum = TRUE; /* This parameter is deprecated. Use srv_n_io_[read|write]_threads instead. */ @@ -307,6 +309,9 @@ UNIV_INTERN ulint srv_buf_pool_flushed = 0; reading of a disk page */ UNIV_INTERN ulint srv_buf_pool_reads = 0; +/** Time in seconds between automatic buffer pool dumps */ +UNIV_INTERN uint srv_auto_lru_dump = 0; + /* structure to pass status variables to MySQL */ UNIV_INTERN export_struc export_vars; @@ -2555,6 +2560,56 @@ loop: OS_THREAD_DUMMY_RETURN; } +/*********************************************************************//** +A thread which restores the buffer pool from a dump file on startup and does +periodic buffer pool dumps. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +srv_LRU_dump_restore_thread( +/*====================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + uint auto_lru_dump; + time_t last_dump_time; + time_t time_elapsed; + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "LRU dump/restore thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + + if (srv_auto_lru_dump) + buf_LRU_file_restore(); + + last_dump_time = time(NULL); + +loop: + os_thread_sleep(5000000); + + if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { + goto exit_func; + } + + time_elapsed = time(NULL) - last_dump_time; + auto_lru_dump = srv_auto_lru_dump; + if (auto_lru_dump > 0 && (time_t) auto_lru_dump < time_elapsed) { + last_dump_time = time(NULL); + buf_LRU_file_dump(); + } + + goto loop; +exit_func: + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + /*******************************************************************//** Tells the InnoDB server that there has been activity in the database and wakes up the master thread if it is suspended (not sleeping). Used diff --git a/srv/srv0start.c b/srv/srv0start.c index 93fccabeff6..b36faf2d2d7 100644 --- a/srv/srv0start.c +++ b/srv/srv0start.c @@ -126,9 +126,9 @@ static mutex_t ios_mutex; static ulint ios; /** io_handler_thread parameters for thread identification */ -static ulint n[SRV_MAX_N_IO_THREADS + 6 + 64]; +static ulint n[SRV_MAX_N_IO_THREADS + 7 + 64]; /** io_handler_thread identifiers */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 64]; +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7 + 64]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -1719,8 +1719,8 @@ innobase_start_or_create_for_mysql(void) Note that this is not as heavy weight as it seems. At this point there will be only ONE page in the buf_LRU and there must be no page in the buf_flush list. */ - /* TODO: treat more correctly */ - if (!srv_buffer_pool_shm_key) + /* buffer_pool_shm should not be reused when recovery was needed. */ + if (!srv_buffer_pool_shm_is_reused) buf_pool_invalidate(); /* We always try to do a recovery, even if the database had @@ -1835,6 +1835,10 @@ innobase_start_or_create_for_mysql(void) os_thread_create(&srv_monitor_thread, NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS); + /* Create the thread which automaticaly dumps/restore buffer pool */ + os_thread_create(&srv_LRU_dump_restore_thread, NULL, + thread_ids + 5 + SRV_MAX_N_IO_THREADS); + srv_is_being_started = FALSE; if (trx_doublewrite == NULL) { @@ -1859,13 +1863,13 @@ innobase_start_or_create_for_mysql(void) ulint i; os_thread_create(&srv_purge_thread, NULL, thread_ids - + (5 + SRV_MAX_N_IO_THREADS)); + + (6 + SRV_MAX_N_IO_THREADS)); for (i = 0; i < srv_use_purge_thread - 1; i++) { - n[6 + i + SRV_MAX_N_IO_THREADS] = i; /* using as index for arrays in purge_sys */ + n[7 + i + SRV_MAX_N_IO_THREADS] = i; /* using as index for arrays in purge_sys */ os_thread_create(&srv_purge_worker_thread, - n + (6 + i + SRV_MAX_N_IO_THREADS), - thread_ids + (6 + i + SRV_MAX_N_IO_THREADS)); + n + (7 + i + SRV_MAX_N_IO_THREADS), + thread_ids + (7 + i + SRV_MAX_N_IO_THREADS)); } } #ifdef UNIV_DEBUG @@ -2213,9 +2217,13 @@ innobase_shutdown_for_mysql(void) pars_lexer_close(); log_mem_free(); buf_pool_free(); - ut_free_all_mem(); mem_close(); + /* ut_free_all_mem() frees all allocated memory not freed yet + in shutdown, and it will also free the ut_list_mutex, so it + should be the last one for all operation */ + ut_free_all_mem(); + if (os_thread_count != 0 || os_event_count != 0 || os_mutex_count != 0 diff --git a/trx/trx0sys.c b/trx/trx0sys.c index ad4471ada0b..11581a3f2ae 100644 --- a/trx/trx0sys.c +++ b/trx/trx0sys.c @@ -541,8 +541,8 @@ start_again: log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n"); + trx_sys_multiple_tablespace_format = TRUE; } - trx_doublewrite_buf_is_being_created = FALSE; } } diff --git a/trx/trx0undo.c b/trx/trx0undo.c index 063a2f2b8a6..ec4beb5660a 100644 --- a/trx/trx0undo.c +++ b/trx/trx0undo.c @@ -1976,7 +1976,8 @@ trx_undo_update_cleanup( UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); } else { - ut_ad(undo->state == TRX_UNDO_TO_PURGE); + ut_ad(undo->state == TRX_UNDO_TO_PURGE + || undo->state == TRX_UNDO_TO_FREE); trx_undo_mem_free(undo); } diff --git a/ut/ut0mem.c b/ut/ut0mem.c index 35a325b9ccd..bf55e4273b6 100644 --- a/ut/ut0mem.c +++ b/ut/ut0mem.c @@ -290,7 +290,8 @@ ut_test_malloc( #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** -Frees a memory block allocated with ut_malloc. */ +Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is +a nop. */ UNIV_INTERN void ut_free( @@ -300,7 +301,9 @@ ut_free( #ifndef UNIV_HOTBACKUP ut_mem_block_t* block; - if (UNIV_LIKELY(srv_use_sys_malloc)) { + if (ptr == NULL) { + return; + } else if (UNIV_LIKELY(srv_use_sys_malloc)) { free(ptr); return; } |