diff options
Diffstat (limited to 'innobase')
47 files changed, 1271 insertions, 375 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index e4e957ea7b6..33a3ac70c90 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -21,7 +21,7 @@ Created 6/2/1994 Heikki Tuuri #include "lock0lock.h" #include "ibuf0ibuf.h" -/* +/** Node pointers ------------- Leaf pages of a B-tree contain the index records stored in the @@ -2255,6 +2255,7 @@ btr_index_rec_validate( ulint len; ulint n; ulint i; + char err_buf[1000]; n = dict_index_get_n_fields(index); @@ -2262,6 +2263,9 @@ btr_index_rec_validate( fprintf(stderr, "Record has %lu fields, should have %lu\n", rec_get_n_fields(rec), n); + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record %s\n", err_buf); + return(FALSE); } @@ -2276,6 +2280,9 @@ btr_index_rec_validate( "Record field %lu len is %lu, should be %lu\n", i, len, dtype_get_fixed_size(type)); + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record %s\n", err_buf); + return(FALSE); } } @@ -2330,7 +2337,6 @@ btr_validate_level( ulint level) /* in: level number */ { ulint space; - mtr_t mtr; page_t* page; page_t* right_page; page_t* father_page; @@ -2344,6 +2350,8 @@ btr_validate_level( dtuple_t* node_ptr_tuple; ibool ret = TRUE; dict_index_t* index; + mtr_t mtr; + char err_buf[1000]; mtr_start(&mtr); @@ -2382,9 +2390,9 @@ loop: if (level == 0) { if (!btr_index_page_validate(page, index)) { fprintf(stderr, - "Error in page %lu in index %s\n", - buf_frame_get_page_no(page), index->name); - + "Error in page %lu in index %s, level %lu\n", + buf_frame_get_page_no(page), index->name, + level); ret = FALSE; } } @@ -2402,12 +2410,32 @@ loop: right_page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr); - ut_a(cmp_rec_rec(page_rec_get_prev(page_get_supremum_rec(page)), + if (cmp_rec_rec(page_rec_get_prev(page_get_supremum_rec(page)), page_rec_get_next(page_get_infimum_rec(right_page)), - UT_LIST_GET_FIRST(tree->tree_indexes)) < 0); + UT_LIST_GET_FIRST(tree->tree_indexes)) >= 0) { + + fprintf(stderr, + "InnoDB: Error on pages %lu and %lu in index %s\n", + buf_frame_get_page_no(page), + right_page_no, + index->name); + + fprintf(stderr, + "InnoDB: records in wrong order on adjacent pages\n"); + + rec_sprintf(err_buf, 900, + page_rec_get_prev(page_get_supremum_rec(page))); + fprintf(stderr, "InnoDB: record %s\n", err_buf); + + rec_sprintf(err_buf, 900, + page_rec_get_next(page_get_infimum_rec(right_page))); + fprintf(stderr, "InnoDB: record %s\n", err_buf); + + ret = FALSE; + } } - if ((level > 0) && (left_page_no == FIL_NULL)) { + if (level > 0 && left_page_no == FIL_NULL) { ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( page_rec_get_next(page_get_infimum_rec(page)))); } @@ -2418,8 +2446,38 @@ loop: node_ptr = btr_page_get_father_node_ptr(tree, page, &mtr); - ut_a(node_ptr == btr_page_get_father_for_rec(tree, page, - page_rec_get_prev(page_get_supremum_rec(page)), &mtr)); + if (btr_node_ptr_get_child_page_no(node_ptr) != + buf_frame_get_page_no(page) + || node_ptr != btr_page_get_father_for_rec(tree, page, + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr)) { + fprintf(stderr, + "InnoDB: Error on page %lu in index %s\n", + buf_frame_get_page_no(page), + index->name); + + fprintf(stderr, + "InnoDB: node pointer to the page is wrong\n"); + + rec_sprintf(err_buf, 900, node_ptr); + + fprintf(stderr, "InnoDB: node ptr %s\n", err_buf); + + fprintf(stderr, + "InnoDB: node ptr child page n:o %lu\n", + btr_node_ptr_get_child_page_no(node_ptr)); + + rec_sprintf(err_buf, 900, + btr_page_get_father_for_rec(tree, page, + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr)); + + fprintf(stderr, "InnoDB: record on page %s\n", + err_buf); + ret = FALSE; + + goto node_ptr_fails; + } father_page = buf_frame_align(node_ptr); @@ -2431,7 +2489,33 @@ loop: page_rec_get_next( page_get_infimum_rec(page)), 0, heap); - ut_a(cmp_dtuple_rec(node_ptr_tuple, node_ptr) == 0); + + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr) != 0) { + + fprintf(stderr, + "InnoDB: Error on page %lu in index %s\n", + buf_frame_get_page_no(page), + index->name); + + fprintf(stderr, + "InnoDB: Error: node ptrs differ on levels > 0\n"); + + rec_sprintf(err_buf, 900, node_ptr); + + fprintf(stderr, "InnoDB: node ptr %s\n", + err_buf); + rec_sprintf(err_buf, 900, + page_rec_get_next( + page_get_infimum_rec(page))); + + fprintf(stderr, "InnoDB: first rec %s\n", + err_buf); + ret = FALSE; + mem_heap_free(heap); + + goto node_ptr_fails; + } + mem_heap_free(heap); } @@ -2454,21 +2538,51 @@ loop: if (page_rec_get_next(node_ptr) != page_get_supremum_rec(father_page)) { - ut_a(right_node_ptr == - page_rec_get_next(node_ptr)); + if (right_node_ptr != + page_rec_get_next(node_ptr)) { + ret = FALSE; + fprintf(stderr, + "InnoDB: node pointer to the right page is wrong\n"); + + fprintf(stderr, + "InnoDB: Error on page %lu in index %s\n", + buf_frame_get_page_no(page), + index->name); + } } else { right_father_page = buf_frame_align( right_node_ptr); - ut_a(right_node_ptr == page_rec_get_next( + if (right_node_ptr != page_rec_get_next( page_get_infimum_rec( - right_father_page))); - ut_a(buf_frame_get_page_no(right_father_page) - == btr_page_get_next(father_page, &mtr)); + right_father_page))) { + ret = FALSE; + fprintf(stderr, + "InnoDB: node pointer 2 to the right page is wrong\n"); + + fprintf(stderr, + "InnoDB: Error on page %lu in index %s\n", + buf_frame_get_page_no(page), + index->name); + } + + if (buf_frame_get_page_no(right_father_page) + != btr_page_get_next(father_page, &mtr)) { + + ret = FALSE; + fprintf(stderr, + "InnoDB: node pointer 3 to the right page is wrong\n"); + + fprintf(stderr, + "InnoDB: Error on page %lu in index %s\n", + buf_frame_get_page_no(page), + index->name); + } } } } +node_ptr_fails: mtr_commit(&mtr); if (right_page_no != FIL_NULL) { diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 25c74f21fd3..4217a3ba99b 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -410,9 +410,9 @@ buf_flush_try_page( block->io_fix = BUF_IO_WRITE; block->flush_type = flush_type; - if (buf_pool->n_flush[block->flush_type] == 0) { + if (buf_pool->n_flush[flush_type] == 0) { - os_event_reset(buf_pool->no_flush[block->flush_type]); + os_event_reset(buf_pool->no_flush[flush_type]); } (buf_pool->n_flush[flush_type])++; @@ -460,6 +460,11 @@ buf_flush_try_page( block->io_fix = BUF_IO_WRITE; block->flush_type = flush_type; + if (buf_pool->n_flush[flush_type] == 0) { + + os_event_reset(buf_pool->no_flush[flush_type]); + } + (buf_pool->n_flush[flush_type])++; rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); @@ -609,7 +614,7 @@ buf_flush_batch( ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); ut_ad((flush_type != BUF_FLUSH_LIST) || sync_thread_levels_empty_gen(TRUE)); - + mutex_enter(&(buf_pool->mutex)); if ((buf_pool->n_flush[flush_type] > 0) diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index eb63fa99f4a..c3118544492 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -10,6 +10,7 @@ Created 11/5/1995 Heikki Tuuri #ifdef UNIV_NONINL #include "buf0lru.ic" +#include "srv0srv.h" /* Needed to getsrv_print_innodb_monitor */ #endif #include "ut0byte.h" @@ -107,20 +108,15 @@ buf_LRU_search_and_free_block( means that we should search farther */ { buf_block_t* block; - ulint distance; ibool freed; - ulint i; mutex_enter(&(buf_pool->mutex)); freed = FALSE; - distance = BUF_LRU_FREE_SEARCH_LEN * (1 + n_iterations / 5); - - i = 0; block = UT_LIST_GET_LAST(buf_pool->LRU); - while (i < distance && block != NULL) { + while (block != NULL) { if (buf_flush_ready_for_replace(block)) { @@ -203,6 +199,8 @@ buf_LRU_get_free_block(void) buf_block_t* block = NULL; ibool freed; ulint n_iterations = 0; + ibool mon_value_was; + ibool started_monitor = FALSE; loop: mutex_enter(&(buf_pool->mutex)); @@ -222,7 +220,11 @@ loop: block->state = BUF_BLOCK_READY_FOR_USE; mutex_exit(&(buf_pool->mutex)); - + + if (started_monitor) { + srv_print_innodb_monitor = mon_value_was; + } + return(block); } @@ -237,14 +239,41 @@ loop: goto loop; } - /* No free block was found near the end of the list: try to flush - the LRU list */ - - buf_flush_free_margin(); + if (n_iterations > 30) { + ut_print_timestamp(stderr); + fprintf(stderr, + " ***********************************************\n" + "InnoDB: Warning: difficult to find free blocks from\n" + "InnoDB: the buffer pool (%lu search iterations)! Consider\n" + "InnoDB: increasing the buffer pool size.\n", + n_iterations); + fprintf(stderr, + "InnoDB: It is also possible that in your Unix version\n" + "InnoDB: fsync is very slow, or completely frozen inside\n" + "InnoDB: the OS kernel. Then upgrading to a newer version\n" + "InnoDB: of your operating system may help. Look at the\n" + "InnoDB: number of fsyncs in diagnostic info below.\n"); + + fprintf(stderr, + "InnoDB: Pending flushes (fsync) log: %lu; buffer pool: %lu\n", + fil_n_pending_log_flushes, + fil_n_pending_tablespace_flushes); + fprintf(stderr, + "InnoDB: %lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", + os_n_file_reads, os_n_file_writes, os_n_fsyncs); + + fprintf(stderr, + "InnoDB: Starting InnoDB Monitor to print further\n" + "InnoDB: diagnostics to the standard output.\n"); + + mon_value_was = srv_print_innodb_monitor; + started_monitor = TRUE; + srv_print_innodb_monitor = TRUE; + } - os_event_wait(buf_pool->no_flush[BUF_FLUSH_LRU]); + /* No free block was found: try to flush the LRU list */ - n_iterations++; + buf_flush_free_margin(); os_aio_simulated_wake_handler_threads(); @@ -253,18 +282,7 @@ loop: os_thread_sleep(500000); } - if (n_iterations > 20) { -/* buf_print(); - os_aio_print(); - rw_lock_list_print_info(); -*/ - if (n_iterations > 30) { - fprintf(stderr, - "InnoDB: Warning: difficult to find free blocks from\n" - "InnoDB: the buffer pool (%lu search iterations)! Consider\n" - "InnoDB: increasing the buffer pool size.\n", n_iterations); - } - } + n_iterations++; goto loop; } diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c index 35fdfce16a6..206fbe32940 100644 --- a/innobase/dict/dict0boot.c +++ b/innobase/dict/dict0boot.c @@ -24,6 +24,65 @@ Created 4/18/1996 Heikki Tuuri #include "os0file.h" /************************************************************************** +Gets a pointer to the dictionary header and x-latches its page. */ + +dict_hdr_t* +dict_hdr_get( +/*=========*/ + /* out: pointer to the dictionary header, + page x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + dict_hdr_t* header; + + ut_ad(mtr); + + header = DICT_HDR + buf_page_get(DICT_HDR_SPACE, DICT_HDR_PAGE_NO, + RW_X_LATCH, mtr); + buf_page_dbg_add_level(header, SYNC_DICT_HEADER); + + return(header); +} + +/************************************************************************** +Returns a new table, index, or tree id. */ + +dulint +dict_hdr_get_new_id( +/*================*/ + /* out: the new id */ + ulint type) /* in: DICT_HDR_ROW_ID, ... */ +{ + dict_hdr_t* dict_hdr; + dulint id; + mtr_t mtr; + + ut_ad((type == DICT_HDR_TABLE_ID) || (type == DICT_HDR_INDEX_ID) + || (type == DICT_HDR_MIX_ID)); + + mtr_start(&mtr); + + dict_hdr = dict_hdr_get(&mtr); + + id = mtr_read_dulint(dict_hdr + type, MLOG_8BYTES, &mtr); + + /* Add some dummy code here because otherwise pgcc seems to + compile wrong */ + + if (0 == ut_dulint_cmp(id, ut_dulint_max)) { + printf("Max id\n"); + } + + id = ut_dulint_add(id, 1); + + mlog_write_dulint(dict_hdr + type, id, MLOG_8BYTES, &mtr); + + mtr_commit(&mtr); + + return(id); +} + +/************************************************************************** Writes the current value of the row id counter to the dictionary header file page. */ diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index 9d79983c9e5..b3bf9157e18 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -1076,7 +1076,7 @@ dict_create_or_check_foreign_constraint_tables(void) } fprintf(stderr, - "InnoDB: creating foreign key constraint system tables\n"); + "InnoDB: Creating foreign key constraint system tables\n"); /* NOTE: in dict_load_foreigns we use the fact that there are 2 secondary indexes on SYS_FOREIGN, and they @@ -1112,6 +1112,8 @@ dict_create_or_check_foreign_constraint_tables(void) error = trx->error_state; if (error != DB_SUCCESS) { + fprintf(stderr, "InnoDB: error %lu in creation\n", error); + ut_a(error == DB_OUT_OF_FILE_SPACE); fprintf(stderr, "InnoDB: creation failed\n"); @@ -1133,7 +1135,7 @@ dict_create_or_check_foreign_constraint_tables(void) if (error == DB_SUCCESS) { fprintf(stderr, - "InnoDB: foreign key constraint system tables created\n"); + "InnoDB: Foreign key constraint system tables created\n"); } mutex_exit(&(dict_sys->mutex)); diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index e0a7fd327a5..07a9b472d66 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -246,7 +246,7 @@ dict_table_get_index_noninline( } /************************************************************************ -Initializes the autoinc counter. It is not an error to initialize already +Initializes the autoinc counter. It is not an error to initialize an already initialized counter. */ void @@ -2811,3 +2811,63 @@ dict_field_print_low( printf(" %s", field->name); } + +/************************************************************************** +Sprintfs to a string info on foreign keys of a table. */ + +void +dict_print_info_on_foreign_keys( +/*============================*/ + char* str, /* in/out: pointer to a string */ + ulint len, /* in: space in str available for info */ + dict_table_t* table) /* in: table */ +{ + dict_foreign_t* foreign; + ulint i; + char* buf2; + char buf[10000]; + + buf2 = buf; + + mutex_enter(&(dict_sys->mutex)); + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + if (foreign == NULL) { + mutex_exit(&(dict_sys->mutex)); + + return; + } + + while (foreign != NULL) { + buf2 += sprintf(buf2, "; ("); + + for (i = 0; i < foreign->n_fields; i++) { + buf2 += sprintf(buf2, "%s", + foreign->foreign_col_names[i]); + if (i + 1 < foreign->n_fields) { + buf2 += sprintf(buf2, " "); + } + } + + buf2 += sprintf(buf2, ") REFER %s(", + foreign->referenced_table_name); + + for (i = 0; i < foreign->n_fields; i++) { + buf2 += sprintf(buf2, "%s", + foreign->referenced_col_names[i]); + if (i + 1 < foreign->n_fields) { + buf2 += sprintf(buf2, " "); + } + } + + buf2 += sprintf(buf2, ")"); + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + mutex_exit(&(dict_sys->mutex)); + + buf[len - 1] = '\0'; + ut_memcpy(str, buf, len); +} diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index d289b176efa..bd1f0e6e1d8 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -2024,7 +2024,7 @@ ibuf_insert_low( ulint n_stored; ulint bits; - ut_ad(!(index->type & (DICT_UNIQUE | DICT_CLUSTERED))); + ut_a(!(index->type & (DICT_UNIQUE | DICT_CLUSTERED))); ut_ad(dtuple_check_typed(entry)); do_merge = FALSE; diff --git a/innobase/include/dict0boot.h b/innobase/include/dict0boot.h index 71180439913..cb631be7e35 100644 --- a/innobase/include/dict0boot.h +++ b/innobase/include/dict0boot.h @@ -22,7 +22,7 @@ typedef byte dict_hdr_t; /************************************************************************** Gets a pointer to the dictionary header and x-latches its page. */ -UNIV_INLINE + dict_hdr_t* dict_hdr_get( /*=========*/ @@ -31,7 +31,7 @@ dict_hdr_get( mtr_t* mtr); /* in: mtr */ /************************************************************************** Returns a new row, table, index, or tree id. */ -UNIV_INLINE + dulint dict_hdr_get_new_id( /*================*/ diff --git a/innobase/include/dict0boot.ic b/innobase/include/dict0boot.ic index 8f1e214701f..8a91feed018 100644 --- a/innobase/include/dict0boot.ic +++ b/innobase/include/dict0boot.ic @@ -16,58 +16,6 @@ dict_hdr_flush_row_id(void); /************************************************************************** -Gets a pointer to the dictionary header and x-latches its page. */ -UNIV_INLINE -dict_hdr_t* -dict_hdr_get( -/*=========*/ - /* out: pointer to the dictionary header, - page x-latched */ - mtr_t* mtr) /* in: mtr */ -{ - dict_hdr_t* header; - - ut_ad(mtr); - - header = DICT_HDR + buf_page_get(DICT_HDR_SPACE, DICT_HDR_PAGE_NO, - RW_X_LATCH, mtr); - buf_page_dbg_add_level(header, SYNC_DICT_HEADER); - - return(header); -} - -/************************************************************************** -Returns a new table, index, or tree id. */ -UNIV_INLINE -dulint -dict_hdr_get_new_id( -/*================*/ - /* out: the new id */ - ulint type) /* in: DICT_HDR_ROW_ID, ... */ -{ - dict_hdr_t* dict_hdr; - dulint id; - mtr_t mtr; - - ut_ad((type == DICT_HDR_TABLE_ID) || (type == DICT_HDR_INDEX_ID) - || (type == DICT_HDR_MIX_ID)); - - mtr_start(&mtr); - - dict_hdr = dict_hdr_get(&mtr); - - id = mtr_read_dulint(dict_hdr + type, MLOG_8BYTES, &mtr); - - id = ut_dulint_add(id, 1); - - mlog_write_dulint(dict_hdr + type, id, MLOG_8BYTES, &mtr); - - mtr_commit(&mtr); - - return(id); -} - -/************************************************************************** Returns a new row id. */ UNIV_INLINE dulint diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index 56b55b8a417..a4ab4faa25c 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -261,6 +261,15 @@ void dict_table_print_by_name( /*=====================*/ char* name); +/************************************************************************** +Sprintfs to a string info on foreign keys of a table. */ + +void +dict_print_info_on_foreign_keys( +/*============================*/ + char* str, /* in/out: pointer to a string */ + ulint len, /* in: space in str available for info */ + dict_table_t* table); /* in: table */ /************************************************************************ Gets the first index on the table (the clustered index). */ UNIV_INLINE diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index 75bbbba549f..411a9fb2c21 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -106,6 +106,10 @@ log. */ #define OS_WIN95 2 #define OS_WINNT 3 +extern ulint os_n_file_reads; +extern ulint os_n_file_writes; +extern ulint os_n_fsyncs; + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ diff --git a/innobase/include/os0sync.h b/innobase/include/os0sync.h index 78374cf8ede..26f7dff5d8b 100644 --- a/innobase/include/os0sync.h +++ b/innobase/include/os0sync.h @@ -14,7 +14,7 @@ Created 9/6/1995 Heikki Tuuri #ifdef __WIN__ #define os_fast_mutex_t CRITICAL_SECTION -typedef void* os_event_t; +typedef void* os_event_t; #else diff --git a/innobase/include/page0page.ic b/innobase/include/page0page.ic index a029604c2bc..6e33fe2ca5d 100644 --- a/innobase/include/page0page.ic +++ b/innobase/include/page0page.ic @@ -396,45 +396,6 @@ page_rec_check( return(TRUE); } -/****************************************************************** -Used to check the consistency of a directory slot. */ -UNIV_INLINE -ibool -page_dir_slot_check( -/*================*/ - /* out: TRUE if succeed */ - page_dir_slot_t* slot) /* in: slot */ -{ - page_t* page; - ulint n_slots; - ulint n_owned; - - ut_a(slot); - - page = buf_frame_align(slot); - - n_slots = page_header_get_field(page, PAGE_N_DIR_SLOTS); - - ut_a(slot <= page_dir_get_nth_slot(page, 0)); - ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); - - ut_a(page_rec_check(page + mach_read_from_2(slot))); - - n_owned = rec_get_n_owned(page + mach_read_from_2(slot)); - - if (slot == page_dir_get_nth_slot(page, 0)) { - ut_a(n_owned == 1); - } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { - ut_a(n_owned >= 1); - ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); - } else { - ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); - ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); - } - - return(TRUE); -} - /******************************************************************* Gets the record pointed to by a directory slot. */ UNIV_INLINE diff --git a/innobase/include/read0read.h b/innobase/include/read0read.h index dea952c8547..cebb2d6701c 100644 --- a/innobase/include/read0read.h +++ b/innobase/include/read0read.h @@ -53,6 +53,13 @@ read_view_sees_trx_id( /* out: TRUE if sees */ read_view_t* view, /* in: read view */ dulint trx_id); /* in: trx id */ +/************************************************************************* +Prints a read view to stderr. */ + +void +read_view_print( +/*============*/ + read_view_t* view); /* in: read view */ /* Read view lists the trx ids of those transactions for which a consistent diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h index 4e90c0ac590..32354219e64 100644 --- a/innobase/include/row0mysql.h +++ b/innobase/include/row0mysql.h @@ -342,6 +342,12 @@ struct row_prebuilt_struct { the row id: in this case this flag is set to TRUE */ dict_index_t* index; /* current index for a search, if any */ + ulint read_just_key; /* set to 1 when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ ulint template_type; /* ROW_MYSQL_WHOLE_ROW, ROW_MYSQL_REC_FIELDS, ROW_MYSQL_DUMMY_TEMPLATE, or diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index ca89de4b9a8..df8e85d05f7 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -15,6 +15,7 @@ Created 10/10/1995 Heikki Tuuri #include "os0sync.h" #include "com0com.h" #include "que0types.h" +#include "trx0types.h" /* When this event is set the lock timeout and InnoDB monitor @@ -64,6 +65,8 @@ extern ulint srv_lock_wait_timeout; extern char* srv_unix_file_flush_method_str; extern ulint srv_unix_file_flush_method; extern ulint srv_force_recovery; +extern ulint srv_thread_concurrency; +extern ibool srv_fast_shutdown; extern ibool srv_use_doublewrite_buf; @@ -83,6 +86,9 @@ extern ibool srv_print_innodb_tablespace_monitor; extern ibool srv_print_verbose_log; extern ibool srv_print_innodb_table_monitor; +extern ibool srv_lock_timeout_and_monitor_active; +extern ibool srv_error_monitor_active; + extern ulint srv_n_spin_wait_rounds; extern ulint srv_spin_wait_delay; extern ibool srv_priority_boost; @@ -160,7 +166,11 @@ of lower numbers are included. */ #define SRV_FORCE_NO_IBUF_MERGE 4 /* prevent also ibuf operations: if they would cause a crash, better not do them */ -#define SRV_FORCE_NO_LOG_REDO 5 /* do not do the log roll-forward +#define SRV_FORCE_NO_UNDO_LOG_SCAN 5 /* do not look at undo logs when + starting the database: InnoDB will + treat even incomplete transactions + as committed */ +#define SRV_FORCE_NO_LOG_REDO 6 /* do not do the log roll-forward in connection with recovery */ /************************************************************************* @@ -235,6 +245,29 @@ mutex, for performace reasons). */ void srv_active_wake_master_thread(void); /*===============================*/ +/************************************************************************* +Puts an OS thread to wait if there are too many concurrent threads +(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ + +void +srv_conc_enter_innodb( +/*==================*/ + trx_t* trx); /* in: transaction object associated with the + thread */ +/************************************************************************* +This lets a thread enter InnoDB regardless of the number of threads inside +InnoDB. This must be called when a thread ends a lock wait. */ + +void +srv_conc_force_enter_innodb(void); +/*=============================*/ +/************************************************************************* +This must be called when a thread exits InnoDB. This must also be called +when a thread goes to wait for a lock. */ + +void +srv_conc_exit_innodb(void); +/*======================*/ /******************************************************************* Puts a MySQL OS thread to wait for a lock to be released. */ diff --git a/innobase/include/srv0start.h b/innobase/include/srv0start.h index e2b20f3b5fc..6dbdcd27250 100644 --- a/innobase/include/srv0start.h +++ b/innobase/include/srv0start.h @@ -29,6 +29,15 @@ innobase_shutdown_for_mysql(void); /*=============================*/ /* out: DB_SUCCESS or error code */ -extern ibool srv_startup_is_before_trx_rollback_phase; +extern ibool srv_startup_is_before_trx_rollback_phase; +extern ibool srv_is_being_shut_down; + +/* At a shutdown the value first climbs from 0 to SRV_SHUTDOWN_CLEANUP +and then to SRV_SHUTDOWN_LAST_PHASE */ + +extern ulint srv_shutdown_state; + +#define SRV_SHUTDOWN_CLEANUP 1 +#define SRV_SHUTDOWN_LAST_PHASE 2 #endif diff --git a/innobase/include/trx0purge.h b/innobase/include/trx0purge.h index 8870ebc936c..087be2f060e 100644 --- a/innobase/include/trx0purge.h +++ b/innobase/include/trx0purge.h @@ -94,6 +94,12 @@ trx_purge(void); /*===========*/ /* out: number of undo log pages handled in the batch */ +/********************************************************************** +Prints information of the purge system to stderr. */ + +void +trx_purge_sys_print(void); +/*======================*/ /* The control structure used in the purge operation */ struct trx_purge_struct{ diff --git a/innobase/include/trx0rec.h b/innobase/include/trx0rec.h index edfc283d1b2..50d942d9040 100644 --- a/innobase/include/trx0rec.h +++ b/innobase/include/trx0rec.h @@ -127,7 +127,9 @@ Builds an update vector based on a remaining part of an undo log record. */ byte* trx_undo_update_rec_get_update( /*===========================*/ - /* out: remaining part of the record */ + /* out: remaining part of the record, + NULL if an error detected, which means that + the record is corrupted */ byte* ptr, /* in: remaining part in update undo log record, after reading the row reference NOTE that this copy of the undo log record must @@ -235,7 +237,8 @@ trx_undo_prev_version_build( /*========================*/ /* out: DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is not >= purge_view, - which means that it may have been removed */ + which means that it may have been removed, + DB_ERROR if corrupted record */ rec_t* index_rec,/* in: clustered index record in the index tree */ mtr_t* index_mtr,/* in: mtr which contains the latch to diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index 8db0b39d3b4..58cef01b376 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -17,11 +17,6 @@ Created 3/26/1996 Heikki Tuuri #include "mem0mem.h" #include "read0types.h" -/* If this flag is defined, then unneeded update undo logs are discarded, -saving CPU time. The kernel mutex contention is increased, however. */ - -#define TRX_UPDATE_UNDO_OPT - extern ulint trx_n_mysql_transactions; /************************************************************************ @@ -130,14 +125,6 @@ void trx_mark_sql_stat_end( /*==================*/ trx_t* trx); /* in: trx handle */ -/************************************************************************** -Marks the latest SQL statement ended but does not start a new transaction -if the trx is not started. */ - -void -trx_mark_sql_stat_end_do_not_start_new( -/*===================================*/ - trx_t* trx); /* in: trx handle */ /************************************************************************ Assigns a read view for a consistent read query. All the consistent reads within the same transaction will get the same read view, which is created diff --git a/innobase/include/univ.i b/innobase/include/univ.i index c56f21d6617..584757529cf 100644 --- a/innobase/include/univ.i +++ b/innobase/include/univ.i @@ -104,8 +104,12 @@ memory is read outside the allocated blocks. */ #define UNIV_INLINE __inline #else /* config.h contains the right def for 'inline' for the current compiler */ +#if (__GNUC__ == 2) #define UNIV_INLINE extern inline - +#else +/* extern inline doesn't work with gcc 3.0.2 */ +#define UNIV_INLINE static inline +#endif #endif #else diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index fa0641bad73..0b2b85d0337 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -3472,14 +3472,18 @@ lock_print_info(void) ulint i; mtr_t mtr; + printf("Trx id counter %lu %lu\n", + ut_dulint_get_high(trx_sys->max_trx_id), + ut_dulint_get_low(trx_sys->max_trx_id)); + printf( - "Purge done for all trx's with n:o < %lu %lu, undo n:o < %lu %lu\n", + "Purge done for trx's n:o < %lu %lu undo n:o < %lu %lu\n", ut_dulint_get_high(purge_sys->purge_trx_no), ut_dulint_get_low(purge_sys->purge_trx_no), ut_dulint_get_high(purge_sys->purge_undo_no), ut_dulint_get_low(purge_sys->purge_undo_no)); - lock_mutex_enter_kernel(); + lock_mutex_enter_kernel(); printf("Total number of lock structs in row lock hash table %lu\n", lock_get_n_rec_locks()); diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index 06ac7a578a5..e787176bb21 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -20,6 +20,7 @@ Created 12/9/1995 Heikki Tuuri #include "fil0fil.h" #include "dict0boot.h" #include "srv0srv.h" +#include "srv0start.h" #include "trx0sys.h" #include "trx0trx.h" @@ -2656,6 +2657,8 @@ logs_empty_and_mark_files_at_shutdown(void) } /* Wait until the master thread and all other operations are idle: our algorithm only works if the server is idle at shutdown */ + + srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; loop: os_thread_sleep(100000); @@ -2737,7 +2740,21 @@ loop: goto loop; } + + if (srv_lock_timeout_and_monitor_active) { + + goto loop; + } + + /* We now suspend also the InnoDB error monitor thread */ + srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + + if (srv_error_monitor_active) { + + goto loop; + } + fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); fil_flush_file_spaces(FIL_TABLESPACE); diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index eb3eadcede9..999429cbfcd 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -61,7 +61,11 @@ buffer pool before the pages have been recovered to the up-to-date state */ /* Recovery is running and no operations on the log files are allowed yet: the variable name is misleading */ -ibool recv_no_ibuf_operations = FALSE; +ibool recv_no_ibuf_operations = FALSE; + +/* the following counter is used to decide when to print info on +log scan */ +ulint recv_scan_print_counter = 0; /************************************************************ Creates the recovery system. */ @@ -1812,10 +1816,19 @@ recv_scan_log_recs( *group_scanned_lsn = scanned_lsn; if (more_data) { - fprintf(stderr, + recv_scan_print_counter++; + + if (recv_scan_print_counter < 10 + || (recv_scan_print_counter % 10 == 0)) { + fprintf(stderr, "InnoDB: Doing recovery: scanned up to log sequence number %lu %lu\n", ut_dulint_get_high(*group_scanned_lsn), ut_dulint_get_low(*group_scanned_lsn)); + if (recv_scan_print_counter == 10) { + fprintf(stderr, +"InnoDB: After this prints a line for every 10th scan sweep:\n"); + } + } /* Try to parse more log records */ @@ -1911,6 +1924,15 @@ recv_recovery_from_checkpoint_start( recv_sys_init(); } + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { + fprintf(stderr, + "InnoDB: The user has set SRV_FORCE_NO_LOG_REDO on\n"); + fprintf(stderr, + "InnoDB: Skipping log redo\n"); + + return(DB_SUCCESS); + } + sync_order_checks_on = TRUE; recv_recovery_on = TRUE; @@ -2028,10 +2050,8 @@ recv_recovery_from_checkpoint_start( while (group) { old_scanned_lsn = recv_sys->scanned_lsn; - if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { - recv_group_scan_log_recs(group, &contiguous_lsn, + recv_group_scan_log_recs(group, &contiguous_lsn, &group_scanned_lsn); - } group->scanned_lsn = group_scanned_lsn; @@ -2124,7 +2144,10 @@ recv_recovery_from_checkpoint_finish(void) /* Apply the hashed log records to the respective file pages */ - recv_apply_hashed_log_recs(TRUE); + if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { + + recv_apply_hashed_log_recs(TRUE); + } if (log_debug_writes) { fprintf(stderr, diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index ced601d4de1..9fecf2c04fd 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -235,7 +235,7 @@ os_file_handle_error( fprintf(stderr, "InnoDB: Cannot continue operation.\n" "InnoDB: Disk is full. Try to clean the disk to free space.\n" - "InnoDB: Delete possible created file and restart.\n"); + "InnoDB: Delete a possible created file and restart.\n"); exit(1); @@ -453,8 +453,17 @@ os_file_get_size( return(TRUE); #else - *size = (ulint) lseek(file, 0, SEEK_END); - *size_high = 0; + off_t offs; + + offs = lseek(file, 0, SEEK_END); + + if (sizeof(off_t) > 4) { + *size = (ulint)(offs & 0xFFFFFFFF); + *size_high = (ulint)(offs >> 32); + } else { + *size = (ulint) offs; + *size_high = 0; + } return(TRUE); #endif @@ -474,17 +483,19 @@ os_file_set_size( size */ ulint size_high)/* in: most significant 32 bits of size */ { - ulint offset; - ulint n_bytes; - ulint low; - ibool ret; - ibool retry; - ulint i; - byte* buf; + ib_longlong offset; + ib_longlong low; + ulint n_bytes; + ibool ret; + ibool retry; + byte* buf; + ulint i; + + ut_a(size == (size & 0xFFFFFFFF)); try_again: /* We use a very big 8 MB buffer in writing because Linux may be - extremely slow in fdatasync on 1 MB writes */ + extremely slow in fsync on 1 MB writes */ buf = ut_malloc(UNIV_PAGE_SIZE * 512); @@ -494,21 +505,19 @@ try_again: } offset = 0; - low = size; -#if (UNIV_WORD_SIZE == 8) - low = low + (size_high << 32); -#else - UT_NOT_USED(size_high); -#endif + low = (ib_longlong)size + (((ib_longlong)size_high) << 32); + while (offset < low) { if (low - offset < UNIV_PAGE_SIZE * 512) { - n_bytes = low - offset; + n_bytes = (ulint)(low - offset); } else { - n_bytes = UNIV_PAGE_SIZE * 512; + n_bytes = UNIV_PAGE_SIZE * 512; } - ret = os_file_write(name, file, buf, offset, 0, n_bytes); - + ret = os_file_write(name, file, buf, + (ulint)(offset & 0xFFFFFFFF), + (ulint)(offset >> 32), + n_bytes); if (!ret) { ut_free(buf); goto error_handling; @@ -582,7 +591,6 @@ os_file_flush( #endif } - #ifndef __WIN__ /*********************************************************************** Does a synchronous read operation in Posix. */ @@ -594,9 +602,29 @@ os_file_pread( os_file_t file, /* in: handle to a file */ void* buf, /* in: buffer where to read */ ulint n, /* in: number of bytes to read */ - ulint offset) /* in: offset from where to read */ + ulint offset, /* in: least significant 32 bits of file + offset from where to read */ + ulint offset_high) /* in: most significant 32 bits of + offset */ { - off_t offs = (off_t)offset; + off_t offs; + + ut_a((offset & 0xFFFFFFFF) == offset); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + + if (sizeof(off_t) > 4) { + offs = (off_t)offset + (((off_t)offset_high) << 32); + + } else { + offs = (off_t)offset; + + if (offset_high > 0) { + fprintf(stderr, + "InnoDB: Error: file read at offset > 4 GB\n"); + } + } os_n_file_reads++; @@ -639,10 +667,30 @@ os_file_pwrite( os_file_t file, /* in: handle to a file */ void* buf, /* in: buffer from where to write */ ulint n, /* in: number of bytes to write */ - ulint offset) /* in: offset where to write */ + ulint offset, /* in: least significant 32 bits of file + offset where to write */ + ulint offset_high) /* in: most significant 32 bits of + offset */ { ssize_t ret; - off_t offs = (off_t)offset; + off_t offs; + + ut_a((offset & 0xFFFFFFFF) == offset); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + + if (sizeof(off_t) > 4) { + offs = (off_t)offset + (((off_t)offset_high) << 32); + + } else { + offs = (off_t)offset; + + if (offset_high > 0) { + fprintf(stderr, + "InnoDB: Error: file write at offset > 4 GB\n"); + } + } os_n_file_writes++; @@ -724,6 +772,8 @@ os_file_read( ibool retry; ulint i; + ut_a((offset & 0xFFFFFFFF) == offset); + os_n_file_reads++; try_again: @@ -758,21 +808,18 @@ try_again: #else ibool retry; ssize_t ret; - -#if (UNIV_WORD_SIZE == 8) - offset = offset + (offset_high << 32); -#else - UT_NOT_USED(offset_high); -#endif + try_again: - ret = os_file_pread(file, buf, n, offset); + ret = os_file_pread(file, buf, n, offset, offset_high); if ((ulint)ret == n) { return(TRUE); } #endif +#ifdef __WIN__ error_handling: +#endif retry = os_file_handle_error(file, NULL); if (retry) { @@ -811,6 +858,8 @@ os_file_write( ibool retry; ulint i; + ut_a((offset & 0xFFFFFFFF) == offset); + os_n_file_writes++; try_again: ut_ad(file); @@ -852,19 +901,16 @@ try_again: ibool retry; ssize_t ret; -#if (UNIV_WORD_SIZE == 8) - offset = offset + (offset_high << 32); -#else - UT_NOT_USED(offset_high); -#endif try_again: - ret = os_file_pwrite(file, buf, n, offset); + ret = os_file_pwrite(file, buf, n, offset, offset_high); if ((ulint)ret == n) { return(TRUE); } #endif +#ifdef __WIN__ error_handling: +#endif retry = os_file_handle_error(file, name); if (retry) { @@ -1108,6 +1154,8 @@ os_aio_get_array_and_local_segment( /*********************************************************************** Gets an integer value designating a specified aio array. This is used to give numbers to signals in Posix aio. */ + +#if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) static ulint os_aio_get_array_no( @@ -1161,6 +1209,7 @@ os_aio_get_array_from_no( return(NULL); } } +#endif /* if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) */ /*********************************************************************** Requests for a slot in the aio array. If no slot is available, waits until @@ -1321,8 +1370,8 @@ os_aio_simulated_wake_handler_thread( arrays */ { os_aio_array_t* array; - ulint segment; os_aio_slot_t* slot; + ulint segment; ulint n; ulint i; @@ -1817,7 +1866,8 @@ restart: n_consecutive = 0; - /* Look for an i/o request at the lowest offset in the array */ + /* Look for an i/o request at the lowest offset in the array + (we ignore the high 32 bits of the offset in these heuristics) */ lowest_offset = ULINT_MAX; @@ -1912,7 +1962,7 @@ consecutive_loop: } } - srv_io_thread_op_info[global_segment] = "doing file i/o"; + srv_io_thread_op_info[global_segment] = (char*) "doing file i/o"; /* Do the i/o with ordinary, synchronous i/o functions: */ if (slot->type == OS_FILE_WRITE) { @@ -1924,7 +1974,7 @@ consecutive_loop: } ut_a(ret); - srv_io_thread_op_info[global_segment] = "file i/o done"; + srv_io_thread_op_info[global_segment] = (char*) "file i/o done"; /* printf("aio: %lu consecutive %lu:th segment, first offs %lu blocks\n", n_consecutive, global_segment, slot->offset @@ -1981,7 +2031,7 @@ wait_for_io: os_mutex_exit(array->mutex); - srv_io_thread_op_info[global_segment] = "waiting for i/o request"; + srv_io_thread_op_info[global_segment] = (char*) "waiting for i/o request"; os_event_wait(os_aio_segment_wait_events[global_segment]); diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c index 427064bc89c..a75a7279fb5 100644 --- a/innobase/page/page0page.c +++ b/innobase/page/page0page.c @@ -63,6 +63,45 @@ Assuming a page size of 8 kB, a typical index page of a secondary index contains 300 index entries, and the size of the page directory is 50 x 4 bytes = 200 bytes. */ +/****************************************************************** +Used to check the consistency of a directory slot. */ +static +ibool +page_dir_slot_check( +/*================*/ + /* out: TRUE if succeed */ + page_dir_slot_t* slot) /* in: slot */ +{ + page_t* page; + ulint n_slots; + ulint n_owned; + + ut_a(slot); + + page = buf_frame_align(slot); + + n_slots = page_header_get_field(page, PAGE_N_DIR_SLOTS); + + ut_a(slot <= page_dir_get_nth_slot(page, 0)); + ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); + + ut_a(page_rec_check(page + mach_read_from_2(slot))); + + n_owned = rec_get_n_owned(page + mach_read_from_2(slot)); + + if (slot == page_dir_get_nth_slot(page, 0)) { + ut_a(n_owned == 1); + } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { + ut_a(n_owned >= 1); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } else { + ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } + + return(TRUE); +} + /***************************************************************** Sets the max trx id field value. */ @@ -1228,7 +1267,6 @@ page_validate( mem_heap_t* heap; page_cur_t cur; byte* buf; - ulint i; ulint count; ulint own_count; ulint slot_no; @@ -1238,6 +1276,8 @@ page_validate( ulint offs; ulint n_slots; ibool ret = FALSE; + ulint i; + char err_buf[1000]; heap = mem_heap_create(UNIV_PAGE_SIZE); @@ -1285,9 +1325,14 @@ page_validate( if ((count >= 2) && (!page_cur_is_after_last(&cur))) { if (!(1 == cmp_rec_rec(rec, old_rec, index))) { fprintf(stderr, - "Records in wrong order in index %s\n", - index->name); - + "Records in wrong order in index %s\n", + index->name); + rec_sprintf(err_buf, 900, old_rec); + fprintf(stderr, "InnoDB: record %s\n", err_buf); + + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record %s\n", err_buf); + goto func_exit; } } diff --git a/innobase/read/read0read.c b/innobase/read/read0read.c index 84e2c93b30c..a5048c0c909 100644 --- a/innobase/read/read0read.c +++ b/innobase/read/read0read.c @@ -38,8 +38,8 @@ read_view_create_low( /************************************************************************* Makes a copy of the oldest existing read view, with the exception that also the creating trx of the oldest view is set as not visible in the 'copied' -view. Opens a new view if no views currently exist. The view must be -closed with ..._close. This is used in purge. */ +view. Opens a new view if no views currently exist. The view must be closed +with ..._close. This is used in purge. */ read_view_t* read_view_oldest_copy_or_open_new( @@ -160,7 +160,7 @@ read_view_open_now( /* NOTE that a transaction whose trx number is < trx_sys->max_trx_id can still be active, if it is - in the middle of the commit! Note that when a + in the middle of its commit! Note that when a transaction starts, we initialize trx->no to ut_dulint_max. */ @@ -199,3 +199,37 @@ read_view_close( UT_LIST_REMOVE(view_list, trx_sys->view_list, view); } + +/************************************************************************* +Prints a read view to stderr. */ + +void +read_view_print( +/*============*/ + read_view_t* view) /* in: read view */ +{ + ulint n_ids; + ulint i; + + fprintf(stderr, "Read view low limit trx n:o %lu %lu\n", + ut_dulint_get_high(view->low_limit_no), + ut_dulint_get_low(view->low_limit_no)); + + fprintf(stderr, "Read view up limit trx id %lu %lu\n", + ut_dulint_get_high(view->up_limit_id), + ut_dulint_get_low(view->up_limit_id)); + + fprintf(stderr, "Read view low limit trx id %lu %lu\n", + ut_dulint_get_high(view->low_limit_id), + ut_dulint_get_low(view->low_limit_id)); + + fprintf(stderr, "Read view individually stored trx ids:\n"); + + n_ids = view->n_trx_ids; + + for (i = 0; i < n_ids; i++) { + fprintf(stderr, "Read view trx id %lu %lu\n", + ut_dulint_get_high(read_view_get_nth_trx_id(view, i)), + ut_dulint_get_low(read_view_get_nth_trx_id(view, i))); + } +} diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index c3687ebb0e0..f3d9d579aa0 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -84,7 +84,7 @@ cmp_collate( record */ { ut_ad((type->mtype == DATA_CHAR) || (type->mtype == DATA_VARCHAR)); - + return((ulint) srv_latin1_ordering[code]); } diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c index 88009f2f5c9..749e19575bc 100644 --- a/innobase/rem/rem0rec.c +++ b/innobase/rem/rem0rec.c @@ -451,24 +451,31 @@ rec_validate( /* out: TRUE if ok */ rec_t* rec) /* in: physical record */ { - ulint i; byte* data; ulint len; ulint n_fields; ulint len_sum = 0; ulint sum = 0; + ulint i; ut_a(rec); n_fields = rec_get_n_fields(rec); if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { - ut_a(0); + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + n_fields); + return(FALSE); } for (i = 0; i < n_fields; i++) { data = rec_get_nth_field(rec, i, &len); - ut_a((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL)); + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", i, + len); + return(FALSE); + } if (len != UNIV_SQL_NULL) { len_sum += len; @@ -481,7 +488,12 @@ rec_validate( } } - ut_a(len_sum == (ulint)(rec_get_end(rec) - rec)); + if (len_sum != (ulint)(rec_get_end(rec) - rec)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + len_sum, (ulint)(rec_get_end(rec) - rec)); + return(FALSE); + } rec_dummy = sum; /* This is here only to fool the compiler */ diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index 92cac5a55cf..2c6d3b0ed00 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -277,7 +277,8 @@ row_ins_clust_index_entry_by_modify( heap = mem_heap_create(1024); /* Build an update vector containing all the fields to be modified; - NOTE that this vector may contain also system columns! */ + NOTE that this vector may NOT contain system columns trx_id or + roll_ptr */ update = row_upd_build_difference_binary(cursor->index, entry, ext_vec, n_ext_vec, rec, heap); @@ -1221,6 +1222,8 @@ row_ins_step( trx = thr_get_trx(thr); + trx_start_if_not_started(trx); + node = thr->run_node; ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); @@ -1241,8 +1244,6 @@ row_ins_step( /* It may be that the current session has not yet started its transaction, or it has been committed: */ - trx_start_if_not_started(trx); - if (UT_DULINT_EQ(trx->id, node->trx_id)) { /* No need to do IX-locking or write trx id to buf */ diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 13c0332dcef..19d2fb744c9 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -571,6 +571,8 @@ row_insert_for_mysql( trx->op_info = "inserting"; + trx_start_if_not_started(trx); + if (node == NULL) { row_get_prebuilt_insert_row(prebuilt); node = prebuilt->ins_node; @@ -754,6 +756,8 @@ row_update_for_mysql( trx->op_info = "updating or deleting"; + trx_start_if_not_started(trx); + node = prebuilt->upd_node; clust_index = dict_table_get_first_index(table); @@ -947,6 +951,8 @@ row_create_table_for_mysql( trx->op_info = "creating table"; + trx_start_if_not_started(trx); + namelen = ut_strlen(table->name); keywordlen = ut_strlen("innodb_monitor"); @@ -1034,7 +1040,7 @@ row_create_table_for_mysql( "InnoDB: Error: table %s already exists in InnoDB internal\n" "InnoDB: data dictionary. Have you deleted the .frm file\n" "InnoDB: and not used DROP TABLE? Have you used DROP DATABASE\n" - "InnoDB: for InnoDB tables in MySQL version <= 3.23.42?\n" + "InnoDB: for InnoDB tables in MySQL version <= 3.23.43?\n" "InnoDB: See the Restrictions section of the InnoDB manual.\n", table->name); fprintf(stderr, @@ -1077,6 +1083,8 @@ row_create_index_for_mysql( trx->op_info = "creating index"; + trx_start_if_not_started(trx); + /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ @@ -1146,6 +1154,8 @@ row_table_add_foreign_constraints( trx->op_info = "adding foreign keys"; + trx_start_if_not_started(trx); + /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ @@ -1218,6 +1228,8 @@ row_drop_table_for_mysql( trx->op_info = "dropping table"; + trx_start_if_not_started(trx); + namelen = ut_strlen(name); keywordlen = ut_strlen("innodb_monitor"); @@ -1435,6 +1447,8 @@ row_drop_database_for_mysql( trx->op_info = "dropping database"; + trx_start_if_not_started(trx); + mutex_enter(&(dict_sys->mutex)); while (table_name = dict_get_first_table_name_in_db(name)) { @@ -1454,6 +1468,8 @@ row_drop_database_for_mysql( mutex_exit(&(dict_sys->mutex)); + trx_commit_for_mysql(trx); + trx->op_info = ""; return(err); @@ -1496,6 +1512,7 @@ row_rename_table_for_mysql( } trx->op_info = "renaming table"; + trx_start_if_not_started(trx); str1 = "PROCEDURE RENAME_TABLE_PROC () IS\n" @@ -1602,6 +1619,7 @@ row_scan_and_check_index( rec_t* rec; ibool is_ok = TRUE; int cmp; + char err_buf[1000]; *n_rows = 0; @@ -1649,15 +1667,27 @@ loop: if (cmp > 0) { fprintf(stderr, "Error: index records in a wrong order in index %s\n", - index->name); + index->name); + + dtuple_sprintf(err_buf, 900, prev_entry); + fprintf(stderr, "InnoDB: prev record %s\n", err_buf); + + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record %s\n", err_buf); is_ok = FALSE; } else if ((index->type & DICT_UNIQUE) && matched_fields >= dict_index_get_n_ordering_defined_by_user(index)) { - fprintf(stderr, - "Error: duplicate key in index %s\n", - index->name); + + fprintf(stderr, "Error: duplicate key in index %s\n", + index->name); + + dtuple_sprintf(err_buf, 900, prev_entry); + fprintf(stderr, "InnoDB: prev record %s\n", err_buf); + + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record %s\n", err_buf); is_ok = FALSE; } diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c index 0dffa273938..390f1b59a4d 100644 --- a/innobase/row/row0purge.c +++ b/innobase/row/row0purge.c @@ -208,7 +208,7 @@ row_purge_remove_sec_if_poss_low( ibool found; ulint err; mtr_t mtr; - mtr_t mtr_vers; + mtr_t* mtr_vers; UT_NOT_USED(thr); @@ -235,17 +235,21 @@ row_purge_remove_sec_if_poss_low( which cannot be purged yet, requires its existence. If some requires, we should do nothing. */ - mtr_start(&mtr_vers); + mtr_vers = mem_alloc(sizeof(mtr_t)); + + mtr_start(mtr_vers); - success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers); + success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, mtr_vers); if (success) { old_has = row_vers_old_has_index_entry(TRUE, btr_pcur_get_rec(&(node->pcur)), - &mtr_vers, index, entry); + mtr_vers, index, entry); } - btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + btr_pcur_commit_specify_mtr(&(node->pcur), mtr_vers); + + mem_free(mtr_vers); if (!success || !old_has) { /* Remove the index record */ @@ -489,11 +493,6 @@ row_purge_parse_undo_rec( return(FALSE); } - /* NOTE that the table has to be explicitly released later */ - - /* TODO: currently nothing prevents dropping of table when purge - is accessing it! */ - mutex_enter(&(dict_sys->mutex)); node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr)); diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c index 59169ef2a98..40a775143f4 100644 --- a/innobase/row/row0row.c +++ b/innobase/row/row0row.c @@ -455,12 +455,25 @@ row_build_row_ref_in_tuple( ulint pos; ulint i; - ut_ad(ref && index && rec); + ut_a(ref && index && rec); table = index->table; + + if (!table) { + fprintf(stderr, "InnoDB: table %s for index %s not found\n", + index->table_name, index->name); + ut_a(0); + } clust_index = dict_table_get_first_index(table); - + + if (!clust_index) { + fprintf(stderr, + "InnoDB: clust index for table %s for index %s not found\n", + index->table_name, index->name); + ut_a(0); + } + ref_len = dict_index_get_n_unique(clust_index); ut_ad(ref_len == dtuple_get_n_fields(ref)); @@ -555,6 +568,8 @@ row_search_on_row_ref( index = dict_table_get_first_index(table); + ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index)); + btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr); low_match = btr_pcur_get_low_match(pcur); diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index e42486f1e17..2cccc217621 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -2671,6 +2671,8 @@ no_shortcut: trx->has_search_latch = FALSE; } + trx_start_if_not_started(trx); + /* Note that if the search mode was GE or G, then the cursor naturally moves upward (in fetch next) in alphabetical order, otherwise downward */ @@ -2715,16 +2717,12 @@ no_shortcut: /* No need to set an intention lock or assign a read view */ } else if (prebuilt->select_lock_type == LOCK_NONE) { - /* This is a consistent read */ - trx_start_if_not_started(trx); - + /* This is a consistent read */ /* Assign a read view for the query */ trx_assign_read_view(trx); prebuilt->sql_stat_start = FALSE; - } else { - trx_start_if_not_started(trx); - + } else { if (prebuilt->select_lock_type == LOCK_S) { err = lock_table(0, index->table, LOCK_IS, thr); } else { diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c index 37f5b1f0bc1..9e8ba87fc2f 100644 --- a/innobase/row/row0umod.c +++ b/innobase/row/row0umod.c @@ -299,13 +299,13 @@ row_undo_mod_del_mark_or_remove_sec_low( BTR_MODIFY_TREE */ { ibool found; - mtr_t mtr; - mtr_t mtr_vers; btr_pcur_t pcur; btr_cur_t* btr_cur; ibool success; ibool old_has; ulint err; + mtr_t mtr; + mtr_t mtr_vers; log_free_check(); mtr_start(&mtr); @@ -338,7 +338,7 @@ row_undo_mod_del_mark_or_remove_sec_low( success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur), &mtr_vers); - ut_ad(success); + ut_a(success); old_has = row_vers_old_has_index_entry(FALSE, btr_pcur_get_rec(&(node->pcur)), @@ -361,7 +361,7 @@ row_undo_mod_del_mark_or_remove_sec_low( ut_ad(mode == BTR_MODIFY_TREE); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, - TRUE, &mtr); + TRUE, &mtr); /* The delete operation may fail if we have little file space left: TODO: easiest to crash the database @@ -413,12 +413,12 @@ row_undo_mod_del_unmark_sec( dict_index_t* index, /* in: index */ dtuple_t* entry) /* in: index entry */ { - mtr_t mtr; btr_pcur_t pcur; btr_cur_t* btr_cur; ulint err; ibool found; - char* err_buf; + mtr_t mtr; + char err_buf[1000]; UT_NOT_USED(node); @@ -428,12 +428,10 @@ row_undo_mod_del_unmark_sec( found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, &mtr); if (!found) { - err_buf = mem_alloc(1000); - dtuple_sprintf(err_buf, 900, entry); - fprintf(stderr, "InnoDB: error in sec index entry del undo in\n" "InnoDB: index %s table %s\n", index->name, index->table->name); + dtuple_sprintf(err_buf, 900, entry); fprintf(stderr, "InnoDB: tuple %s\n", err_buf); rec_sprintf(err_buf, 900, btr_pcur_get_rec(&pcur)); @@ -444,8 +442,6 @@ row_undo_mod_del_unmark_sec( fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n"); trx_print(thr_get_trx(thr)); - - mem_free(err_buf); } else { btr_cur = btr_pcur_get_btr_cur(&pcur); diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index fa859729141..435cfa3485e 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -700,7 +700,7 @@ row_upd_build_difference_binary( ulint i; /* This function is used only for a clustered index */ - ut_ad(index->type & DICT_CLUSTERED); + ut_a(index->type & DICT_CLUSTERED); update = upd_create(dtuple_get_n_fields(entry), heap); @@ -718,10 +718,14 @@ row_upd_build_difference_binary( /* NOTE: we compare the fields as binary strings! (No collation) */ - if ((rec_get_nth_field_extern_bit(rec, i) - != upd_ext_vec_contains(ext_vec, n_ext_vec, i)) - || ((i != trx_id_pos) && (i != roll_ptr_pos) - && !dfield_data_is_binary_equal(dfield, len, data))) { + if (i == trx_id_pos || i == roll_ptr_pos) { + + goto skip_compare; + } + + if (rec_get_nth_field_extern_bit(rec, i) + != upd_ext_vec_contains(ext_vec, n_ext_vec, i) + || !dfield_data_is_binary_equal(dfield, len, data)) { upd_field = upd_get_nth_field(update, n_diff); @@ -737,6 +741,8 @@ row_upd_build_difference_binary( n_diff++; } +skip_compare: + ; } update->n_fields = n_diff; @@ -1011,13 +1017,13 @@ row_upd_sec_index_entry( ibool found; dict_index_t* index; dtuple_t* entry; - mtr_t mtr; btr_pcur_t pcur; btr_cur_t* btr_cur; mem_heap_t* heap; rec_t* rec; - char* err_buf; ulint err = DB_SUCCESS; + mtr_t mtr; + char err_buf[1000]; index = node->index; @@ -1038,12 +1044,10 @@ row_upd_sec_index_entry( rec = btr_cur_get_rec(btr_cur); if (!found) { - err_buf = mem_alloc(1000); - dtuple_sprintf(err_buf, 900, entry); - fprintf(stderr, "InnoDB: error in sec index entry update in\n" "InnoDB: index %s table %s\n", index->name, index->table->name); + dtuple_sprintf(err_buf, 900, entry); fprintf(stderr, "InnoDB: tuple %s\n", err_buf); rec_sprintf(err_buf, 900, rec); @@ -1054,8 +1058,6 @@ row_upd_sec_index_entry( fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n"); trx_print(thr_get_trx(thr)); - - mem_free(err_buf); } else { /* Delete mark the old index record; it can already be delete marked if we return after a lock wait in @@ -1620,6 +1622,8 @@ row_upd_step( trx = thr_get_trx(thr); + trx_start_if_not_started(trx); + node = thr->run_node; sel_node = node->select; @@ -1638,8 +1642,6 @@ row_upd_step( /* It may be that the current session has not yet started its transaction, or it has been committed: */ - trx_start_if_not_started(thr_get_trx(thr)); - err = lock_table(0, node->table, LOCK_IX, thr); if (err != DB_SUCCESS) { diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c index 5b62cd2b7e3..9508e73f45d 100644 --- a/innobase/row/row0vers.c +++ b/innobase/row/row0vers.c @@ -300,7 +300,7 @@ row_vers_old_has_index_entry( if heap2 != NULL */ } - if ((err != DB_SUCCESS) || !prev_version) { + if (err != DB_SUCCESS || !prev_version) { /* Versions end here */ mem_heap_free(heap); diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index 45f7b1b6879..a00f5c95c7c 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -24,7 +24,7 @@ thread library. This might confuse NT though. Created 10/8/1995 Heikki Tuuri *******************************************************/ - +/* Dummy comment */ #include "srv0srv.h" #include "ut0mem.h" @@ -48,11 +48,15 @@ Created 10/8/1995 Heikki Tuuri #include "buf0flu.h" #include "btr0sea.h" #include "dict0load.h" +#include "srv0start.h" /* The following counter is incremented whenever there is some user activity in the server */ ulint srv_activity_count = 0; +ibool srv_lock_timeout_and_monitor_active = FALSE; +ibool srv_error_monitor_active = FALSE; + char* srv_main_thread_op_info = ""; /* Server parameters which are read from the initfile */ @@ -106,9 +110,48 @@ char* srv_unix_file_flush_method_str = NULL; ulint srv_unix_file_flush_method = 0; /* If the following is != 0 we do not allow inserts etc. This protects -the user from forgetting innodb_force_recovery keyword to my.cnf */ +the user from forgetting the innodb_force_recovery keyword to my.cnf */ ulint srv_force_recovery = 0; +/*-----------------------*/ +/* The following controls how many threads we let inside InnoDB concurrently: +threads waiting for locks are not counted into the number because otherwise +we could get a deadlock. MySQL creates a thread for each user session, and +semaphore contention and convoy problems can occur withput this restriction. +Value 10 should be good if there are less than 4 processors + 4 disks in the +computer. Bigger computers need bigger values. */ + +ulint srv_thread_concurrency = 4; + +os_fast_mutex_t srv_conc_mutex; /* this mutex protects srv_conc data + structures */ +ulint srv_conc_n_threads = 0; /* number of OS threads currently + inside InnoDB */ + +typedef struct srv_conc_slot_struct srv_conc_slot_t; +struct srv_conc_slot_struct{ + os_event_t event; /* event to wait */ + ibool reserved; /* TRUE if slot + reserved */ + ibool wait_ended; /* TRUE when another + thread has already set + the event and the + thread in this slot is + free to proceed; but + reserved may still be + TRUE at that point */ + UT_LIST_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue node */ +}; + +UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue of threads + waiting to get in */ +srv_conc_slot_t srv_conc_slots[OS_THREAD_MAX_N]; /* array of wait + slots */ +/*-----------------------*/ +/* If the following is set TRUE then we do not run purge and insert buffer +merge to completion before shutdown */ + +ibool srv_fast_shutdown = FALSE; ibool srv_use_doublewrite_buf = TRUE; @@ -1512,8 +1555,9 @@ void srv_init(void) /*==========*/ { - srv_slot_t* slot; - ulint i; + srv_conc_slot_t* conc_slot; + srv_slot_t* slot; + ulint i; srv_sys = mem_alloc(sizeof(srv_sys_t)); @@ -1556,6 +1600,19 @@ srv_init(void) ut_a(srv_sys->operational); UT_LIST_INIT(srv_sys->tasks); + + /* Init the server concurrency restriction data structures */ + + os_fast_mutex_init(&srv_conc_mutex); + + UT_LIST_INIT(srv_conc_queue); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + conc_slot = srv_conc_slots + i; + conc_slot->reserved = FALSE; + conc_slot->event = os_event_create(NULL); + ut_a(conc_slot->event); + } } /************************************************************************* @@ -1572,6 +1629,140 @@ srv_general_init(void) } /************************************************************************* +Puts an OS thread to wait if there are too many concurrent threads +(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ + +void +srv_conc_enter_innodb( +/*==================*/ + trx_t* trx) /* in: transaction object associated with the + thread */ +{ + srv_conc_slot_t* slot; + ulint i; + + os_fast_mutex_lock(&srv_conc_mutex); + + if (srv_conc_n_threads < srv_thread_concurrency) { + srv_conc_n_threads++; + + os_fast_mutex_unlock(&srv_conc_mutex); + + return; + } + + /* Too many threads inside: put to the current thread to a queue */ + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + slot = srv_conc_slots + i; + + if (!slot->reserved) { + break; + } + } + + if (i == OS_THREAD_MAX_N) { + /* Could not find a free wait slot, we must let the + thread enter */ + + srv_conc_n_threads++; + + os_fast_mutex_unlock(&srv_conc_mutex); + + return; + } + + /* Release possible search system latch this thread has */ + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + /* Add to the queue */ + slot->reserved = TRUE; + slot->wait_ended = FALSE; + + UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot); + + os_event_reset(slot->event); + + os_fast_mutex_unlock(&srv_conc_mutex); + + /* Go to wait for the event; when a thread leaves InnoDB it will + release this thread */ + + os_event_wait(slot->event); + + os_fast_mutex_lock(&srv_conc_mutex); + + /* NOTE that the thread which released this thread already + incremented the thread counter on behalf of this thread */ + + slot->reserved = FALSE; + + UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot); + + os_fast_mutex_unlock(&srv_conc_mutex); +} + +/************************************************************************* +This lets a thread enter InnoDB regardless of the number of threads inside +InnoDB. This must be called when a thread ends a lock wait. */ + +void +srv_conc_force_enter_innodb(void) +/*=============================*/ +{ + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc_n_threads++; + + os_fast_mutex_unlock(&srv_conc_mutex); +} + +/************************************************************************* +This must be called when a thread exits InnoDB. This must also be called +when a thread goes to wait for a lock. */ + +void +srv_conc_exit_innodb(void) +/*======================*/ +{ + srv_conc_slot_t* slot = NULL; + + os_fast_mutex_lock(&srv_conc_mutex); + + ut_a(srv_conc_n_threads > 0); + + srv_conc_n_threads--; + + if (srv_conc_n_threads < srv_thread_concurrency) { + /* Look for a slot where a thread is waiting and no other + thread has yet released the thread */ + + slot = UT_LIST_GET_FIRST(srv_conc_queue); + + while (slot && slot->wait_ended == TRUE) { + slot = UT_LIST_GET_NEXT(srv_conc_queue, slot); + } + + if (slot != NULL) { + slot->wait_ended = TRUE; + + /* We increment the count on behalf of the released + thread */ + + srv_conc_n_threads++; + } + } + + os_fast_mutex_unlock(&srv_conc_mutex); + + if (slot != NULL) { + os_event_set(slot->event); + } +} + +/************************************************************************* Normalizes init parameter values to use units we use inside InnoDB. */ static ulint @@ -1713,10 +1904,20 @@ srv_suspend_mysql_thread( mutex_exit(&kernel_mutex); + /* We must declare this OS thread to exit InnoDB, since a possible + other thread holding a lock which this thread waits for must be + allowed to enter, sooner or later */ + + srv_conc_exit_innodb(); + /* Wait for the release */ os_event_wait(event); + /* Return back inside InnoDB */ + + srv_conc_force_enter_innodb(); + mutex_enter(&kernel_mutex); /* Release the slot for others to use */ @@ -1792,6 +1993,8 @@ srv_lock_timeout_and_monitor_thread( UT_NOT_USED(arg); last_monitor_time = time(NULL); loop: + srv_lock_timeout_and_monitor_active = TRUE; + /* When someone is waiting for a lock, we wake up every second and check if a timeout has passed for a lock wait */ @@ -1809,9 +2012,9 @@ loop: if (time_elapsed > 15) { - last_monitor_time = time(NULL); - if (srv_print_innodb_monitor) { + + last_monitor_time = time(NULL); printf("=====================================\n"); ut_print_timestamp(stdout); @@ -1849,8 +2052,9 @@ loop: printf("--------------\n" "ROW OPERATIONS\n" "--------------\n"); - printf("InnoDB main thread state: %s\n", - srv_main_thread_op_info); + printf( + "%lu queries inside InnoDB; main thread: %s\n", + srv_conc_n_threads, srv_main_thread_op_info); printf( "Number of rows inserted %lu, updated %lu, deleted %lu, read %lu\n", srv_n_rows_inserted, @@ -1934,7 +2138,7 @@ loop: (wait_time > (double) srv_lock_wait_timeout || wait_time < 0)) { - /* Timeout exceeded or a wrap over in system + /* Timeout exceeded or a wrap-around in system time counter: cancel the lock request queued by the transaction and release possible other transactions waiting behind */ @@ -1949,6 +2153,10 @@ loop: mutex_exit(&kernel_mutex); + if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { + goto exit_func; + } + if (some_waits || srv_print_innodb_monitor || srv_print_innodb_lock_monitor || srv_print_innodb_tablespace_monitor @@ -1958,11 +2166,16 @@ loop: /* No one was waiting for a lock and no monitor was active: suspend this thread */ - + + srv_lock_timeout_and_monitor_active = FALSE; + os_event_wait(srv_lock_timeout_thread_event); goto loop; +exit_func: + srv_lock_timeout_and_monitor_active = FALSE; + #ifndef __WIN__ return(NULL); #else @@ -1987,11 +2200,18 @@ srv_error_monitor_thread( { UT_NOT_USED(arg); loop: + srv_error_monitor_active = TRUE; + os_thread_sleep(10000000); sync_array_print_long_waits(); - goto loop; + if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) { + + goto loop; + } + + srv_error_monitor_active = FALSE; #ifndef __WIN__ return(NULL); @@ -2079,13 +2299,12 @@ loop: for (i = 0; i < 10; i++) { n_ios_old = log_sys->n_log_ios + buf_pool->n_pages_read + buf_pool->n_pages_written; - srv_main_thread_op_info = "sleeping"; os_thread_sleep(1000000); if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) { - goto loop; + goto suspend_thread; } /* We flush the log once in a second even if no commit @@ -2112,6 +2331,11 @@ loop: log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); } + if (srv_fast_shutdown && srv_shutdown_state > 0) { + + goto background_loop; + } + if (srv_activity_count == old_activity_count) { if (srv_print_thread_releases) { @@ -2160,6 +2384,11 @@ loop: while (n_pages_purged) { + if (srv_fast_shutdown && srv_shutdown_state > 0) { + + goto background_loop; + } + srv_main_thread_op_info = "purging"; n_pages_purged = trx_purge(); @@ -2247,7 +2476,12 @@ background_loop: log_archive_do(FALSE, &n_bytes_archived); - if (n_pages_purged + n_bytes_merged + n_pages_flushed + if (srv_fast_shutdown && srv_shutdown_state > 0) { + if (n_pages_flushed + n_bytes_archived != 0) { + + goto background_loop; + } + } else if (n_pages_purged + n_bytes_merged + n_pages_flushed + n_bytes_archived != 0) { goto background_loop; } @@ -2261,6 +2495,7 @@ background_loop: /* There is no work for background operations either: suspend master thread to wait for more server activity */ +suspend_thread: srv_main_thread_op_info = "suspending"; mutex_enter(&kernel_mutex); diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index bdc8225a14f..7a429bdfed5 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -60,6 +60,10 @@ ibool srv_startup_is_before_trx_rollback_phase = FALSE; ibool srv_is_being_started = FALSE; ibool srv_was_started = FALSE; +/* At a shutdown the value first climbs to SRV_SHUTDOWN_CLEANUP +and then to SRV_SHUTDOWN_LAST_PHASE */ +ulint srv_shutdown_state = 0; + ibool measure_cont = FALSE; os_file_t files[1000]; @@ -175,6 +179,34 @@ srv_add_path_separator_if_needed( } /************************************************************************* +Calculates the low 32 bits when a file size which is given as a number +database pages is converted to the number of bytes. */ +static +ulint +srv_calc_low32( +/*===========*/ + /* out: low 32 bytes of file size when + expressed in bytes */ + ulint file_size) /* in: file size in database pages */ +{ + return(0xFFFFFFFF & (file_size << UNIV_PAGE_SIZE_SHIFT)); +} + +/************************************************************************* +Calculates the high 32 bits when a file size which is given as a number +database pages is converted to the number of bytes. */ +static +ulint +srv_calc_high32( +/*============*/ + /* out: high 32 bytes of file size when + expressed in bytes */ + ulint file_size) /* in: file size in database pages */ +{ + return(file_size >> (32 - UNIV_PAGE_SIZE_SHIFT)); +} + +/************************************************************************* Creates or opens the log files. */ static ulint @@ -214,8 +246,7 @@ open_or_create_log_file( return(DB_ERROR); } - files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_AIO, + files[i] = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO, OS_LOG_FILE, &ret); if (!ret) { fprintf(stderr, @@ -227,8 +258,9 @@ open_or_create_log_file( ret = os_file_get_size(files[i], &size, &size_high); ut_a(ret); - if (size != UNIV_PAGE_SIZE * srv_log_file_size - || size_high != 0) { + if (size != srv_calc_low32(srv_log_file_size) + || size_high != srv_calc_high32(srv_log_file_size)) { + fprintf(stderr, "InnoDB: Error: log file %s is of different size\n" "InnoDB: than specified in the .cnf file!\n", name); @@ -241,11 +273,13 @@ open_or_create_log_file( fprintf(stderr, "InnoDB: Log file %s did not exist: new to be created\n", name); - fprintf(stderr, "InnoDB: Setting log file %s size to %lu\n", - name, UNIV_PAGE_SIZE * srv_log_file_size); + fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n", + name, srv_log_file_size + >> (20 - UNIV_PAGE_SIZE_SHIFT)); ret = os_file_set_size(name, files[i], - UNIV_PAGE_SIZE * srv_log_file_size, 0); + srv_calc_low32(srv_log_file_size), + srv_calc_high32(srv_log_file_size)); if (!ret) { fprintf(stderr, "InnoDB: Error in creating %s: probably out of disk space\n", @@ -277,8 +311,7 @@ open_or_create_log_file( if (k == 0 && i == 0) { arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID; - fil_space_create("arch_log_space", arch_space_id, - FIL_LOG); + fil_space_create("arch_log_space", arch_space_id, FIL_LOG); } else { arch_space_id = ULINT_UNDEFINED; } @@ -396,9 +429,14 @@ open_or_create_data_files( &size_high); ut_a(ret); - if (size != - UNIV_PAGE_SIZE * srv_data_file_sizes[i] - || size_high != 0) { + /* File sizes in srv_... are given in + database pages */ + + if (size != srv_calc_low32( + srv_data_file_sizes[i]) + || size_high != srv_calc_high32( + srv_data_file_sizes[i])) { + fprintf(stderr, "InnoDB: Error: data file %s is of different size\n" "InnoDB: than specified in the .cnf file!\n", name); @@ -426,14 +464,17 @@ open_or_create_data_files( *create_new_db = TRUE; } - fprintf(stderr, "InnoDB: Setting file %s size to %lu\n", - name, UNIV_PAGE_SIZE * srv_data_file_sizes[i]); + fprintf(stderr, + "InnoDB: Setting file %s size to %lu MB\n", + name, (srv_data_file_sizes[i] + >> (20 - UNIV_PAGE_SIZE_SHIFT))); fprintf(stderr, "InnoDB: Database physically writes the file full: wait...\n"); ret = os_file_set_size(name, files[i], - UNIV_PAGE_SIZE * srv_data_file_sizes[i], 0); + srv_calc_low32(srv_data_file_sizes[i]), + srv_calc_high32(srv_data_file_sizes[i])); if (!ret) { fprintf(stderr, @@ -673,16 +714,28 @@ innobase_start_or_create_for_mysql(void) return(DB_ERROR); } - sum_of_new_sizes = 0; + if (sizeof(ulint) == 4 + && srv_n_log_files * srv_log_file_size >= 262144) { + + fprintf(stderr, + "InnoDB: Error: combined size of log files must be < 4 GB\n" + "InnoDB: on 32-bit computers\n"); + return(DB_ERROR); + } + + sum_of_new_sizes = 0; + for (i = 0; i < srv_n_data_files; i++) { - if (srv_data_file_sizes[i] >= 262144) { +#ifndef __WIN__ + if (sizeof(off_t) < 5 && srv_data_file_sizes[i] >= 262144) { fprintf(stderr, - "InnoDB: Error: file size must be < 4 GB, or on some OS's < 2 GB\n"); + "InnoDB: Error: file size must be < 4 GB with this MySQL binary\n" + "InnoDB: and operating system combination, in some OS's < 2 GB\n"); return(DB_ERROR); } - +#endif sum_of_new_sizes += srv_data_file_sizes[i]; } @@ -889,7 +942,6 @@ innobase_start_or_create_for_mysql(void) /* Create the thread which warns of long semaphore waits */ os_thread_create(&srv_error_monitor_thread, NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS); - srv_was_started = TRUE; srv_is_being_started = FALSE; @@ -945,7 +997,7 @@ innobase_shutdown_for_mysql(void) the tablespace header(s), and copy all log data to archive */ logs_empty_and_mark_files_at_shutdown(); - + ut_free_all_mem(); return((int) DB_SUCCESS); diff --git a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c index e4c351b9d21..b82c2a4a2df 100644 --- a/innobase/sync/sync0arr.c +++ b/innobase/sync/sync0arr.c @@ -48,6 +48,8 @@ struct sync_cell_struct { void* wait_object; /* pointer to the object the thread is waiting for; if NULL the cell is free for use */ + mutex_t* old_wait_mutex; /* the latest wait mutex in cell */ + rw_lock_t* old_wait_rw_lock;/* the latest wait rw-lock in cell */ ulint request_type; /* lock type requested on the object */ char* file; /* in debug version file where @@ -353,6 +355,13 @@ sync_array_reserve_cell( cell->thread = os_thread_get_curr_id(); cell->wait_object = object; + + if (type == SYNC_MUTEX) { + cell->old_wait_mutex = object; + } else { + cell->old_wait_rw_lock = object; + } + cell->request_type = type; cell->waiting = FALSE; @@ -448,7 +457,9 @@ sync_array_cell_print( difftime(time(NULL), cell->reservation_time)); if (type == SYNC_MUTEX) { - mutex = (mutex_t*)cell->wait_object; + /* We use old_wait_mutex in case the cell has already + been freed meanwhile */ + mutex = cell->old_wait_mutex; fprintf(file, "Mutex at %lx created file %s line %lu, lock var %lu\n", @@ -466,7 +477,7 @@ sync_array_cell_print( fprintf(file, "S-lock on"); } - rwlock = (rw_lock_t*)cell->wait_object; + rwlock = cell->old_wait_rw_lock; fprintf(file, " RW-latch at %lx created in file %s line %lu\n", (ulint)rwlock, rwlock->cfile_name, rwlock->cline); diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c index c50ffb65e00..a91ac135f81 100644 --- a/innobase/trx/trx0purge.c +++ b/innobase/trx/trx0purge.c @@ -537,13 +537,13 @@ trx_purge_truncate_history(void) /* We play safe and set the truncate limit at most to the purge view low_limit number, though this is not necessary */ - if (ut_dulint_cmp(limit_trx_no, (purge_sys->view)->low_limit_no) >= 0) { - limit_trx_no = (purge_sys->view)->low_limit_no; + if (ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) >= 0) { + limit_trx_no = purge_sys->view->low_limit_no; limit_undo_no = ut_dulint_zero; } ut_ad((ut_dulint_cmp(limit_trx_no, - (purge_sys->view)->low_limit_no) <= 0)); + purge_sys->view->low_limit_no) <= 0)); rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); @@ -565,7 +565,7 @@ trx_purge_truncate_if_arr_empty(void) { ut_ad(mutex_own(&(purge_sys->mutex))); - if ((purge_sys->arr)->n_used == 0) { + if (purge_sys->arr->n_used == 0) { trx_purge_truncate_history(); @@ -783,7 +783,7 @@ trx_purge_get_next_rec( ut_ad(mutex_own(&(purge_sys->mutex))); ut_ad(purge_sys->next_stored); - space = (purge_sys->rseg)->space; + space = purge_sys->rseg->space; page_no = purge_sys->page_no; offset = purge_sys->offset; @@ -936,7 +936,7 @@ trx_purge_fetch_next_rec( } if (ut_dulint_cmp(purge_sys->purge_trx_no, - (purge_sys->view)->low_limit_no) >= 0) { + purge_sys->view->low_limit_no) >= 0) { purge_sys->state = TRX_STOP_PURGE; trx_purge_truncate_if_arr_empty(); @@ -1072,3 +1072,28 @@ trx_purge(void) return(purge_sys->n_pages_handled - old_pages_handled); } + +/********************************************************************** +Prints information of the purge system to stderr. */ + +void +trx_purge_sys_print(void) +/*=====================*/ +{ + fprintf(stderr, "InnoDB: Purge system view:\n"); + read_view_print(purge_sys->view); + + fprintf(stderr, "InnoDB: Purge trx n:o %lu %lu, undo n_o %lu %lu\n", + ut_dulint_get_high(purge_sys->purge_trx_no), + ut_dulint_get_low(purge_sys->purge_trx_no), + ut_dulint_get_high(purge_sys->purge_undo_no), + ut_dulint_get_low(purge_sys->purge_undo_no)); + fprintf(stderr, + "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n" + "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n", + purge_sys->next_stored, + purge_sys->page_no, + purge_sys->offset, + purge_sys->hdr_page_no, + purge_sys->hdr_offset); +} diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index 73153cfaa37..abce470bd1c 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -329,7 +329,7 @@ trx_undo_rec_get_pars( /************************************************************************** Reads from an undo log record a stored column value. */ -UNIV_INLINE +static byte* trx_undo_rec_get_col_val( /*=====================*/ @@ -374,13 +374,14 @@ trx_undo_rec_get_row_ref( mem_heap_t* heap) /* in: memory heap from which the memory needed is allocated */ { - ulint i; dfield_t* dfield; byte* field; ulint len; ulint ref_len; + ulint i; ut_ad(index && ptr && ref && heap); + ut_a(index->type & DICT_CLUSTERED); ref_len = dict_index_get_n_unique(index); @@ -411,12 +412,13 @@ trx_undo_rec_skip_row_ref( record, at the start of the row reference */ dict_index_t* index) /* in: clustered index */ { - ulint i; - byte* field; - ulint len; - ulint ref_len; + byte* field; + ulint len; + ulint ref_len; + ulint i; ut_ad(index && ptr); + ut_a(index->type & DICT_CLUSTERED); ref_len = dict_index_get_n_unique(index); @@ -468,7 +470,7 @@ trx_undo_page_report_modify( byte* type_cmpl_ptr; ulint i; - ut_ad(index->type & DICT_CLUSTERED); + ut_a(index->type & DICT_CLUSTERED); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; @@ -603,7 +605,7 @@ trx_undo_page_report_modify( /* Notify purge that it eventually has to free the old externally stored field */ - (trx->update_undo)->del_marks = TRUE; + trx->update_undo->del_marks = TRUE; *type_cmpl_ptr = *type_cmpl_ptr | TRX_UNDO_UPD_EXTERN; } else { @@ -634,7 +636,7 @@ trx_undo_page_report_modify( if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { - (trx->update_undo)->del_marks = TRUE; + trx->update_undo->del_marks = TRUE; if (trx_undo_left(undo_page, ptr) < 5) { @@ -787,7 +789,9 @@ Builds an update vector based on a remaining part of an undo log record. */ byte* trx_undo_update_rec_get_update( /*===========================*/ - /* out: remaining part of the record */ + /* out: remaining part of the record, + NULL if an error detected, which means that + the record is corrupted */ byte* ptr, /* in: remaining part in update undo log record, after reading the row reference NOTE that this copy of the undo log record must @@ -816,6 +820,8 @@ trx_undo_update_rec_get_update( ulint field_no; ulint i; + ut_a(index->type & DICT_CLUSTERED); + if (type != TRX_UNDO_DEL_MARK_REC) { ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields); } else { @@ -846,11 +852,28 @@ trx_undo_update_rec_get_update( index); dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN); - /* Store then the updated ordinary columns to update vector */ + /* Store then the updated ordinary columns to the update vector */ for (i = 0; i < n_fields; i++) { ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + if (field_no >= dict_index_get_n_fields(index)) { + fprintf(stderr, + "InnoDB: Error: trying to access update undo rec field %lu in table %s\n" + "InnoDB: index %s, but index has only %lu fields\n", + field_no, index->table_name, index->name, + dict_index_get_n_fields(index)); + fprintf(stderr, + "InnoDB: Send a detailed bug report to mysql@lists.mysql.com"); + + fprintf(stderr, + "InnoDB: Run also CHECK TABLE on table %s\n", index->table_name); + fprintf(stderr, + "InnoDB: n_fields = %lu, i = %lu, ptr %lx\n", n_fields, i, (ulint)ptr); + return(NULL); + } + ptr = trx_undo_rec_get_col_val(ptr, &field, &len); upd_field = upd_get_nth_field(update, i); @@ -1005,7 +1028,7 @@ trx_undo_report_row_operation( the update vector, otherwise NULL */ ulint cmpl_info, /* in: compiler info on secondary index updates */ - rec_t* rec, /* in: case of an update or delete + rec_t* rec, /* in: in case of an update or delete marking, the record in the clustered index, otherwise NULL */ dulint* roll_ptr) /* out: rollback pointer to the @@ -1017,11 +1040,13 @@ trx_undo_report_row_operation( trx_undo_t* undo; page_t* undo_page; ulint offset; - mtr_t mtr; ulint page_no; ibool is_insert; trx_rseg_t* rseg; + mtr_t mtr; + ut_a(index->type & DICT_CLUSTERED); + if (flags & BTR_NO_UNDO_LOG_FLAG) { *roll_ptr = ut_dulint_zero; @@ -1030,7 +1055,7 @@ trx_undo_report_row_operation( } ut_ad(thr); - ut_ad(index->type & DICT_CLUSTERED); + ut_a(index->type & DICT_CLUSTERED); ut_ad((op_type != TRX_UNDO_INSERT_OP) || (clust_entry && !update && !rec)); @@ -1165,6 +1190,7 @@ trx_undo_get_undo_rec_low( dulint roll_ptr, /* in: roll pointer to record */ mem_heap_t* heap) /* in: memory heap where copied */ { + trx_undo_rec_t* undo_rec; ulint rseg_id; ulint page_no; ulint offset; @@ -1172,7 +1198,6 @@ trx_undo_get_undo_rec_low( trx_rseg_t* rseg; ibool is_insert; mtr_t mtr; - trx_undo_rec_t* undo_rec; trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset); @@ -1234,7 +1259,8 @@ trx_undo_prev_version_build( /*========================*/ /* out: DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is not >= purge_view, - which means that it may have been removed */ + which means that it may have been removed, + DB_ERROR if corrupted record */ rec_t* index_rec,/* in: clustered index record in the index tree */ mtr_t* index_mtr,/* in: mtr which contains the latch to @@ -1255,6 +1281,7 @@ trx_undo_prev_version_build( dulint table_id; dulint trx_id; dulint roll_ptr; + dulint old_roll_ptr; upd_t* update; byte* ptr; ulint info_bits; @@ -1263,19 +1290,38 @@ trx_undo_prev_version_build( byte* buf; ulint err; ulint i; + char err_buf[1000]; ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); ut_ad(mtr_memo_contains(index_mtr, buf_block_align(index_rec), MTR_MEMO_PAGE_S_FIX) || mtr_memo_contains(index_mtr, buf_block_align(index_rec), MTR_MEMO_PAGE_X_FIX)); + if (!(index->type & DICT_CLUSTERED)) { + fprintf(stderr, + "InnoDB: Error: trying to access update undo rec for table %s\n" + "InnoDB: index %s which is not a clustered index\n", + index->table_name, index->name); + fprintf(stderr, + "InnoDB: Send a detailed bug report to mysql@lists.mysql.com"); + + rec_sprintf(err_buf, 900, index_rec); + fprintf(stderr, "InnoDB: index record %s\n", err_buf); + + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record version %s\n", err_buf); + + return(DB_ERROR); + } roll_ptr = row_get_rec_roll_ptr(rec, index); + old_roll_ptr = roll_ptr; + + *old_vers = NULL; if (trx_undo_roll_ptr_is_insert(roll_ptr)) { /* The record rec is the first inserted version */ - *old_vers = NULL; return(DB_SUCCESS); } @@ -1286,8 +1332,6 @@ trx_undo_prev_version_build( if (err != DB_SUCCESS) { - *old_vers = NULL; - return(err); } @@ -1298,8 +1342,70 @@ trx_undo_prev_version_build( &info_bits); ptr = trx_undo_rec_skip_row_ref(ptr, index); - trx_undo_update_rec_get_update(ptr, index, type, trx_id, roll_ptr, - info_bits, heap, &update); + ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, + roll_ptr, info_bits, heap, &update); + + if (ut_dulint_cmp(table_id, index->table->id) != 0) { + ptr = NULL; + + fprintf(stderr, + "InnoDB: Error: trying to access update undo rec for table %s\n" + "InnoDB: but the table id in the undo record is wrong\n", + index->table_name); + fprintf(stderr, + "InnoDB: Send a detailed bug report to mysql@lists.mysql.com\n"); + + fprintf(stderr, + "InnoDB: Run also CHECK TABLE on table %s\n", index->table_name); + } + + if (ptr == NULL) { + /* The record was corrupted, return an error; these printfs + should catch an elusive bug in row_vers_old_has_index_entry */ + + fprintf(stderr, + "InnoDB: Table name %s, index name %s, n_uniq %lu\n", + index->table_name, index->name, + dict_index_get_n_unique(index)); + + fprintf(stderr, + "InnoDB: undo rec address %lx, type %lu cmpl_info %lu\n", + (ulint)undo_rec, type, cmpl_info); + fprintf(stderr, + "InnoDB: undo rec table id %lu %lu, index table id %lu %lu\n", + ut_dulint_get_high(table_id), + ut_dulint_get_low(table_id), + ut_dulint_get_high(index->table->id), + ut_dulint_get_low(index->table->id)); + + ut_sprintf_buf(err_buf, undo_rec, 150); + + fprintf(stderr, "InnoDB: dump of 150 bytes in undo rec: %s\n", + err_buf); + rec_sprintf(err_buf, 900, index_rec); + fprintf(stderr, "InnoDB: index record %s\n", err_buf); + + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record version %s\n", err_buf); + + fprintf(stderr, + "InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n", + ut_dulint_get_high(rec_trx_id), + ut_dulint_get_low(rec_trx_id), + ut_dulint_get_high(trx_id), + ut_dulint_get_low(trx_id)); + + fprintf(stderr, + "InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n", + ut_dulint_get_high(old_roll_ptr), + ut_dulint_get_low(old_roll_ptr), + ut_dulint_get_high(roll_ptr), + ut_dulint_get_low(roll_ptr)); + + trx_purge_sys_print(); + + return(DB_ERROR); + } if (row_upd_changes_field_size(rec, index, update)) { diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c index 2adeb1cf57c..47fffea5e40 100644 --- a/innobase/trx/trx0roll.c +++ b/innobase/trx/trx0roll.c @@ -45,6 +45,8 @@ trx_general_rollback_for_mysql( que_thr_t* thr; roll_node_t* roll_node; + trx_start_if_not_started(trx); + heap = mem_heap_create(512); roll_node = roll_node_create(heap); @@ -108,6 +110,8 @@ trx_rollback_for_mysql( err = trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx_mark_sql_stat_end(trx); + /* Tell Innobase server that there might be work for utility threads: */ @@ -144,7 +148,7 @@ trx_rollback_last_sql_stat_for_mysql( err = trx_general_rollback_for_mysql(trx, TRUE, &(trx->last_sql_stat_start)); trx_mark_sql_stat_end(trx); - + /* Tell Innobase server that there might be work for utility threads: */ @@ -229,8 +233,9 @@ loop: ut_a(thr == que_fork_start_command(fork, SESS_COMM_EXECUTE, 0)); - fprintf(stderr, "InnoDB: Rolling back trx no %lu\n", - ut_dulint_get_low(trx->id)); + fprintf(stderr, "InnoDB: Rolling back trx with id %lu %lu\n", + ut_dulint_get_high(trx->id), + ut_dulint_get_low(trx->id)); mutex_exit(&kernel_mutex); if (trx->dict_operation) { @@ -246,7 +251,7 @@ loop: mutex_exit(&kernel_mutex); fprintf(stderr, - "InnoDB: Waiting rollback of trx no %lu to end\n", + "InnoDB: Waiting for rollback of trx id %lu to end\n", ut_dulint_get_low(trx->id)); os_thread_sleep(100000); @@ -272,7 +277,8 @@ loop: mutex_exit(&(dict_sys->mutex)); } - fprintf(stderr, "InnoDB: Rolling back of trx no %lu completed\n", + fprintf(stderr, "InnoDB: Rolling back of trx id %lu %lu completed\n", + ut_dulint_get_high(trx->id), ut_dulint_get_low(trx->id)); mem_heap_free(heap); diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 0b8664013d7..e79e4594637 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -518,6 +518,10 @@ trx_sys_init_at_db_start(void) fprintf(stderr, "InnoDB: %lu uncommitted transaction(s) which must be rolled back\n", UT_LIST_GET_LEN(trx_sys->trx_list)); + + fprintf(stderr, "Trx id counter is %lu %lu\n", + ut_dulint_get_high(trx_sys->max_trx_id), + ut_dulint_get_low(trx_sys->max_trx_id)); } UT_LIST_INIT(trx_sys->view_list); diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 13b37775dce..18c80819245 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -499,7 +499,7 @@ trx_commit_off_kernel( rseg = trx->rseg; - if ((trx->insert_undo != NULL) || (trx->update_undo != NULL)) { + if (trx->insert_undo != NULL || trx->update_undo != NULL) { mutex_exit(&kernel_mutex); @@ -524,7 +524,13 @@ trx_commit_off_kernel( if (undo) { mutex_enter(&kernel_mutex); -#ifdef TRX_UPDATE_UNDO_OPT +#ifdef notdefined + /* ########## There is a bug here: purge and rollback + need the whole stack of old record versions even if no + consistent read would need them!! This is because they + decide on the basis of the old versions when we can + remove delete marked secondary index records! */ + if (!undo->del_marks && (undo->size == 1) && (UT_LIST_GET_LEN(trx_sys->view_list) == 1)) { @@ -584,9 +590,7 @@ trx_commit_off_kernel( mutex_enter(&kernel_mutex); } -#ifdef TRX_UPDATE_UNDO_OPT -shortcut: -#endif + ut_ad(trx->conc_state == TRX_ACTIVE); ut_ad(mutex_own(&kernel_mutex)); @@ -1286,6 +1290,8 @@ trx_commit_for_mysql( sig to the transaction, we must here make sure that trx has been started. */ + ut_a(trx); + trx->op_info = "committing"; trx_start_if_not_started(trx); @@ -1309,29 +1315,13 @@ trx_mark_sql_stat_end( /*==================*/ trx_t* trx) /* in: trx handle */ { - trx_start_if_not_started(trx); - - mutex_enter(&kernel_mutex); - - trx->last_sql_stat_start.least_undo_no = trx->undo_no; - - mutex_exit(&kernel_mutex); -} - -/************************************************************************** -Marks the latest SQL statement ended but does not start a new transaction -if the trx is not started. */ + ut_a(trx); -void -trx_mark_sql_stat_end_do_not_start_new( -/*===================================*/ - trx_t* trx) /* in: trx handle */ -{ - mutex_enter(&kernel_mutex); + if (trx->conc_state == TRX_NOT_STARTED) { + trx->undo_no = ut_dulint_zero; + } trx->last_sql_stat_start.least_undo_no = trx->undo_no; - - mutex_exit(&kernel_mutex); } /************************************************************************** diff --git a/innobase/trx/trx0undo.c b/innobase/trx/trx0undo.c index 598090bdee2..8b83163bfc2 100644 --- a/innobase/trx/trx0undo.c +++ b/innobase/trx/trx0undo.c @@ -1220,8 +1220,14 @@ trx_undo_lists_init( for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr); - if (page_no != FIL_NULL) { - + /* In forced recovery: try to avoid operations which look + at database pages; undo logs are rapidly changing data, and + the probability that they are in an inconsistent state is + high */ + + if (page_no != FIL_NULL + && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + undo = trx_undo_mem_create_at_db_start(rseg, i, page_no, &mtr); size += undo->size; diff --git a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c index 630bd3a9b71..84a79d39556 100644 --- a/innobase/ut/ut0mem.c +++ b/innobase/ut/ut0mem.c @@ -78,7 +78,7 @@ ut_malloc_low( fprintf(stderr, "InnoDB: Fatal error: cannot allocate %lu bytes of\n" "InnoDB: memory with malloc! Total allocated memory\n" - "InnoDB: by InnoDB %lu bytes. Operating system errno: %lu\n" + "InnoDB: by InnoDB %lu bytes. Operating system errno: %d\n" "InnoDB: Cannot continue operation!\n" "InnoDB: Check if you should increase the swap file or\n" "InnoDB: ulimits of your operating system.\n", @@ -155,7 +155,7 @@ ut_free_all_mem(void) os_fast_mutex_lock(&ut_list_mutex); - while (block = UT_LIST_GET_FIRST(ut_mem_block_list)) { + while ((block = UT_LIST_GET_FIRST(ut_mem_block_list))) { ut_a(block->magic_n == UT_MEM_MAGIC_N); ut_a(ut_total_allocated_memory >= block->size); |