diff options
author | unknown <heikki@hundin.mysql.fi> | 2002-10-29 23:16:46 +0200 |
---|---|---|
committer | unknown <heikki@hundin.mysql.fi> | 2002-10-29 23:16:46 +0200 |
commit | 20f0aa168fff3389824f63d17a0b16a4f4e80604 (patch) | |
tree | e57bf300e559932ce45e0f749d7349577e7e0479 | |
parent | c327bf8a0f9c23f996b6822db65b09402f0f93ef (diff) | |
download | mariadb-git-20f0aa168fff3389824f63d17a0b16a4f4e80604.tar.gz |
Many files:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
mysqld.cc:
Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway
sql/mysqld.cc:
Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway
sql/ha_innodb.cc:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
sql/ha_innodb.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/buf0buf.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/dict0dict.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/fil0fil.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/lock0lock.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/os0file.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/os0proc.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/os0thread.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/page0cur.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/page0page.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/read0read.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/rem0rec.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/srv0srv.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/sync0rw.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/sync0sync.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/trx0purge.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/trx0trx.h:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/include/rem0rec.ic:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/btr/btr0btr.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/btr/btr0cur.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/btr/btr0pcur.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/buf/buf0buf.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/buf/buf0flu.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/dict/dict0dict.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/fil/fil0fil.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/fsp/fsp0fsp.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/ibuf/ibuf0ibuf.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/lock/lock0lock.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/mem/mem0dbg.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/os/os0file.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/os/os0proc.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/page/page0cur.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/page/page0page.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/pars/lexyy.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/pars/pars0grm.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/read/read0read.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/row/row0ins.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/row/row0mysql.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/row/row0purge.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/row/row0sel.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/row/row0uins.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/row/row0undo.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/row/row0upd.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/srv/srv0srv.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/srv/srv0start.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/sync/sync0rw.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/sync/sync0sync.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/trx/trx0purge.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
innobase/trx/trx0trx.c:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
51 files changed, 1572 insertions, 575 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index 7a7678b2dcf..62a86d342a2 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -274,6 +274,7 @@ btr_page_create( ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); page_create(page, mtr); + buf_block_align(page)->check_index_page_at_flush = TRUE; btr_page_set_index_id(page, tree->id, mtr); } @@ -713,6 +714,7 @@ btr_create( /* Create a new index page on the the allocated segment page */ page = page_create(frame, mtr); + buf_block_align(page)->check_index_page_at_flush = TRUE; /* Set the index id of the page */ btr_page_set_index_id(page, index_id, mtr); @@ -847,6 +849,7 @@ btr_page_reorganize_low( segment headers, next page-field, etc.) is preserved intact */ page_create(page, mtr); + buf_block_align(page)->check_index_page_at_flush = TRUE; /* Copy the records from the temporary space to the recreated page; do not copy the lock bits yet */ @@ -919,6 +922,7 @@ btr_page_empty( segment headers, next page-field, etc.) is preserved intact */ page_create(page, mtr); + buf_block_align(page)->check_index_page_at_flush = TRUE; } /***************************************************************** diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index 3d6b63def6c..24f0447d55d 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -121,16 +121,19 @@ btr_cur_latch_leaves( { ulint left_page_no; ulint right_page_no; - + page_t* get_page; + ut_ad(tree && page && mtr); if (latch_mode == BTR_SEARCH_LEAF) { - btr_page_get(space, page_no, RW_S_LATCH, mtr); + get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_LEAF) { - btr_page_get(space, page_no, RW_X_LATCH, mtr); + get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_TREE) { @@ -138,15 +141,22 @@ btr_cur_latch_leaves( left_page_no = btr_page_get_prev(page, mtr); if (left_page_no != FIL_NULL) { - btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + get_page = btr_page_get(space, left_page_no, + RW_X_LATCH, mtr); + buf_block_align(get_page)->check_index_page_at_flush = + TRUE; } - btr_page_get(space, page_no, RW_X_LATCH, mtr); + get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + buf_block_align(get_page)->check_index_page_at_flush = TRUE; right_page_no = btr_page_get_next(page, mtr); if (right_page_no != FIL_NULL) { - btr_page_get(space, right_page_no, RW_X_LATCH, mtr); + get_page = btr_page_get(space, right_page_no, + RW_X_LATCH, mtr); + buf_block_align(get_page)->check_index_page_at_flush = + TRUE; } } else if (latch_mode == BTR_SEARCH_PREV) { @@ -157,9 +167,12 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_S_LATCH, mtr); + buf_block_align( + cursor->left_page)->check_index_page_at_flush = TRUE; } - btr_page_get(space, page_no, RW_S_LATCH, mtr); + get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_PREV) { @@ -169,9 +182,12 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + buf_block_align( + cursor->left_page)->check_index_page_at_flush = TRUE; } - btr_page_get(space, page_no, RW_X_LATCH, mtr); + get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else { ut_error; } @@ -274,6 +290,7 @@ btr_cur_search_to_nth_level( if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ && !estimate + && mode != PAGE_CUR_LE_OR_EXTENDS && btr_search_guess_on_hash(index, info, tuple, mode, latch_mode, cursor, has_search_latch, mtr)) { @@ -334,12 +351,18 @@ btr_cur_search_to_nth_level( rw_latch = RW_NO_LATCH; buf_mode = BUF_GET; + /* We use these modified search modes on non-leaf levels of the + B-tree. These let us end up in the right B-tree leaf. In that leaf + we use the original search mode. */ + if (mode == PAGE_CUR_GE) { page_mode = PAGE_CUR_L; } else if (mode == PAGE_CUR_G) { page_mode = PAGE_CUR_LE; } else if (mode == PAGE_CUR_LE) { page_mode = PAGE_CUR_LE; + } else if (mode == PAGE_CUR_LE_OR_EXTENDS) { + page_mode = PAGE_CUR_LE_OR_EXTENDS; } else { ut_ad(mode == PAGE_CUR_L); page_mode = PAGE_CUR_L; @@ -390,6 +413,8 @@ retry_page_get: goto retry_page_get; } + + buf_block_align(page)->check_index_page_at_flush = TRUE; #ifdef UNIV_SYNC_DEBUG if (rw_latch != RW_NO_LATCH) { @@ -543,6 +568,8 @@ btr_cur_open_at_index_side( ut_ad(0 == ut_dulint_cmp(tree->id, btr_page_get_index_id(page))); + buf_block_align(page)->check_index_page_at_flush = TRUE; + if (height == ULINT_UNDEFINED) { /* We are in the root node */ diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c index 8ca3d41f7f9..b2115dfdd6c 100644 --- a/innobase/btr/btr0pcur.c +++ b/innobase/btr/btr0pcur.c @@ -354,6 +354,7 @@ btr_pcur_move_to_next_page( ut_ad(next_page_no != FIL_NULL); next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr); + buf_block_align(next_page)->check_index_page_at_flush = TRUE; btr_leaf_page_release(page, cursor->latch_mode, mtr); diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index ee8e8b91f8d..4524fa1a4f9 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -331,6 +331,11 @@ buf_page_print( index->table_name, index->name); } + } else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) { + fprintf(stderr, "InnoDB: Page may be an 'inode' page\n"); + } else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) { + fprintf(stderr, + "InnoDB: Page may be an insert buffer free list page\n"); } } @@ -351,6 +356,8 @@ buf_block_init( block->file_page_was_freed = FALSE; + block->check_index_page_at_flush = FALSE; + rw_lock_create(&(block->lock)); ut_ad(rw_lock_validate(&(block->lock))); @@ -617,6 +624,29 @@ buf_page_peek_block( } /************************************************************************ +Resets the check_index_page_at_flush field of a page if found in the buffer +pool. */ + +void +buf_reset_check_index_page_at_flush( +/*================================*/ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (block) { + block->check_index_page_at_flush = FALSE; + } + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************ Returns the current state of is_hashed of a page. FALSE if the page is not in the pool. NOTE that this operation does not fix the page in the pool if it is found there. */ @@ -1185,6 +1215,8 @@ buf_page_init( block->space = space; block->offset = offset; + block->check_index_page_at_flush = FALSE; + block->lock_hash_val = lock_rec_hash(space, offset); block->lock_mutex = NULL; diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 4c6850af078..78bde60c9b2 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -15,6 +15,7 @@ Created 11/11/1995 Heikki Tuuri #include "ut0byte.h" #include "ut0lst.h" +#include "page0page.h" #include "fil0fil.h" #include "buf0buf.h" #include "buf0lru.h" @@ -225,6 +226,24 @@ buf_flush_buffered_writes(void) return; } + for (i = 0; i < trx_doublewrite->first_free; i++) { + block = trx_doublewrite->buf_block_arr[i]; + + if (block->check_index_page_at_flush + && !page_simple_validate(block->frame)) { + + buf_page_print(block->frame); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Apparent corruption of an index page\n" + "InnoDB: to be written to data file. We intentionally crash server\n" + "InnoDB: to prevent corrupt data from ending up in data\n" + "InnoDB: files.\n"); + ut_a(0); + } + } + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; } else { diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index 095c27f1c5f..2ee2c9d18a9 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -29,7 +29,14 @@ Created 1/8/1996 Heikki Tuuri dict_sys_t* dict_sys = NULL; /* the dictionary system */ -rw_lock_t dict_foreign_key_check_lock; +rw_lock_t dict_operation_lock; /* table create, drop, etc. reserve + this in X-mode, implicit or backround + operations purge, rollback, foreign + key checks reserve this in S-mode; we + cannot trust that MySQL protects + implicit or background operations + from dropping a table: this is our + mechanism */ #define DICT_HEAP_SIZE 100 /* initial memory heap size when creating a table or index object */ @@ -509,9 +516,8 @@ dict_init(void) UT_LIST_INIT(dict_sys->table_LRU); - rw_lock_create(&dict_foreign_key_check_lock); - rw_lock_set_level(&dict_foreign_key_check_lock, - SYNC_FOREIGN_KEY_CHECK); + rw_lock_create(&dict_operation_lock); + rw_lock_set_level(&dict_operation_lock, SYNC_DICT_OPERATION); } /************************************************************************** @@ -1851,14 +1857,14 @@ loop: /************************************************************************* Accepts a specified string. Comparisons are case-insensitive. */ -static + char* dict_accept( /*========*/ /* out: if string was accepted, the pointer is moved after that, else ptr is returned */ char* ptr, /* in: scan from this */ - const char* string, /* in: accept only this string as the next + const char* string,/* in: accept only this string as the next non-whitespace string */ ibool* success)/* out: TRUE if accepted */ { diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index 3e0f21395ef..98980f6c337 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -967,6 +967,7 @@ fil_extend_last_data_file( fil_node_t* node; fil_space_t* space; fil_system_t* system = fil_system; + byte* buf2; byte* buf; ibool success; ulint i; @@ -981,19 +982,23 @@ fil_extend_last_data_file( fil_node_prepare_for_io(node, system, space); - buf = mem_alloc(1024 * 1024); + buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE); + buf = ut_align(buf2, UNIV_PAGE_SIZE); memset(buf, '\0', 1024 * 1024); for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) { - success = os_file_write(node->name, node->handle, buf, + /* If we use native Windows aio, then also this write is + done using it */ + + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF, node->size >> (32 - UNIV_PAGE_SIZE_SHIFT), - 1024 * 1024); + 1024 * 1024, NULL, NULL); if (!success) { - break; } @@ -1003,7 +1008,7 @@ fil_extend_last_data_file( os_has_said_disk_full = FALSE; } - mem_free(buf); + mem_free(buf2); fil_node_complete_io(node, system, OS_FILE_WRITE); @@ -1528,7 +1533,6 @@ fil_page_set_type( ulint type) /* in: type */ { ut_ad(page); - ut_ad((type == FIL_PAGE_INDEX) || (type == FIL_PAGE_UNDO_LOG)); mach_write_to_2(page + FIL_PAGE_TYPE, type); } diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index 1abb043fdc2..ff586819d4a 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -769,6 +769,8 @@ fsp_init_file_page_low( #endif page = buf_frame_align(ptr); + buf_block_align(page)->check_index_page_at_flush = FALSE; + #ifdef UNIV_BASIC_LOG_DEBUG /* printf("In log debug version: Erase the contents of the file page\n"); */ @@ -1097,7 +1099,7 @@ fsp_fill_free_list( /* Initialize the ibuf page in a separate mini-transaction because it is low in the latching - order, and we must be able to release the its latch + order, and we must be able to release its latch before returning from the fsp routine */ mtr_start(&ibuf_mtr); @@ -1264,7 +1266,12 @@ fsp_alloc_free_page( free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE, hint % FSP_EXTENT_SIZE, mtr); - ut_a(free != ULINT_UNDEFINED); + if (free == ULINT_UNDEFINED) { + + ut_print_buf(((byte*)descr) - 500, 1000); + + ut_a(0); + } xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); @@ -1412,7 +1419,12 @@ fsp_free_extent( descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr); - ut_a(xdes_get_state(descr, mtr) != XDES_FREE); + if (xdes_get_state(descr, mtr) == XDES_FREE) { + + ut_print_buf(((byte*)descr) - 500, 1000); + + ut_a(0); + } xdes_init(descr, mtr); @@ -1523,6 +1535,10 @@ fsp_alloc_seg_inode_page( page = buf_page_get(space, page_no, RW_X_LATCH, mtr); + buf_block_align(page)->check_index_page_at_flush = FALSE; + + fil_page_set_type(page, FIL_PAGE_INODE); + buf_page_dbg_add_level(page, SYNC_FSP_PAGE); for (i = 0; i < FSP_SEG_INODES_PER_PAGE; i++) { @@ -2298,6 +2314,8 @@ fseg_alloc_free_page_low( fseg_mark_page_used(seg_inode, space, ret_page, mtr); } + buf_reset_check_index_page_at_flush(space, ret_page); + return(ret_page); } diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index b7d691485cc..143b3bfa584 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -1295,6 +1295,8 @@ ibuf_add_free_page( flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + fil_page_set_type(page, FIL_PAGE_IBUF_FREE_LIST); + ibuf_data->seg_size++; ibuf_data->free_list_len++; @@ -1305,6 +1307,7 @@ ibuf_add_free_page( ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_IBUF, TRUE, &mtr); + mtr_commit(&mtr); mutex_exit(&ibuf_mutex); diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index 591c0ec54ab..f76c437bd1d 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -274,6 +274,15 @@ buf_page_peek_block( ulint space, /* in: space id */ ulint offset);/* in: page number */ /************************************************************************ +Resets the check_index_page_at_flush field of a page if found in the buffer +pool. */ + +void +buf_reset_check_index_page_at_flush( +/*================================*/ + ulint space, /* in: space id */ + ulint offset);/* in: page number */ +/************************************************************************ Sets file_page_was_freed TRUE if the page is found in the buffer pool. This function should be called when we free a file page and want the debug version to check that it is not accessed any more unless @@ -648,6 +657,14 @@ struct buf_block_struct{ then it can wait for this rw-lock */ buf_block_t* hash; /* node used in chaining to the page hash table */ + ibool check_index_page_at_flush; + /* TRUE if we know that this is + an index page, and want the database + to check its consistency before flush; + note that there may be pages in the + buffer pool which are index pages, + but this flag is not set because + we do not keep track of all pages */ /* 2. Page flushing fields */ UT_LIST_NODE_T(buf_block_t) flush_list; diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index dd92c5aa467..b5e6e04a1de 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -26,6 +26,18 @@ Created 1/8/1996 Heikki Tuuri #include "ut0byte.h" #include "trx0types.h" +/************************************************************************* +Accepts a specified string. Comparisons are case-insensitive. */ + +char* +dict_accept( +/*========*/ + /* out: if string was accepted, the pointer + is moved after that, else ptr is returned */ + char* ptr, /* in: scan from this */ + const char* string,/* in: accept only this string as the next + non-whitespace string */ + ibool* success);/* out: TRUE if accepted */ /************************************************************************ Decrements the count of open MySQL handles to a table. */ @@ -798,7 +810,7 @@ dict_mutex_exit_for_mysql(void); extern dict_sys_t* dict_sys; /* the dictionary system */ -extern rw_lock_t dict_foreign_key_check_lock; +extern rw_lock_t dict_operation_lock; /* Dictionary system struct */ struct dict_sys_struct{ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index 63e20221c16..23ef0304b2d 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -73,6 +73,8 @@ extern fil_addr_t fil_addr_null; /* File page types */ #define FIL_PAGE_INDEX 17855 #define FIL_PAGE_UNDO_LOG 2 +#define FIL_PAGE_INODE 3 +#define FIL_PAGE_IBUF_FREE_LIST 4 /* Space types */ #define FIL_TABLESPACE 501 diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h index 288356d3270..d3b3d55d015 100644 --- a/innobase/include/lock0lock.h +++ b/innobase/include/lock0lock.h @@ -292,16 +292,12 @@ lock_sec_rec_modify_check_and_lock( dict_index_t* index, /* in: secondary index */ que_thr_t* thr); /* in: query thread */ /************************************************************************* -Checks if locks of other transactions prevent an immediate read, or passing -over by a read cursor, of a clustered index record. If they do, first tests -if the query thread should anyway be suspended for some reason; if not, then -puts the transaction and the query thread to the lock wait state and inserts a -waiting request for a record lock to the lock queue. Sets the requested mode -lock on the record. */ +Like the counterpart for a clustered index below, but now we read a +secondary index record. */ ulint -lock_clust_rec_read_check_and_lock( -/*===============================*/ +lock_sec_rec_read_check_and_lock( +/*=============================*/ /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, @@ -309,18 +305,24 @@ lock_clust_rec_read_check_and_lock( rec_t* rec, /* in: user record or page supremum record which should be read or passed over by a read cursor */ - dict_index_t* index, /* in: clustered index */ + dict_index_t* index, /* in: secondary index */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ que_thr_t* thr); /* in: query thread */ /************************************************************************* -Like the counterpart for a clustered index above, but now we read a -secondary index record. */ +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. */ ulint -lock_sec_rec_read_check_and_lock( -/*=============================*/ +lock_clust_rec_read_check_and_lock( +/*===============================*/ /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, @@ -328,10 +330,12 @@ lock_sec_rec_read_check_and_lock( rec_t* rec, /* in: user record or page supremum record which should be read or passed over by a read cursor */ - dict_index_t* index, /* in: secondary index */ + dict_index_t* index, /* in: clustered index */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ que_thr_t* thr); /* in: query thread */ /************************************************************************* Checks that a record is seen in a consistent read. */ @@ -509,6 +513,7 @@ lock_validate(void); extern lock_sys_t* lock_sys; /* Lock modes and types */ +/* Basic modes */ #define LOCK_NONE 0 /* this flag is used elsewhere to note consistent read */ #define LOCK_IS 2 /* intention shared */ @@ -519,15 +524,20 @@ extern lock_sys_t* lock_sys; in an exclusive mode */ #define LOCK_MODE_MASK 0xF /* mask used to extract mode from the type_mode field in a lock */ +/* Lock types */ #define LOCK_TABLE 16 /* these type values should be so high that */ #define LOCK_REC 32 /* they can be ORed to the lock mode */ #define LOCK_TYPE_MASK 0xF0 /* mask used to extract lock type from the type_mode field in a lock */ +/* Waiting lock flag */ #define LOCK_WAIT 256 /* this wait bit should be so high that it can be ORed to the lock mode and type; when this bit is set, it means that the lock has not yet been granted, it is just waiting for its turn in the wait queue */ +/* Precise modes */ +#define LOCK_ORDINARY 0 /* this flag denotes an ordinary next-key lock + in contrast to LOCK_GAP or LOCK_REC_NOT_GAP */ #define LOCK_GAP 512 /* this gap bit should be so high that it can be ORed to the other flags; when this bit is set, it means that the @@ -537,7 +547,15 @@ extern lock_sys_t* lock_sys; the bit is set; locks of this type are created when records are removed from the index chain of records */ -#define LOCK_INSERT_INTENTION 1024 /* this bit is set when we place a waiting +#define LOCK_REC_NOT_GAP 1024 /* this bit means that the lock is only on + the index record and does NOT block inserts + to the gap before the index record; this is + used in the case when we retrieve a record + with a unique key, and is also used in + locking plain SELECTs (not part of UPDATE + or DELETE) when the user has set the READ + COMMITTED isolation level */ +#define LOCK_INSERT_INTENTION 2048 /* this bit is set when we place a waiting gap type record lock request in order to let an insert of an index record to wait until there are no conflicting locks by other diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index d65c7fd47e3..a7624a90d5e 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -111,6 +111,7 @@ log. */ #define OS_WIN31 1 #define OS_WIN95 2 #define OS_WINNT 3 +#define OS_WIN2000 4 extern ulint os_n_file_reads; extern ulint os_n_file_writes; @@ -122,7 +123,7 @@ Gets the operating system version. Currently works only on Windows. */ ulint os_get_os_version(void); /*===================*/ - /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */ + /* out: OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */ /******************************************************************** Creates the seek mutexes used in positioned reads and writes. */ diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h index 9da1f33e070..79750e5c1f7 100644 --- a/innobase/include/os0proc.h +++ b/innobase/include/os0proc.h @@ -16,6 +16,15 @@ typedef void* os_process_t; typedef unsigned long int os_process_id_t; /******************************************************************** +Converts the current process id to a number. It is not guaranteed that the +number is unique. In Linux returns the 'process number' of the current +thread. That number is the same as one sees in 'top', for example. In Linux +the thread id is not the same as one sees in 'top'. */ + +ulint +os_proc_get_number(void); +/*====================*/ +/******************************************************************** Allocates non-cacheable memory. */ void* diff --git a/innobase/include/os0thread.h b/innobase/include/os0thread.h index 8355afa46e9..efc8651e06d 100644 --- a/innobase/include/os0thread.h +++ b/innobase/include/os0thread.h @@ -16,11 +16,8 @@ Created 9/8/1995 Heikki Tuuri this is also the size of the wait slot array for MySQL threads which can wait inside InnoDB */ #ifdef __WIN__ -/* Windows 95/98/ME seemed to have difficulties creating the all -the event semaphores for the wait array slots. If the computer had -<= 64 MB memory, InnoDB startup could take minutes or even crash. -That is why we set this to only 1000 in Windows. */ - +/* Create less event semaphores because Win 98/ME had difficult creating +40000 event semaphores */ #define OS_THREAD_MAX_N 1000 #else #define OS_THREAD_MAX_N 10000 diff --git a/innobase/include/page0cur.h b/innobase/include/page0cur.h index 144e0e02b21..c3f0decdb4b 100644 --- a/innobase/include/page0cur.h +++ b/innobase/include/page0cur.h @@ -26,7 +26,12 @@ Created 10/4/1994 Heikki Tuuri #define PAGE_CUR_GE 2 #define PAGE_CUR_L 3 #define PAGE_CUR_LE 4 -#define PAGE_CUR_DBG 5 +#define PAGE_CUR_LE_OR_EXTENDS 5 /* This is a search mode used in + "column LIKE 'abc%' ORDER BY column DESC"; + we have to find strings which are <= 'abc' or + which extend it */ +#define PAGE_CUR_DBG 6 + extern ulint page_cur_short_succ; diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h index 2f77127466f..b5e33af5bc0 100644 --- a/innobase/include/page0page.h +++ b/innobase/include/page0page.h @@ -666,6 +666,16 @@ page_rec_validate( /* out: TRUE if ok */ rec_t* rec); /* in: record on the page */ /******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ + +ibool +page_simple_validate( +/*=================*/ + /* out: TRUE if ok */ + page_t* page); /* in: index page */ +/******************************************************************* This function checks the consistency of an index page. */ ibool diff --git a/innobase/include/read0read.h b/innobase/include/read0read.h index cebb2d6701c..db6bf888095 100644 --- a/innobase/include/read0read.h +++ b/innobase/include/read0read.h @@ -45,6 +45,14 @@ read_view_close( /*============*/ read_view_t* view); /* in: read view */ /************************************************************************* +Closes a consistent read view for MySQL. This function is called at an SQL +statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ + +void +read_view_close_for_mysql( +/*======================*/ + trx_t* trx); /* in: trx which has a read view */ +/************************************************************************* Checks if a read view sees the specified transaction. */ UNIV_INLINE ibool diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h index 12e3a8b39d6..b28f39925c1 100644 --- a/innobase/include/rem0rec.h +++ b/innobase/include/rem0rec.h @@ -148,12 +148,22 @@ data field in the record. */ byte* rec_get_nth_field( /*==============*/ - /* out: pointer to the field, NULL if SQL null */ + /* out: pointer to the field */ rec_t* rec, /* in: record */ ulint n, /* in: index of the field */ ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL null */ /**************************************************************** +Return field length or UNIV_SQL_NULL. */ +UNIV_INLINE +ulint +rec_get_nth_field_len( +/*==================*/ + /* out: length of the field; UNIV_SQL_NULL if SQL + null */ + rec_t* rec, /* in: record */ + ulint n); /* in: index of the field */ +/**************************************************************** Gets the physical size of a field. Also an SQL null may have a field of size > 0, if the data type is of a fixed size. */ UNIV_INLINE diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic index aaa3c58a003..9dfd4faeec8 100644 --- a/innobase/include/rem0rec.ic +++ b/innobase/include/rem0rec.ic @@ -65,6 +65,24 @@ a field stored to another page: */ #define REC_2BYTE_EXTERN_MASK 0x4000 +/**************************************************************** +Return field length or UNIV_SQL_NULL. */ +UNIV_INLINE +ulint +rec_get_nth_field_len( +/*==================*/ + /* out: length of the field; UNIV_SQL_NULL if SQL + null */ + rec_t* rec, /* in: record */ + ulint n) /* in: index of the field */ +{ + ulint len; + + rec_get_nth_field(rec, n, &len); + + return(len); +} + /*************************************************************** Sets the value of the ith field SQL null bit. */ diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 9de5e9ccfc9..4d2768cf109 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -57,8 +57,6 @@ extern ulint srv_flush_log_at_trx_commit; extern byte srv_latin1_ordering[256];/* The sort order table of the latin1 character set */ -extern ibool srv_use_native_aio; - extern ulint srv_pool_size; extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; @@ -70,8 +68,9 @@ extern dulint srv_archive_recovery_limit_lsn; extern ulint srv_lock_wait_timeout; -extern char* srv_unix_file_flush_method_str; +extern char* srv_file_flush_method_str; extern ulint srv_unix_file_flush_method; +extern ulint srv_win_file_flush_method; extern ulint srv_force_recovery; extern ulint srv_thread_concurrency; @@ -154,13 +153,19 @@ typedef struct srv_sys_struct srv_sys_t; /* The server system */ extern srv_sys_t* srv_sys; -/* Alternatives for the field flush option in Unix; see the InnoDB manual about +/* Alternatives for the file flush option in Unix; see the InnoDB manual about what these mean */ -#define SRV_UNIX_FDATASYNC 1 +#define SRV_UNIX_FDATASYNC 1 /* This is the default; it is currently mapped + to a call of fsync() because fdatasync() + seemed to corrupt files in Linux and Solaris */ #define SRV_UNIX_O_DSYNC 2 #define SRV_UNIX_LITTLESYNC 3 #define SRV_UNIX_NOSYNC 4 +/* Alternatives for file i/o in Windows */ +#define SRV_WIN_IO_NORMAL 1 +#define SRV_WIN_IO_UNBUFFERED 2 /* This is the default */ + /* Alternatives for srv_force_recovery. Non-zero values are intended to help the user get a damaged database up so that he can dump intact tables and rows with SELECT INTO OUTFILE. The database must not otherwise @@ -311,15 +316,17 @@ srv_conc_exit_innodb( trx_t* trx); /* in: transaction object associated with the thread */ /******************************************************************* -Puts a MySQL OS thread to wait for a lock to be released. */ +Puts a MySQL OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ -ibool +void srv_suspend_mysql_thread( /*=====================*/ - /* out: TRUE if the lock wait timeout was - exceeded */ - que_thr_t* thr); /* in: query thread associated with - the MySQL OS thread */ + que_thr_t* thr); /* in: query thread associated with the MySQL + OS thread */ /************************************************************************ Releases a MySQL OS thread waiting for a lock to be released, if the thread is already suspended. */ @@ -407,3 +414,4 @@ struct srv_sys_struct{ extern ulint srv_n_threads_active[]; #endif + diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h index 7ad38f5bc7f..5aa3dcdffc3 100644 --- a/innobase/include/sync0rw.h +++ b/innobase/include/sync0rw.h @@ -335,7 +335,8 @@ ibool rw_lock_own( /*========*/ rw_lock_t* lock, /* in: rw-lock */ - ulint lock_type); /* in: lock type */ + ulint lock_type); /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ /********************************************************************** Checks if somebody has locked the rw-lock in the specified mode. */ diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h index 5bfa0bc2d48..320f8faf12d 100644 --- a/innobase/include/sync0sync.h +++ b/innobase/include/sync0sync.h @@ -371,10 +371,12 @@ or row lock! */ #define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress latching order checking */ #define SYNC_LEVEL_NONE 2000 /* default: level not defined */ -#define SYNC_FOREIGN_KEY_CHECK 1001 +#define SYNC_DICT_OPERATION 1001 /* table create, drop, etc. reserve + this in X-mode, implicit or backround + operations purge, rollback, foreign + key checks reserve this in S-mode */ #define SYNC_DICT 1000 #define SYNC_DICT_AUTOINC_MUTEX 999 -#define SYNC_PURGE_IS_RUNNING 997 #define SYNC_DICT_HEADER 995 #define SYNC_IBUF_HEADER 914 #define SYNC_IBUF_PESS_INSERT_MUTEX 912 diff --git a/innobase/include/trx0purge.h b/innobase/include/trx0purge.h index 087be2f060e..049c79aec9b 100644 --- a/innobase/include/trx0purge.h +++ b/innobase/include/trx0purge.h @@ -111,9 +111,6 @@ struct trx_purge_struct{ of the trx system and it never ends */ que_t* query; /* The query graph which will do the parallelized purge operation */ - rw_lock_t purge_is_running;/* Purge operation set an x-latch here - while it is accessing a table: this - prevents dropping of the table */ rw_lock_t latch; /* The latch protecting the purge view. A purge operation must acquire an x-latch here for the instant at which diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index 9b29c481b6d..874b126e47c 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -327,6 +327,7 @@ struct trx_struct{ time_t start_time; /* time the trx object was created or the state last time became TRX_ACTIVE */ + ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */ ibool check_foreigns; /* normally TRUE, but if the user wants to suppress foreign key checks, (in table imports, for example) we @@ -350,6 +351,9 @@ struct trx_struct{ /*------------------------------*/ void* mysql_thd; /* MySQL thread handle corresponding to this trx, or NULL */ + char** mysql_query_str;/* pointer to the field in mysqld_thd + which contains the pointer to the + current SQL query string */ char* mysql_log_file_name; /* if MySQL binlog is used, this field contains a pointer to the latest file @@ -371,6 +375,9 @@ struct trx_struct{ replication has processed */ os_thread_id_t mysql_thread_id;/* id of the MySQL thread associated with this transaction object */ + ulint mysql_process_no;/* since in Linux, 'top' reports + process id's and not thread id's, we + store the process number too */ /*------------------------------*/ ulint n_mysql_tables_in_use; /* number of Innobase tables used in the processing of the current @@ -379,9 +386,9 @@ struct trx_struct{ /* how many tables the current SQL statement uses, except those in consistent read */ - ibool has_dict_foreign_key_check_lock; + ibool has_dict_operation_lock; /* TRUE if the trx currently holds - an s-lock on dict_foreign_... */ + an s-lock on dict_operation_lock */ ibool has_search_latch; /* TRUE if this trx has latched the search system latch in S-mode */ @@ -523,6 +530,41 @@ struct trx_struct{ #define TRX_QUE_ROLLING_BACK 3 /* transaction is rolling back */ #define TRX_QUE_COMMITTING 4 /* transaction is committing */ +/* Transaction isolation levels */ +#define TRX_ISO_READ_UNCOMMITTED 1 /* dirty read: non-locking + SELECTs are performed so that + we do not look at a possible + earlier version of a record; + thus they are not 'consistent' + reads under this isolation + level; otherwise like level + 2 */ + +#define TRX_ISO_READ_COMMITTED 2 /* somewhat Oracle-like + isolation, except that in + range UPDATE and DELETE we + must block phantom rows + with next-key locks; + SELECT ... FOR UPDATE and ... + LOCK IN SHARE MODE only lock + the index records, NOT the + gaps before them, and thus + allow free inserting; + each consistent read reads its + own snapshot */ + +#define TRX_ISO_REPEATABLE_READ 3 /* this is the default; + all consistent reads in the + same trx read the same + snapshot; + full next-key locking used + in locking reads to block + insertions into gaps */ + +#define TRX_ISO_SERIALIZABLE 4 /* all plain SELECTs are + converted to LOCK IN SHARE + MODE reads */ + /* Types of a trx signal */ #define TRX_SIG_NO_SIGNAL 100 #define TRX_SIG_TOTAL_ROLLBACK 1 diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index 866fe556af9..92ee5ee6cbe 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -70,6 +70,11 @@ A waiting record lock can also be of the gap type. A waiting lock request can be granted when there is no conflicting mode lock request by another transaction ahead of it in the explicit lock queue. +In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP. +It only locks the record it is placed on, not the gap before the record. +This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation +level. + ------------------------------------------------------------------------- RULE 1: If there is an implicit x-lock on a record, and there are non-gap ------- @@ -294,7 +299,9 @@ struct lock_struct{ UT_LIST_NODE_T(lock_t) trx_locks; /* list of the locks of the transaction */ - ulint type_mode; /* lock type, mode, gap flag, and + ulint type_mode; /* lock type, mode, LOCK_GAP or + LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION, wait flag, ORed */ hash_node_t hash; /* hash chain node for a record lock */ dict_index_t* index; /* index for a record lock */ @@ -309,6 +316,10 @@ Monitor will then fetch it and print */ ibool lock_deadlock_found = FALSE; char* lock_latest_err_buf; /* We allocate 5000 bytes for this */ +/* Flags for recursive deadlock search */ +#define LOCK_VICTIM_IS_START 1 +#define LOCK_VICTIM_IS_OTHER 2 + /************************************************************************ Checks if a lock request results in a deadlock. */ static @@ -700,23 +711,23 @@ lock_rec_get_gap( } /************************************************************************* -Sets the gap flag of a record lock. */ +Gets the LOCK_REC_NOT_GAP flag of a record lock. */ UNIV_INLINE -void -lock_rec_set_gap( -/*=============*/ - lock_t* lock, /* in: record lock */ - ibool val) /* in: value to set: TRUE or FALSE */ +ibool +lock_rec_get_rec_not_gap( +/*=====================*/ + /* out: TRUE if LOCK_REC_NOT_GAP flag set */ + lock_t* lock) /* in: record lock */ { ut_ad(lock); - ut_ad((val == TRUE) || (val == FALSE)); ut_ad(lock_get_type(lock) == LOCK_REC); - if (val) { - lock->type_mode = lock->type_mode | LOCK_GAP; - } else { - lock->type_mode = lock->type_mode & ~LOCK_GAP; + if (lock->type_mode & LOCK_REC_NOT_GAP) { + + return(TRUE); } + + return(FALSE); } /************************************************************************* @@ -740,26 +751,6 @@ lock_rec_get_insert_intention( } /************************************************************************* -Sets the waiting insert flag of a record lock. */ -UNIV_INLINE -void -lock_rec_set_insert_intention( -/*==========================*/ - lock_t* lock, /* in: record lock */ - ibool val) /* in: value to set: TRUE or FALSE */ -{ - ut_ad(lock); - ut_ad((val == TRUE) || (val == FALSE)); - ut_ad(lock_get_type(lock) == LOCK_REC); - - if (val) { - lock->type_mode = lock->type_mode | LOCK_INSERT_INTENTION; - } else { - lock->type_mode = lock->type_mode & ~LOCK_INSERT_INTENTION; - } -} - -/************************************************************************* Calculates if lock mode 1 is stronger or equal to lock mode 2. */ UNIV_INLINE ibool @@ -848,48 +839,53 @@ lock_rec_has_to_wait( /* out: TRUE if new lock has to wait for lock2 to be removed */ trx_t* trx, /* in: trx of new lock */ - ulint mode, /* in: LOCK_S or LOCK_X */ - ulint gap, /* in: LOCK_GAP or 0 */ - ulint insert_intention, - /* in: LOCK_INSERT_INTENTION or 0 */ + ulint type_mode,/* in: precise mode of the new lock to set: + LOCK_S or LOCK_X, possibly ORed to + LOCK_GAP or LOCK_REC_NOT_GAP, LOCK_INSERT_INTENTION */ lock_t* lock2) /* in: another record lock; NOTE that it is assumed that this has a lock bit set on the same record as - in lock1 */ + in the new lock we are setting */ { ut_ad(trx && lock2); ut_ad(lock_get_type(lock2) == LOCK_REC); - ut_ad(mode == LOCK_S || mode == LOCK_X); - ut_ad(gap == LOCK_GAP || gap == 0); - ut_ad(insert_intention == LOCK_INSERT_INTENTION - || insert_intention == 0); - if (trx != lock2->trx && !lock_mode_compatible(mode, + if (trx != lock2->trx + && !lock_mode_compatible(LOCK_MODE_MASK & type_mode, lock_get_mode(lock2))) { - /* We have somewhat complex rules when gap type - record locks cause waits */ + /* We have somewhat complex rules when gap type record locks + cause waits */ - if (!gap && lock_rec_get_insert_intention(lock2)) { + if ((type_mode & LOCK_REC_NOT_GAP) + && lock_rec_get_gap(lock2)) { + /* Lock on just the record does not need to wait for + a gap type lock */ + + return(FALSE); + } + + if ((type_mode & LOCK_GAP) + && lock_rec_get_rec_not_gap(lock2)) { + + /* Lock on gap does not need to wait for + a LOCK_REC_NOT_GAP type lock */ - /* Request of a full next-key record does not - need to wait for an insert intention lock to be - removed. This is ok since our rules allow conflicting - locks on gaps. This eliminates a spurious deadlock - caused by a next-key lock waiting for an insert - intention lock; when the insert intention lock was - granted, the insert deadlocked on the waiting - next-key lock. */ - return(FALSE); } - if (insert_intention && lock_rec_get_insert_intention(lock2)) { + if (lock_rec_get_insert_intention(lock2)) { - /* An insert intention is not disturbed by another - insert intention; this removes a spurious deadlock - caused by inserts which had to wait for a next-key - lock to be removed */ + /* No lock request needs to wait for an insert + intention lock to be removed. This is ok since our + rules allow conflicting locks on gaps. This eliminates + a spurious deadlock caused by a next-key lock waiting + for an insert intention lock; when the insert + intention lock was granted, the insert deadlocked on + the waiting next-key lock. + Also, insert intention locks do not disturb each + other. */ + return(FALSE); } @@ -921,10 +917,7 @@ lock_has_to_wait( ut_ad(lock_get_type(lock2) == LOCK_REC); return(lock_rec_has_to_wait(lock1->trx, - lock_get_mode(lock1), - lock_rec_get_gap(lock1), - lock_rec_get_insert_intention(lock1), - lock2)); + lock1->type_mode, lock2)); } return(TRUE); @@ -1386,32 +1379,41 @@ lock_table_has( /*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/ /************************************************************************* -Checks if a transaction has a GRANTED explicit lock on rec, where the gap -flag or the insert intention flag is not set, stronger or equal to mode. -Note that locks on the supremum of a page are a special case here, since -they are always gap type locks, even if the gap flag is not set in them. */ +Checks if a transaction has a GRANTED explicit lock on rec stronger or equal +to precise_mode. */ UNIV_INLINE lock_t* lock_rec_has_expl( /*==============*/ /* out: lock or NULL */ - ulint mode, /* in: lock mode */ + ulint precise_mode,/* in: LOCK_S or LOCK_X possibly ORed to + LOCK_GAP or LOCK_REC_NOT_GAP, + for a supremum record we regard this always a gap + type request */ rec_t* rec, /* in: record */ trx_t* trx) /* in: transaction */ { lock_t* lock; - - ut_ad(mutex_own(&kernel_mutex)); - ut_ad((mode == LOCK_X) || (mode == LOCK_S)); + ut_ad(mutex_own(&kernel_mutex)); + ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S + || (precise_mode & LOCK_MODE_MASK) == LOCK_X); + ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); + lock = lock_rec_get_first(rec); while (lock) { if (lock->trx == trx - && lock_mode_stronger_or_eq(lock_get_mode(lock), mode) + && lock_mode_stronger_or_eq(lock_get_mode(lock), + precise_mode & LOCK_MODE_MASK) && !lock_get_wait(lock) - && !lock_rec_get_insert_intention(lock) - && !lock_rec_get_gap(lock)) { + && (!lock_rec_get_rec_not_gap(lock) + || (precise_mode & LOCK_REC_NOT_GAP) + || page_rec_is_supremum(rec)) + && (!lock_rec_get_gap(lock) + || (precise_mode & LOCK_GAP) + || page_rec_is_supremum(rec)) + && (!lock_rec_get_insert_intention(lock))) { return(lock); } @@ -1429,7 +1431,7 @@ lock_t* lock_rec_other_has_expl_req( /*========================*/ /* out: lock or NULL */ - ulint mode, /* in: lock mode */ + ulint mode, /* in: LOCK_S or LOCK_X */ ulint gap, /* in: LOCK_GAP if also gap locks are taken into account, or 0 if not */ ulint wait, /* in: LOCK_WAIT if also waiting locks are @@ -1471,27 +1473,21 @@ lock_t* lock_rec_other_has_conflicting( /*===========================*/ /* out: lock or NULL */ - ulint mode, /* in: lock mode of the lock we are going to reserve */ - ulint gap, /* in: LOCK_GAP if we are going to reserve a gap type - lock, else 0 */ - ulint insert_intention, - /* in: LOCK_INSERT_INTENTION if we are going to - reserve an insert intention lock */ + ulint mode, /* in: LOCK_S or LOCK_X, + possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP, + LOCK_INSERT_INTENTION */ rec_t* rec, /* in: record to look at */ trx_t* trx) /* in: our transaction */ { lock_t* lock; ut_ad(mutex_own(&kernel_mutex)); - ut_ad(mode == LOCK_X || mode == LOCK_S); - ut_ad(gap == 0 || gap == LOCK_GAP); - ut_ad(insert_intention == LOCK_INSERT_INTENTION - || insert_intention == 0); + lock = lock_rec_get_first(rec); while (lock) { - if (lock_rec_has_to_wait(trx, mode, gap, insert_intention, - lock)) { + if (lock_rec_has_to_wait(trx, mode, lock)) { + return(lock); } @@ -1607,14 +1603,14 @@ lock_rec_create( page_no = buf_frame_get_page_no(page); heap_no = rec_get_heap_no(rec); - /* If rec is the supremum record, then we reset the gap bit, as - all locks on the supremum are automatically of the gap type, and - we try to avoid unnecessary memory consumption of a new record lock - struct for a gap type lock */ + /* If rec is the supremum record, then we reset the gap and + LOCK_REC_NOT_GAP bits, as all locks on the supremum are + automatically of the gap type */ if (rec == page_get_supremum_rec(page)) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); - type_mode = type_mode & ~LOCK_GAP; + type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP); } /* Make lock bitmap bigger by a safety margin */ @@ -1666,10 +1662,14 @@ ulint lock_rec_enqueue_waiting( /*=====================*/ /* out: DB_LOCK_WAIT, DB_DEADLOCK, or - DB_QUE_THR_SUSPENDED */ + DB_QUE_THR_SUSPENDED, or DB_SUCCESS; + DB_SUCCESS means that there was a deadlock, + but another transaction was chosen as a + victim, and we got the lock immediately: + no need to wait then */ ulint type_mode,/* in: lock mode this transaction is - requesting: LOCK_S or LOCK_X, ORed with - LOCK_GAP if a gap lock is requested, ORed + requesting: LOCK_S or LOCK_X, possibly ORed + with LOCK_GAP or LOCK_REC_NOT_GAP, ORed with LOCK_INSERT_INTENTION if this waiting lock request is set when performing an insert of an index record */ @@ -1718,6 +1718,14 @@ index->table_name); return(DB_DEADLOCK); } + /* If there was a deadlock but we chose another transaction as a + victim, it is possible that we already have the lock now granted! */ + + if (trx->wait_lock == NULL) { + + return(DB_SUCCESS); + } + trx->que_state = TRX_QUE_LOCK_WAIT; trx->wait_started = time(NULL); @@ -1744,8 +1752,8 @@ lock_rec_add_to_queue( /*==================*/ /* out: lock where the bit was set, NULL if out of memory */ - ulint type_mode,/* in: lock mode, wait, and gap flags; type - is ignored and replaced by LOCK_REC */ + ulint type_mode,/* in: lock mode, wait, gap etc. flags; + type is ignored and replaced by LOCK_REC */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: index of record */ trx_t* trx) /* in: transaction */ @@ -1759,12 +1767,11 @@ lock_rec_add_to_queue( ut_ad(mutex_own(&kernel_mutex)); ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_S) - || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, - rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, rec, trx)); ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_X) - || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, - rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, trx)); + type_mode = type_mode | LOCK_REC; page = buf_frame_align(rec); @@ -1775,12 +1782,15 @@ lock_rec_add_to_queue( struct for a gap type lock */ if (rec == page_get_supremum_rec(page)) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); - type_mode = type_mode & ~LOCK_GAP; + /* There should never be LOCK_REC_NOT_GAP on a supremum + record, but let us play safe */ + + type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP); } - /* Look for a waiting lock request on the same record, or for a - similar record lock on the same page */ + /* Look for a waiting lock request on the same record or on a gap */ heap_no = rec_get_heap_no(rec); lock = lock_rec_get_first_on_page(rec); @@ -1795,6 +1805,9 @@ lock_rec_add_to_queue( lock = lock_rec_get_next_on_page(lock); } + /* Look for a similar record lock on the same page: if one is found + and there are no waiting lock requests, we can just set the bit */ + similar_lock = lock_rec_find_similar_on_page(type_mode, rec, trx); if (similar_lock && !somebody_waits && !(type_mode & LOCK_WAIT)) { @@ -1822,7 +1835,8 @@ lock_rec_lock_fast( ibool impl, /* in: if TRUE, no lock is set if no wait is necessary: we assume that the caller will set an implicit lock */ - ulint mode, /* in: lock mode */ + ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly + ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index of record */ que_thr_t* thr) /* in: query thread */ @@ -1831,8 +1845,16 @@ lock_rec_lock_fast( ulint heap_no; ut_ad(mutex_own(&kernel_mutex)); - ut_ad((mode == LOCK_X) || (mode == LOCK_S)); - + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == 0 + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + heap_no = rec_get_heap_no(rec); lock = lock_rec_get_first_on_page(rec); @@ -1877,7 +1899,8 @@ lock_rec_lock_slow( ibool impl, /* in: if TRUE, no lock is set if no wait is necessary: we assume that the caller will set an implicit lock */ - ulint mode, /* in: lock mode */ + ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly + ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index of record */ que_thr_t* thr) /* in: query thread */ @@ -1886,20 +1909,24 @@ lock_rec_lock_slow( ulint err; ut_ad(mutex_own(&kernel_mutex)); - ut_ad((mode == LOCK_X) || (mode == LOCK_S)); - + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == 0 + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + trx = thr_get_trx(thr); - - ut_ad((mode != LOCK_S) || lock_table_has(trx, index->table, - LOCK_IS)); - ut_ad((mode != LOCK_X) || lock_table_has(trx, index->table, - LOCK_IX)); + if (lock_rec_has_expl(mode, rec, trx)) { /* The trx already has a strong enough lock on rec: do nothing */ err = DB_SUCCESS; - } else if (lock_rec_other_has_conflicting(mode, 0, 0, rec, trx)) { + } else if (lock_rec_other_has_conflicting(mode, rec, trx)) { /* If another transaction has a non-gap conflicting request in the queue, as this transaction does not have a lock strong @@ -1935,7 +1962,8 @@ lock_rec_lock( ibool impl, /* in: if TRUE, no lock is set if no wait is necessary: we assume that the caller will set an implicit lock */ - ulint mode, /* in: lock mode */ + ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly + ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index of record */ que_thr_t* thr) /* in: query thread */ @@ -1943,11 +1971,16 @@ lock_rec_lock( ulint err; ut_ad(mutex_own(&kernel_mutex)); - ut_ad((mode != LOCK_S) || lock_table_has(thr_get_trx(thr), - index->table, LOCK_IS)); - ut_ad((mode != LOCK_X) || lock_table_has(thr_get_trx(thr), - index->table, LOCK_IX)); - + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP + || mode - (LOCK_MODE_MASK & mode) == 0); + if (lock_rec_lock_fast(impl, mode, rec, index, thr)) { /* We try a simplified and faster subroutine for the most @@ -2011,26 +2044,33 @@ lock_grant( ut_ad(mutex_own(&kernel_mutex)); lock_reset_lock_and_trx_wait(lock); - - if (lock_get_mode(lock) == LOCK_AUTO_INC) { - if (lock->trx->auto_inc_lock != NULL) { - fprintf(stderr, - "InnoDB: Error: trx already had an AUTO-INC lock!\n"); - } + if (lock_get_mode(lock) == LOCK_AUTO_INC) { - /* Store pointer to lock to trx so that we know to - release it at the end of the SQL statement */ + if (lock->trx->auto_inc_lock != NULL) { + fprintf(stderr, + "InnoDB: Error: trx already had an AUTO-INC lock!\n"); + } - lock->trx->auto_inc_lock = lock; - } + /* Store pointer to lock to trx so that we know to + release it at the end of the SQL statement */ + + lock->trx->auto_inc_lock = lock; + } if (lock_print_waits) { printf("Lock wait for trx %lu ends\n", ut_dulint_get_low(lock->trx->id)); } + + /* If we are resolving a deadlock by choosing another transaction + as a victim, then our original transaction may not be in the + TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait + for it */ - trx_end_lock_wait(lock->trx); + if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) { + trx_end_lock_wait(lock->trx); + } } /***************************************************************** @@ -2080,7 +2120,7 @@ lock_rec_dequeue_from_page( ut_ad(lock_get_type(in_lock) == LOCK_REC); trx = in_lock->trx; - + space = in_lock->un_member.rec_lock.space; page_no = in_lock->un_member.rec_lock.page_no; @@ -2199,9 +2239,10 @@ lock_rec_reset_and_release_wait( } /***************************************************************** -Makes a record to inherit the locks of another record as gap type locks, but -does not reset the lock bits of the other record. Also waiting lock requests -on rec are inherited as GRANTED gap locks. */ +Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of +the other record. Also waiting lock requests on rec are inherited as +GRANTED gap locks. */ void lock_rec_inherit_to_gap( @@ -2217,9 +2258,45 @@ lock_rec_inherit_to_gap( lock = lock_rec_get_first(rec); while (lock != NULL) { - lock_rec_add_to_queue(((lock->type_mode | LOCK_GAP) - & ~LOCK_WAIT), + if (!lock_rec_get_insert_intention(lock)) { + + lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock) + | LOCK_GAP, heir, lock->index, lock->trx); + } + + lock = lock_rec_get_next(rec, lock); + } +} + +/***************************************************************** +Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of the +other record. Also waiting lock requests are inherited as GRANTED gap locks. */ + +void +lock_rec_inherit_to_gap_if_gap_lock( +/*================================*/ + rec_t* heir, /* in: record which inherits */ + rec_t* rec) /* in: record from which inherited; does NOT reset + the locks on this record */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = lock_rec_get_first(rec); + + while (lock != NULL) { + if (!lock_rec_get_insert_intention(lock) + && (page_rec_is_supremum(rec) + || !lock_rec_get_rec_not_gap(lock))) { + + lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock) + | LOCK_GAP, + heir, lock->index, lock->trx); + } + lock = lock_rec_get_next(rec, lock); } } @@ -2778,9 +2855,10 @@ lock_update_insert( { lock_mutex_enter_kernel(); - /* Inherit the locks for rec, in gap mode, from the next record */ + /* Inherit the gap-locking locks for rec, in gap mode, from the next + record */ - lock_rec_inherit_to_gap(rec, page_rec_get_next(rec)); + lock_rec_inherit_to_gap_if_gap_lock(rec, page_rec_get_next(rec)); lock_mutex_exit_kernel(); } @@ -2859,20 +2937,23 @@ static ibool lock_deadlock_occurs( /*=================*/ - /* out: TRUE if a deadlock was detected */ + /* out: TRUE if a deadlock was detected and we + chose trx as a victim; FALSE if no deadlock, or + there was a deadlock, but we chose other + transaction(s) as victim(s) */ lock_t* lock, /* in: lock the transaction is requesting */ trx_t* trx) /* in: transaction */ { dict_table_t* table; dict_index_t* index; trx_t* mark_trx; - ibool ret; + ulint ret; ulint cost = 0; char* err_buf; ut_ad(trx && lock); ut_ad(mutex_own(&kernel_mutex)); - +retry: /* We check that adding this trx to the waits-for graph does not produce a cycle. First mark all active transactions with 0: */ @@ -2886,7 +2967,14 @@ lock_deadlock_occurs( ret = lock_deadlock_recursive(trx, trx, lock, &cost); - if (ret) { + if (ret == LOCK_VICTIM_IS_OTHER) { + /* We chose some other trx as a victim: retry if there still + is a deadlock */ + + goto retry; + } + + if (ret == LOCK_VICTIM_IS_START) { if (lock_get_type(lock) == LOCK_TABLE) { table = lock->un_member.tab_lock.table; index = NULL; @@ -2898,19 +2986,6 @@ lock_deadlock_occurs( lock_deadlock_found = TRUE; err_buf = lock_latest_err_buf + strlen(lock_latest_err_buf); - - err_buf += sprintf(err_buf, - "*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n"); - - ut_a(err_buf <= lock_latest_err_buf + 4000); - - if (lock_get_type(lock) == LOCK_REC) { - lock_rec_print(err_buf, lock); - err_buf += strlen(err_buf); - } else { - lock_table_print(err_buf, lock); - err_buf += strlen(err_buf); - } ut_a(err_buf <= lock_latest_err_buf + 4000); @@ -2923,30 +2998,39 @@ lock_deadlock_occurs( sess_raise_error_low(trx, DB_DEADLOCK, lock->type_mode, table, index, NULL, NULL, NULL); */ + + return(TRUE); } - return(ret); + return(FALSE); } /************************************************************************ Looks recursively for a deadlock. */ static -ibool +ulint lock_deadlock_recursive( /*====================*/ - /* out: TRUE if a deadlock was detected - or the calculation took too long */ + /* out: 0 if no deadlock found, + LOCK_VICTIM_IS_START if there was a deadlock + and we chose 'start' as the victim, + LOCK_VICTIM_IS_OTHER if a deadlock + was found and we chose some other trx as a + victim: we must do the search again in this + last case because there may be another + deadlock! */ trx_t* start, /* in: recursion starting point */ trx_t* trx, /* in: a transaction waiting for a lock */ lock_t* wait_lock, /* in: the lock trx is waiting to be granted */ ulint* cost) /* in/out: number of calculation steps thus far: if this exceeds LOCK_MAX_N_STEPS_... - we return TRUE */ + we return LOCK_VICTIM_IS_START */ { lock_t* lock; ulint bit_no; trx_t* lock_trx; char* err_buf; + ulint ret; ut_a(trx && start && wait_lock); ut_ad(mutex_own(&kernel_mutex)); @@ -2955,14 +3039,14 @@ lock_deadlock_recursive( /* We have already exhaustively searched the subtree starting from this trx */ - return(FALSE); + return(0); } *cost = *cost + 1; if (*cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK) { - return(TRUE); + return(LOCK_VICTIM_IS_START); } lock = wait_lock; @@ -2998,6 +3082,9 @@ lock_deadlock_recursive( lock_trx = lock->trx; if (lock_trx == start) { + /* We came back to the recursion starting + point: a deadlock detected */ + err_buf = lock_latest_err_buf; ut_sprintf_timestamp(err_buf); @@ -3045,11 +3132,59 @@ lock_deadlock_recursive( ut_a(err_buf <= lock_latest_err_buf + 4000); + err_buf += sprintf(err_buf, + "*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n"); + + ut_a(err_buf <= lock_latest_err_buf + 4000); + + if (lock_get_type(start->wait_lock) + == LOCK_REC) { + lock_rec_print(err_buf, + start->wait_lock); + err_buf += strlen(err_buf); + } else { + lock_table_print(err_buf, + start->wait_lock); + err_buf += strlen(err_buf); + } + if (lock_print_waits) { printf("Deadlock detected\n"); } - return(TRUE); + if (ut_dulint_cmp(wait_lock->trx->undo_no, + start->undo_no) >= 0) { + /* Our recursion starting point + transaction is 'smaller', let us + choose 'start' as the victim and roll + back it */ + + return(LOCK_VICTIM_IS_START); + } + + lock_deadlock_found = TRUE; + + ut_a(err_buf <= lock_latest_err_buf + 4000); + + /* Let us choose the transaction of wait_lock + as a victim to try to avoid deadlocking our + recursion starting point transaction */ + + err_buf += sprintf(err_buf, + "*** WE ROLL BACK TRANSACTION (1)\n"); + + wait_lock->trx->error_state = DB_DEADLOCK; + + lock_cancel_waiting_and_release(wait_lock); + + /* Since trx and wait_lock are no longer + in the waits-for graph, we can return FALSE; + note that our selective algorithm can choose + several transactions as victims, but still + we may end up rolling back also the recursion + starting point transaction! */ + + return(LOCK_VICTIM_IS_OTHER); } if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) { @@ -3058,10 +3193,11 @@ lock_deadlock_recursive( incompatible mode, and is itself waiting for a lock */ - if (lock_deadlock_recursive(start, lock_trx, - lock_trx->wait_lock, cost)) { + ret = lock_deadlock_recursive(start, lock_trx, + lock_trx->wait_lock, cost); + if (ret != 0) { - return(TRUE); + return(ret); } } } @@ -3153,12 +3289,16 @@ lock_table_remove_low( /************************************************************************* Enqueues a waiting request for a table lock which cannot be granted immediately. Checks for deadlocks. */ - +static ulint lock_table_enqueue_waiting( /*=======================*/ /* out: DB_LOCK_WAIT, DB_DEADLOCK, or - DB_QUE_THR_SUSPENDED */ + DB_QUE_THR_SUSPENDED, or DB_SUCCESS; + DB_SUCCESS means that there was a deadlock, + but another transaction was chosen as a + victim, and we got the lock immediately: + no need to wait then */ ulint mode, /* in: lock mode this transaction is requesting */ dict_table_t* table, /* in: table */ @@ -3205,6 +3345,13 @@ table->name); return(DB_DEADLOCK); } + if (trx->wait_lock == NULL) { + /* Deadlock resolution chose another transaction as a victim, + and we accidentally got our lock granted! */ + + return(DB_SUCCESS); + } + trx->que_state = TRX_QUE_LOCK_WAIT; trx->wait_started = time(NULL); @@ -3292,7 +3439,7 @@ lock_table( if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) { /* Another trx has a request on the table in an incompatible - mode: this trx must wait */ + mode: this trx may have to wait */ err = lock_table_enqueue_waiting(mode, table, thr); @@ -3659,7 +3806,11 @@ lock_rec_print( } if (lock_rec_get_gap(lock)) { - buf += sprintf(buf, " gap type lock"); + buf += sprintf(buf, " locks gap before rec"); + } + + if (lock_rec_get_rec_not_gap(lock)) { + buf += sprintf(buf, " locks rec but not gap"); } if (lock_rec_get_insert_intention(lock)) { @@ -3776,8 +3927,8 @@ lock_print_info( mtr_t mtr; if (buf_end - buf < 600) { - sprintf(buf, "... output truncated!\n"); - + sprintf(buf, "... output truncated!\n"); + return; } @@ -3802,8 +3953,8 @@ lock_print_info( if ((ulint)(buf_end - buf) < 100 + strlen(lock_latest_err_buf)) { - lock_mutex_exit_kernel(); - sprintf(buf, "... output truncated!\n"); + lock_mutex_exit_kernel(); + sprintf(buf, "... output truncated!\n"); return; } @@ -3826,8 +3977,8 @@ lock_print_info( while (trx) { if (buf_end - buf < 900) { - lock_mutex_exit_kernel(); - sprintf(buf, "... output truncated!\n"); + lock_mutex_exit_kernel(); + sprintf(buf, "... output truncated!\n"); return; } @@ -3879,8 +4030,8 @@ loop: buf += strlen(buf); if (buf_end - buf < 500) { - lock_mutex_exit_kernel(); - sprintf(buf, "... output truncated!\n"); + lock_mutex_exit_kernel(); + sprintf(buf, "... output truncated!\n"); return; } @@ -3936,7 +4087,7 @@ loop: } if (buf_end - buf < 500) { - lock_mutex_exit_kernel(); + lock_mutex_exit_kernel(); sprintf(buf, "... output truncated!\n"); return; @@ -4080,7 +4231,8 @@ lock_rec_queue_validate( if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, impl_trx)) { - ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx)); + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, + impl_trx)); } } @@ -4095,7 +4247,8 @@ lock_rec_queue_validate( if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, impl_trx)) { - ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx)); + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, + impl_trx)); } } @@ -4359,8 +4512,8 @@ lock_rec_insert_check_and_lock( *inherit = TRUE; - /* If another transaction has an explicit lock request, gap or not, - waiting or granted, on the successor, the insert has to wait. + /* If another transaction has an explicit lock request which locks + the gap, waiting or granted, on the successor, the insert has to wait. An exception is the case where the lock by the another transaction is a gap type lock which it placed to wait for its turn to insert. We @@ -4369,8 +4522,10 @@ lock_rec_insert_check_and_lock( had to wait for their insert. Both had waiting gap type lock requests on the successor, which produced an unnecessary deadlock. */ - if (lock_rec_other_has_conflicting(LOCK_X, LOCK_GAP, - LOCK_INSERT_INTENTION, next_rec, trx)) { + if (lock_rec_other_has_conflicting(LOCK_X | LOCK_GAP + | LOCK_INSERT_INTENTION, next_rec, trx)) { + + /* Note that we may get DB_SUCCESS also here! */ err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION, next_rec, index, thr); @@ -4418,9 +4573,11 @@ lock_rec_convert_impl_to_expl( /* If the transaction has no explicit x-lock set on the record, set one for it */ - if (!lock_rec_has_expl(LOCK_X, rec, impl_trx)) { + if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, + impl_trx)) { - lock_rec_add_to_queue(LOCK_REC | LOCK_X, rec, index, + lock_rec_add_to_queue(LOCK_REC | LOCK_X + | LOCK_REC_NOT_GAP, rec, index, impl_trx); } } @@ -4466,7 +4623,7 @@ lock_clust_rec_modify_check_and_lock( lock_rec_convert_impl_to_expl(rec, index); - err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr); + err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); @@ -4511,7 +4668,7 @@ lock_sec_rec_modify_check_and_lock( ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr); + err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); @@ -4545,6 +4702,8 @@ lock_sec_rec_read_check_and_lock( ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ que_thr_t* thr) /* in: query thread */ { ulint err; @@ -4576,7 +4735,7 @@ lock_sec_rec_read_check_and_lock( lock_rec_convert_impl_to_expl(rec, index); } - err = lock_rec_lock(FALSE, mode, rec, index, thr); + err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); @@ -4607,13 +4766,16 @@ lock_clust_rec_read_check_and_lock( ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ que_thr_t* thr) /* in: query thread */ { ulint err; ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); - + ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP + || gap_mode == LOCK_REC_NOT_GAP); if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); @@ -4631,7 +4793,7 @@ lock_clust_rec_read_check_and_lock( lock_rec_convert_impl_to_expl(rec, index); } - err = lock_rec_lock(FALSE, mode, rec, index, thr); + err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); diff --git a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c index 23585e494b8..22d0bab0da2 100644 --- a/innobase/mem/mem0dbg.c +++ b/innobase/mem/mem0dbg.c @@ -347,9 +347,19 @@ mem_hash_remove( NULL, NULL); if (error) { printf("Inconsistency in memory heap or buffer n:o %lu created\n", - node->nth_heap); + node->nth_heap); printf("in %s line %lu and tried to free in %s line %lu.\n", node->file_name, node->line, file_name, line); + + printf( + "Hex dump of 400 bytes around memory heap first block start:\n"); + + ut_print_buf((byte*)(node->heap) - 200, 400); + + printf("\nDump of the mem heap:\n"); + + mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, &size, + NULL, NULL); ut_error; } diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 098d5b25e89..9eae358c7fb 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -148,7 +148,7 @@ Gets the operating system version. Currently works only on Windows. */ ulint os_get_os_version(void) /*===================*/ - /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */ + /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */ { #ifdef __WIN__ OSVERSIONINFO os_info; @@ -162,7 +162,11 @@ os_get_os_version(void) } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { return(OS_WIN95); } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { - return(OS_WINNT); + if (os_info.dwMajorVersion <= 4) { + return(OS_WINNT); + } else { + return(OS_WIN2000); + } } else { ut_error; return(0); @@ -268,9 +272,7 @@ os_file_get_last_error(void) } /******************************************************************** -Does error handling when a file operation fails. If we have run out -of disk space, then the user can clean the disk. If we do not find -a specified file, then the user can copy it to disk. */ +Does error handling when a file operation fails. */ static ibool os_file_handle_error( @@ -503,7 +505,11 @@ try_again: value 2 denotes that we do not flush the log at every commit, but only once per second */ } else { - attributes = attributes | FILE_FLAG_NO_BUFFERING; + if (srv_win_file_flush_method == + SRV_WIN_IO_UNBUFFERED) { + attributes = attributes + | FILE_FLAG_NO_BUFFERING; + } } #endif } else if (purpose == OS_FILE_NORMAL) { @@ -514,7 +520,11 @@ try_again: value 2 denotes that we do not flush the log at every commit, but only once per second */ } else { - attributes = attributes | FILE_FLAG_NO_BUFFERING; + if (srv_win_file_flush_method == + SRV_WIN_IO_UNBUFFERED) { + attributes = attributes + | FILE_FLAG_NO_BUFFERING; + } } #endif } else { @@ -1752,6 +1762,7 @@ os_aio( os_aio_array_t* array; os_aio_slot_t* slot; #ifdef WIN_ASYNC_IO + ibool retval; BOOL ret = TRUE; DWORD len = n; void* dummy_mess1; @@ -1824,6 +1835,8 @@ try_again: if (os_aio_use_native_aio) { #ifdef WIN_ASYNC_IO os_n_file_reads++; + os_bytes_read_since_printout += len; + ret = ReadFile(file, buf, (DWORD)n, &len, &(slot->control)); #elif defined(POSIX_ASYNC_IO) @@ -1870,10 +1883,12 @@ try_again: where we also use async i/o: in Windows we must use the same wait mechanism as for async i/o */ - return(os_aio_windows_handle(ULINT_UNDEFINED, + retval = os_aio_windows_handle(ULINT_UNDEFINED, slot->pos, &dummy_mess1, &dummy_mess2, - &dummy_type)); + &dummy_type); + + return(retval); } return(TRUE); @@ -1897,8 +1912,6 @@ try_again: goto try_again; } - ut_error; - return(FALSE); } @@ -1958,14 +1971,14 @@ os_aio_windows_handle( n = array->n_slots / array->n_segments; if (array == os_aio_sync_array) { - srv_io_thread_op_info[orig_seg] = "wait windows aio for 1 page"; + srv_io_thread_op_info[orig_seg] = "wait Windows aio for 1 page"; ut_ad(pos < array->n_slots); os_event_wait(array->events[pos]); i = pos; } else { srv_io_thread_op_info[orig_seg] = - "wait windows aio for n pages"; + "wait Windows aio"; i = os_event_wait_multiple(n, (array->events) + segment * n); } @@ -1991,10 +2004,8 @@ os_aio_windows_handle( ut_a(TRUE == os_file_flush(slot->file)); } } else { - os_file_get_last_error(); - - ut_error; - + os_file_handle_error(slot->file, slot->name); + ret_val = FALSE; } diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c index 43a2db4d306..1ee448a4a44 100644 --- a/innobase/os/os0proc.c +++ b/innobase/os/os0proc.c @@ -19,6 +19,23 @@ Created 9/30/1995 Heikki Tuuri #include "ut0mem.h" /******************************************************************** +Converts the current process id to a number. It is not guaranteed that the +number is unique. In Linux returns the 'process number' of the current +thread. That number is the same as one sees in 'top', for example. In Linux +the thread id is not the same as one sees in 'top'. */ + +ulint +os_proc_get_number(void) +/*====================*/ +{ +#ifdef __WIN__ + return((ulint)GetCurrentProcessId()); +#else + return((ulint)getpid()); +#endif +} + +/******************************************************************** Allocates non-cacheable memory. */ void* diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index 2909573b14b..bb49e9080ce 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -169,7 +169,7 @@ page_cur_search_with_match( ut_ad(dtuple_check_typed(tuple)); ut_ad((mode == PAGE_CUR_L) || (mode == PAGE_CUR_LE) || (mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE) - || (mode == PAGE_CUR_DBG)); + || (mode == PAGE_CUR_LE_OR_EXTENDS) || (mode == PAGE_CUR_DBG)); #ifdef PAGE_CUR_ADAPT if ((page_header_get_field(page, PAGE_LEVEL) == 0) @@ -232,9 +232,26 @@ page_cur_search_with_match( low_matched_bytes = cur_matched_bytes; } else if (cmp == -1) { - up = mid; - up_matched_fields = cur_matched_fields; - up_matched_bytes = cur_matched_bytes; + + if (mode == PAGE_CUR_LE_OR_EXTENDS + && dfield_get_len(dtuple_get_nth_field(tuple, + cur_matched_fields)) + == cur_matched_bytes + && rec_get_nth_field_len(mid_rec, + cur_matched_fields) + != UNIV_SQL_NULL) { + + /* This means current dfield is not SQL + NULL, and the current rec field extends it */ + + low = mid; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + } else { + up = mid; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + } } else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) { low = mid; @@ -252,8 +269,8 @@ page_cur_search_with_match( slot = page_dir_get_nth_slot(page, up); up_rec = page_dir_slot_get_rec(slot); - /* Perform linear search until the upper and lower records - come to distance 1 of each other. */ + /* Perform linear search until the upper and lower records come to + distance 1 of each other. */ while (page_rec_get_next(low_rec) != up_rec) { @@ -272,10 +289,25 @@ page_cur_search_with_match( low_matched_bytes = cur_matched_bytes; } else if (cmp == -1) { - up_rec = mid_rec; - up_matched_fields = cur_matched_fields; - up_matched_bytes = cur_matched_bytes; - + if (mode == PAGE_CUR_LE_OR_EXTENDS + && dfield_get_len(dtuple_get_nth_field(tuple, + cur_matched_fields)) + == cur_matched_bytes + && rec_get_nth_field_len(mid_rec, + cur_matched_fields) + != UNIV_SQL_NULL) { + + /* This means current dfield is not SQL + NULL, and the current rec field extends it */ + + low = mid; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + } else { + up_rec = mid_rec; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + } } else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) { low_rec = mid_rec; low_matched_fields = cur_matched_fields; diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c index ed74736c8da..7d0d88c6afc 100644 --- a/innobase/page/page0page.c +++ b/innobase/page/page0page.c @@ -1313,6 +1313,194 @@ page_rec_validate( } /******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ + +ibool +page_simple_validate( +/*=================*/ + /* out: TRUE if ok */ + page_t* page) /* in: index page */ +{ + page_cur_t cur; + page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + rec_t* rec; + byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (n_slots > UNIV_PAGE_SIZE / 4) { + fprintf(stderr, + "Nonsensical number %lu of page dir slots\n", n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (rec_heap_top > page_dir_get_nth_slot(page, n_slots - 1)) { + + fprintf(stderr, + "Record heap and dir overlap on a page, heap top %lu, dir %lu\n", + (ulint)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page), + (ulint)(page_dir_get_nth_slot(page, n_slots - 1) - page)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + page_cur_set_before_first(page, &cur); + + for (;;) { + rec = (&cur)->rec; + + if (rec > rec_heap_top) { + fprintf(stderr, + "Record %lu is above rec heap top %lu\n", + (ulint)(rec - page), (ulint)(rec_heap_top - page)); + + goto func_exit; + } + + if (rec_get_n_owned(rec) != 0) { + /* This is a record pointed to by a dir slot */ + if (rec_get_n_owned(rec) != own_count) { + + fprintf(stderr, + "Wrong owned count %lu, %lu, rec %lu\n", + rec_get_n_owned(rec), own_count, + (ulint)(rec - page)); + + goto func_exit; + } + + if (page_dir_slot_get_rec(slot) != rec) { + fprintf(stderr, + "Dir slot does not point to right rec %lu\n", + (ulint)(rec - page)); + + goto func_exit; + } + + own_count = 0; + + if (!page_cur_is_after_last(&cur)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_cur_is_after_last(&cur)) { + + break; + } + + if (rec_get_next_offs(rec) < FIL_PAGE_DATA + || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { + fprintf(stderr, + "Next record offset nonsensical %lu for rec %lu\n", + rec_get_next_offs(rec), + (ulint)(rec - page)); + + goto func_exit; + } + + count++; + + if (count > UNIV_PAGE_SIZE) { + fprintf(stderr, + "Page record list appears to be circular %lu\n", + count); + goto func_exit; + } + + page_cur_move_to_next(&cur); + own_count++; + } + + if (rec_get_n_owned(rec) == 0) { + fprintf(stderr, "n owned is zero in a supremum rec\n"); + + goto func_exit; + } + + if (slot_no != n_slots - 1) { + fprintf(stderr, "n slots wrong %lu, %lu\n", + slot_no, n_slots - 1); + goto func_exit; + } + + if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) { + fprintf(stderr, "n recs wrong %lu %lu\n", + page_header_get_field(page, PAGE_N_RECS) + 2, count + 1); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE) { + fprintf(stderr, + "Free list record has a nonsensical offset %lu\n", + (ulint)(rec - page)); + + goto func_exit; + } + + if (rec > rec_heap_top) { + fprintf(stderr, + "Free list record %lu is above rec heap top %lu\n", + (ulint)(rec - page), (ulint)(rec_heap_top - page)); + + goto func_exit; + } + + count++; + + if (count > UNIV_PAGE_SIZE) { + fprintf(stderr, + "Page free list appears to be circular %lu\n", + count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + } + + if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { + + fprintf(stderr, "N heap is wrong %lu, %lu\n", + page_header_get_field(page, PAGE_N_HEAP), count + 1); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/******************************************************************* This function checks the consistency of an index page. */ ibool @@ -1339,6 +1527,14 @@ page_validate( ulint i; char err_buf[1000]; + if (!page_simple_validate(page)) { + buf_page_print(page); + + fprintf(stderr, "Apparent corruption in a page in index %s\n", + index->name); + return(FALSE); + } + heap = mem_heap_create(UNIV_PAGE_SIZE); /* The following buffer is used to check that the diff --git a/innobase/pars/lexyy.c b/innobase/pars/lexyy.c index 782fca35f66..f7edc9d195f 100644 --- a/innobase/pars/lexyy.c +++ b/innobase/pars/lexyy.c @@ -4,8 +4,6 @@ * $Header: /home/daffy/u0/vern/flex/RCS/flex.skl,v 2.91 96/09/10 16:58:48 vern Exp $ */ -#include "univ.i" - #define FLEX_SCANNER #define YY_FLEX_MAJOR_VERSION 2 #define YY_FLEX_MINOR_VERSION 5 @@ -609,18 +607,13 @@ How to make the InnoDB parser and lexer C files: 6. Remove the #include of unistd.h from about line 2500 of lexyy.c -7. Move #include <math.h> in pars0grm.c after #include "univ.i" to remove - a large file compilation error on AIX. - -8. Move #include "univ.i" in lexyy.c to the file start to remove a large - file compilation error on AIX. - These instructions seem to work at least with bison-1.28 and flex-2.5.4 on Linux. *******************************************************/ #line 36 "pars0lex.l" #define YYSTYPE que_node_t* +#include "univ.i" #include "pars0pars.h" #include "pars0grm.h" #include "pars0sym.h" diff --git a/innobase/pars/pars0grm.c b/innobase/pars/pars0grm.c index ce575063610..05b75398084 100644 --- a/innobase/pars/pars0grm.c +++ b/innobase/pars/pars0grm.c @@ -102,8 +102,6 @@ que_node_t */ #include "que0que.h" #include "row0sel.h" -#include <math.h> - #define YYSTYPE que_node_t* /* #define __STDC__ */ diff --git a/innobase/read/read0read.c b/innobase/read/read0read.c index a5048c0c909..5c1d2d5418e 100644 --- a/innobase/read/read0read.c +++ b/innobase/read/read0read.c @@ -201,6 +201,28 @@ read_view_close( } /************************************************************************* +Closes a consistent read view for MySQL. This function is called at an SQL +statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ + +void +read_view_close_for_mysql( +/*======================*/ + trx_t* trx) /* in: trx which has a read view */ +{ + ut_a(trx->read_view); + + mutex_enter(&kernel_mutex); + + read_view_close(trx->read_view); + + mem_heap_empty(trx->read_view_heap); + + trx->read_view = NULL; + + mutex_exit(&kernel_mutex); +} + +/************************************************************************* Prints a read view to stderr. */ void diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index 941c9d5759d..4e8b487a0f1 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -321,59 +321,6 @@ row_ins_clust_index_entry_by_modify( return(err); } -/******************************************************************* -Checks if a unique key violation to rec would occur at the index entry -insert. */ -static -ibool -row_ins_dupl_error_with_rec( -/*========================*/ - /* out: TRUE if error */ - rec_t* rec, /* in: user record; NOTE that we assume - that the caller already has a record lock on - the record! */ - dtuple_t* entry, /* in: entry to insert */ - dict_index_t* index) /* in: index */ -{ - ulint matched_fields; - ulint matched_bytes; - ulint n_unique; - ulint i; - - n_unique = dict_index_get_n_unique(index); - - matched_fields = 0; - matched_bytes = 0; - - cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes); - - if (matched_fields < n_unique) { - - return(FALSE); - } - - /* In a unique secondary index we allow equal key values if they - contain SQL NULLs */ - - if (!(index->type & DICT_CLUSTERED)) { - - for (i = 0; i < n_unique; i++) { - if (UNIV_SQL_NULL == dfield_get_len( - dtuple_get_nth_field(entry, i))) { - - return(FALSE); - } - } - } - - if (!rec_get_deleted_flag(rec)) { - - return(TRUE); - } - - return(FALSE); -} - /************************************************************************* Either deletes or sets the referencing columns SQL NULL in a child row. Used in ON DELETE ... clause for foreign keys when a parent row is @@ -533,8 +480,12 @@ row_ins_foreign_delete_or_set_null( err = lock_table(0, table, LOCK_IX, thr); if (err == DB_SUCCESS) { + /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; + we already have a normal shared lock on the appropriate + gap if the search criterion was not unique */ + err = lock_clust_rec_read_check_and_lock(0, clust_rec, - clust_index, LOCK_X, thr); + clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr); } if (err != DB_SUCCESS) { @@ -630,12 +581,14 @@ nonstandard_exit_func: /************************************************************************* Sets a shared lock on a record. Used in locking possible duplicate key -records. */ +records and also in checking foreign key constraints. */ static ulint row_ins_set_shared_rec_lock( /*========================*/ /* out: DB_SUCCESS or error code */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ que_thr_t* thr) /* in: query thread */ @@ -644,10 +597,10 @@ row_ins_set_shared_rec_lock( if (index->type & DICT_CLUSTERED) { err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_S, - thr); + type, thr); } else { err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_S, - thr); + type, thr); } return(err); @@ -656,7 +609,7 @@ row_ins_set_shared_rec_lock( /******************************************************************* Checks if foreign key constraint fails for an index entry. Sets shared locks which lock either the success or the failure of the constraint. NOTE that -the caller must have a shared latch on dict_foreign_key_check_lock. */ +the caller must have a shared latch on dict_operation_lock. */ ulint row_ins_check_foreign_constraint( @@ -679,7 +632,7 @@ row_ins_check_foreign_constraint( dict_table_t* check_table; dict_index_t* check_index; ulint n_fields_cmp; - ibool timeout_expired; + ibool unique_search; rec_t* rec; btr_pcur_t pcur; ibool moved; @@ -689,7 +642,9 @@ row_ins_check_foreign_constraint( mtr_t mtr; run_again: - ut_ad(rw_lock_own(&dict_foreign_key_check_lock, RW_LOCK_SHARED)); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); + + err = DB_SUCCESS; if (thr_get_trx(thr)->check_foreigns == FALSE) { /* The user has suppressed foreign key checks currently for @@ -748,6 +703,14 @@ run_again: dtuple_set_n_fields_cmp(entry, foreign->n_fields); + if (dict_index_get_n_unique(check_index) <= foreign->n_fields) { + /* We can just set a LOCK_REC_NOT_GAP type lock */ + + unique_search = TRUE; + } else { + unique_search = FALSE; + } + btr_pcur_open(check_index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); @@ -761,25 +724,45 @@ run_again: goto next_rec; } - /* Try to place a lock on the index record */ - - err = row_ins_set_shared_rec_lock(rec, check_index, thr); - - if (err != DB_SUCCESS) { - - break; - } - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, + check_index, thr); + if (err != DB_SUCCESS) { + + break; + } + goto next_rec; } cmp = cmp_dtuple_rec(entry, rec); if (cmp == 0) { - if (!rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec)) { + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, + rec, check_index, thr); + if (err != DB_SUCCESS) { + + break; + } + } else { /* Found a matching record */ + + if (unique_search) { + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + rec, check_index, thr); + } else { + err = row_ins_set_shared_rec_lock( + LOCK_ORDINARY, + rec, check_index, thr); + } + + if (err != DB_SUCCESS) { + + break; + } /* printf( "FOREIGN: Found matching record from %s %s\n", @@ -807,6 +790,13 @@ run_again: } if (cmp < 0) { + err = row_ins_set_shared_rec_lock(LOCK_GAP, + rec, check_index, thr); + if (err != DB_SUCCESS) { + + break; + } + if (check_ref) { err = DB_NO_REFERENCED_ROW; } else { @@ -844,14 +834,14 @@ do_possible_lock_wait: que_thr_stop_for_mysql(thr); - timeout_expired = srv_suspend_mysql_thread(thr); + srv_suspend_mysql_thread(thr); - if (!timeout_expired) { + if (thr_get_trx(thr)->error_state == DB_SUCCESS) { goto run_again; } - err = DB_LOCK_WAIT_TIMEOUT; + err = thr_get_trx(thr)->error_state; } return(err); @@ -890,21 +880,21 @@ row_ins_check_foreign_constraints( trx); } - if (!trx->has_dict_foreign_key_check_lock) { + if (!trx->has_dict_operation_lock) { got_s_lock = TRUE; - rw_lock_s_lock(&dict_foreign_key_check_lock); + rw_lock_s_lock(&dict_operation_lock); - trx->has_dict_foreign_key_check_lock = TRUE; + trx->has_dict_operation_lock = TRUE; } err = row_ins_check_foreign_constraint(TRUE, foreign, table, index, entry, thr); if (got_s_lock) { - rw_lock_s_unlock(&dict_foreign_key_check_lock); + rw_lock_s_unlock(&dict_operation_lock); - trx->has_dict_foreign_key_check_lock = FALSE; + trx->has_dict_operation_lock = FALSE; } if (err != DB_SUCCESS) { @@ -919,6 +909,59 @@ row_ins_check_foreign_constraints( } /******************************************************************* +Checks if a unique key violation to rec would occur at the index entry +insert. */ +static +ibool +row_ins_dupl_error_with_rec( +/*========================*/ + /* out: TRUE if error */ + rec_t* rec, /* in: user record; NOTE that we assume + that the caller already has a record lock on + the record! */ + dtuple_t* entry, /* in: entry to insert */ + dict_index_t* index) /* in: index */ +{ + ulint matched_fields; + ulint matched_bytes; + ulint n_unique; + ulint i; + + n_unique = dict_index_get_n_unique(index); + + matched_fields = 0; + matched_bytes = 0; + + cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes); + + if (matched_fields < n_unique) { + + return(FALSE); + } + + /* In a unique secondary index we allow equal key values if they + contain SQL NULLs */ + + if (!(index->type & DICT_CLUSTERED)) { + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(FALSE); + } + } + } + + if (!rec_get_deleted_flag(rec)) { + + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************* Scans a unique non-clustered index at a given index entry to determine whether a uniqueness violation has occurred for the key value of the entry. Set shared locks on possible duplicate records. */ @@ -976,9 +1019,10 @@ row_ins_scan_sec_index_for_duplicate( goto next_rec; } - /* Try to place a lock on the index record */ + /* Try to place a lock on the index record */ - err = row_ins_set_shared_rec_lock(rec, index, thr); + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, index, + thr); if (err != DB_SUCCESS) { @@ -1082,8 +1126,8 @@ row_ins_duplicate_error_in_clust( sure that in roll-forward we get the same duplicate errors as in original execution */ - err = row_ins_set_shared_rec_lock(rec, cursor->index, - thr); + err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP, + rec, cursor->index, thr); if (err != DB_SUCCESS) { return(err); @@ -1105,8 +1149,8 @@ row_ins_duplicate_error_in_clust( if (rec != page_get_supremum_rec(page)) { - err = row_ins_set_shared_rec_lock(rec, cursor->index, - thr); + err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP, + rec, cursor->index, thr); if (err != DB_SUCCESS) { return(err); diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index cea8f1316fe..6fde57eb75a 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -27,6 +27,7 @@ Created 9/17/2000 Heikki Tuuri #include "lock0lock.h" #include "rem0cmp.h" #include "log0log.h" +#include "btr0sea.h" /* A dummy variable used to fool the compiler */ ibool row_mysql_identically_false = FALSE; @@ -203,7 +204,6 @@ row_mysql_handle_errors( que_thr_t* thr, /* in: query thread */ trx_savept_t* savept) /* in: savepoint or NULL */ { - ibool timeout_expired; ulint err; handle_new_error: @@ -240,11 +240,9 @@ handle_new_error: /* MySQL will roll back the latest SQL statement */ } else if (err == DB_LOCK_WAIT) { - timeout_expired = srv_suspend_mysql_thread(thr); - - if (timeout_expired) { - trx->error_state = DB_LOCK_WAIT_TIMEOUT; + srv_suspend_mysql_thread(thr); + if (trx->error_state != DB_SUCCESS) { que_thr_stop_for_mysql(thr); goto handle_new_error; @@ -1146,7 +1144,7 @@ row_mysql_lock_data_dictionary(void) /* Serialize data dictionary operations with dictionary mutex: no deadlocks or lock waits can occur then in these operations */ - rw_lock_x_lock(&(dict_foreign_key_check_lock)); + rw_lock_x_lock(&dict_operation_lock); mutex_enter(&(dict_sys->mutex)); } @@ -1161,7 +1159,7 @@ row_mysql_unlock_data_dictionary(void) no deadlocks can occur then in these operations */ mutex_exit(&(dict_sys->mutex)); - rw_lock_x_unlock(&(dict_foreign_key_check_lock)); + rw_lock_x_unlock(&dict_operation_lock); } /************************************************************************* @@ -1184,6 +1182,7 @@ row_create_table_for_mysql( ulint err; ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); ut_ad(mutex_own(&(dict_sys->mutex))); if (srv_created_new_raw) { @@ -1383,7 +1382,8 @@ row_create_index_for_mysql( ulint namelen; ulint keywordlen; ulint err; - + + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); ut_ad(mutex_own(&(dict_sys->mutex))); ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); @@ -1464,6 +1464,7 @@ row_table_add_foreign_constraints( ulint err; ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); ut_a(sql_string); trx->op_info = (char *) "adding foreign keys"; @@ -1846,12 +1847,16 @@ row_drop_table_for_mysql( no deadlocks can occur then in these operations */ if (!has_dict_mutex) { - /* Prevent foreign key checks while we are dropping the table */ - rw_lock_x_lock(&(dict_foreign_key_check_lock)); + /* Prevent foreign key checks etc. while we are dropping the + table */ + rw_lock_x_lock(&dict_operation_lock); mutex_enter(&(dict_sys->mutex)); } + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); + graph = pars_sql(buf); ut_a(graph); @@ -1861,9 +1866,6 @@ row_drop_table_for_mysql( graph->fork_type = QUE_FORK_MYSQL_INTERFACE; - /* Prevent purge from running while we are dropping the table */ - rw_lock_s_lock(&(purge_sys->purge_is_running)); - table = dict_table_get_low(name); if (!table) { @@ -1944,12 +1946,11 @@ row_drop_table_for_mysql( } } -funct_exit: - rw_lock_s_unlock(&(purge_sys->purge_is_running)); +funct_exit: if (!has_dict_mutex) { mutex_exit(&(dict_sys->mutex)); - rw_lock_x_unlock(&(dict_foreign_key_check_lock)); + rw_lock_x_unlock(&dict_operation_lock); } que_graph_free(graph); @@ -1985,7 +1986,7 @@ row_drop_database_for_mysql( trx_start_if_not_started(trx); loop: - rw_lock_x_lock(&(dict_foreign_key_check_lock)); + rw_lock_x_lock(&dict_operation_lock); mutex_enter(&(dict_sys->mutex)); while ((table_name = dict_get_first_table_name_in_db(name))) { @@ -2000,7 +2001,7 @@ loop: if (table->n_mysql_handles_opened > 0) { mutex_exit(&(dict_sys->mutex)); - rw_lock_x_unlock(&(dict_foreign_key_check_lock)); + rw_lock_x_unlock(&dict_operation_lock); ut_print_timestamp(stderr); fprintf(stderr, @@ -2028,7 +2029,7 @@ loop: } mutex_exit(&(dict_sys->mutex)); - rw_lock_x_unlock(&(dict_foreign_key_check_lock)); + rw_lock_x_unlock(&dict_operation_lock); trx_commit_for_mysql(trx); @@ -2165,7 +2166,7 @@ row_rename_table_for_mysql( /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ - rw_lock_x_lock(&(dict_foreign_key_check_lock)); + rw_lock_x_lock(&dict_operation_lock); mutex_enter(&(dict_sys->mutex)); table = dict_table_get_low(old_name); @@ -2249,7 +2250,7 @@ row_rename_table_for_mysql( } funct_exit: mutex_exit(&(dict_sys->mutex)); - rw_lock_x_unlock(&(dict_foreign_key_check_lock)); + rw_lock_x_unlock(&dict_operation_lock); que_graph_free(graph); @@ -2394,18 +2395,28 @@ row_check_table_for_mysql( row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL handle */ { - dict_table_t* table = prebuilt->table; + dict_table_t* table = prebuilt->table; dict_index_t* index; ulint n_rows; ulint n_rows_in_table = ULINT_UNDEFINED; - ulint ret = DB_SUCCESS; - + ulint ret = DB_SUCCESS; + ulint old_isolation_level; + prebuilt->trx->op_info = (char *) "checking table"; + old_isolation_level = prebuilt->trx->isolation_level; + + /* We must run the index record counts at an isolation level + >= READ COMMITTED, because a dirty read can see a wrong number + of records in some index; to play safe, we use always + REPEATABLE READ here */ + + prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ; + index = dict_table_get_first_index(table); while (index != NULL) { - /* fprintf(stderr, "Validating index %s\n", index->name); */ + /* fprintf(stderr, "Validating index %s\n", index->name); */ if (!btr_validate_tree(index->tree)) { ret = DB_ERROR; @@ -2433,6 +2444,9 @@ row_check_table_for_mysql( index = dict_table_get_next_index(index); } + /* Restore the original isolation level */ + prebuilt->trx->isolation_level = old_isolation_level; + /* We validate also the whole adaptive hash index for all tables at every CHECK TABLE */ diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c index 60e057b816e..3d9ae6aad8b 100644 --- a/innobase/row/row0purge.c +++ b/innobase/row/row0purge.c @@ -453,7 +453,9 @@ static ibool row_purge_parse_undo_rec( /*=====================*/ - /* out: TRUE if purge operation required */ + /* out: TRUE if purge operation required: + NOTE that then the CALLER must s-unlock + dict_operation_lock! */ purge_node_t* node, /* in: row undo node */ ibool* updated_extern, /* out: TRUE if an externally stored field @@ -493,18 +495,20 @@ row_purge_parse_undo_rec( return(FALSE); } + /* Prevent DROP TABLE etc. from running when we are doing the purge + for this row */ + + rw_lock_s_lock(&dict_operation_lock); mutex_enter(&(dict_sys->mutex)); node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr)); - rw_lock_x_lock(&(purge_sys->purge_is_running)); - mutex_exit(&(dict_sys->mutex)); if (node->table == NULL) { /* The table has been dropped: no need to do purge */ - rw_lock_x_unlock(&(purge_sys->purge_is_running)); + rw_lock_s_unlock(&dict_operation_lock); return(FALSE); } @@ -514,7 +518,7 @@ row_purge_parse_undo_rec( if (clust_index == NULL) { /* The table was corrupt in the data dictionary */ - rw_lock_x_unlock(&(purge_sys->purge_is_running)); + rw_lock_s_unlock(&dict_operation_lock); return(FALSE); } @@ -573,6 +577,8 @@ row_purge( } else { purge_needed = row_purge_parse_undo_rec(node, &updated_extern, thr); + /* If purge_needed == TRUE, we must also remember to unlock + dict_operation_lock! */ } if (purge_needed) { @@ -594,7 +600,7 @@ row_purge( btr_pcur_close(&(node->pcur)); } - rw_lock_x_unlock(&(purge_sys->purge_is_running)); + rw_lock_s_unlock(&dict_operation_lock); } /* Do some cleanup */ diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index 4af04251996..fcf48dd15cf 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -606,7 +606,7 @@ row_sel_get_clust_rec( /* Try to place a lock on the index record */ err = lock_clust_rec_read_check_and_lock(0, clust_rec, index, - node->row_lock_mode, thr); + node->row_lock_mode, LOCK_ORDINARY, thr); if (err != DB_SUCCESS) { return(err); @@ -621,7 +621,7 @@ row_sel_get_clust_rec( node->read_view)) { err = row_sel_build_prev_vers(node->read_view, plan, - clust_rec, &old_vers, mtr); + clust_rec, &old_vers, mtr); if (err != DB_SUCCESS) { return(err); @@ -678,16 +678,17 @@ sel_set_rec_lock( rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ ulint mode, /* in: lock mode */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */ que_thr_t* thr) /* in: query thread */ { ulint err; if (index->type & DICT_CLUSTERED) { err = lock_clust_rec_read_check_and_lock(0, rec, index, mode, - thr); + type, thr); } else { err = lock_sec_rec_read_check_and_lock(0, rec, index, mode, - thr); + type, thr); } return(err); @@ -1154,7 +1155,7 @@ rec_loop: if (!consistent_read) { err = sel_set_rec_lock(page_rec_get_next(rec), index, - node->row_lock_mode, thr); + node->row_lock_mode, LOCK_ORDINARY, thr); if (err != DB_SUCCESS) { /* Note that in this case we will store in pcur the PREDECESSOR of the record we are waiting @@ -1180,8 +1181,8 @@ rec_loop: if (!consistent_read) { /* Try to place a lock on the index record */ - err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr); - + err = sel_set_rec_lock(rec, index, node->row_lock_mode, + LOCK_ORDINARY, thr); if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -2200,6 +2201,7 @@ row_sel_get_clust_rec_for_mysql( rec_t* old_vers; ulint err; trx_t* trx; + char err_buf[1000]; *out_rec = NULL; @@ -2213,14 +2215,40 @@ row_sel_get_clust_rec_for_mysql( clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); - ut_ad(page_rec_is_user_rec(clust_rec)); + if (!page_rec_is_user_rec(clust_rec)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: error clustered record for sec rec not found\n" + "InnoDB: index %s table %s\n", sec_index->name, + sec_index->table->name); + + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: sec index record %s\n", err_buf); + + rec_sprintf(err_buf, 900, clust_rec); + fprintf(stderr, "InnoDB: clust index record %s\n", err_buf); + + trx_print(err_buf, trx); + + fprintf(stderr, + "%s\nInnoDB: Make a detailed bug report and send it\n", + err_buf); + fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n"); + + clust_rec = NULL; + + goto func_exit; + } if (prebuilt->select_lock_type != LOCK_NONE) { - /* Try to place a lock on the index record */ + /* Try to place a lock on the index record; we are searching + the clust rec with a unique condition, hence + we set a LOCK_REC_NOT_GAP type lock */ err = lock_clust_rec_read_check_and_lock(0, clust_rec, clust_index, - prebuilt->select_lock_type, thr); + prebuilt->select_lock_type, + LOCK_REC_NOT_GAP, thr); if (err != DB_SUCCESS) { return(err); @@ -2232,8 +2260,12 @@ row_sel_get_clust_rec_for_mysql( trx = thr_get_trx(thr); old_vers = NULL; - - if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index, + + /* If the isolation level allows reading of uncommitted data, + then we never look for an earlier version */ + + if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && !lock_clust_rec_cons_read_sees(clust_rec, clust_index, trx->read_view)) { err = row_sel_build_prev_vers_for_mysql( @@ -2275,6 +2307,7 @@ row_sel_get_clust_rec_for_mysql( } } +func_exit: *out_rec = clust_rec; if (prebuilt->select_lock_type == LOCK_X) { @@ -2407,7 +2440,7 @@ row_sel_push_cache_row_for_mysql( /************************************************************************* Tries to do a shortcut to fetch a clustered index record with a unique key, using the hash index if possible (not always). We assume that the search -mode is PAGE_CUR_GE, it is a consistent read, trx has already a read view, +mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx, btr search latch has been locked in S-mode. */ static ulint @@ -2426,7 +2459,7 @@ row_sel_try_search_shortcut_for_mysql( ut_ad(index->type & DICT_CLUSTERED); ut_ad(!prebuilt->templ_contains_blob); - + btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, pcur, #ifndef UNIV_SEARCH_DEBUG @@ -2516,17 +2549,22 @@ row_search_for_mysql( ibool was_lock_wait; ulint ret; ulint shortcut; + ibool unique_search = FALSE; ibool unique_search_from_clust_index = FALSE; ibool mtr_has_extra_clust_latch = FALSE; ibool moves_up = FALSE; + ibool set_also_gap_locks = TRUE; + /* if the query is a plain + locking SELECT, and the isolation + level is <= TRX_ISO_READ_COMMITTED, + then this is set to FALSE */ + ibool success; ulint cnt = 0; mtr_t mtr; ut_ad(index && pcur && search_tuple); ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); - - ut_ad(sync_thread_levels_empty_gen(FALSE)); - + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" @@ -2543,6 +2581,9 @@ row_search_for_mysql( printf("N tables locked %lu\n", trx->mysql_n_tables_locked); */ + /*-------------------------------------------------------------*/ + /* PHASE 1: Try to pop the row from the prefetch cache */ + if (direction == 0) { trx->op_info = (char *) "starting index read"; @@ -2608,18 +2649,35 @@ row_search_for_mysql( mtr_start(&mtr); - /* Since we must release the search system latch when we retrieve an - externally stored field, we cannot use the adaptive hash index in a - search in the case the row may be long and there may be externally - stored fields */ + /* In a search where at most one record in the index may match, we + can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete + marked matching record. + + Note that in a unique secondary index there may be different delete + marked versions of a record where only the primary key values differ: + thus in a secondary index we must use next-key locks when locking + delete marked records. */ if (match_mode == ROW_SEL_EXACT - && index->type & DICT_UNIQUE - && index->type & DICT_CLUSTERED - && !prebuilt->templ_contains_blob - && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8) - && dtuple_get_n_fields(search_tuple) + && index->type & DICT_UNIQUE + && dtuple_get_n_fields(search_tuple) == dict_index_get_n_unique(index)) { + unique_search = TRUE; + } + + /*-------------------------------------------------------------*/ + /* PHASE 2: Try fast adaptive hash index search if possible */ + + /* Next test if this is the special case where we can use the fast + adaptive hash index to try the search. Since we must release the + search system latch when we retrieve an externally stored field, we + cannot use the adaptive hash index in a search in the case the row + may be long and there may be externally stored fields */ + + if (unique_search + && index->type & DICT_CLUSTERED + && !prebuilt->templ_contains_blob + && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { if (direction == ROW_SEL_NEXT) { /* MySQL sometimes seems to do fetch next even @@ -2642,8 +2700,9 @@ row_search_for_mysql( unique_search_from_clust_index = TRUE; - if (trx->mysql_n_tables_locked == 0 - && !prebuilt->sql_stat_start) { + if (prebuilt->select_lock_type == LOCK_NONE + && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && trx->read_view) { /* This is a SELECT query done as a consistent read, and the read view has already been allocated: @@ -2722,13 +2781,34 @@ row_search_for_mysql( mtr_start(&mtr); } } -no_shortcut: + +no_shortcut: + /*-------------------------------------------------------------*/ + /* PHASE 3: Open or restore index cursor position */ + if (trx->has_search_latch) { rw_lock_s_unlock(&btr_search_latch); trx->has_search_latch = FALSE; } trx_start_if_not_started(trx); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && prebuilt->select_lock_type != LOCK_NONE + && trx->mysql_query_str) { + + /* Scan the MySQL query string; check if SELECT is the first + word there */ + + dict_accept(*trx->mysql_query_str, "SELECT", &success); + + if (success) { + /* It is a plain locking SELECT and the isolation + level is low: do not lock gaps */ + + set_also_gap_locks = FALSE; + } + } /* Note that if the search mode was GE or G, then the cursor naturally moves upward (in fetch next) in alphabetical order, @@ -2793,8 +2873,10 @@ no_shortcut: prebuilt->sql_stat_start = FALSE; } - /*-------------------------------------------------------------*/ rec_loop: + /*-------------------------------------------------------------*/ + /* PHASE 4: Look for matching records in a loop */ + cons_read_requires_clust_rec = FALSE; rec = btr_pcur_get_rec(pcur); @@ -2812,22 +2894,24 @@ rec_loop: goto next_rec; } - - if (prebuilt->select_lock_type != LOCK_NONE) { - /* Try to place a lock on the index record */ - err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type, - thr); - if (err != DB_SUCCESS) { + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { - goto lock_wait_or_error; - } - } + if (prebuilt->select_lock_type != LOCK_NONE + && set_also_gap_locks) { - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + /* Try to place a lock on the index record */ + + err = sel_set_rec_lock(rec, index, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr); + if (err != DB_SUCCESS) { + goto lock_wait_or_error; + } + } /* A page supremum record cannot be in the result set: skip - it now when we have placed a possible lock on it */ + it now that we have placed a possible lock on it */ goto next_rec; } @@ -2850,6 +2934,19 @@ rec_loop: if (0 != cmp_dtuple_rec(search_tuple, rec)) { + if (prebuilt->select_lock_type != LOCK_NONE + && set_also_gap_locks) { + /* Try to place a lock on the index record */ + + err = sel_set_rec_lock(rec, index, + prebuilt->select_lock_type, + LOCK_GAP, thr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + btr_pcur_store_position(pcur, &mtr); ret = DB_RECORD_NOT_FOUND; @@ -2862,6 +2959,19 @@ rec_loop: if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) { + if (prebuilt->select_lock_type != LOCK_NONE + && set_also_gap_locks) { + /* Try to place a lock on the index record */ + + err = sel_set_rec_lock(rec, index, + prebuilt->select_lock_type, + LOCK_GAP, thr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + btr_pcur_store_position(pcur, &mtr); ret = DB_RECORD_NOT_FOUND; @@ -2874,16 +2984,39 @@ rec_loop: /* We are ready to look at a possible new index entry in the result set: the cursor is now placed on a user record */ - /* Get the right version of the row in a consistent read */ - - if (prebuilt->select_lock_type == LOCK_NONE) { + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; note that delete + marked records are a special case in a unique search. If there + is a non-delete marked record, then it is enough to lock its + existence with LOCK_REC_NOT_GAP. */ + + if (!set_also_gap_locks + || (unique_search && !rec_get_deleted_flag(rec))) { + err = sel_set_rec_lock(rec, index, + prebuilt->select_lock_type, + LOCK_REC_NOT_GAP, thr); + } else { + err = sel_set_rec_lock(rec, index, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr); + } + + if (err != DB_SUCCESS) { + goto lock_wait_or_error; + } + } else { /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ cons_read_requires_clust_rec = FALSE; - if (index == clust_index) { + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + + /* Do nothing: we let a non-locking SELECT read the + latest version of the record */ + + } else if (index == clust_index) { if (!lock_clust_rec_cons_read_sees(rec, index, trx->read_view)) { @@ -3020,8 +3153,11 @@ got_row: ret = DB_SUCCESS; goto normal_return; - /*-------------------------------------------------------------*/ + next_rec: + /*-------------------------------------------------------------*/ + /* PHASE 5: Move the cursor to the next index record */ + if (mtr_has_extra_clust_latch) { /* We must commit mtr if we are moving to the next non-clustered index record, because we could break the @@ -3064,8 +3200,10 @@ next_rec: cnt++; goto rec_loop; - /*-------------------------------------------------------------*/ + lock_wait_or_error: + /*-------------------------------------------------------------*/ + btr_pcur_store_position(pcur, &mtr); mtr_commit(&mtr); @@ -3096,6 +3234,7 @@ lock_wait_or_error: return(err); normal_return: + /*-------------------------------------------------------------*/ que_thr_stop_for_mysql_no_error(thr, trx); mtr_commit(&mtr); @@ -3156,10 +3295,12 @@ row_search_check_if_query_cache_permitted( ret = TRUE; - /* Assign a read view for the transaction if it does not yet - have one */ + /* If the isolation level is high, assign a read view for the + transaction if it does not yet have one */ + + if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ + && !trx->read_view) { - if (!trx->read_view) { trx->read_view = read_view_open_now(trx, trx->read_view_heap); } diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c index 9990f893432..fff67dcd627 100644 --- a/innobase/row/row0uins.c +++ b/innobase/row/row0uins.c @@ -254,7 +254,8 @@ row_undo_ins_parse_undo_rec( node->table = dict_table_get_on_id(table_id, node->trx); if (node->table == NULL) { - return; + + return; } clust_index = dict_table_get_first_index(node->table); @@ -281,7 +282,7 @@ row_undo_ins( ut_ad(node && thr); ut_ad(node->state == UNDO_NODE_INSERT); - + row_undo_ins_parse_undo_rec(node, thr); if (node->table == NULL) { @@ -292,6 +293,7 @@ row_undo_ins( if (!found) { trx_undo_rec_release(node->trx, node->undo_no); + return(DB_SUCCESS); } diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c index 5119254f405..b40d36533a4 100644 --- a/innobase/row/row0undo.c +++ b/innobase/row/row0undo.c @@ -211,7 +211,6 @@ row_undo( if (node->state == UNDO_NODE_FETCH_NEXT) { - /* The call below also starts &mtr */ node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, trx->roll_limit, &roll_ptr, @@ -254,6 +253,10 @@ row_undo( } } + /* Prevent DROP TABLE etc. while we are rolling back this row */ + + rw_lock_s_lock(&dict_operation_lock); + if (node->state == UNDO_NODE_INSERT) { err = row_undo_ins(node, thr); @@ -264,6 +267,8 @@ row_undo( err = row_undo_mod(node, thr); } + rw_lock_s_unlock(&dict_operation_lock); + /* Do some cleanup */ btr_pcur_close(&(node->pcur)); diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index 25c82f39da9..0be4f901d16 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -79,7 +79,7 @@ ibool row_upd_index_is_referenced( /*========================*/ /* out: TRUE if referenced; NOTE that since - we do not hold dict_foreign_key_check_lock + we do not hold dict_operation_lock when leaving the function, it may be that the referencing table has been dropped when we leave this function: this function is only @@ -95,8 +95,8 @@ row_upd_index_is_referenced( return(FALSE); } - if (!trx->has_dict_foreign_key_check_lock) { - rw_lock_s_lock(&dict_foreign_key_check_lock); + if (!trx->has_dict_operation_lock) { + rw_lock_s_lock(&dict_operation_lock); } foreign = UT_LIST_GET_FIRST(table->referenced_list); @@ -104,8 +104,8 @@ row_upd_index_is_referenced( while (foreign) { if (foreign->referenced_index == index) { - if (!trx->has_dict_foreign_key_check_lock) { - rw_lock_s_unlock(&dict_foreign_key_check_lock); + if (!trx->has_dict_operation_lock) { + rw_lock_s_unlock(&dict_operation_lock); } return(TRUE); @@ -114,8 +114,8 @@ row_upd_index_is_referenced( foreign = UT_LIST_GET_NEXT(referenced_list, foreign); } - if (!trx->has_dict_foreign_key_check_lock) { - rw_lock_s_unlock(&dict_foreign_key_check_lock); + if (!trx->has_dict_operation_lock) { + rw_lock_s_unlock(&dict_operation_lock); } return(FALSE); @@ -162,12 +162,12 @@ row_upd_check_references_constraints( mtr_start(mtr); - if (!trx->has_dict_foreign_key_check_lock) { + if (!trx->has_dict_operation_lock) { got_s_lock = TRUE; - rw_lock_s_lock(&dict_foreign_key_check_lock); + rw_lock_s_lock(&dict_operation_lock); - trx->has_dict_foreign_key_check_lock = TRUE; + trx->has_dict_operation_lock = TRUE; } foreign = UT_LIST_GET_FIRST(table->referenced_list); @@ -189,7 +189,7 @@ row_upd_check_references_constraints( } /* NOTE that if the thread ends up waiting for a lock - we will release dict_foreign_key_check_lock + we will release dict_operation_lock temporarily! But the counter on the table protects 'foreign' from being dropped while the check is running. */ @@ -212,8 +212,8 @@ row_upd_check_references_constraints( if (err != DB_SUCCESS) { if (got_s_lock) { rw_lock_s_unlock( - &dict_foreign_key_check_lock); - trx->has_dict_foreign_key_check_lock + &dict_operation_lock); + trx->has_dict_operation_lock = FALSE; } @@ -227,8 +227,8 @@ row_upd_check_references_constraints( } if (got_s_lock) { - rw_lock_s_unlock(&dict_foreign_key_check_lock); - trx->has_dict_foreign_key_check_lock = FALSE; + rw_lock_s_unlock(&dict_operation_lock); + trx->has_dict_operation_lock = FALSE; } mem_heap_free(heap); diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index d754f603efc..11e45df4ce3 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -135,8 +135,6 @@ byte srv_latin1_ordering[256] /* The sort order table of the latin1 , 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xF7 , 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF }; - -ibool srv_use_native_aio = FALSE; ulint srv_pool_size = ULINT_MAX; /* size in database pages; MySQL originally sets this @@ -151,8 +149,9 @@ dulint srv_archive_recovery_limit_lsn; ulint srv_lock_wait_timeout = 1024 * 1024 * 1024; -char* srv_unix_file_flush_method_str = NULL; -ulint srv_unix_file_flush_method = 0; +char* srv_file_flush_method_str = NULL; +ulint srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; +ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; /* If the following is != 0 we do not allow inserts etc. This protects the user from forgetting the innodb_force_recovery keyword to my.cnf */ @@ -281,6 +280,9 @@ time_t srv_last_monitor_time; mutex_t srv_innodb_monitor_mutex; +ulint srv_main_thread_process_no = 0; +ulint srv_main_thread_id = 0; + /* IMPLEMENTATION OF THE SERVER MAIN PROGRAM ========================================= @@ -2046,13 +2048,15 @@ srv_table_reserve_slot_for_mysql(void) } /******************************************************************* -Puts a MySQL OS thread to wait for a lock to be released. */ +Puts a MySQL OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ -ibool +void srv_suspend_mysql_thread( /*=====================*/ - /* out: TRUE if the lock wait timeout was - exceeded */ que_thr_t* thr) /* in: query thread associated with the MySQL OS thread */ { @@ -2069,13 +2073,15 @@ srv_suspend_mysql_thread( mutex_enter(&kernel_mutex); + trx->error_state = DB_SUCCESS; + if (thr->state == QUE_THR_RUNNING) { /* The lock has already been released: no need to suspend */ mutex_exit(&kernel_mutex); - return(FALSE); + return; } slot = srv_table_reserve_slot_for_mysql(); @@ -2101,18 +2107,18 @@ srv_suspend_mysql_thread( srv_conc_force_exit_innodb(thr_get_trx(thr)); /* Release possible foreign key check latch */ - if (trx->has_dict_foreign_key_check_lock) { + if (trx->has_dict_operation_lock) { - rw_lock_s_unlock(&dict_foreign_key_check_lock); + rw_lock_s_unlock(&dict_operation_lock); } /* Wait for the release */ os_event_wait(event); - if (trx->has_dict_foreign_key_check_lock) { + if (trx->has_dict_operation_lock) { - rw_lock_s_lock(&dict_foreign_key_check_lock); + rw_lock_s_lock(&dict_operation_lock); } /* Return back inside InnoDB */ @@ -2131,10 +2137,9 @@ srv_suspend_mysql_thread( if (srv_lock_wait_timeout < 100000000 && wait_time > (double)srv_lock_wait_timeout) { - return(TRUE); - } - return(FALSE); + trx->error_state = DB_LOCK_WAIT_TIMEOUT; + } } /************************************************************************ @@ -2300,9 +2305,19 @@ srv_sprintf_innodb_monitor( "ROW OPERATIONS\n" "--------------\n"); buf += sprintf(buf, - "%ld queries inside InnoDB, %ld queries in queue; main thread: %s\n", - srv_conc_n_threads, srv_conc_n_waiting_threads, + "%ld queries inside InnoDB, %ld queries in queue\n", + srv_conc_n_threads, srv_conc_n_waiting_threads); +#ifdef UNIV_LINUX + buf += sprintf(buf, + "Main thread process no %lu, state: %s\n", + srv_main_thread_process_no, + srv_main_thread_op_info); +#else + buf += sprintf(buf, + "Main thread id %lu, state: %s\n", + srv_main_thread_id, srv_main_thread_op_info); +#endif buf += sprintf(buf, "Number of rows inserted %lu, updated %lu, deleted %lu, read %lu\n", srv_n_rows_inserted, @@ -2636,6 +2651,9 @@ srv_master_thread( UT_NOT_USED(arg); + srv_main_thread_process_no = os_proc_get_number(); + srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); + srv_table_reserve_slot(SRV_MASTER); mutex_enter(&kernel_mutex); diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index dfa122b2ece..d6d610bb5b8 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -515,7 +515,7 @@ srv_calc_high32( } /************************************************************************* -Creates or opens the log files. */ +Creates or opens the log files and closes them. */ static ulint open_or_create_log_file( @@ -640,7 +640,7 @@ open_or_create_log_file( } /************************************************************************* -Creates or opens database data files. */ +Creates or opens database data files and closes them. */ static ulint open_or_create_data_files( @@ -965,31 +965,63 @@ innobase_start_or_create_for_mysql(void) srv_is_being_started = TRUE; srv_startup_is_before_trx_rollback_phase = TRUE; + os_aio_use_native_aio = FALSE; + +#ifdef __WIN__ + if (os_get_os_version() == OS_WIN95 + || os_get_os_version() == OS_WIN31 + || os_get_os_version() == OS_WINNT) { + + /* On Win 95, 98, ME, Win32 subsystem for Windows 3.1, + and NT use simulated aio. In NT Windows provides async i/o, + but when run in conjunction with InnoDB Hot Backup, it seemed + to corrupt the data files. */ + + os_aio_use_native_aio = FALSE; + } else { + /* On Win 2000 and XP use async i/o */ + os_aio_use_native_aio = TRUE; + } +#endif + if (srv_file_flush_method_str == NULL) { + /* These are the default options */ + + srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; - if (0 == ut_strcmp(srv_unix_file_flush_method_str, "fdatasync")) { + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +#ifndef __WIN__ + } else if (0 == ut_strcmp(srv_file_flush_method_str, "fdatasync")) { srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; - } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "O_DSYNC")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) { srv_unix_file_flush_method = SRV_UNIX_O_DSYNC; - } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, + } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; - } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "nosync")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) { srv_unix_file_flush_method = SRV_UNIX_NOSYNC; +#else + } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) { + srv_win_file_flush_method = SRV_WIN_IO_NORMAL; + os_aio_use_native_aio = FALSE; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) { + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; + os_aio_use_native_aio = FALSE; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, + "async_unbuffered")) { + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +#endif } else { fprintf(stderr, "InnoDB: Unrecognized value %s for innodb_flush_method\n", - srv_unix_file_flush_method_str); + srv_file_flush_method_str); return(DB_ERROR); } - /* - printf("srv_unix set to %lu\n", srv_unix_file_flush_method); - */ - os_aio_use_native_aio = srv_use_native_aio; - err = srv_boot(); if (err != DB_SUCCESS) { @@ -999,34 +1031,15 @@ innobase_start_or_create_for_mysql(void) /* Restrict the maximum number of file i/o threads */ if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) { + srv_n_file_io_threads = SRV_MAX_N_IO_THREADS; } -#if !(defined(WIN_ASYNC_IO) || defined(POSIX_ASYNC_IO)) - /* In simulated aio we currently have use only for 4 threads */ - - os_aio_use_native_aio = FALSE; - - srv_n_file_io_threads = 4; -#endif - -#ifdef __WIN__ - if (os_get_os_version() == OS_WIN95 - || os_get_os_version() == OS_WIN31) { + if (!os_aio_use_native_aio) { + /* In simulated aio we currently have use only for 4 threads */ - /* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use - simulated aio */ + srv_n_file_io_threads = 4; - os_aio_use_native_aio = FALSE; - srv_n_file_io_threads = 4; - } else { - /* On NT and Win 2000 always use aio */ - os_aio_use_native_aio = TRUE; - } -#endif - os_aio_use_native_aio = FALSE; - - if (!os_aio_use_native_aio) { os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD * srv_n_file_io_threads, srv_n_file_io_threads, @@ -1047,15 +1060,6 @@ innobase_start_or_create_for_mysql(void) lock_sys_create(srv_lock_table_size); -#ifdef POSIX_ASYNC_IO - if (os_aio_use_native_aio) { - /* There is only one thread per async io array: - one for ibuf i/o, one for log i/o, one for ordinary reads, - one for ordinary writes; we need only 4 i/o threads */ - - srv_n_file_io_threads = 4; - } -#endif /* Create i/o-handler threads: */ for (i = 0; i < srv_n_file_io_threads; i++) { diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c index fe837b119f3..b214bca0470 100644 --- a/innobase/sync/sync0rw.c +++ b/innobase/sync/sync0rw.c @@ -663,7 +663,8 @@ rw_lock_own( /*========*/ /* out: TRUE if locked */ rw_lock_t* lock, /* in: rw-lock */ - ulint lock_type) /* in: lock type */ + ulint lock_type) /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ { rw_lock_debug_t* info; diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index 3ea996afd6b..376be2e723a 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -901,8 +901,7 @@ sync_thread_levels_empty_gen( if (slot->latch != NULL && (!dict_mutex_allowed || (slot->level != SYNC_DICT - && slot->level != SYNC_FOREIGN_KEY_CHECK - && slot->level != SYNC_PURGE_IS_RUNNING))) { + && slot->level != SYNC_DICT_OPERATION))) { lock = slot->latch; mutex = slot->latch; @@ -1087,12 +1086,10 @@ sync_thread_add_level( SYNC_IBUF_PESS_INSERT_MUTEX)); } else if (level == SYNC_DICT_AUTOINC_MUTEX) { ut_a(sync_thread_levels_g(array, SYNC_DICT_AUTOINC_MUTEX)); - } else if (level == SYNC_FOREIGN_KEY_CHECK) { - ut_a(sync_thread_levels_g(array, SYNC_FOREIGN_KEY_CHECK)); + } else if (level == SYNC_DICT_OPERATION) { + ut_a(sync_thread_levels_g(array, SYNC_DICT_OPERATION)); } else if (level == SYNC_DICT_HEADER) { ut_a(sync_thread_levels_g(array, SYNC_DICT_HEADER)); - } else if (level == SYNC_PURGE_IS_RUNNING) { - ut_a(sync_thread_levels_g(array, SYNC_PURGE_IS_RUNNING)); } else if (level == SYNC_DICT) { ut_a(buf_debug_prints || sync_thread_levels_g(array, SYNC_DICT)); diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c index 97362d00b4b..d58240d3c11 100644 --- a/innobase/trx/trx0purge.c +++ b/innobase/trx/trx0purge.c @@ -209,9 +209,6 @@ trx_purge_sys_create(void) purge_sys->purge_undo_no = ut_dulint_zero; purge_sys->next_stored = FALSE; - rw_lock_create(&(purge_sys->purge_is_running)); - rw_lock_set_level(&(purge_sys->purge_is_running), - SYNC_PURGE_IS_RUNNING); rw_lock_create(&(purge_sys->latch)); rw_lock_set_level(&(purge_sys->latch), SYNC_PURGE_LATCH); diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 994a6777924..7566fe1839e 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -23,7 +23,7 @@ Created 3/26/1996 Heikki Tuuri #include "srv0srv.h" #include "thr0loc.h" #include "btr0sea.h" - +#include "os0proc.h" /* Copy of the prototype for innobase_mysql_print_thd: this copy MUST be equal to the one in mysql/sql/ha_innobase.cc ! */ @@ -85,12 +85,14 @@ trx_create( trx->conc_state = TRX_NOT_STARTED; trx->start_time = time(NULL); + trx->isolation_level = TRX_ISO_REPEATABLE_READ; trx->check_foreigns = TRUE; trx->check_unique_secondary = TRUE; trx->dict_operation = FALSE; trx->mysql_thd = NULL; + trx->mysql_query_str = NULL; trx->n_mysql_tables_in_use = 0; trx->mysql_n_tables_locked = 0; @@ -132,7 +134,7 @@ trx_create( trx->lock_heap = mem_heap_create_in_buffer(256); UT_LIST_INIT(trx->trx_locks); - trx->has_dict_foreign_key_check_lock = FALSE; + trx->has_dict_operation_lock = FALSE; trx->has_search_latch = FALSE; trx->search_latch_timeout = BTR_SEA_TIMEOUT; @@ -175,6 +177,8 @@ trx_allocate_for_mysql(void) mutex_exit(&kernel_mutex); trx->mysql_thread_id = os_thread_get_curr_id(); + + trx->mysql_process_no = os_proc_get_number(); return(trx); } @@ -1497,9 +1501,12 @@ trx_print( default: buf += sprintf(buf, " state %lu", trx->conc_state); } +#ifdef UNIV_LINUX + buf += sprintf(buf, ", process no %lu", trx->mysql_process_no); +#else buf += sprintf(buf, ", OS thread id %lu", os_thread_pf(trx->mysql_thread_id)); - +#endif if (ut_strlen(trx->op_info) > 0) { buf += sprintf(buf, " %s", trx->op_info); } diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index 9e174d7ee59..f4125f2259e 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -97,6 +97,8 @@ are determined in innobase_init below: */ char* innobase_data_home_dir = NULL; char* innobase_log_group_home_dir = NULL; char* innobase_log_arch_dir = NULL; +/* The following has a midleading name: starting from 4.0.5 this also +affects Windows */ char* innobase_unix_file_flush_method = NULL; /* Below we have boolean-valued start-up parameters, and their default @@ -346,7 +348,8 @@ check_trx_exists( trx = trx_allocate_for_mysql(); trx->mysql_thd = thd; - + trx->mysql_query_str = &((*thd).query); + thd->transaction.all.innobase_tid = trx; /* The execution of a single SQL statement is denoted by @@ -713,9 +716,10 @@ innobase_init(void) DBUG_RETURN(TRUE); } - srv_unix_file_flush_method_str = (innobase_unix_file_flush_method ? + + srv_file_flush_method_str = (innobase_unix_file_flush_method ? innobase_unix_file_flush_method : - (char*)"fdatasync"); + NULL); srv_n_log_groups = (ulint) innobase_mirrored_log_groups; srv_n_log_files = (ulint) innobase_log_files_in_group; @@ -725,8 +729,6 @@ innobase_init(void) srv_log_buffer_size = (ulint) innobase_log_buffer_size; srv_flush_log_at_trx_commit = (ulint) innobase_flush_log_at_trx_commit; - srv_use_native_aio = 0; - srv_pool_size = (ulint) innobase_buffer_pool_size; srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; @@ -2179,8 +2181,16 @@ convert_search_mode_to_innobase( case HA_READ_AFTER_KEY: return(PAGE_CUR_G); case HA_READ_BEFORE_KEY: return(PAGE_CUR_L); case HA_READ_PREFIX: return(PAGE_CUR_GE); - case HA_READ_PREFIX_LAST: return(PAGE_CUR_LE); - /* HA_READ_PREFIX_LAST does not yet work in InnoDB! */ + case HA_READ_PREFIX_LAST: + /* ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Using HA_READ_PREFIX_LAST\n"); */ + return(PAGE_CUR_LE); + + /* InnoDB does not yet support ..PREFIX_LAST! + We have to add a new search flag + PAGE_CUR_LE_OR_PREFIX to InnoDB. */ + /* the above PREFIX flags mean that the last field in the key value may just be a prefix of the complete fixed length field */ @@ -3639,7 +3649,6 @@ ha_innobase::reset(void) return(0); } - /********************************************************************** When we create a temporary table inside MySQL LOCK TABLES, MySQL will not call external_lock for the temporary table when it uses it. Instead, @@ -3661,6 +3670,14 @@ ha_innobase::start_stmt( innobase_release_stat_resources(trx); trx_mark_sql_stat_end(trx); + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && trx->read_view) { + /* At low transaction isolation levels we let + each consistent read set its own snapshot */ + + read_view_close_for_mysql(trx); + } + auto_inc_counter_for_this_stat = 0; prebuilt->sql_stat_start = TRUE; prebuilt->hint_no_need_to_fetch_extra_cols = TRUE; @@ -3681,6 +3698,24 @@ ha_innobase::start_stmt( } /********************************************************************** +Maps a MySQL trx isolation level code to the InnoDB isolation level code */ +inline +ulint +innobase_map_isolation_level( +/*=========================*/ + /* out: InnoDB isolation level */ + enum_tx_isolation iso) /* in: MySQL isolation level code */ +{ + switch(iso) { + case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED); + case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ); + case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE); + case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED); + default: ut_a(0); return(0); + } +} + +/********************************************************************** As MySQL will execute an external lock for every new table it uses when it starts to process an SQL statement (an exception is when MySQL calls start_stmt for the handle) we can use this function to store the pointer to @@ -3726,7 +3761,13 @@ ha_innobase::external_lock( thd->transaction.all.innodb_active_trans = 1; trx->n_mysql_tables_in_use++; - if (thd->variables.tx_isolation == ISO_SERIALIZABLE + if (thd->variables.tx_isolation != ISO_REPEATABLE_READ) { + trx->isolation_level = innobase_map_isolation_level( + (enum_tx_isolation) + thd->variables.tx_isolation); + } + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE && prebuilt->select_lock_type == LOCK_NONE) { /* To get serializable execution we let InnoDB @@ -3753,6 +3794,15 @@ ha_innobase::external_lock( innobase_release_stat_resources(trx); + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && trx->read_view) { + + /* At low transaction isolation levels we let + each consistent read set its own snapshot */ + + read_view_close_for_mysql(trx); + } + if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { @@ -3777,14 +3827,13 @@ innodb_show_status( char* buf; DBUG_ENTER("innodb_show_status"); - + if (innodb_skip) { + fprintf(stderr, + "Cannot call SHOW INNODB STATUS because skip-innodb is defined\n"); - fprintf(stderr, - "Cannot call SHOW INNODB STATUS because skip-innodb is defined\n"); - - DBUG_RETURN(-1); - } + DBUG_RETURN(-1); + } /* We let the InnoDB Monitor to output at most 100 kB of text, add a safety margin of 10 kB for buffer overruns */ diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h index 357fb31b5e3..d2639f39c5b 100644 --- a/sql/ha_innodb.h +++ b/sql/ha_innodb.h @@ -96,7 +96,7 @@ class ha_innobase: public handler ulong index_flags(uint idx) const { return (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | - HA_KEY_READ_ONLY | HA_NOT_READ_PREFIX_LAST); + HA_KEY_READ_ONLY); } uint max_record_length() const { return HA_MAX_REC_LENGTH; } uint max_keys() const { return MAX_KEY; } diff --git a/sql/mysqld.cc b/sql/mysqld.cc index b2114b2192a..ebc6100a71b 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -3879,7 +3879,7 @@ static void set_options(void) /* Set default values for some variables */ global_system_variables.table_type=DB_TYPE_MYISAM; - global_system_variables.tx_isolation=ISO_READ_COMMITTED; + global_system_variables.tx_isolation=ISO_REPEATABLE_READ; global_system_variables.select_limit= (ulong) HA_POS_ERROR; max_system_variables.select_limit= (ulong) HA_POS_ERROR; global_system_variables.max_join_size= (ulong) HA_POS_ERROR; @@ -4351,7 +4351,7 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)), } global_system_variables.tx_isolation= ((opt_sql_mode & MODE_SERIALIZABLE) ? ISO_SERIALIZABLE : - ISO_READ_COMMITTED); + ISO_REPEATABLE_READ); break; } case OPT_MASTER_PASSWORD: |