summaryrefslogtreecommitdiff
path: root/innobase
diff options
context:
space:
mode:
authorunknown <heikki@hundin.mysql.fi>2002-10-29 23:16:46 +0200
committerunknown <heikki@hundin.mysql.fi>2002-10-29 23:16:46 +0200
commit3cb98f0d66c8030a3532b67ff74e7211cca4c079 (patch)
treee57bf300e559932ce45e0f749d7349577e7e0479 /innobase
parent2d9a473bb67eb5d46ef3facf9384e2b9a621b79e (diff)
downloadmariadb-git-3cb98f0d66c8030a3532b67ff74e7211cca4c079.tar.gz
Many files:
Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution mysqld.cc: Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway sql/mysqld.cc: Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway sql/ha_innodb.cc: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution sql/ha_innodb.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/buf0buf.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/dict0dict.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/fil0fil.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/lock0lock.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0file.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0proc.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0thread.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/page0cur.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/page0page.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/read0read.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/rem0rec.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/srv0srv.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/sync0rw.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/sync0sync.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/trx0purge.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/trx0trx.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/rem0rec.ic: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0btr.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0cur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0pcur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/buf/buf0buf.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/buf/buf0flu.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/dict/dict0dict.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/fil/fil0fil.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/fsp/fsp0fsp.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/ibuf/ibuf0ibuf.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/lock/lock0lock.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/mem/mem0dbg.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/os/os0file.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/os/os0proc.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/page/page0cur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/page/page0page.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/pars/lexyy.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/pars/pars0grm.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/read/read0read.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0ins.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0mysql.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0purge.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0sel.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0uins.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0undo.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0upd.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/srv/srv0srv.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/srv/srv0start.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/sync/sync0rw.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/sync/sync0sync.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/trx/trx0purge.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/trx/trx0trx.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
Diffstat (limited to 'innobase')
-rw-r--r--innobase/btr/btr0btr.c4
-rw-r--r--innobase/btr/btr0cur.c43
-rw-r--r--innobase/btr/btr0pcur.c1
-rw-r--r--innobase/buf/buf0buf.c32
-rw-r--r--innobase/buf/buf0flu.c19
-rw-r--r--innobase/dict/dict0dict.c18
-rw-r--r--innobase/fil/fil0fil.c16
-rw-r--r--innobase/fsp/fsp0fsp.c24
-rw-r--r--innobase/ibuf/ibuf0ibuf.c3
-rw-r--r--innobase/include/buf0buf.h17
-rw-r--r--innobase/include/dict0dict.h14
-rw-r--r--innobase/include/fil0fil.h2
-rw-r--r--innobase/include/lock0lock.h48
-rw-r--r--innobase/include/os0file.h3
-rw-r--r--innobase/include/os0proc.h9
-rw-r--r--innobase/include/os0thread.h7
-rw-r--r--innobase/include/page0cur.h7
-rw-r--r--innobase/include/page0page.h10
-rw-r--r--innobase/include/read0read.h8
-rw-r--r--innobase/include/rem0rec.h12
-rw-r--r--innobase/include/rem0rec.ic18
-rw-r--r--innobase/include/srv0srv.h30
-rw-r--r--innobase/include/sync0rw.h3
-rw-r--r--innobase/include/sync0sync.h6
-rw-r--r--innobase/include/trx0purge.h3
-rw-r--r--innobase/include/trx0trx.h46
-rw-r--r--innobase/lock/lock0lock.c552
-rw-r--r--innobase/mem/mem0dbg.c12
-rw-r--r--innobase/os/os0file.c45
-rw-r--r--innobase/os/os0proc.c17
-rw-r--r--innobase/page/page0cur.c52
-rw-r--r--innobase/page/page0page.c196
-rw-r--r--innobase/pars/lexyy.c9
-rw-r--r--innobase/pars/pars0grm.c2
-rw-r--r--innobase/read/read0read.c22
-rw-r--r--innobase/row/row0ins.c212
-rw-r--r--innobase/row/row0mysql.c64
-rw-r--r--innobase/row/row0purge.c18
-rw-r--r--innobase/row/row0sel.c241
-rw-r--r--innobase/row/row0uins.c6
-rw-r--r--innobase/row/row0undo.c7
-rw-r--r--innobase/row/row0upd.c30
-rw-r--r--innobase/srv/srv0srv.c54
-rw-r--r--innobase/srv/srv0start.c92
-rw-r--r--innobase/sync/sync0rw.c3
-rw-r--r--innobase/sync/sync0sync.c9
-rw-r--r--innobase/trx/trx0purge.c3
-rw-r--r--innobase/trx/trx0trx.c13
48 files changed, 1505 insertions, 557 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c
index 7a7678b2dcf..62a86d342a2 100644
--- a/innobase/btr/btr0btr.c
+++ b/innobase/btr/btr0btr.c
@@ -274,6 +274,7 @@ btr_page_create(
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
MTR_MEMO_PAGE_X_FIX));
page_create(page, mtr);
+ buf_block_align(page)->check_index_page_at_flush = TRUE;
btr_page_set_index_id(page, tree->id, mtr);
}
@@ -713,6 +714,7 @@ btr_create(
/* Create a new index page on the the allocated segment page */
page = page_create(frame, mtr);
+ buf_block_align(page)->check_index_page_at_flush = TRUE;
/* Set the index id of the page */
btr_page_set_index_id(page, index_id, mtr);
@@ -847,6 +849,7 @@ btr_page_reorganize_low(
segment headers, next page-field, etc.) is preserved intact */
page_create(page, mtr);
+ buf_block_align(page)->check_index_page_at_flush = TRUE;
/* Copy the records from the temporary space to the recreated page;
do not copy the lock bits yet */
@@ -919,6 +922,7 @@ btr_page_empty(
segment headers, next page-field, etc.) is preserved intact */
page_create(page, mtr);
+ buf_block_align(page)->check_index_page_at_flush = TRUE;
}
/*****************************************************************
diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c
index 3d6b63def6c..24f0447d55d 100644
--- a/innobase/btr/btr0cur.c
+++ b/innobase/btr/btr0cur.c
@@ -121,16 +121,19 @@ btr_cur_latch_leaves(
{
ulint left_page_no;
ulint right_page_no;
-
+ page_t* get_page;
+
ut_ad(tree && page && mtr);
if (latch_mode == BTR_SEARCH_LEAF) {
- btr_page_get(space, page_no, RW_S_LATCH, mtr);
+ get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
+ buf_block_align(get_page)->check_index_page_at_flush = TRUE;
} else if (latch_mode == BTR_MODIFY_LEAF) {
- btr_page_get(space, page_no, RW_X_LATCH, mtr);
+ get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+ buf_block_align(get_page)->check_index_page_at_flush = TRUE;
} else if (latch_mode == BTR_MODIFY_TREE) {
@@ -138,15 +141,22 @@ btr_cur_latch_leaves(
left_page_no = btr_page_get_prev(page, mtr);
if (left_page_no != FIL_NULL) {
- btr_page_get(space, left_page_no, RW_X_LATCH, mtr);
+ get_page = btr_page_get(space, left_page_no,
+ RW_X_LATCH, mtr);
+ buf_block_align(get_page)->check_index_page_at_flush =
+ TRUE;
}
- btr_page_get(space, page_no, RW_X_LATCH, mtr);
+ get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+ buf_block_align(get_page)->check_index_page_at_flush = TRUE;
right_page_no = btr_page_get_next(page, mtr);
if (right_page_no != FIL_NULL) {
- btr_page_get(space, right_page_no, RW_X_LATCH, mtr);
+ get_page = btr_page_get(space, right_page_no,
+ RW_X_LATCH, mtr);
+ buf_block_align(get_page)->check_index_page_at_flush =
+ TRUE;
}
} else if (latch_mode == BTR_SEARCH_PREV) {
@@ -157,9 +167,12 @@ btr_cur_latch_leaves(
if (left_page_no != FIL_NULL) {
cursor->left_page = btr_page_get(space, left_page_no,
RW_S_LATCH, mtr);
+ buf_block_align(
+ cursor->left_page)->check_index_page_at_flush = TRUE;
}
- btr_page_get(space, page_no, RW_S_LATCH, mtr);
+ get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
+ buf_block_align(get_page)->check_index_page_at_flush = TRUE;
} else if (latch_mode == BTR_MODIFY_PREV) {
@@ -169,9 +182,12 @@ btr_cur_latch_leaves(
if (left_page_no != FIL_NULL) {
cursor->left_page = btr_page_get(space, left_page_no,
RW_X_LATCH, mtr);
+ buf_block_align(
+ cursor->left_page)->check_index_page_at_flush = TRUE;
}
- btr_page_get(space, page_no, RW_X_LATCH, mtr);
+ get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+ buf_block_align(get_page)->check_index_page_at_flush = TRUE;
} else {
ut_error;
}
@@ -274,6 +290,7 @@ btr_cur_search_to_nth_level(
if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
&& !estimate
+ && mode != PAGE_CUR_LE_OR_EXTENDS
&& btr_search_guess_on_hash(index, info, tuple, mode,
latch_mode, cursor,
has_search_latch, mtr)) {
@@ -334,12 +351,18 @@ btr_cur_search_to_nth_level(
rw_latch = RW_NO_LATCH;
buf_mode = BUF_GET;
+ /* We use these modified search modes on non-leaf levels of the
+ B-tree. These let us end up in the right B-tree leaf. In that leaf
+ we use the original search mode. */
+
if (mode == PAGE_CUR_GE) {
page_mode = PAGE_CUR_L;
} else if (mode == PAGE_CUR_G) {
page_mode = PAGE_CUR_LE;
} else if (mode == PAGE_CUR_LE) {
page_mode = PAGE_CUR_LE;
+ } else if (mode == PAGE_CUR_LE_OR_EXTENDS) {
+ page_mode = PAGE_CUR_LE_OR_EXTENDS;
} else {
ut_ad(mode == PAGE_CUR_L);
page_mode = PAGE_CUR_L;
@@ -390,6 +413,8 @@ retry_page_get:
goto retry_page_get;
}
+
+ buf_block_align(page)->check_index_page_at_flush = TRUE;
#ifdef UNIV_SYNC_DEBUG
if (rw_latch != RW_NO_LATCH) {
@@ -543,6 +568,8 @@ btr_cur_open_at_index_side(
ut_ad(0 == ut_dulint_cmp(tree->id,
btr_page_get_index_id(page)));
+ buf_block_align(page)->check_index_page_at_flush = TRUE;
+
if (height == ULINT_UNDEFINED) {
/* We are in the root node */
diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c
index 8ca3d41f7f9..b2115dfdd6c 100644
--- a/innobase/btr/btr0pcur.c
+++ b/innobase/btr/btr0pcur.c
@@ -354,6 +354,7 @@ btr_pcur_move_to_next_page(
ut_ad(next_page_no != FIL_NULL);
next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr);
+ buf_block_align(next_page)->check_index_page_at_flush = TRUE;
btr_leaf_page_release(page, cursor->latch_mode, mtr);
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
index ee8e8b91f8d..4524fa1a4f9 100644
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -331,6 +331,11 @@ buf_page_print(
index->table_name,
index->name);
}
+ } else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) {
+ fprintf(stderr, "InnoDB: Page may be an 'inode' page\n");
+ } else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) {
+ fprintf(stderr,
+ "InnoDB: Page may be an insert buffer free list page\n");
}
}
@@ -351,6 +356,8 @@ buf_block_init(
block->file_page_was_freed = FALSE;
+ block->check_index_page_at_flush = FALSE;
+
rw_lock_create(&(block->lock));
ut_ad(rw_lock_validate(&(block->lock)));
@@ -617,6 +624,29 @@ buf_page_peek_block(
}
/************************************************************************
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+ ulint space, /* in: space id */
+ ulint offset) /* in: page number */
+{
+ buf_block_t* block;
+
+ mutex_enter_fast(&(buf_pool->mutex));
+
+ block = buf_page_hash_get(space, offset);
+
+ if (block) {
+ block->check_index_page_at_flush = FALSE;
+ }
+
+ mutex_exit(&(buf_pool->mutex));
+}
+
+/************************************************************************
Returns the current state of is_hashed of a page. FALSE if the page is
not in the pool. NOTE that this operation does not fix the page in the
pool if it is found there. */
@@ -1185,6 +1215,8 @@ buf_page_init(
block->space = space;
block->offset = offset;
+ block->check_index_page_at_flush = FALSE;
+
block->lock_hash_val = lock_rec_hash(space, offset);
block->lock_mutex = NULL;
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
index 4c6850af078..78bde60c9b2 100644
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@@ -15,6 +15,7 @@ Created 11/11/1995 Heikki Tuuri
#include "ut0byte.h"
#include "ut0lst.h"
+#include "page0page.h"
#include "fil0fil.h"
#include "buf0buf.h"
#include "buf0lru.h"
@@ -225,6 +226,24 @@ buf_flush_buffered_writes(void)
return;
}
+ for (i = 0; i < trx_doublewrite->first_free; i++) {
+ block = trx_doublewrite->buf_block_arr[i];
+
+ if (block->check_index_page_at_flush
+ && !page_simple_validate(block->frame)) {
+
+ buf_page_print(block->frame);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Apparent corruption of an index page\n"
+ "InnoDB: to be written to data file. We intentionally crash server\n"
+ "InnoDB: to prevent corrupt data from ending up in data\n"
+ "InnoDB: files.\n");
+ ut_a(0);
+ }
+ }
+
if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
} else {
diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c
index 095c27f1c5f..2ee2c9d18a9 100644
--- a/innobase/dict/dict0dict.c
+++ b/innobase/dict/dict0dict.c
@@ -29,7 +29,14 @@ Created 1/8/1996 Heikki Tuuri
dict_sys_t* dict_sys = NULL; /* the dictionary system */
-rw_lock_t dict_foreign_key_check_lock;
+rw_lock_t dict_operation_lock; /* table create, drop, etc. reserve
+ this in X-mode, implicit or backround
+ operations purge, rollback, foreign
+ key checks reserve this in S-mode; we
+ cannot trust that MySQL protects
+ implicit or background operations
+ from dropping a table: this is our
+ mechanism */
#define DICT_HEAP_SIZE 100 /* initial memory heap size when
creating a table or index object */
@@ -509,9 +516,8 @@ dict_init(void)
UT_LIST_INIT(dict_sys->table_LRU);
- rw_lock_create(&dict_foreign_key_check_lock);
- rw_lock_set_level(&dict_foreign_key_check_lock,
- SYNC_FOREIGN_KEY_CHECK);
+ rw_lock_create(&dict_operation_lock);
+ rw_lock_set_level(&dict_operation_lock, SYNC_DICT_OPERATION);
}
/**************************************************************************
@@ -1851,14 +1857,14 @@ loop:
/*************************************************************************
Accepts a specified string. Comparisons are case-insensitive. */
-static
+
char*
dict_accept(
/*========*/
/* out: if string was accepted, the pointer
is moved after that, else ptr is returned */
char* ptr, /* in: scan from this */
- const char* string, /* in: accept only this string as the next
+ const char* string,/* in: accept only this string as the next
non-whitespace string */
ibool* success)/* out: TRUE if accepted */
{
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c
index 3e0f21395ef..98980f6c337 100644
--- a/innobase/fil/fil0fil.c
+++ b/innobase/fil/fil0fil.c
@@ -967,6 +967,7 @@ fil_extend_last_data_file(
fil_node_t* node;
fil_space_t* space;
fil_system_t* system = fil_system;
+ byte* buf2;
byte* buf;
ibool success;
ulint i;
@@ -981,19 +982,23 @@ fil_extend_last_data_file(
fil_node_prepare_for_io(node, system, space);
- buf = mem_alloc(1024 * 1024);
+ buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE);
+ buf = ut_align(buf2, UNIV_PAGE_SIZE);
memset(buf, '\0', 1024 * 1024);
for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) {
- success = os_file_write(node->name, node->handle, buf,
+ /* If we use native Windows aio, then also this write is
+ done using it */
+
+ success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
+ node->name, node->handle, buf,
(node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF,
node->size >> (32 - UNIV_PAGE_SIZE_SHIFT),
- 1024 * 1024);
+ 1024 * 1024, NULL, NULL);
if (!success) {
-
break;
}
@@ -1003,7 +1008,7 @@ fil_extend_last_data_file(
os_has_said_disk_full = FALSE;
}
- mem_free(buf);
+ mem_free(buf2);
fil_node_complete_io(node, system, OS_FILE_WRITE);
@@ -1528,7 +1533,6 @@ fil_page_set_type(
ulint type) /* in: type */
{
ut_ad(page);
- ut_ad((type == FIL_PAGE_INDEX) || (type == FIL_PAGE_UNDO_LOG));
mach_write_to_2(page + FIL_PAGE_TYPE, type);
}
diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c
index 1abb043fdc2..ff586819d4a 100644
--- a/innobase/fsp/fsp0fsp.c
+++ b/innobase/fsp/fsp0fsp.c
@@ -769,6 +769,8 @@ fsp_init_file_page_low(
#endif
page = buf_frame_align(ptr);
+ buf_block_align(page)->check_index_page_at_flush = FALSE;
+
#ifdef UNIV_BASIC_LOG_DEBUG
/* printf("In log debug version: Erase the contents of the file page\n");
*/
@@ -1097,7 +1099,7 @@ fsp_fill_free_list(
/* Initialize the ibuf page in a separate
mini-transaction because it is low in the latching
- order, and we must be able to release the its latch
+ order, and we must be able to release its latch
before returning from the fsp routine */
mtr_start(&ibuf_mtr);
@@ -1264,7 +1266,12 @@ fsp_alloc_free_page(
free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE,
hint % FSP_EXTENT_SIZE, mtr);
- ut_a(free != ULINT_UNDEFINED);
+ if (free == ULINT_UNDEFINED) {
+
+ ut_print_buf(((byte*)descr) - 500, 1000);
+
+ ut_a(0);
+ }
xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);
@@ -1412,7 +1419,12 @@ fsp_free_extent(
descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
- ut_a(xdes_get_state(descr, mtr) != XDES_FREE);
+ if (xdes_get_state(descr, mtr) == XDES_FREE) {
+
+ ut_print_buf(((byte*)descr) - 500, 1000);
+
+ ut_a(0);
+ }
xdes_init(descr, mtr);
@@ -1523,6 +1535,10 @@ fsp_alloc_seg_inode_page(
page = buf_page_get(space, page_no, RW_X_LATCH, mtr);
+ buf_block_align(page)->check_index_page_at_flush = FALSE;
+
+ fil_page_set_type(page, FIL_PAGE_INODE);
+
buf_page_dbg_add_level(page, SYNC_FSP_PAGE);
for (i = 0; i < FSP_SEG_INODES_PER_PAGE; i++) {
@@ -2298,6 +2314,8 @@ fseg_alloc_free_page_low(
fseg_mark_page_used(seg_inode, space, ret_page, mtr);
}
+ buf_reset_check_index_page_at_flush(space, ret_page);
+
return(ret_page);
}
diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
index b7d691485cc..143b3bfa584 100644
--- a/innobase/ibuf/ibuf0ibuf.c
+++ b/innobase/ibuf/ibuf0ibuf.c
@@ -1295,6 +1295,8 @@ ibuf_add_free_page(
flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+ fil_page_set_type(page, FIL_PAGE_IBUF_FREE_LIST);
+
ibuf_data->seg_size++;
ibuf_data->free_list_len++;
@@ -1305,6 +1307,7 @@ ibuf_add_free_page(
ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_IBUF,
TRUE, &mtr);
+
mtr_commit(&mtr);
mutex_exit(&ibuf_mutex);
diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h
index 591c0ec54ab..f76c437bd1d 100644
--- a/innobase/include/buf0buf.h
+++ b/innobase/include/buf0buf.h
@@ -274,6 +274,15 @@ buf_page_peek_block(
ulint space, /* in: space id */
ulint offset);/* in: page number */
/************************************************************************
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+ ulint space, /* in: space id */
+ ulint offset);/* in: page number */
+/************************************************************************
Sets file_page_was_freed TRUE if the page is found in the buffer pool.
This function should be called when we free a file page and want the
debug version to check that it is not accessed any more unless
@@ -648,6 +657,14 @@ struct buf_block_struct{
then it can wait for this rw-lock */
buf_block_t* hash; /* node used in chaining to the page
hash table */
+ ibool check_index_page_at_flush;
+ /* TRUE if we know that this is
+ an index page, and want the database
+ to check its consistency before flush;
+ note that there may be pages in the
+ buffer pool which are index pages,
+ but this flag is not set because
+ we do not keep track of all pages */
/* 2. Page flushing fields */
UT_LIST_NODE_T(buf_block_t) flush_list;
diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h
index dd92c5aa467..b5e6e04a1de 100644
--- a/innobase/include/dict0dict.h
+++ b/innobase/include/dict0dict.h
@@ -26,6 +26,18 @@ Created 1/8/1996 Heikki Tuuri
#include "ut0byte.h"
#include "trx0types.h"
+/*************************************************************************
+Accepts a specified string. Comparisons are case-insensitive. */
+
+char*
+dict_accept(
+/*========*/
+ /* out: if string was accepted, the pointer
+ is moved after that, else ptr is returned */
+ char* ptr, /* in: scan from this */
+ const char* string,/* in: accept only this string as the next
+ non-whitespace string */
+ ibool* success);/* out: TRUE if accepted */
/************************************************************************
Decrements the count of open MySQL handles to a table. */
@@ -798,7 +810,7 @@ dict_mutex_exit_for_mysql(void);
extern dict_sys_t* dict_sys; /* the dictionary system */
-extern rw_lock_t dict_foreign_key_check_lock;
+extern rw_lock_t dict_operation_lock;
/* Dictionary system struct */
struct dict_sys_struct{
diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h
index 63e20221c16..23ef0304b2d 100644
--- a/innobase/include/fil0fil.h
+++ b/innobase/include/fil0fil.h
@@ -73,6 +73,8 @@ extern fil_addr_t fil_addr_null;
/* File page types */
#define FIL_PAGE_INDEX 17855
#define FIL_PAGE_UNDO_LOG 2
+#define FIL_PAGE_INODE 3
+#define FIL_PAGE_IBUF_FREE_LIST 4
/* Space types */
#define FIL_TABLESPACE 501
diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h
index 288356d3270..d3b3d55d015 100644
--- a/innobase/include/lock0lock.h
+++ b/innobase/include/lock0lock.h
@@ -292,16 +292,12 @@ lock_sec_rec_modify_check_and_lock(
dict_index_t* index, /* in: secondary index */
que_thr_t* thr); /* in: query thread */
/*************************************************************************
-Checks if locks of other transactions prevent an immediate read, or passing
-over by a read cursor, of a clustered index record. If they do, first tests
-if the query thread should anyway be suspended for some reason; if not, then
-puts the transaction and the query thread to the lock wait state and inserts a
-waiting request for a record lock to the lock queue. Sets the requested mode
-lock on the record. */
+Like the counterpart for a clustered index below, but now we read a
+secondary index record. */
ulint
-lock_clust_rec_read_check_and_lock(
-/*===============================*/
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
/* out: DB_SUCCESS, DB_LOCK_WAIT,
DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
@@ -309,18 +305,24 @@ lock_clust_rec_read_check_and_lock(
rec_t* rec, /* in: user record or page supremum record
which should be read or passed over by a read
cursor */
- dict_index_t* index, /* in: clustered index */
+ dict_index_t* index, /* in: secondary index */
ulint mode, /* in: mode of the lock which the read cursor
should set on records: LOCK_S or LOCK_X; the
latter is possible in SELECT FOR UPDATE */
+ ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
que_thr_t* thr); /* in: query thread */
/*************************************************************************
-Like the counterpart for a clustered index above, but now we read a
-secondary index record. */
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. */
ulint
-lock_sec_rec_read_check_and_lock(
-/*=============================*/
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
/* out: DB_SUCCESS, DB_LOCK_WAIT,
DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
@@ -328,10 +330,12 @@ lock_sec_rec_read_check_and_lock(
rec_t* rec, /* in: user record or page supremum record
which should be read or passed over by a read
cursor */
- dict_index_t* index, /* in: secondary index */
+ dict_index_t* index, /* in: clustered index */
ulint mode, /* in: mode of the lock which the read cursor
should set on records: LOCK_S or LOCK_X; the
latter is possible in SELECT FOR UPDATE */
+ ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
que_thr_t* thr); /* in: query thread */
/*************************************************************************
Checks that a record is seen in a consistent read. */
@@ -509,6 +513,7 @@ lock_validate(void);
extern lock_sys_t* lock_sys;
/* Lock modes and types */
+/* Basic modes */
#define LOCK_NONE 0 /* this flag is used elsewhere to note
consistent read */
#define LOCK_IS 2 /* intention shared */
@@ -519,15 +524,20 @@ extern lock_sys_t* lock_sys;
in an exclusive mode */
#define LOCK_MODE_MASK 0xF /* mask used to extract mode from the
type_mode field in a lock */
+/* Lock types */
#define LOCK_TABLE 16 /* these type values should be so high that */
#define LOCK_REC 32 /* they can be ORed to the lock mode */
#define LOCK_TYPE_MASK 0xF0 /* mask used to extract lock type from the
type_mode field in a lock */
+/* Waiting lock flag */
#define LOCK_WAIT 256 /* this wait bit should be so high that
it can be ORed to the lock mode and type;
when this bit is set, it means that the
lock has not yet been granted, it is just
waiting for its turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY 0 /* this flag denotes an ordinary next-key lock
+ in contrast to LOCK_GAP or LOCK_REC_NOT_GAP */
#define LOCK_GAP 512 /* this gap bit should be so high that
it can be ORed to the other flags;
when this bit is set, it means that the
@@ -537,7 +547,15 @@ extern lock_sys_t* lock_sys;
the bit is set; locks of this type are created
when records are removed from the index chain
of records */
-#define LOCK_INSERT_INTENTION 1024 /* this bit is set when we place a waiting
+#define LOCK_REC_NOT_GAP 1024 /* this bit means that the lock is only on
+ the index record and does NOT block inserts
+ to the gap before the index record; this is
+ used in the case when we retrieve a record
+ with a unique key, and is also used in
+ locking plain SELECTs (not part of UPDATE
+ or DELETE) when the user has set the READ
+ COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048 /* this bit is set when we place a waiting
gap type record lock request in order to let
an insert of an index record to wait until
there are no conflicting locks by other
diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h
index d65c7fd47e3..a7624a90d5e 100644
--- a/innobase/include/os0file.h
+++ b/innobase/include/os0file.h
@@ -111,6 +111,7 @@ log. */
#define OS_WIN31 1
#define OS_WIN95 2
#define OS_WINNT 3
+#define OS_WIN2000 4
extern ulint os_n_file_reads;
extern ulint os_n_file_writes;
@@ -122,7 +123,7 @@ Gets the operating system version. Currently works only on Windows. */
ulint
os_get_os_version(void);
/*===================*/
- /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */
+ /* out: OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
/********************************************************************
Creates the seek mutexes used in positioned reads and writes. */
diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h
index 9da1f33e070..79750e5c1f7 100644
--- a/innobase/include/os0proc.h
+++ b/innobase/include/os0proc.h
@@ -16,6 +16,15 @@ typedef void* os_process_t;
typedef unsigned long int os_process_id_t;
/********************************************************************
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'. */
+
+ulint
+os_proc_get_number(void);
+/*====================*/
+/********************************************************************
Allocates non-cacheable memory. */
void*
diff --git a/innobase/include/os0thread.h b/innobase/include/os0thread.h
index 8355afa46e9..efc8651e06d 100644
--- a/innobase/include/os0thread.h
+++ b/innobase/include/os0thread.h
@@ -16,11 +16,8 @@ Created 9/8/1995 Heikki Tuuri
this is also the size of the wait slot array for MySQL threads which
can wait inside InnoDB */
#ifdef __WIN__
-/* Windows 95/98/ME seemed to have difficulties creating the all
-the event semaphores for the wait array slots. If the computer had
-<= 64 MB memory, InnoDB startup could take minutes or even crash.
-That is why we set this to only 1000 in Windows. */
-
+/* Create less event semaphores because Win 98/ME had difficult creating
+40000 event semaphores */
#define OS_THREAD_MAX_N 1000
#else
#define OS_THREAD_MAX_N 10000
diff --git a/innobase/include/page0cur.h b/innobase/include/page0cur.h
index 144e0e02b21..c3f0decdb4b 100644
--- a/innobase/include/page0cur.h
+++ b/innobase/include/page0cur.h
@@ -26,7 +26,12 @@ Created 10/4/1994 Heikki Tuuri
#define PAGE_CUR_GE 2
#define PAGE_CUR_L 3
#define PAGE_CUR_LE 4
-#define PAGE_CUR_DBG 5
+#define PAGE_CUR_LE_OR_EXTENDS 5 /* This is a search mode used in
+ "column LIKE 'abc%' ORDER BY column DESC";
+ we have to find strings which are <= 'abc' or
+ which extend it */
+#define PAGE_CUR_DBG 6
+
extern ulint page_cur_short_succ;
diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h
index 2f77127466f..b5e33af5bc0 100644
--- a/innobase/include/page0page.h
+++ b/innobase/include/page0page.h
@@ -666,6 +666,16 @@ page_rec_validate(
/* out: TRUE if ok */
rec_t* rec); /* in: record on the page */
/*******************************************************************
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage. */
+
+ibool
+page_simple_validate(
+/*=================*/
+ /* out: TRUE if ok */
+ page_t* page); /* in: index page */
+/*******************************************************************
This function checks the consistency of an index page. */
ibool
diff --git a/innobase/include/read0read.h b/innobase/include/read0read.h
index cebb2d6701c..db6bf888095 100644
--- a/innobase/include/read0read.h
+++ b/innobase/include/read0read.h
@@ -45,6 +45,14 @@ read_view_close(
/*============*/
read_view_t* view); /* in: read view */
/*************************************************************************
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+
+void
+read_view_close_for_mysql(
+/*======================*/
+ trx_t* trx); /* in: trx which has a read view */
+/*************************************************************************
Checks if a read view sees the specified transaction. */
UNIV_INLINE
ibool
diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h
index 12e3a8b39d6..b28f39925c1 100644
--- a/innobase/include/rem0rec.h
+++ b/innobase/include/rem0rec.h
@@ -148,12 +148,22 @@ data field in the record. */
byte*
rec_get_nth_field(
/*==============*/
- /* out: pointer to the field, NULL if SQL null */
+ /* out: pointer to the field */
rec_t* rec, /* in: record */
ulint n, /* in: index of the field */
ulint* len); /* out: length of the field; UNIV_SQL_NULL
if SQL null */
/****************************************************************
+Return field length or UNIV_SQL_NULL. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_len(
+/*==================*/
+ /* out: length of the field; UNIV_SQL_NULL if SQL
+ null */
+ rec_t* rec, /* in: record */
+ ulint n); /* in: index of the field */
+/****************************************************************
Gets the physical size of a field. Also an SQL null may have a field of
size > 0, if the data type is of a fixed size. */
UNIV_INLINE
diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic
index aaa3c58a003..9dfd4faeec8 100644
--- a/innobase/include/rem0rec.ic
+++ b/innobase/include/rem0rec.ic
@@ -65,6 +65,24 @@ a field stored to another page: */
#define REC_2BYTE_EXTERN_MASK 0x4000
+/****************************************************************
+Return field length or UNIV_SQL_NULL. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_len(
+/*==================*/
+ /* out: length of the field; UNIV_SQL_NULL if SQL
+ null */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: index of the field */
+{
+ ulint len;
+
+ rec_get_nth_field(rec, n, &len);
+
+ return(len);
+}
+
/***************************************************************
Sets the value of the ith field SQL null bit. */
diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
index 9de5e9ccfc9..4d2768cf109 100644
--- a/innobase/include/srv0srv.h
+++ b/innobase/include/srv0srv.h
@@ -57,8 +57,6 @@ extern ulint srv_flush_log_at_trx_commit;
extern byte srv_latin1_ordering[256];/* The sort order table of the latin1
character set */
-extern ibool srv_use_native_aio;
-
extern ulint srv_pool_size;
extern ulint srv_mem_pool_size;
extern ulint srv_lock_table_size;
@@ -70,8 +68,9 @@ extern dulint srv_archive_recovery_limit_lsn;
extern ulint srv_lock_wait_timeout;
-extern char* srv_unix_file_flush_method_str;
+extern char* srv_file_flush_method_str;
extern ulint srv_unix_file_flush_method;
+extern ulint srv_win_file_flush_method;
extern ulint srv_force_recovery;
extern ulint srv_thread_concurrency;
@@ -154,13 +153,19 @@ typedef struct srv_sys_struct srv_sys_t;
/* The server system */
extern srv_sys_t* srv_sys;
-/* Alternatives for the field flush option in Unix; see the InnoDB manual about
+/* Alternatives for the file flush option in Unix; see the InnoDB manual about
what these mean */
-#define SRV_UNIX_FDATASYNC 1
+#define SRV_UNIX_FDATASYNC 1 /* This is the default; it is currently mapped
+ to a call of fsync() because fdatasync()
+ seemed to corrupt files in Linux and Solaris */
#define SRV_UNIX_O_DSYNC 2
#define SRV_UNIX_LITTLESYNC 3
#define SRV_UNIX_NOSYNC 4
+/* Alternatives for file i/o in Windows */
+#define SRV_WIN_IO_NORMAL 1
+#define SRV_WIN_IO_UNBUFFERED 2 /* This is the default */
+
/* Alternatives for srv_force_recovery. Non-zero values are intended
to help the user get a damaged database up so that he can dump intact
tables and rows with SELECT INTO OUTFILE. The database must not otherwise
@@ -311,15 +316,17 @@ srv_conc_exit_innodb(
trx_t* trx); /* in: transaction object associated with the
thread */
/*******************************************************************
-Puts a MySQL OS thread to wait for a lock to be released. */
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
-ibool
+void
srv_suspend_mysql_thread(
/*=====================*/
- /* out: TRUE if the lock wait timeout was
- exceeded */
- que_thr_t* thr); /* in: query thread associated with
- the MySQL OS thread */
+ que_thr_t* thr); /* in: query thread associated with the MySQL
+ OS thread */
/************************************************************************
Releases a MySQL OS thread waiting for a lock to be released, if the
thread is already suspended. */
@@ -407,3 +414,4 @@ struct srv_sys_struct{
extern ulint srv_n_threads_active[];
#endif
+
diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h
index 7ad38f5bc7f..5aa3dcdffc3 100644
--- a/innobase/include/sync0rw.h
+++ b/innobase/include/sync0rw.h
@@ -335,7 +335,8 @@ ibool
rw_lock_own(
/*========*/
rw_lock_t* lock, /* in: rw-lock */
- ulint lock_type); /* in: lock type */
+ ulint lock_type); /* in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
/**********************************************************************
Checks if somebody has locked the rw-lock in the specified mode. */
diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h
index 5bfa0bc2d48..320f8faf12d 100644
--- a/innobase/include/sync0sync.h
+++ b/innobase/include/sync0sync.h
@@ -371,10 +371,12 @@ or row lock! */
#define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress
latching order checking */
#define SYNC_LEVEL_NONE 2000 /* default: level not defined */
-#define SYNC_FOREIGN_KEY_CHECK 1001
+#define SYNC_DICT_OPERATION 1001 /* table create, drop, etc. reserve
+ this in X-mode, implicit or backround
+ operations purge, rollback, foreign
+ key checks reserve this in S-mode */
#define SYNC_DICT 1000
#define SYNC_DICT_AUTOINC_MUTEX 999
-#define SYNC_PURGE_IS_RUNNING 997
#define SYNC_DICT_HEADER 995
#define SYNC_IBUF_HEADER 914
#define SYNC_IBUF_PESS_INSERT_MUTEX 912
diff --git a/innobase/include/trx0purge.h b/innobase/include/trx0purge.h
index 087be2f060e..049c79aec9b 100644
--- a/innobase/include/trx0purge.h
+++ b/innobase/include/trx0purge.h
@@ -111,9 +111,6 @@ struct trx_purge_struct{
of the trx system and it never ends */
que_t* query; /* The query graph which will do the
parallelized purge operation */
- rw_lock_t purge_is_running;/* Purge operation set an x-latch here
- while it is accessing a table: this
- prevents dropping of the table */
rw_lock_t latch; /* The latch protecting the purge view.
A purge operation must acquire an
x-latch here for the instant at which
diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h
index 9b29c481b6d..874b126e47c 100644
--- a/innobase/include/trx0trx.h
+++ b/innobase/include/trx0trx.h
@@ -327,6 +327,7 @@ struct trx_struct{
time_t start_time; /* time the trx object was created
or the state last time became
TRX_ACTIVE */
+ ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
ibool check_foreigns; /* normally TRUE, but if the user
wants to suppress foreign key checks,
(in table imports, for example) we
@@ -350,6 +351,9 @@ struct trx_struct{
/*------------------------------*/
void* mysql_thd; /* MySQL thread handle corresponding
to this trx, or NULL */
+ char** mysql_query_str;/* pointer to the field in mysqld_thd
+ which contains the pointer to the
+ current SQL query string */
char* mysql_log_file_name;
/* if MySQL binlog is used, this field
contains a pointer to the latest file
@@ -371,6 +375,9 @@ struct trx_struct{
replication has processed */
os_thread_id_t mysql_thread_id;/* id of the MySQL thread associated
with this transaction object */
+ ulint mysql_process_no;/* since in Linux, 'top' reports
+ process id's and not thread id's, we
+ store the process number too */
/*------------------------------*/
ulint n_mysql_tables_in_use; /* number of Innobase tables
used in the processing of the current
@@ -379,9 +386,9 @@ struct trx_struct{
/* how many tables the current SQL
statement uses, except those
in consistent read */
- ibool has_dict_foreign_key_check_lock;
+ ibool has_dict_operation_lock;
/* TRUE if the trx currently holds
- an s-lock on dict_foreign_... */
+ an s-lock on dict_operation_lock */
ibool has_search_latch;
/* TRUE if this trx has latched the
search system latch in S-mode */
@@ -523,6 +530,41 @@ struct trx_struct{
#define TRX_QUE_ROLLING_BACK 3 /* transaction is rolling back */
#define TRX_QUE_COMMITTING 4 /* transaction is committing */
+/* Transaction isolation levels */
+#define TRX_ISO_READ_UNCOMMITTED 1 /* dirty read: non-locking
+ SELECTs are performed so that
+ we do not look at a possible
+ earlier version of a record;
+ thus they are not 'consistent'
+ reads under this isolation
+ level; otherwise like level
+ 2 */
+
+#define TRX_ISO_READ_COMMITTED 2 /* somewhat Oracle-like
+ isolation, except that in
+ range UPDATE and DELETE we
+ must block phantom rows
+ with next-key locks;
+ SELECT ... FOR UPDATE and ...
+ LOCK IN SHARE MODE only lock
+ the index records, NOT the
+ gaps before them, and thus
+ allow free inserting;
+ each consistent read reads its
+ own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ 3 /* this is the default;
+ all consistent reads in the
+ same trx read the same
+ snapshot;
+ full next-key locking used
+ in locking reads to block
+ insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE 4 /* all plain SELECTs are
+ converted to LOCK IN SHARE
+ MODE reads */
+
/* Types of a trx signal */
#define TRX_SIG_NO_SIGNAL 100
#define TRX_SIG_TOTAL_ROLLBACK 1
diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c
index 866fe556af9..92ee5ee6cbe 100644
--- a/innobase/lock/lock0lock.c
+++ b/innobase/lock/lock0lock.c
@@ -70,6 +70,11 @@ A waiting record lock can also be of the gap type. A waiting lock request
can be granted when there is no conflicting mode lock request by another
transaction ahead of it in the explicit lock queue.
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
-------------------------------------------------------------------------
RULE 1: If there is an implicit x-lock on a record, and there are non-gap
-------
@@ -294,7 +299,9 @@ struct lock_struct{
UT_LIST_NODE_T(lock_t)
trx_locks; /* list of the locks of the
transaction */
- ulint type_mode; /* lock type, mode, gap flag, and
+ ulint type_mode; /* lock type, mode, LOCK_GAP or
+ LOCK_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION,
wait flag, ORed */
hash_node_t hash; /* hash chain node for a record lock */
dict_index_t* index; /* index for a record lock */
@@ -309,6 +316,10 @@ Monitor will then fetch it and print */
ibool lock_deadlock_found = FALSE;
char* lock_latest_err_buf; /* We allocate 5000 bytes for this */
+/* Flags for recursive deadlock search */
+#define LOCK_VICTIM_IS_START 1
+#define LOCK_VICTIM_IS_OTHER 2
+
/************************************************************************
Checks if a lock request results in a deadlock. */
static
@@ -700,23 +711,23 @@ lock_rec_get_gap(
}
/*************************************************************************
-Sets the gap flag of a record lock. */
+Gets the LOCK_REC_NOT_GAP flag of a record lock. */
UNIV_INLINE
-void
-lock_rec_set_gap(
-/*=============*/
- lock_t* lock, /* in: record lock */
- ibool val) /* in: value to set: TRUE or FALSE */
+ibool
+lock_rec_get_rec_not_gap(
+/*=====================*/
+ /* out: TRUE if LOCK_REC_NOT_GAP flag set */
+ lock_t* lock) /* in: record lock */
{
ut_ad(lock);
- ut_ad((val == TRUE) || (val == FALSE));
ut_ad(lock_get_type(lock) == LOCK_REC);
- if (val) {
- lock->type_mode = lock->type_mode | LOCK_GAP;
- } else {
- lock->type_mode = lock->type_mode & ~LOCK_GAP;
+ if (lock->type_mode & LOCK_REC_NOT_GAP) {
+
+ return(TRUE);
}
+
+ return(FALSE);
}
/*************************************************************************
@@ -740,26 +751,6 @@ lock_rec_get_insert_intention(
}
/*************************************************************************
-Sets the waiting insert flag of a record lock. */
-UNIV_INLINE
-void
-lock_rec_set_insert_intention(
-/*==========================*/
- lock_t* lock, /* in: record lock */
- ibool val) /* in: value to set: TRUE or FALSE */
-{
- ut_ad(lock);
- ut_ad((val == TRUE) || (val == FALSE));
- ut_ad(lock_get_type(lock) == LOCK_REC);
-
- if (val) {
- lock->type_mode = lock->type_mode | LOCK_INSERT_INTENTION;
- } else {
- lock->type_mode = lock->type_mode & ~LOCK_INSERT_INTENTION;
- }
-}
-
-/*************************************************************************
Calculates if lock mode 1 is stronger or equal to lock mode 2. */
UNIV_INLINE
ibool
@@ -848,48 +839,53 @@ lock_rec_has_to_wait(
/* out: TRUE if new lock has to wait for lock2 to be
removed */
trx_t* trx, /* in: trx of new lock */
- ulint mode, /* in: LOCK_S or LOCK_X */
- ulint gap, /* in: LOCK_GAP or 0 */
- ulint insert_intention,
- /* in: LOCK_INSERT_INTENTION or 0 */
+ ulint type_mode,/* in: precise mode of the new lock to set:
+ LOCK_S or LOCK_X, possibly ORed to
+ LOCK_GAP or LOCK_REC_NOT_GAP, LOCK_INSERT_INTENTION */
lock_t* lock2) /* in: another record lock; NOTE that it is assumed
that this has a lock bit set on the same record as
- in lock1 */
+ in the new lock we are setting */
{
ut_ad(trx && lock2);
ut_ad(lock_get_type(lock2) == LOCK_REC);
- ut_ad(mode == LOCK_S || mode == LOCK_X);
- ut_ad(gap == LOCK_GAP || gap == 0);
- ut_ad(insert_intention == LOCK_INSERT_INTENTION
- || insert_intention == 0);
- if (trx != lock2->trx && !lock_mode_compatible(mode,
+ if (trx != lock2->trx
+ && !lock_mode_compatible(LOCK_MODE_MASK & type_mode,
lock_get_mode(lock2))) {
- /* We have somewhat complex rules when gap type
- record locks cause waits */
+ /* We have somewhat complex rules when gap type record locks
+ cause waits */
- if (!gap && lock_rec_get_insert_intention(lock2)) {
+ if ((type_mode & LOCK_REC_NOT_GAP)
+ && lock_rec_get_gap(lock2)) {
+ /* Lock on just the record does not need to wait for
+ a gap type lock */
+
+ return(FALSE);
+ }
+
+ if ((type_mode & LOCK_GAP)
+ && lock_rec_get_rec_not_gap(lock2)) {
+
+ /* Lock on gap does not need to wait for
+ a LOCK_REC_NOT_GAP type lock */
- /* Request of a full next-key record does not
- need to wait for an insert intention lock to be
- removed. This is ok since our rules allow conflicting
- locks on gaps. This eliminates a spurious deadlock
- caused by a next-key lock waiting for an insert
- intention lock; when the insert intention lock was
- granted, the insert deadlocked on the waiting
- next-key lock. */
-
return(FALSE);
}
- if (insert_intention && lock_rec_get_insert_intention(lock2)) {
+ if (lock_rec_get_insert_intention(lock2)) {
- /* An insert intention is not disturbed by another
- insert intention; this removes a spurious deadlock
- caused by inserts which had to wait for a next-key
- lock to be removed */
+ /* No lock request needs to wait for an insert
+ intention lock to be removed. This is ok since our
+ rules allow conflicting locks on gaps. This eliminates
+ a spurious deadlock caused by a next-key lock waiting
+ for an insert intention lock; when the insert
+ intention lock was granted, the insert deadlocked on
+ the waiting next-key lock.
+ Also, insert intention locks do not disturb each
+ other. */
+
return(FALSE);
}
@@ -921,10 +917,7 @@ lock_has_to_wait(
ut_ad(lock_get_type(lock2) == LOCK_REC);
return(lock_rec_has_to_wait(lock1->trx,
- lock_get_mode(lock1),
- lock_rec_get_gap(lock1),
- lock_rec_get_insert_intention(lock1),
- lock2));
+ lock1->type_mode, lock2));
}
return(TRUE);
@@ -1386,32 +1379,41 @@ lock_table_has(
/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
/*************************************************************************
-Checks if a transaction has a GRANTED explicit lock on rec, where the gap
-flag or the insert intention flag is not set, stronger or equal to mode.
-Note that locks on the supremum of a page are a special case here, since
-they are always gap type locks, even if the gap flag is not set in them. */
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode. */
UNIV_INLINE
lock_t*
lock_rec_has_expl(
/*==============*/
/* out: lock or NULL */
- ulint mode, /* in: lock mode */
+ ulint precise_mode,/* in: LOCK_S or LOCK_X possibly ORed to
+ LOCK_GAP or LOCK_REC_NOT_GAP,
+ for a supremum record we regard this always a gap
+ type request */
rec_t* rec, /* in: record */
trx_t* trx) /* in: transaction */
{
lock_t* lock;
-
- ut_ad(mutex_own(&kernel_mutex));
- ut_ad((mode == LOCK_X) || (mode == LOCK_S));
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+ || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+ ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
lock = lock_rec_get_first(rec);
while (lock) {
if (lock->trx == trx
- && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)
+ && lock_mode_stronger_or_eq(lock_get_mode(lock),
+ precise_mode & LOCK_MODE_MASK)
&& !lock_get_wait(lock)
- && !lock_rec_get_insert_intention(lock)
- && !lock_rec_get_gap(lock)) {
+ && (!lock_rec_get_rec_not_gap(lock)
+ || (precise_mode & LOCK_REC_NOT_GAP)
+ || page_rec_is_supremum(rec))
+ && (!lock_rec_get_gap(lock)
+ || (precise_mode & LOCK_GAP)
+ || page_rec_is_supremum(rec))
+ && (!lock_rec_get_insert_intention(lock))) {
return(lock);
}
@@ -1429,7 +1431,7 @@ lock_t*
lock_rec_other_has_expl_req(
/*========================*/
/* out: lock or NULL */
- ulint mode, /* in: lock mode */
+ ulint mode, /* in: LOCK_S or LOCK_X */
ulint gap, /* in: LOCK_GAP if also gap locks are taken
into account, or 0 if not */
ulint wait, /* in: LOCK_WAIT if also waiting locks are
@@ -1471,27 +1473,21 @@ lock_t*
lock_rec_other_has_conflicting(
/*===========================*/
/* out: lock or NULL */
- ulint mode, /* in: lock mode of the lock we are going to reserve */
- ulint gap, /* in: LOCK_GAP if we are going to reserve a gap type
- lock, else 0 */
- ulint insert_intention,
- /* in: LOCK_INSERT_INTENTION if we are going to
- reserve an insert intention lock */
+ ulint mode, /* in: LOCK_S or LOCK_X,
+ possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION */
rec_t* rec, /* in: record to look at */
trx_t* trx) /* in: our transaction */
{
lock_t* lock;
ut_ad(mutex_own(&kernel_mutex));
- ut_ad(mode == LOCK_X || mode == LOCK_S);
- ut_ad(gap == 0 || gap == LOCK_GAP);
- ut_ad(insert_intention == LOCK_INSERT_INTENTION
- || insert_intention == 0);
+
lock = lock_rec_get_first(rec);
while (lock) {
- if (lock_rec_has_to_wait(trx, mode, gap, insert_intention,
- lock)) {
+ if (lock_rec_has_to_wait(trx, mode, lock)) {
+
return(lock);
}
@@ -1607,14 +1603,14 @@ lock_rec_create(
page_no = buf_frame_get_page_no(page);
heap_no = rec_get_heap_no(rec);
- /* If rec is the supremum record, then we reset the gap bit, as
- all locks on the supremum are automatically of the gap type, and
- we try to avoid unnecessary memory consumption of a new record lock
- struct for a gap type lock */
+ /* If rec is the supremum record, then we reset the gap and
+ LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+ automatically of the gap type */
if (rec == page_get_supremum_rec(page)) {
+ ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
- type_mode = type_mode & ~LOCK_GAP;
+ type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
}
/* Make lock bitmap bigger by a safety margin */
@@ -1666,10 +1662,14 @@ ulint
lock_rec_enqueue_waiting(
/*=====================*/
/* out: DB_LOCK_WAIT, DB_DEADLOCK, or
- DB_QUE_THR_SUSPENDED */
+ DB_QUE_THR_SUSPENDED, or DB_SUCCESS;
+ DB_SUCCESS means that there was a deadlock,
+ but another transaction was chosen as a
+ victim, and we got the lock immediately:
+ no need to wait then */
ulint type_mode,/* in: lock mode this transaction is
- requesting: LOCK_S or LOCK_X, ORed with
- LOCK_GAP if a gap lock is requested, ORed
+ requesting: LOCK_S or LOCK_X, possibly ORed
+ with LOCK_GAP or LOCK_REC_NOT_GAP, ORed
with LOCK_INSERT_INTENTION if this waiting
lock request is set when performing an
insert of an index record */
@@ -1718,6 +1718,14 @@ index->table_name);
return(DB_DEADLOCK);
}
+ /* If there was a deadlock but we chose another transaction as a
+ victim, it is possible that we already have the lock now granted! */
+
+ if (trx->wait_lock == NULL) {
+
+ return(DB_SUCCESS);
+ }
+
trx->que_state = TRX_QUE_LOCK_WAIT;
trx->wait_started = time(NULL);
@@ -1744,8 +1752,8 @@ lock_rec_add_to_queue(
/*==================*/
/* out: lock where the bit was set, NULL if out
of memory */
- ulint type_mode,/* in: lock mode, wait, and gap flags; type
- is ignored and replaced by LOCK_REC */
+ ulint type_mode,/* in: lock mode, wait, gap etc. flags;
+ type is ignored and replaced by LOCK_REC */
rec_t* rec, /* in: record on page */
dict_index_t* index, /* in: index of record */
trx_t* trx) /* in: transaction */
@@ -1759,12 +1767,11 @@ lock_rec_add_to_queue(
ut_ad(mutex_own(&kernel_mutex));
ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP))
|| ((type_mode & LOCK_MODE_MASK) != LOCK_S)
- || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT,
- rec, trx));
+ || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, rec, trx));
ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP))
|| ((type_mode & LOCK_MODE_MASK) != LOCK_X)
- || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
- rec, trx));
+ || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, trx));
+
type_mode = type_mode | LOCK_REC;
page = buf_frame_align(rec);
@@ -1775,12 +1782,15 @@ lock_rec_add_to_queue(
struct for a gap type lock */
if (rec == page_get_supremum_rec(page)) {
+ ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
- type_mode = type_mode & ~LOCK_GAP;
+ /* There should never be LOCK_REC_NOT_GAP on a supremum
+ record, but let us play safe */
+
+ type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
}
- /* Look for a waiting lock request on the same record, or for a
- similar record lock on the same page */
+ /* Look for a waiting lock request on the same record or on a gap */
heap_no = rec_get_heap_no(rec);
lock = lock_rec_get_first_on_page(rec);
@@ -1795,6 +1805,9 @@ lock_rec_add_to_queue(
lock = lock_rec_get_next_on_page(lock);
}
+ /* Look for a similar record lock on the same page: if one is found
+ and there are no waiting lock requests, we can just set the bit */
+
similar_lock = lock_rec_find_similar_on_page(type_mode, rec, trx);
if (similar_lock && !somebody_waits && !(type_mode & LOCK_WAIT)) {
@@ -1822,7 +1835,8 @@ lock_rec_lock_fast(
ibool impl, /* in: if TRUE, no lock is set if no wait
is necessary: we assume that the caller will
set an implicit lock */
- ulint mode, /* in: lock mode */
+ ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly
+ ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
rec_t* rec, /* in: record */
dict_index_t* index, /* in: index of record */
que_thr_t* thr) /* in: query thread */
@@ -1831,8 +1845,16 @@ lock_rec_lock_fast(
ulint heap_no;
ut_ad(mutex_own(&kernel_mutex));
- ut_ad((mode == LOCK_X) || (mode == LOCK_S));
-
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+ || (LOCK_MODE_MASK & mode) == LOCK_X);
+ ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+ || mode - (LOCK_MODE_MASK & mode) == 0
+ || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+
heap_no = rec_get_heap_no(rec);
lock = lock_rec_get_first_on_page(rec);
@@ -1877,7 +1899,8 @@ lock_rec_lock_slow(
ibool impl, /* in: if TRUE, no lock is set if no wait is
necessary: we assume that the caller will set
an implicit lock */
- ulint mode, /* in: lock mode */
+ ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly
+ ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
rec_t* rec, /* in: record */
dict_index_t* index, /* in: index of record */
que_thr_t* thr) /* in: query thread */
@@ -1886,20 +1909,24 @@ lock_rec_lock_slow(
ulint err;
ut_ad(mutex_own(&kernel_mutex));
- ut_ad((mode == LOCK_X) || (mode == LOCK_S));
-
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+ || (LOCK_MODE_MASK & mode) == LOCK_X);
+ ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+ || mode - (LOCK_MODE_MASK & mode) == 0
+ || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+
trx = thr_get_trx(thr);
-
- ut_ad((mode != LOCK_S) || lock_table_has(trx, index->table,
- LOCK_IS));
- ut_ad((mode != LOCK_X) || lock_table_has(trx, index->table,
- LOCK_IX));
+
if (lock_rec_has_expl(mode, rec, trx)) {
/* The trx already has a strong enough lock on rec: do
nothing */
err = DB_SUCCESS;
- } else if (lock_rec_other_has_conflicting(mode, 0, 0, rec, trx)) {
+ } else if (lock_rec_other_has_conflicting(mode, rec, trx)) {
/* If another transaction has a non-gap conflicting request in
the queue, as this transaction does not have a lock strong
@@ -1935,7 +1962,8 @@ lock_rec_lock(
ibool impl, /* in: if TRUE, no lock is set if no wait is
necessary: we assume that the caller will set
an implicit lock */
- ulint mode, /* in: lock mode */
+ ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly
+ ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
rec_t* rec, /* in: record */
dict_index_t* index, /* in: index of record */
que_thr_t* thr) /* in: query thread */
@@ -1943,11 +1971,16 @@ lock_rec_lock(
ulint err;
ut_ad(mutex_own(&kernel_mutex));
- ut_ad((mode != LOCK_S) || lock_table_has(thr_get_trx(thr),
- index->table, LOCK_IS));
- ut_ad((mode != LOCK_X) || lock_table_has(thr_get_trx(thr),
- index->table, LOCK_IX));
-
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+ || (LOCK_MODE_MASK & mode) == LOCK_X);
+ ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+ || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP
+ || mode - (LOCK_MODE_MASK & mode) == 0);
+
if (lock_rec_lock_fast(impl, mode, rec, index, thr)) {
/* We try a simplified and faster subroutine for the most
@@ -2011,26 +2044,33 @@ lock_grant(
ut_ad(mutex_own(&kernel_mutex));
lock_reset_lock_and_trx_wait(lock);
-
- if (lock_get_mode(lock) == LOCK_AUTO_INC) {
- if (lock->trx->auto_inc_lock != NULL) {
- fprintf(stderr,
- "InnoDB: Error: trx already had an AUTO-INC lock!\n");
- }
+ if (lock_get_mode(lock) == LOCK_AUTO_INC) {
- /* Store pointer to lock to trx so that we know to
- release it at the end of the SQL statement */
+ if (lock->trx->auto_inc_lock != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error: trx already had an AUTO-INC lock!\n");
+ }
- lock->trx->auto_inc_lock = lock;
- }
+ /* Store pointer to lock to trx so that we know to
+ release it at the end of the SQL statement */
+
+ lock->trx->auto_inc_lock = lock;
+ }
if (lock_print_waits) {
printf("Lock wait for trx %lu ends\n",
ut_dulint_get_low(lock->trx->id));
}
+
+ /* If we are resolving a deadlock by choosing another transaction
+ as a victim, then our original transaction may not be in the
+ TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
+ for it */
- trx_end_lock_wait(lock->trx);
+ if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) {
+ trx_end_lock_wait(lock->trx);
+ }
}
/*****************************************************************
@@ -2080,7 +2120,7 @@ lock_rec_dequeue_from_page(
ut_ad(lock_get_type(in_lock) == LOCK_REC);
trx = in_lock->trx;
-
+
space = in_lock->un_member.rec_lock.space;
page_no = in_lock->un_member.rec_lock.page_no;
@@ -2199,9 +2239,10 @@ lock_rec_reset_and_release_wait(
}
/*****************************************************************
-Makes a record to inherit the locks of another record as gap type locks, but
-does not reset the lock bits of the other record. Also waiting lock requests
-on rec are inherited as GRANTED gap locks. */
+Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks. */
void
lock_rec_inherit_to_gap(
@@ -2217,9 +2258,45 @@ lock_rec_inherit_to_gap(
lock = lock_rec_get_first(rec);
while (lock != NULL) {
- lock_rec_add_to_queue(((lock->type_mode | LOCK_GAP)
- & ~LOCK_WAIT),
+ if (!lock_rec_get_insert_intention(lock)) {
+
+ lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock)
+ | LOCK_GAP,
heir, lock->index, lock->trx);
+ }
+
+ lock = lock_rec_get_next(rec, lock);
+ }
+}
+
+/*****************************************************************
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+ rec_t* heir, /* in: record which inherits */
+ rec_t* rec) /* in: record from which inherited; does NOT reset
+ the locks on this record */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = lock_rec_get_first(rec);
+
+ while (lock != NULL) {
+ if (!lock_rec_get_insert_intention(lock)
+ && (page_rec_is_supremum(rec)
+ || !lock_rec_get_rec_not_gap(lock))) {
+
+ lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock)
+ | LOCK_GAP,
+ heir, lock->index, lock->trx);
+ }
+
lock = lock_rec_get_next(rec, lock);
}
}
@@ -2778,9 +2855,10 @@ lock_update_insert(
{
lock_mutex_enter_kernel();
- /* Inherit the locks for rec, in gap mode, from the next record */
+ /* Inherit the gap-locking locks for rec, in gap mode, from the next
+ record */
- lock_rec_inherit_to_gap(rec, page_rec_get_next(rec));
+ lock_rec_inherit_to_gap_if_gap_lock(rec, page_rec_get_next(rec));
lock_mutex_exit_kernel();
}
@@ -2859,20 +2937,23 @@ static
ibool
lock_deadlock_occurs(
/*=================*/
- /* out: TRUE if a deadlock was detected */
+ /* out: TRUE if a deadlock was detected and we
+ chose trx as a victim; FALSE if no deadlock, or
+ there was a deadlock, but we chose other
+ transaction(s) as victim(s) */
lock_t* lock, /* in: lock the transaction is requesting */
trx_t* trx) /* in: transaction */
{
dict_table_t* table;
dict_index_t* index;
trx_t* mark_trx;
- ibool ret;
+ ulint ret;
ulint cost = 0;
char* err_buf;
ut_ad(trx && lock);
ut_ad(mutex_own(&kernel_mutex));
-
+retry:
/* We check that adding this trx to the waits-for graph
does not produce a cycle. First mark all active transactions
with 0: */
@@ -2886,7 +2967,14 @@ lock_deadlock_occurs(
ret = lock_deadlock_recursive(trx, trx, lock, &cost);
- if (ret) {
+ if (ret == LOCK_VICTIM_IS_OTHER) {
+ /* We chose some other trx as a victim: retry if there still
+ is a deadlock */
+
+ goto retry;
+ }
+
+ if (ret == LOCK_VICTIM_IS_START) {
if (lock_get_type(lock) == LOCK_TABLE) {
table = lock->un_member.tab_lock.table;
index = NULL;
@@ -2898,19 +2986,6 @@ lock_deadlock_occurs(
lock_deadlock_found = TRUE;
err_buf = lock_latest_err_buf + strlen(lock_latest_err_buf);
-
- err_buf += sprintf(err_buf,
- "*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
-
- ut_a(err_buf <= lock_latest_err_buf + 4000);
-
- if (lock_get_type(lock) == LOCK_REC) {
- lock_rec_print(err_buf, lock);
- err_buf += strlen(err_buf);
- } else {
- lock_table_print(err_buf, lock);
- err_buf += strlen(err_buf);
- }
ut_a(err_buf <= lock_latest_err_buf + 4000);
@@ -2923,30 +2998,39 @@ lock_deadlock_occurs(
sess_raise_error_low(trx, DB_DEADLOCK, lock->type_mode, table,
index, NULL, NULL, NULL);
*/
+
+ return(TRUE);
}
- return(ret);
+ return(FALSE);
}
/************************************************************************
Looks recursively for a deadlock. */
static
-ibool
+ulint
lock_deadlock_recursive(
/*====================*/
- /* out: TRUE if a deadlock was detected
- or the calculation took too long */
+ /* out: 0 if no deadlock found,
+ LOCK_VICTIM_IS_START if there was a deadlock
+ and we chose 'start' as the victim,
+ LOCK_VICTIM_IS_OTHER if a deadlock
+ was found and we chose some other trx as a
+ victim: we must do the search again in this
+ last case because there may be another
+ deadlock! */
trx_t* start, /* in: recursion starting point */
trx_t* trx, /* in: a transaction waiting for a lock */
lock_t* wait_lock, /* in: the lock trx is waiting to be granted */
ulint* cost) /* in/out: number of calculation steps thus
far: if this exceeds LOCK_MAX_N_STEPS_...
- we return TRUE */
+ we return LOCK_VICTIM_IS_START */
{
lock_t* lock;
ulint bit_no;
trx_t* lock_trx;
char* err_buf;
+ ulint ret;
ut_a(trx && start && wait_lock);
ut_ad(mutex_own(&kernel_mutex));
@@ -2955,14 +3039,14 @@ lock_deadlock_recursive(
/* We have already exhaustively searched the subtree starting
from this trx */
- return(FALSE);
+ return(0);
}
*cost = *cost + 1;
if (*cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK) {
- return(TRUE);
+ return(LOCK_VICTIM_IS_START);
}
lock = wait_lock;
@@ -2998,6 +3082,9 @@ lock_deadlock_recursive(
lock_trx = lock->trx;
if (lock_trx == start) {
+ /* We came back to the recursion starting
+ point: a deadlock detected */
+
err_buf = lock_latest_err_buf;
ut_sprintf_timestamp(err_buf);
@@ -3045,11 +3132,59 @@ lock_deadlock_recursive(
ut_a(err_buf <= lock_latest_err_buf + 4000);
+ err_buf += sprintf(err_buf,
+ "*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+ ut_a(err_buf <= lock_latest_err_buf + 4000);
+
+ if (lock_get_type(start->wait_lock)
+ == LOCK_REC) {
+ lock_rec_print(err_buf,
+ start->wait_lock);
+ err_buf += strlen(err_buf);
+ } else {
+ lock_table_print(err_buf,
+ start->wait_lock);
+ err_buf += strlen(err_buf);
+ }
+
if (lock_print_waits) {
printf("Deadlock detected\n");
}
- return(TRUE);
+ if (ut_dulint_cmp(wait_lock->trx->undo_no,
+ start->undo_no) >= 0) {
+ /* Our recursion starting point
+ transaction is 'smaller', let us
+ choose 'start' as the victim and roll
+ back it */
+
+ return(LOCK_VICTIM_IS_START);
+ }
+
+ lock_deadlock_found = TRUE;
+
+ ut_a(err_buf <= lock_latest_err_buf + 4000);
+
+ /* Let us choose the transaction of wait_lock
+ as a victim to try to avoid deadlocking our
+ recursion starting point transaction */
+
+ err_buf += sprintf(err_buf,
+ "*** WE ROLL BACK TRANSACTION (1)\n");
+
+ wait_lock->trx->error_state = DB_DEADLOCK;
+
+ lock_cancel_waiting_and_release(wait_lock);
+
+ /* Since trx and wait_lock are no longer
+ in the waits-for graph, we can return FALSE;
+ note that our selective algorithm can choose
+ several transactions as victims, but still
+ we may end up rolling back also the recursion
+ starting point transaction! */
+
+ return(LOCK_VICTIM_IS_OTHER);
}
if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
@@ -3058,10 +3193,11 @@ lock_deadlock_recursive(
incompatible mode, and is itself waiting for
a lock */
- if (lock_deadlock_recursive(start, lock_trx,
- lock_trx->wait_lock, cost)) {
+ ret = lock_deadlock_recursive(start, lock_trx,
+ lock_trx->wait_lock, cost);
+ if (ret != 0) {
- return(TRUE);
+ return(ret);
}
}
}
@@ -3153,12 +3289,16 @@ lock_table_remove_low(
/*************************************************************************
Enqueues a waiting request for a table lock which cannot be granted
immediately. Checks for deadlocks. */
-
+static
ulint
lock_table_enqueue_waiting(
/*=======================*/
/* out: DB_LOCK_WAIT, DB_DEADLOCK, or
- DB_QUE_THR_SUSPENDED */
+ DB_QUE_THR_SUSPENDED, or DB_SUCCESS;
+ DB_SUCCESS means that there was a deadlock,
+ but another transaction was chosen as a
+ victim, and we got the lock immediately:
+ no need to wait then */
ulint mode, /* in: lock mode this transaction is
requesting */
dict_table_t* table, /* in: table */
@@ -3205,6 +3345,13 @@ table->name);
return(DB_DEADLOCK);
}
+ if (trx->wait_lock == NULL) {
+ /* Deadlock resolution chose another transaction as a victim,
+ and we accidentally got our lock granted! */
+
+ return(DB_SUCCESS);
+ }
+
trx->que_state = TRX_QUE_LOCK_WAIT;
trx->wait_started = time(NULL);
@@ -3292,7 +3439,7 @@ lock_table(
if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) {
/* Another trx has a request on the table in an incompatible
- mode: this trx must wait */
+ mode: this trx may have to wait */
err = lock_table_enqueue_waiting(mode, table, thr);
@@ -3659,7 +3806,11 @@ lock_rec_print(
}
if (lock_rec_get_gap(lock)) {
- buf += sprintf(buf, " gap type lock");
+ buf += sprintf(buf, " locks gap before rec");
+ }
+
+ if (lock_rec_get_rec_not_gap(lock)) {
+ buf += sprintf(buf, " locks rec but not gap");
}
if (lock_rec_get_insert_intention(lock)) {
@@ -3776,8 +3927,8 @@ lock_print_info(
mtr_t mtr;
if (buf_end - buf < 600) {
- sprintf(buf, "... output truncated!\n");
-
+ sprintf(buf, "... output truncated!\n");
+
return;
}
@@ -3802,8 +3953,8 @@ lock_print_info(
if ((ulint)(buf_end - buf)
< 100 + strlen(lock_latest_err_buf)) {
- lock_mutex_exit_kernel();
- sprintf(buf, "... output truncated!\n");
+ lock_mutex_exit_kernel();
+ sprintf(buf, "... output truncated!\n");
return;
}
@@ -3826,8 +3977,8 @@ lock_print_info(
while (trx) {
if (buf_end - buf < 900) {
- lock_mutex_exit_kernel();
- sprintf(buf, "... output truncated!\n");
+ lock_mutex_exit_kernel();
+ sprintf(buf, "... output truncated!\n");
return;
}
@@ -3879,8 +4030,8 @@ loop:
buf += strlen(buf);
if (buf_end - buf < 500) {
- lock_mutex_exit_kernel();
- sprintf(buf, "... output truncated!\n");
+ lock_mutex_exit_kernel();
+ sprintf(buf, "... output truncated!\n");
return;
}
@@ -3936,7 +4087,7 @@ loop:
}
if (buf_end - buf < 500) {
- lock_mutex_exit_kernel();
+ lock_mutex_exit_kernel();
sprintf(buf, "... output truncated!\n");
return;
@@ -4080,7 +4231,8 @@ lock_rec_queue_validate(
if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0,
LOCK_WAIT, rec, impl_trx)) {
- ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx));
+ ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+ impl_trx));
}
}
@@ -4095,7 +4247,8 @@ lock_rec_queue_validate(
if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0,
LOCK_WAIT, rec, impl_trx)) {
- ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx));
+ ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+ impl_trx));
}
}
@@ -4359,8 +4512,8 @@ lock_rec_insert_check_and_lock(
*inherit = TRUE;
- /* If another transaction has an explicit lock request, gap or not,
- waiting or granted, on the successor, the insert has to wait.
+ /* If another transaction has an explicit lock request which locks
+ the gap, waiting or granted, on the successor, the insert has to wait.
An exception is the case where the lock by the another transaction
is a gap type lock which it placed to wait for its turn to insert. We
@@ -4369,8 +4522,10 @@ lock_rec_insert_check_and_lock(
had to wait for their insert. Both had waiting gap type lock requests
on the successor, which produced an unnecessary deadlock. */
- if (lock_rec_other_has_conflicting(LOCK_X, LOCK_GAP,
- LOCK_INSERT_INTENTION, next_rec, trx)) {
+ if (lock_rec_other_has_conflicting(LOCK_X | LOCK_GAP
+ | LOCK_INSERT_INTENTION, next_rec, trx)) {
+
+ /* Note that we may get DB_SUCCESS also here! */
err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP
| LOCK_INSERT_INTENTION,
next_rec, index, thr);
@@ -4418,9 +4573,11 @@ lock_rec_convert_impl_to_expl(
/* If the transaction has no explicit x-lock set on the
record, set one for it */
- if (!lock_rec_has_expl(LOCK_X, rec, impl_trx)) {
+ if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+ impl_trx)) {
- lock_rec_add_to_queue(LOCK_REC | LOCK_X, rec, index,
+ lock_rec_add_to_queue(LOCK_REC | LOCK_X
+ | LOCK_REC_NOT_GAP, rec, index,
impl_trx);
}
}
@@ -4466,7 +4623,7 @@ lock_clust_rec_modify_check_and_lock(
lock_rec_convert_impl_to_expl(rec, index);
- err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr);
+ err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr);
lock_mutex_exit_kernel();
@@ -4511,7 +4668,7 @@ lock_sec_rec_modify_check_and_lock(
ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
- err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr);
+ err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr);
lock_mutex_exit_kernel();
@@ -4545,6 +4702,8 @@ lock_sec_rec_read_check_and_lock(
ulint mode, /* in: mode of the lock which the read cursor
should set on records: LOCK_S or LOCK_X; the
latter is possible in SELECT FOR UPDATE */
+ ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
que_thr_t* thr) /* in: query thread */
{
ulint err;
@@ -4576,7 +4735,7 @@ lock_sec_rec_read_check_and_lock(
lock_rec_convert_impl_to_expl(rec, index);
}
- err = lock_rec_lock(FALSE, mode, rec, index, thr);
+ err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr);
lock_mutex_exit_kernel();
@@ -4607,13 +4766,16 @@ lock_clust_rec_read_check_and_lock(
ulint mode, /* in: mode of the lock which the read cursor
should set on records: LOCK_S or LOCK_X; the
latter is possible in SELECT FOR UPDATE */
+ ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
que_thr_t* thr) /* in: query thread */
{
ulint err;
ut_ad(index->type & DICT_CLUSTERED);
ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
-
+ ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+ || gap_mode == LOCK_REC_NOT_GAP);
if (flags & BTR_NO_LOCKING_FLAG) {
return(DB_SUCCESS);
@@ -4631,7 +4793,7 @@ lock_clust_rec_read_check_and_lock(
lock_rec_convert_impl_to_expl(rec, index);
}
- err = lock_rec_lock(FALSE, mode, rec, index, thr);
+ err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr);
lock_mutex_exit_kernel();
diff --git a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c
index 23585e494b8..22d0bab0da2 100644
--- a/innobase/mem/mem0dbg.c
+++ b/innobase/mem/mem0dbg.c
@@ -347,9 +347,19 @@ mem_hash_remove(
NULL, NULL);
if (error) {
printf("Inconsistency in memory heap or buffer n:o %lu created\n",
- node->nth_heap);
+ node->nth_heap);
printf("in %s line %lu and tried to free in %s line %lu.\n",
node->file_name, node->line, file_name, line);
+
+ printf(
+ "Hex dump of 400 bytes around memory heap first block start:\n");
+
+ ut_print_buf((byte*)(node->heap) - 200, 400);
+
+ printf("\nDump of the mem heap:\n");
+
+ mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, &size,
+ NULL, NULL);
ut_error;
}
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c
index 098d5b25e89..9eae358c7fb 100644
--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -148,7 +148,7 @@ Gets the operating system version. Currently works only on Windows. */
ulint
os_get_os_version(void)
/*===================*/
- /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */
+ /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
{
#ifdef __WIN__
OSVERSIONINFO os_info;
@@ -162,7 +162,11 @@ os_get_os_version(void)
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
return(OS_WIN95);
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
- return(OS_WINNT);
+ if (os_info.dwMajorVersion <= 4) {
+ return(OS_WINNT);
+ } else {
+ return(OS_WIN2000);
+ }
} else {
ut_error;
return(0);
@@ -268,9 +272,7 @@ os_file_get_last_error(void)
}
/********************************************************************
-Does error handling when a file operation fails. If we have run out
-of disk space, then the user can clean the disk. If we do not find
-a specified file, then the user can copy it to disk. */
+Does error handling when a file operation fails. */
static
ibool
os_file_handle_error(
@@ -503,7 +505,11 @@ try_again:
value 2 denotes that we do not flush the log at every
commit, but only once per second */
} else {
- attributes = attributes | FILE_FLAG_NO_BUFFERING;
+ if (srv_win_file_flush_method ==
+ SRV_WIN_IO_UNBUFFERED) {
+ attributes = attributes
+ | FILE_FLAG_NO_BUFFERING;
+ }
}
#endif
} else if (purpose == OS_FILE_NORMAL) {
@@ -514,7 +520,11 @@ try_again:
value 2 denotes that we do not flush the log at every
commit, but only once per second */
} else {
- attributes = attributes | FILE_FLAG_NO_BUFFERING;
+ if (srv_win_file_flush_method ==
+ SRV_WIN_IO_UNBUFFERED) {
+ attributes = attributes
+ | FILE_FLAG_NO_BUFFERING;
+ }
}
#endif
} else {
@@ -1752,6 +1762,7 @@ os_aio(
os_aio_array_t* array;
os_aio_slot_t* slot;
#ifdef WIN_ASYNC_IO
+ ibool retval;
BOOL ret = TRUE;
DWORD len = n;
void* dummy_mess1;
@@ -1824,6 +1835,8 @@ try_again:
if (os_aio_use_native_aio) {
#ifdef WIN_ASYNC_IO
os_n_file_reads++;
+ os_bytes_read_since_printout += len;
+
ret = ReadFile(file, buf, (DWORD)n, &len,
&(slot->control));
#elif defined(POSIX_ASYNC_IO)
@@ -1870,10 +1883,12 @@ try_again:
where we also use async i/o: in Windows we must
use the same wait mechanism as for async i/o */
- return(os_aio_windows_handle(ULINT_UNDEFINED,
+ retval = os_aio_windows_handle(ULINT_UNDEFINED,
slot->pos,
&dummy_mess1, &dummy_mess2,
- &dummy_type));
+ &dummy_type);
+
+ return(retval);
}
return(TRUE);
@@ -1897,8 +1912,6 @@ try_again:
goto try_again;
}
- ut_error;
-
return(FALSE);
}
@@ -1958,14 +1971,14 @@ os_aio_windows_handle(
n = array->n_slots / array->n_segments;
if (array == os_aio_sync_array) {
- srv_io_thread_op_info[orig_seg] = "wait windows aio for 1 page";
+ srv_io_thread_op_info[orig_seg] = "wait Windows aio for 1 page";
ut_ad(pos < array->n_slots);
os_event_wait(array->events[pos]);
i = pos;
} else {
srv_io_thread_op_info[orig_seg] =
- "wait windows aio for n pages";
+ "wait Windows aio";
i = os_event_wait_multiple(n, (array->events) + segment * n);
}
@@ -1991,10 +2004,8 @@ os_aio_windows_handle(
ut_a(TRUE == os_file_flush(slot->file));
}
} else {
- os_file_get_last_error();
-
- ut_error;
-
+ os_file_handle_error(slot->file, slot->name);
+
ret_val = FALSE;
}
diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c
index 43a2db4d306..1ee448a4a44 100644
--- a/innobase/os/os0proc.c
+++ b/innobase/os/os0proc.c
@@ -19,6 +19,23 @@ Created 9/30/1995 Heikki Tuuri
#include "ut0mem.h"
/********************************************************************
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'. */
+
+ulint
+os_proc_get_number(void)
+/*====================*/
+{
+#ifdef __WIN__
+ return((ulint)GetCurrentProcessId());
+#else
+ return((ulint)getpid());
+#endif
+}
+
+/********************************************************************
Allocates non-cacheable memory. */
void*
diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c
index 2909573b14b..bb49e9080ce 100644
--- a/innobase/page/page0cur.c
+++ b/innobase/page/page0cur.c
@@ -169,7 +169,7 @@ page_cur_search_with_match(
ut_ad(dtuple_check_typed(tuple));
ut_ad((mode == PAGE_CUR_L) || (mode == PAGE_CUR_LE)
|| (mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)
- || (mode == PAGE_CUR_DBG));
+ || (mode == PAGE_CUR_LE_OR_EXTENDS) || (mode == PAGE_CUR_DBG));
#ifdef PAGE_CUR_ADAPT
if ((page_header_get_field(page, PAGE_LEVEL) == 0)
@@ -232,9 +232,26 @@ page_cur_search_with_match(
low_matched_bytes = cur_matched_bytes;
} else if (cmp == -1) {
- up = mid;
- up_matched_fields = cur_matched_fields;
- up_matched_bytes = cur_matched_bytes;
+
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && dfield_get_len(dtuple_get_nth_field(tuple,
+ cur_matched_fields))
+ == cur_matched_bytes
+ && rec_get_nth_field_len(mid_rec,
+ cur_matched_fields)
+ != UNIV_SQL_NULL) {
+
+ /* This means current dfield is not SQL
+ NULL, and the current rec field extends it */
+
+ low = mid;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+ } else {
+ up = mid;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+ }
} else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) {
low = mid;
@@ -252,8 +269,8 @@ page_cur_search_with_match(
slot = page_dir_get_nth_slot(page, up);
up_rec = page_dir_slot_get_rec(slot);
- /* Perform linear search until the upper and lower records
- come to distance 1 of each other. */
+ /* Perform linear search until the upper and lower records come to
+ distance 1 of each other. */
while (page_rec_get_next(low_rec) != up_rec) {
@@ -272,10 +289,25 @@ page_cur_search_with_match(
low_matched_bytes = cur_matched_bytes;
} else if (cmp == -1) {
- up_rec = mid_rec;
- up_matched_fields = cur_matched_fields;
- up_matched_bytes = cur_matched_bytes;
-
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && dfield_get_len(dtuple_get_nth_field(tuple,
+ cur_matched_fields))
+ == cur_matched_bytes
+ && rec_get_nth_field_len(mid_rec,
+ cur_matched_fields)
+ != UNIV_SQL_NULL) {
+
+ /* This means current dfield is not SQL
+ NULL, and the current rec field extends it */
+
+ low = mid;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+ } else {
+ up_rec = mid_rec;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+ }
} else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) {
low_rec = mid_rec;
low_matched_fields = cur_matched_fields;
diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c
index ed74736c8da..7d0d88c6afc 100644
--- a/innobase/page/page0page.c
+++ b/innobase/page/page0page.c
@@ -1313,6 +1313,194 @@ page_rec_validate(
}
/*******************************************************************
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage. */
+
+ibool
+page_simple_validate(
+/*=================*/
+ /* out: TRUE if ok */
+ page_t* page) /* in: index page */
+{
+ page_cur_t cur;
+ page_dir_slot_t* slot;
+ ulint slot_no;
+ ulint n_slots;
+ rec_t* rec;
+ byte* rec_heap_top;
+ ulint count;
+ ulint own_count;
+ ibool ret = FALSE;
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (n_slots > UNIV_PAGE_SIZE / 4) {
+ fprintf(stderr,
+ "Nonsensical number %lu of page dir slots\n", n_slots);
+
+ goto func_exit;
+ }
+
+ rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ if (rec_heap_top > page_dir_get_nth_slot(page, n_slots - 1)) {
+
+ fprintf(stderr,
+ "Record heap and dir overlap on a page, heap top %lu, dir %lu\n",
+ (ulint)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page),
+ (ulint)(page_dir_get_nth_slot(page, n_slots - 1) - page));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that it is
+ consistent with the page record directory. */
+
+ count = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ page_cur_set_before_first(page, &cur);
+
+ for (;;) {
+ rec = (&cur)->rec;
+
+ if (rec > rec_heap_top) {
+ fprintf(stderr,
+ "Record %lu is above rec heap top %lu\n",
+ (ulint)(rec - page), (ulint)(rec_heap_top - page));
+
+ goto func_exit;
+ }
+
+ if (rec_get_n_owned(rec) != 0) {
+ /* This is a record pointed to by a dir slot */
+ if (rec_get_n_owned(rec) != own_count) {
+
+ fprintf(stderr,
+ "Wrong owned count %lu, %lu, rec %lu\n",
+ rec_get_n_owned(rec), own_count,
+ (ulint)(rec - page));
+
+ goto func_exit;
+ }
+
+ if (page_dir_slot_get_rec(slot) != rec) {
+ fprintf(stderr,
+ "Dir slot does not point to right rec %lu\n",
+ (ulint)(rec - page));
+
+ goto func_exit;
+ }
+
+ own_count = 0;
+
+ if (!page_cur_is_after_last(&cur)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_cur_is_after_last(&cur)) {
+
+ break;
+ }
+
+ if (rec_get_next_offs(rec) < FIL_PAGE_DATA
+ || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "Next record offset nonsensical %lu for rec %lu\n",
+ rec_get_next_offs(rec),
+ (ulint)(rec - page));
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (count > UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "Page record list appears to be circular %lu\n",
+ count);
+ goto func_exit;
+ }
+
+ page_cur_move_to_next(&cur);
+ own_count++;
+ }
+
+ if (rec_get_n_owned(rec) == 0) {
+ fprintf(stderr, "n owned is zero in a supremum rec\n");
+
+ goto func_exit;
+ }
+
+ if (slot_no != n_slots - 1) {
+ fprintf(stderr, "n slots wrong %lu, %lu\n",
+ slot_no, n_slots - 1);
+ goto func_exit;
+ }
+
+ if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) {
+ fprintf(stderr, "n recs wrong %lu %lu\n",
+ page_header_get_field(page, PAGE_N_RECS) + 2, count + 1);
+
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ if (rec < page + FIL_PAGE_DATA
+ || rec >= page + UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "Free list record has a nonsensical offset %lu\n",
+ (ulint)(rec - page));
+
+ goto func_exit;
+ }
+
+ if (rec > rec_heap_top) {
+ fprintf(stderr,
+ "Free list record %lu is above rec heap top %lu\n",
+ (ulint)(rec - page), (ulint)(rec_heap_top - page));
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (count > UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "Page free list appears to be circular %lu\n",
+ count);
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next(rec);
+ }
+
+ if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) {
+
+ fprintf(stderr, "N heap is wrong %lu, %lu\n",
+ page_header_get_field(page, PAGE_N_HEAP), count + 1);
+
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ return(ret);
+}
+
+/*******************************************************************
This function checks the consistency of an index page. */
ibool
@@ -1339,6 +1527,14 @@ page_validate(
ulint i;
char err_buf[1000];
+ if (!page_simple_validate(page)) {
+ buf_page_print(page);
+
+ fprintf(stderr, "Apparent corruption in a page in index %s\n",
+ index->name);
+ return(FALSE);
+ }
+
heap = mem_heap_create(UNIV_PAGE_SIZE);
/* The following buffer is used to check that the
diff --git a/innobase/pars/lexyy.c b/innobase/pars/lexyy.c
index 782fca35f66..f7edc9d195f 100644
--- a/innobase/pars/lexyy.c
+++ b/innobase/pars/lexyy.c
@@ -4,8 +4,6 @@
* $Header: /home/daffy/u0/vern/flex/RCS/flex.skl,v 2.91 96/09/10 16:58:48 vern Exp $
*/
-#include "univ.i"
-
#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5
@@ -609,18 +607,13 @@ How to make the InnoDB parser and lexer C files:
6. Remove the #include of unistd.h from about line 2500 of lexyy.c
-7. Move #include <math.h> in pars0grm.c after #include "univ.i" to remove
- a large file compilation error on AIX.
-
-8. Move #include "univ.i" in lexyy.c to the file start to remove a large
- file compilation error on AIX.
-
These instructions seem to work at least with bison-1.28 and flex-2.5.4 on
Linux.
*******************************************************/
#line 36 "pars0lex.l"
#define YYSTYPE que_node_t*
+#include "univ.i"
#include "pars0pars.h"
#include "pars0grm.h"
#include "pars0sym.h"
diff --git a/innobase/pars/pars0grm.c b/innobase/pars/pars0grm.c
index ce575063610..05b75398084 100644
--- a/innobase/pars/pars0grm.c
+++ b/innobase/pars/pars0grm.c
@@ -102,8 +102,6 @@ que_node_t */
#include "que0que.h"
#include "row0sel.h"
-#include <math.h>
-
#define YYSTYPE que_node_t*
/* #define __STDC__ */
diff --git a/innobase/read/read0read.c b/innobase/read/read0read.c
index a5048c0c909..5c1d2d5418e 100644
--- a/innobase/read/read0read.c
+++ b/innobase/read/read0read.c
@@ -201,6 +201,28 @@ read_view_close(
}
/*************************************************************************
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+
+void
+read_view_close_for_mysql(
+/*======================*/
+ trx_t* trx) /* in: trx which has a read view */
+{
+ ut_a(trx->read_view);
+
+ mutex_enter(&kernel_mutex);
+
+ read_view_close(trx->read_view);
+
+ mem_heap_empty(trx->read_view_heap);
+
+ trx->read_view = NULL;
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*************************************************************************
Prints a read view to stderr. */
void
diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c
index 941c9d5759d..4e8b487a0f1 100644
--- a/innobase/row/row0ins.c
+++ b/innobase/row/row0ins.c
@@ -321,59 +321,6 @@ row_ins_clust_index_entry_by_modify(
return(err);
}
-/*******************************************************************
-Checks if a unique key violation to rec would occur at the index entry
-insert. */
-static
-ibool
-row_ins_dupl_error_with_rec(
-/*========================*/
- /* out: TRUE if error */
- rec_t* rec, /* in: user record; NOTE that we assume
- that the caller already has a record lock on
- the record! */
- dtuple_t* entry, /* in: entry to insert */
- dict_index_t* index) /* in: index */
-{
- ulint matched_fields;
- ulint matched_bytes;
- ulint n_unique;
- ulint i;
-
- n_unique = dict_index_get_n_unique(index);
-
- matched_fields = 0;
- matched_bytes = 0;
-
- cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
-
- if (matched_fields < n_unique) {
-
- return(FALSE);
- }
-
- /* In a unique secondary index we allow equal key values if they
- contain SQL NULLs */
-
- if (!(index->type & DICT_CLUSTERED)) {
-
- for (i = 0; i < n_unique; i++) {
- if (UNIV_SQL_NULL == dfield_get_len(
- dtuple_get_nth_field(entry, i))) {
-
- return(FALSE);
- }
- }
- }
-
- if (!rec_get_deleted_flag(rec)) {
-
- return(TRUE);
- }
-
- return(FALSE);
-}
-
/*************************************************************************
Either deletes or sets the referencing columns SQL NULL in a child row.
Used in ON DELETE ... clause for foreign keys when a parent row is
@@ -533,8 +480,12 @@ row_ins_foreign_delete_or_set_null(
err = lock_table(0, table, LOCK_IX, thr);
if (err == DB_SUCCESS) {
+ /* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+ we already have a normal shared lock on the appropriate
+ gap if the search criterion was not unique */
+
err = lock_clust_rec_read_check_and_lock(0, clust_rec,
- clust_index, LOCK_X, thr);
+ clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr);
}
if (err != DB_SUCCESS) {
@@ -630,12 +581,14 @@ nonstandard_exit_func:
/*************************************************************************
Sets a shared lock on a record. Used in locking possible duplicate key
-records. */
+records and also in checking foreign key constraints. */
static
ulint
row_ins_set_shared_rec_lock(
/*========================*/
/* out: DB_SUCCESS or error code */
+ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
rec_t* rec, /* in: record */
dict_index_t* index, /* in: index */
que_thr_t* thr) /* in: query thread */
@@ -644,10 +597,10 @@ row_ins_set_shared_rec_lock(
if (index->type & DICT_CLUSTERED) {
err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_S,
- thr);
+ type, thr);
} else {
err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_S,
- thr);
+ type, thr);
}
return(err);
@@ -656,7 +609,7 @@ row_ins_set_shared_rec_lock(
/*******************************************************************
Checks if foreign key constraint fails for an index entry. Sets shared locks
which lock either the success or the failure of the constraint. NOTE that
-the caller must have a shared latch on dict_foreign_key_check_lock. */
+the caller must have a shared latch on dict_operation_lock. */
ulint
row_ins_check_foreign_constraint(
@@ -679,7 +632,7 @@ row_ins_check_foreign_constraint(
dict_table_t* check_table;
dict_index_t* check_index;
ulint n_fields_cmp;
- ibool timeout_expired;
+ ibool unique_search;
rec_t* rec;
btr_pcur_t pcur;
ibool moved;
@@ -689,7 +642,9 @@ row_ins_check_foreign_constraint(
mtr_t mtr;
run_again:
- ut_ad(rw_lock_own(&dict_foreign_key_check_lock, RW_LOCK_SHARED));
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+
+ err = DB_SUCCESS;
if (thr_get_trx(thr)->check_foreigns == FALSE) {
/* The user has suppressed foreign key checks currently for
@@ -748,6 +703,14 @@ run_again:
dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+ if (dict_index_get_n_unique(check_index) <= foreign->n_fields) {
+ /* We can just set a LOCK_REC_NOT_GAP type lock */
+
+ unique_search = TRUE;
+ } else {
+ unique_search = FALSE;
+ }
+
btr_pcur_open(check_index, entry, PAGE_CUR_GE,
BTR_SEARCH_LEAF, &pcur, &mtr);
@@ -761,25 +724,45 @@ run_again:
goto next_rec;
}
- /* Try to place a lock on the index record */
-
- err = row_ins_set_shared_rec_lock(rec, check_index, thr);
-
- if (err != DB_SUCCESS) {
-
- break;
- }
-
if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec,
+ check_index, thr);
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+
goto next_rec;
}
cmp = cmp_dtuple_rec(entry, rec);
if (cmp == 0) {
- if (!rec_get_deleted_flag(rec)) {
+ if (rec_get_deleted_flag(rec)) {
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY,
+ rec, check_index, thr);
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+ } else {
/* Found a matching record */
+
+ if (unique_search) {
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,
+ rec, check_index, thr);
+ } else {
+ err = row_ins_set_shared_rec_lock(
+ LOCK_ORDINARY,
+ rec, check_index, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
/* printf(
"FOREIGN: Found matching record from %s %s\n",
@@ -807,6 +790,13 @@ run_again:
}
if (cmp < 0) {
+ err = row_ins_set_shared_rec_lock(LOCK_GAP,
+ rec, check_index, thr);
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+
if (check_ref) {
err = DB_NO_REFERENCED_ROW;
} else {
@@ -844,14 +834,14 @@ do_possible_lock_wait:
que_thr_stop_for_mysql(thr);
- timeout_expired = srv_suspend_mysql_thread(thr);
+ srv_suspend_mysql_thread(thr);
- if (!timeout_expired) {
+ if (thr_get_trx(thr)->error_state == DB_SUCCESS) {
goto run_again;
}
- err = DB_LOCK_WAIT_TIMEOUT;
+ err = thr_get_trx(thr)->error_state;
}
return(err);
@@ -890,21 +880,21 @@ row_ins_check_foreign_constraints(
trx);
}
- if (!trx->has_dict_foreign_key_check_lock) {
+ if (!trx->has_dict_operation_lock) {
got_s_lock = TRUE;
- rw_lock_s_lock(&dict_foreign_key_check_lock);
+ rw_lock_s_lock(&dict_operation_lock);
- trx->has_dict_foreign_key_check_lock = TRUE;
+ trx->has_dict_operation_lock = TRUE;
}
err = row_ins_check_foreign_constraint(TRUE, foreign,
table, index, entry, thr);
if (got_s_lock) {
- rw_lock_s_unlock(&dict_foreign_key_check_lock);
+ rw_lock_s_unlock(&dict_operation_lock);
- trx->has_dict_foreign_key_check_lock = FALSE;
+ trx->has_dict_operation_lock = FALSE;
}
if (err != DB_SUCCESS) {
@@ -919,6 +909,59 @@ row_ins_check_foreign_constraints(
}
/*******************************************************************
+Checks if a unique key violation to rec would occur at the index entry
+insert. */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+ /* out: TRUE if error */
+ rec_t* rec, /* in: user record; NOTE that we assume
+ that the caller already has a record lock on
+ the record! */
+ dtuple_t* entry, /* in: entry to insert */
+ dict_index_t* index) /* in: index */
+{
+ ulint matched_fields;
+ ulint matched_bytes;
+ ulint n_unique;
+ ulint i;
+
+ n_unique = dict_index_get_n_unique(index);
+
+ matched_fields = 0;
+ matched_bytes = 0;
+
+ cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
+
+ if (matched_fields < n_unique) {
+
+ return(FALSE);
+ }
+
+ /* In a unique secondary index we allow equal key values if they
+ contain SQL NULLs */
+
+ if (!(index->type & DICT_CLUSTERED)) {
+
+ for (i = 0; i < n_unique; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ return(FALSE);
+ }
+ }
+ }
+
+ if (!rec_get_deleted_flag(rec)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************
Scans a unique non-clustered index at a given index entry to determine
whether a uniqueness violation has occurred for the key value of the entry.
Set shared locks on possible duplicate records. */
@@ -976,9 +1019,10 @@ row_ins_scan_sec_index_for_duplicate(
goto next_rec;
}
- /* Try to place a lock on the index record */
+ /* Try to place a lock on the index record */
- err = row_ins_set_shared_rec_lock(rec, index, thr);
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, index,
+ thr);
if (err != DB_SUCCESS) {
@@ -1082,8 +1126,8 @@ row_ins_duplicate_error_in_clust(
sure that in roll-forward we get the same duplicate
errors as in original execution */
- err = row_ins_set_shared_rec_lock(rec, cursor->index,
- thr);
+ err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP,
+ rec, cursor->index, thr);
if (err != DB_SUCCESS) {
return(err);
@@ -1105,8 +1149,8 @@ row_ins_duplicate_error_in_clust(
if (rec != page_get_supremum_rec(page)) {
- err = row_ins_set_shared_rec_lock(rec, cursor->index,
- thr);
+ err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP,
+ rec, cursor->index, thr);
if (err != DB_SUCCESS) {
return(err);
diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c
index cea8f1316fe..6fde57eb75a 100644
--- a/innobase/row/row0mysql.c
+++ b/innobase/row/row0mysql.c
@@ -27,6 +27,7 @@ Created 9/17/2000 Heikki Tuuri
#include "lock0lock.h"
#include "rem0cmp.h"
#include "log0log.h"
+#include "btr0sea.h"
/* A dummy variable used to fool the compiler */
ibool row_mysql_identically_false = FALSE;
@@ -203,7 +204,6 @@ row_mysql_handle_errors(
que_thr_t* thr, /* in: query thread */
trx_savept_t* savept) /* in: savepoint or NULL */
{
- ibool timeout_expired;
ulint err;
handle_new_error:
@@ -240,11 +240,9 @@ handle_new_error:
/* MySQL will roll back the latest SQL statement */
} else if (err == DB_LOCK_WAIT) {
- timeout_expired = srv_suspend_mysql_thread(thr);
-
- if (timeout_expired) {
- trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+ srv_suspend_mysql_thread(thr);
+ if (trx->error_state != DB_SUCCESS) {
que_thr_stop_for_mysql(thr);
goto handle_new_error;
@@ -1146,7 +1144,7 @@ row_mysql_lock_data_dictionary(void)
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks or lock waits can occur then in these operations */
- rw_lock_x_lock(&(dict_foreign_key_check_lock));
+ rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&(dict_sys->mutex));
}
@@ -1161,7 +1159,7 @@ row_mysql_unlock_data_dictionary(void)
no deadlocks can occur then in these operations */
mutex_exit(&(dict_sys->mutex));
- rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+ rw_lock_x_unlock(&dict_operation_lock);
}
/*************************************************************************
@@ -1184,6 +1182,7 @@ row_create_table_for_mysql(
ulint err;
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
ut_ad(mutex_own(&(dict_sys->mutex)));
if (srv_created_new_raw) {
@@ -1383,7 +1382,8 @@ row_create_index_for_mysql(
ulint namelen;
ulint keywordlen;
ulint err;
-
+
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
ut_ad(mutex_own(&(dict_sys->mutex)));
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
@@ -1464,6 +1464,7 @@ row_table_add_foreign_constraints(
ulint err;
ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
ut_a(sql_string);
trx->op_info = (char *) "adding foreign keys";
@@ -1846,12 +1847,16 @@ row_drop_table_for_mysql(
no deadlocks can occur then in these operations */
if (!has_dict_mutex) {
- /* Prevent foreign key checks while we are dropping the table */
- rw_lock_x_lock(&(dict_foreign_key_check_lock));
+ /* Prevent foreign key checks etc. while we are dropping the
+ table */
+ rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&(dict_sys->mutex));
}
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+
graph = pars_sql(buf);
ut_a(graph);
@@ -1861,9 +1866,6 @@ row_drop_table_for_mysql(
graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
- /* Prevent purge from running while we are dropping the table */
- rw_lock_s_lock(&(purge_sys->purge_is_running));
-
table = dict_table_get_low(name);
if (!table) {
@@ -1944,12 +1946,11 @@ row_drop_table_for_mysql(
}
}
-funct_exit:
- rw_lock_s_unlock(&(purge_sys->purge_is_running));
+funct_exit:
if (!has_dict_mutex) {
mutex_exit(&(dict_sys->mutex));
- rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+ rw_lock_x_unlock(&dict_operation_lock);
}
que_graph_free(graph);
@@ -1985,7 +1986,7 @@ row_drop_database_for_mysql(
trx_start_if_not_started(trx);
loop:
- rw_lock_x_lock(&(dict_foreign_key_check_lock));
+ rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&(dict_sys->mutex));
while ((table_name = dict_get_first_table_name_in_db(name))) {
@@ -2000,7 +2001,7 @@ loop:
if (table->n_mysql_handles_opened > 0) {
mutex_exit(&(dict_sys->mutex));
- rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+ rw_lock_x_unlock(&dict_operation_lock);
ut_print_timestamp(stderr);
fprintf(stderr,
@@ -2028,7 +2029,7 @@ loop:
}
mutex_exit(&(dict_sys->mutex));
- rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+ rw_lock_x_unlock(&dict_operation_lock);
trx_commit_for_mysql(trx);
@@ -2165,7 +2166,7 @@ row_rename_table_for_mysql(
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations */
- rw_lock_x_lock(&(dict_foreign_key_check_lock));
+ rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&(dict_sys->mutex));
table = dict_table_get_low(old_name);
@@ -2249,7 +2250,7 @@ row_rename_table_for_mysql(
}
funct_exit:
mutex_exit(&(dict_sys->mutex));
- rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+ rw_lock_x_unlock(&dict_operation_lock);
que_graph_free(graph);
@@ -2394,18 +2395,28 @@ row_check_table_for_mysql(
row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
handle */
{
- dict_table_t* table = prebuilt->table;
+ dict_table_t* table = prebuilt->table;
dict_index_t* index;
ulint n_rows;
ulint n_rows_in_table = ULINT_UNDEFINED;
- ulint ret = DB_SUCCESS;
-
+ ulint ret = DB_SUCCESS;
+ ulint old_isolation_level;
+
prebuilt->trx->op_info = (char *) "checking table";
+ old_isolation_level = prebuilt->trx->isolation_level;
+
+ /* We must run the index record counts at an isolation level
+ >= READ COMMITTED, because a dirty read can see a wrong number
+ of records in some index; to play safe, we use always
+ REPEATABLE READ here */
+
+ prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
index = dict_table_get_first_index(table);
while (index != NULL) {
- /* fprintf(stderr, "Validating index %s\n", index->name); */
+ /* fprintf(stderr, "Validating index %s\n", index->name); */
if (!btr_validate_tree(index->tree)) {
ret = DB_ERROR;
@@ -2433,6 +2444,9 @@ row_check_table_for_mysql(
index = dict_table_get_next_index(index);
}
+ /* Restore the original isolation level */
+ prebuilt->trx->isolation_level = old_isolation_level;
+
/* We validate also the whole adaptive hash index for all tables
at every CHECK TABLE */
diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c
index 60e057b816e..3d9ae6aad8b 100644
--- a/innobase/row/row0purge.c
+++ b/innobase/row/row0purge.c
@@ -453,7 +453,9 @@ static
ibool
row_purge_parse_undo_rec(
/*=====================*/
- /* out: TRUE if purge operation required */
+ /* out: TRUE if purge operation required:
+ NOTE that then the CALLER must s-unlock
+ dict_operation_lock! */
purge_node_t* node, /* in: row undo node */
ibool* updated_extern,
/* out: TRUE if an externally stored field
@@ -493,18 +495,20 @@ row_purge_parse_undo_rec(
return(FALSE);
}
+ /* Prevent DROP TABLE etc. from running when we are doing the purge
+ for this row */
+
+ rw_lock_s_lock(&dict_operation_lock);
mutex_enter(&(dict_sys->mutex));
node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr));
- rw_lock_x_lock(&(purge_sys->purge_is_running));
-
mutex_exit(&(dict_sys->mutex));
if (node->table == NULL) {
/* The table has been dropped: no need to do purge */
- rw_lock_x_unlock(&(purge_sys->purge_is_running));
+ rw_lock_s_unlock(&dict_operation_lock);
return(FALSE);
}
@@ -514,7 +518,7 @@ row_purge_parse_undo_rec(
if (clust_index == NULL) {
/* The table was corrupt in the data dictionary */
- rw_lock_x_unlock(&(purge_sys->purge_is_running));
+ rw_lock_s_unlock(&dict_operation_lock);
return(FALSE);
}
@@ -573,6 +577,8 @@ row_purge(
} else {
purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
thr);
+ /* If purge_needed == TRUE, we must also remember to unlock
+ dict_operation_lock! */
}
if (purge_needed) {
@@ -594,7 +600,7 @@ row_purge(
btr_pcur_close(&(node->pcur));
}
- rw_lock_x_unlock(&(purge_sys->purge_is_running));
+ rw_lock_s_unlock(&dict_operation_lock);
}
/* Do some cleanup */
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c
index 4af04251996..fcf48dd15cf 100644
--- a/innobase/row/row0sel.c
+++ b/innobase/row/row0sel.c
@@ -606,7 +606,7 @@ row_sel_get_clust_rec(
/* Try to place a lock on the index record */
err = lock_clust_rec_read_check_and_lock(0, clust_rec, index,
- node->row_lock_mode, thr);
+ node->row_lock_mode, LOCK_ORDINARY, thr);
if (err != DB_SUCCESS) {
return(err);
@@ -621,7 +621,7 @@ row_sel_get_clust_rec(
node->read_view)) {
err = row_sel_build_prev_vers(node->read_view, plan,
- clust_rec, &old_vers, mtr);
+ clust_rec, &old_vers, mtr);
if (err != DB_SUCCESS) {
return(err);
@@ -678,16 +678,17 @@ sel_set_rec_lock(
rec_t* rec, /* in: record */
dict_index_t* index, /* in: index */
ulint mode, /* in: lock mode */
+ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */
que_thr_t* thr) /* in: query thread */
{
ulint err;
if (index->type & DICT_CLUSTERED) {
err = lock_clust_rec_read_check_and_lock(0, rec, index, mode,
- thr);
+ type, thr);
} else {
err = lock_sec_rec_read_check_and_lock(0, rec, index, mode,
- thr);
+ type, thr);
}
return(err);
@@ -1154,7 +1155,7 @@ rec_loop:
if (!consistent_read) {
err = sel_set_rec_lock(page_rec_get_next(rec), index,
- node->row_lock_mode, thr);
+ node->row_lock_mode, LOCK_ORDINARY, thr);
if (err != DB_SUCCESS) {
/* Note that in this case we will store in pcur
the PREDECESSOR of the record we are waiting
@@ -1180,8 +1181,8 @@ rec_loop:
if (!consistent_read) {
/* Try to place a lock on the index record */
- err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr);
-
+ err = sel_set_rec_lock(rec, index, node->row_lock_mode,
+ LOCK_ORDINARY, thr);
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
@@ -2200,6 +2201,7 @@ row_sel_get_clust_rec_for_mysql(
rec_t* old_vers;
ulint err;
trx_t* trx;
+ char err_buf[1000];
*out_rec = NULL;
@@ -2213,14 +2215,40 @@ row_sel_get_clust_rec_for_mysql(
clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
- ut_ad(page_rec_is_user_rec(clust_rec));
+ if (!page_rec_is_user_rec(clust_rec)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: error clustered record for sec rec not found\n"
+ "InnoDB: index %s table %s\n", sec_index->name,
+ sec_index->table->name);
+
+ rec_sprintf(err_buf, 900, rec);
+ fprintf(stderr, "InnoDB: sec index record %s\n", err_buf);
+
+ rec_sprintf(err_buf, 900, clust_rec);
+ fprintf(stderr, "InnoDB: clust index record %s\n", err_buf);
+
+ trx_print(err_buf, trx);
+
+ fprintf(stderr,
+ "%s\nInnoDB: Make a detailed bug report and send it\n",
+ err_buf);
+ fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n");
+
+ clust_rec = NULL;
+
+ goto func_exit;
+ }
if (prebuilt->select_lock_type != LOCK_NONE) {
- /* Try to place a lock on the index record */
+ /* Try to place a lock on the index record; we are searching
+ the clust rec with a unique condition, hence
+ we set a LOCK_REC_NOT_GAP type lock */
err = lock_clust_rec_read_check_and_lock(0, clust_rec,
clust_index,
- prebuilt->select_lock_type, thr);
+ prebuilt->select_lock_type,
+ LOCK_REC_NOT_GAP, thr);
if (err != DB_SUCCESS) {
return(err);
@@ -2232,8 +2260,12 @@ row_sel_get_clust_rec_for_mysql(
trx = thr_get_trx(thr);
old_vers = NULL;
-
- if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index,
+
+ /* If the isolation level allows reading of uncommitted data,
+ then we never look for an earlier version */
+
+ if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && !lock_clust_rec_cons_read_sees(clust_rec, clust_index,
trx->read_view)) {
err = row_sel_build_prev_vers_for_mysql(
@@ -2275,6 +2307,7 @@ row_sel_get_clust_rec_for_mysql(
}
}
+func_exit:
*out_rec = clust_rec;
if (prebuilt->select_lock_type == LOCK_X) {
@@ -2407,7 +2440,7 @@ row_sel_push_cache_row_for_mysql(
/*************************************************************************
Tries to do a shortcut to fetch a clustered index record with a unique key,
using the hash index if possible (not always). We assume that the search
-mode is PAGE_CUR_GE, it is a consistent read, trx has already a read view,
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
btr search latch has been locked in S-mode. */
static
ulint
@@ -2426,7 +2459,7 @@ row_sel_try_search_shortcut_for_mysql(
ut_ad(index->type & DICT_CLUSTERED);
ut_ad(!prebuilt->templ_contains_blob);
-
+
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
BTR_SEARCH_LEAF, pcur,
#ifndef UNIV_SEARCH_DEBUG
@@ -2516,17 +2549,22 @@ row_search_for_mysql(
ibool was_lock_wait;
ulint ret;
ulint shortcut;
+ ibool unique_search = FALSE;
ibool unique_search_from_clust_index = FALSE;
ibool mtr_has_extra_clust_latch = FALSE;
ibool moves_up = FALSE;
+ ibool set_also_gap_locks = TRUE;
+ /* if the query is a plain
+ locking SELECT, and the isolation
+ level is <= TRX_ISO_READ_COMMITTED,
+ then this is set to FALSE */
+ ibool success;
ulint cnt = 0;
mtr_t mtr;
ut_ad(index && pcur && search_tuple);
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
-
- ut_ad(sync_thread_levels_empty_gen(FALSE));
-
+
if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
fprintf(stderr,
"InnoDB: Error: trying to free a corrupt\n"
@@ -2543,6 +2581,9 @@ row_search_for_mysql(
printf("N tables locked %lu\n", trx->mysql_n_tables_locked);
*/
+ /*-------------------------------------------------------------*/
+ /* PHASE 1: Try to pop the row from the prefetch cache */
+
if (direction == 0) {
trx->op_info = (char *) "starting index read";
@@ -2608,18 +2649,35 @@ row_search_for_mysql(
mtr_start(&mtr);
- /* Since we must release the search system latch when we retrieve an
- externally stored field, we cannot use the adaptive hash index in a
- search in the case the row may be long and there may be externally
- stored fields */
+ /* In a search where at most one record in the index may match, we
+ can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete
+ marked matching record.
+
+ Note that in a unique secondary index there may be different delete
+ marked versions of a record where only the primary key values differ:
+ thus in a secondary index we must use next-key locks when locking
+ delete marked records. */
if (match_mode == ROW_SEL_EXACT
- && index->type & DICT_UNIQUE
- && index->type & DICT_CLUSTERED
- && !prebuilt->templ_contains_blob
- && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
- && dtuple_get_n_fields(search_tuple)
+ && index->type & DICT_UNIQUE
+ && dtuple_get_n_fields(search_tuple)
== dict_index_get_n_unique(index)) {
+ unique_search = TRUE;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 2: Try fast adaptive hash index search if possible */
+
+ /* Next test if this is the special case where we can use the fast
+ adaptive hash index to try the search. Since we must release the
+ search system latch when we retrieve an externally stored field, we
+ cannot use the adaptive hash index in a search in the case the row
+ may be long and there may be externally stored fields */
+
+ if (unique_search
+ && index->type & DICT_CLUSTERED
+ && !prebuilt->templ_contains_blob
+ && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
if (direction == ROW_SEL_NEXT) {
/* MySQL sometimes seems to do fetch next even
@@ -2642,8 +2700,9 @@ row_search_for_mysql(
unique_search_from_clust_index = TRUE;
- if (trx->mysql_n_tables_locked == 0
- && !prebuilt->sql_stat_start) {
+ if (prebuilt->select_lock_type == LOCK_NONE
+ && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && trx->read_view) {
/* This is a SELECT query done as a consistent read,
and the read view has already been allocated:
@@ -2722,13 +2781,34 @@ row_search_for_mysql(
mtr_start(&mtr);
}
}
-no_shortcut:
+
+no_shortcut:
+ /*-------------------------------------------------------------*/
+ /* PHASE 3: Open or restore index cursor position */
+
if (trx->has_search_latch) {
rw_lock_s_unlock(&btr_search_latch);
trx->has_search_latch = FALSE;
}
trx_start_if_not_started(trx);
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && prebuilt->select_lock_type != LOCK_NONE
+ && trx->mysql_query_str) {
+
+ /* Scan the MySQL query string; check if SELECT is the first
+ word there */
+
+ dict_accept(*trx->mysql_query_str, "SELECT", &success);
+
+ if (success) {
+ /* It is a plain locking SELECT and the isolation
+ level is low: do not lock gaps */
+
+ set_also_gap_locks = FALSE;
+ }
+ }
/* Note that if the search mode was GE or G, then the cursor
naturally moves upward (in fetch next) in alphabetical order,
@@ -2793,8 +2873,10 @@ no_shortcut:
prebuilt->sql_stat_start = FALSE;
}
- /*-------------------------------------------------------------*/
rec_loop:
+ /*-------------------------------------------------------------*/
+ /* PHASE 4: Look for matching records in a loop */
+
cons_read_requires_clust_rec = FALSE;
rec = btr_pcur_get_rec(pcur);
@@ -2812,22 +2894,24 @@ rec_loop:
goto next_rec;
}
-
- if (prebuilt->select_lock_type != LOCK_NONE) {
- /* Try to place a lock on the index record */
- err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type,
- thr);
- if (err != DB_SUCCESS) {
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
- goto lock_wait_or_error;
- }
- }
+ if (prebuilt->select_lock_type != LOCK_NONE
+ && set_also_gap_locks) {
- if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+ /* Try to place a lock on the index record */
+
+ err = sel_set_rec_lock(rec, index,
+ prebuilt->select_lock_type,
+ LOCK_ORDINARY, thr);
+ if (err != DB_SUCCESS) {
+ goto lock_wait_or_error;
+ }
+ }
/* A page supremum record cannot be in the result set: skip
- it now when we have placed a possible lock on it */
+ it now that we have placed a possible lock on it */
goto next_rec;
}
@@ -2850,6 +2934,19 @@ rec_loop:
if (0 != cmp_dtuple_rec(search_tuple, rec)) {
+ if (prebuilt->select_lock_type != LOCK_NONE
+ && set_also_gap_locks) {
+ /* Try to place a lock on the index record */
+
+ err = sel_set_rec_lock(rec, index,
+ prebuilt->select_lock_type,
+ LOCK_GAP, thr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
@@ -2862,6 +2959,19 @@ rec_loop:
if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) {
+ if (prebuilt->select_lock_type != LOCK_NONE
+ && set_also_gap_locks) {
+ /* Try to place a lock on the index record */
+
+ err = sel_set_rec_lock(rec, index,
+ prebuilt->select_lock_type,
+ LOCK_GAP, thr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
@@ -2874,16 +2984,39 @@ rec_loop:
/* We are ready to look at a possible new index entry in the result
set: the cursor is now placed on a user record */
- /* Get the right version of the row in a consistent read */
-
- if (prebuilt->select_lock_type == LOCK_NONE) {
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; note that delete
+ marked records are a special case in a unique search. If there
+ is a non-delete marked record, then it is enough to lock its
+ existence with LOCK_REC_NOT_GAP. */
+
+ if (!set_also_gap_locks
+ || (unique_search && !rec_get_deleted_flag(rec))) {
+ err = sel_set_rec_lock(rec, index,
+ prebuilt->select_lock_type,
+ LOCK_REC_NOT_GAP, thr);
+ } else {
+ err = sel_set_rec_lock(rec, index,
+ prebuilt->select_lock_type,
+ LOCK_ORDINARY, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto lock_wait_or_error;
+ }
+ } else {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
cons_read_requires_clust_rec = FALSE;
- if (index == clust_index) {
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+ /* Do nothing: we let a non-locking SELECT read the
+ latest version of the record */
+
+ } else if (index == clust_index) {
if (!lock_clust_rec_cons_read_sees(rec, index,
trx->read_view)) {
@@ -3020,8 +3153,11 @@ got_row:
ret = DB_SUCCESS;
goto normal_return;
- /*-------------------------------------------------------------*/
+
next_rec:
+ /*-------------------------------------------------------------*/
+ /* PHASE 5: Move the cursor to the next index record */
+
if (mtr_has_extra_clust_latch) {
/* We must commit mtr if we are moving to the next
non-clustered index record, because we could break the
@@ -3064,8 +3200,10 @@ next_rec:
cnt++;
goto rec_loop;
- /*-------------------------------------------------------------*/
+
lock_wait_or_error:
+ /*-------------------------------------------------------------*/
+
btr_pcur_store_position(pcur, &mtr);
mtr_commit(&mtr);
@@ -3096,6 +3234,7 @@ lock_wait_or_error:
return(err);
normal_return:
+ /*-------------------------------------------------------------*/
que_thr_stop_for_mysql_no_error(thr, trx);
mtr_commit(&mtr);
@@ -3156,10 +3295,12 @@ row_search_check_if_query_cache_permitted(
ret = TRUE;
- /* Assign a read view for the transaction if it does not yet
- have one */
+ /* If the isolation level is high, assign a read view for the
+ transaction if it does not yet have one */
+
+ if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+ && !trx->read_view) {
- if (!trx->read_view) {
trx->read_view = read_view_open_now(trx,
trx->read_view_heap);
}
diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c
index 9990f893432..fff67dcd627 100644
--- a/innobase/row/row0uins.c
+++ b/innobase/row/row0uins.c
@@ -254,7 +254,8 @@ row_undo_ins_parse_undo_rec(
node->table = dict_table_get_on_id(table_id, node->trx);
if (node->table == NULL) {
- return;
+
+ return;
}
clust_index = dict_table_get_first_index(node->table);
@@ -281,7 +282,7 @@ row_undo_ins(
ut_ad(node && thr);
ut_ad(node->state == UNDO_NODE_INSERT);
-
+
row_undo_ins_parse_undo_rec(node, thr);
if (node->table == NULL) {
@@ -292,6 +293,7 @@ row_undo_ins(
if (!found) {
trx_undo_rec_release(node->trx, node->undo_no);
+
return(DB_SUCCESS);
}
diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c
index 5119254f405..b40d36533a4 100644
--- a/innobase/row/row0undo.c
+++ b/innobase/row/row0undo.c
@@ -211,7 +211,6 @@ row_undo(
if (node->state == UNDO_NODE_FETCH_NEXT) {
- /* The call below also starts &mtr */
node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
trx->roll_limit,
&roll_ptr,
@@ -254,6 +253,10 @@ row_undo(
}
}
+ /* Prevent DROP TABLE etc. while we are rolling back this row */
+
+ rw_lock_s_lock(&dict_operation_lock);
+
if (node->state == UNDO_NODE_INSERT) {
err = row_undo_ins(node, thr);
@@ -264,6 +267,8 @@ row_undo(
err = row_undo_mod(node, thr);
}
+ rw_lock_s_unlock(&dict_operation_lock);
+
/* Do some cleanup */
btr_pcur_close(&(node->pcur));
diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c
index 25c82f39da9..0be4f901d16 100644
--- a/innobase/row/row0upd.c
+++ b/innobase/row/row0upd.c
@@ -79,7 +79,7 @@ ibool
row_upd_index_is_referenced(
/*========================*/
/* out: TRUE if referenced; NOTE that since
- we do not hold dict_foreign_key_check_lock
+ we do not hold dict_operation_lock
when leaving the function, it may be that
the referencing table has been dropped when
we leave this function: this function is only
@@ -95,8 +95,8 @@ row_upd_index_is_referenced(
return(FALSE);
}
- if (!trx->has_dict_foreign_key_check_lock) {
- rw_lock_s_lock(&dict_foreign_key_check_lock);
+ if (!trx->has_dict_operation_lock) {
+ rw_lock_s_lock(&dict_operation_lock);
}
foreign = UT_LIST_GET_FIRST(table->referenced_list);
@@ -104,8 +104,8 @@ row_upd_index_is_referenced(
while (foreign) {
if (foreign->referenced_index == index) {
- if (!trx->has_dict_foreign_key_check_lock) {
- rw_lock_s_unlock(&dict_foreign_key_check_lock);
+ if (!trx->has_dict_operation_lock) {
+ rw_lock_s_unlock(&dict_operation_lock);
}
return(TRUE);
@@ -114,8 +114,8 @@ row_upd_index_is_referenced(
foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
}
- if (!trx->has_dict_foreign_key_check_lock) {
- rw_lock_s_unlock(&dict_foreign_key_check_lock);
+ if (!trx->has_dict_operation_lock) {
+ rw_lock_s_unlock(&dict_operation_lock);
}
return(FALSE);
@@ -162,12 +162,12 @@ row_upd_check_references_constraints(
mtr_start(mtr);
- if (!trx->has_dict_foreign_key_check_lock) {
+ if (!trx->has_dict_operation_lock) {
got_s_lock = TRUE;
- rw_lock_s_lock(&dict_foreign_key_check_lock);
+ rw_lock_s_lock(&dict_operation_lock);
- trx->has_dict_foreign_key_check_lock = TRUE;
+ trx->has_dict_operation_lock = TRUE;
}
foreign = UT_LIST_GET_FIRST(table->referenced_list);
@@ -189,7 +189,7 @@ row_upd_check_references_constraints(
}
/* NOTE that if the thread ends up waiting for a lock
- we will release dict_foreign_key_check_lock
+ we will release dict_operation_lock
temporarily! But the counter on the table
protects 'foreign' from being dropped while the check
is running. */
@@ -212,8 +212,8 @@ row_upd_check_references_constraints(
if (err != DB_SUCCESS) {
if (got_s_lock) {
rw_lock_s_unlock(
- &dict_foreign_key_check_lock);
- trx->has_dict_foreign_key_check_lock
+ &dict_operation_lock);
+ trx->has_dict_operation_lock
= FALSE;
}
@@ -227,8 +227,8 @@ row_upd_check_references_constraints(
}
if (got_s_lock) {
- rw_lock_s_unlock(&dict_foreign_key_check_lock);
- trx->has_dict_foreign_key_check_lock = FALSE;
+ rw_lock_s_unlock(&dict_operation_lock);
+ trx->has_dict_operation_lock = FALSE;
}
mem_heap_free(heap);
diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
index d754f603efc..11e45df4ce3 100644
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -135,8 +135,6 @@ byte srv_latin1_ordering[256] /* The sort order table of the latin1
, 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xF7
, 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF
};
-
-ibool srv_use_native_aio = FALSE;
ulint srv_pool_size = ULINT_MAX; /* size in database pages;
MySQL originally sets this
@@ -151,8 +149,9 @@ dulint srv_archive_recovery_limit_lsn;
ulint srv_lock_wait_timeout = 1024 * 1024 * 1024;
-char* srv_unix_file_flush_method_str = NULL;
-ulint srv_unix_file_flush_method = 0;
+char* srv_file_flush_method_str = NULL;
+ulint srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
+ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
/* If the following is != 0 we do not allow inserts etc. This protects
the user from forgetting the innodb_force_recovery keyword to my.cnf */
@@ -281,6 +280,9 @@ time_t srv_last_monitor_time;
mutex_t srv_innodb_monitor_mutex;
+ulint srv_main_thread_process_no = 0;
+ulint srv_main_thread_id = 0;
+
/*
IMPLEMENTATION OF THE SERVER MAIN PROGRAM
=========================================
@@ -2046,13 +2048,15 @@ srv_table_reserve_slot_for_mysql(void)
}
/*******************************************************************
-Puts a MySQL OS thread to wait for a lock to be released. */
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
-ibool
+void
srv_suspend_mysql_thread(
/*=====================*/
- /* out: TRUE if the lock wait timeout was
- exceeded */
que_thr_t* thr) /* in: query thread associated with the MySQL
OS thread */
{
@@ -2069,13 +2073,15 @@ srv_suspend_mysql_thread(
mutex_enter(&kernel_mutex);
+ trx->error_state = DB_SUCCESS;
+
if (thr->state == QUE_THR_RUNNING) {
/* The lock has already been released: no need to suspend */
mutex_exit(&kernel_mutex);
- return(FALSE);
+ return;
}
slot = srv_table_reserve_slot_for_mysql();
@@ -2101,18 +2107,18 @@ srv_suspend_mysql_thread(
srv_conc_force_exit_innodb(thr_get_trx(thr));
/* Release possible foreign key check latch */
- if (trx->has_dict_foreign_key_check_lock) {
+ if (trx->has_dict_operation_lock) {
- rw_lock_s_unlock(&dict_foreign_key_check_lock);
+ rw_lock_s_unlock(&dict_operation_lock);
}
/* Wait for the release */
os_event_wait(event);
- if (trx->has_dict_foreign_key_check_lock) {
+ if (trx->has_dict_operation_lock) {
- rw_lock_s_lock(&dict_foreign_key_check_lock);
+ rw_lock_s_lock(&dict_operation_lock);
}
/* Return back inside InnoDB */
@@ -2131,10 +2137,9 @@ srv_suspend_mysql_thread(
if (srv_lock_wait_timeout < 100000000 &&
wait_time > (double)srv_lock_wait_timeout) {
- return(TRUE);
- }
- return(FALSE);
+ trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+ }
}
/************************************************************************
@@ -2300,9 +2305,19 @@ srv_sprintf_innodb_monitor(
"ROW OPERATIONS\n"
"--------------\n");
buf += sprintf(buf,
- "%ld queries inside InnoDB, %ld queries in queue; main thread: %s\n",
- srv_conc_n_threads, srv_conc_n_waiting_threads,
+ "%ld queries inside InnoDB, %ld queries in queue\n",
+ srv_conc_n_threads, srv_conc_n_waiting_threads);
+#ifdef UNIV_LINUX
+ buf += sprintf(buf,
+ "Main thread process no %lu, state: %s\n",
+ srv_main_thread_process_no,
+ srv_main_thread_op_info);
+#else
+ buf += sprintf(buf,
+ "Main thread id %lu, state: %s\n",
+ srv_main_thread_id,
srv_main_thread_op_info);
+#endif
buf += sprintf(buf,
"Number of rows inserted %lu, updated %lu, deleted %lu, read %lu\n",
srv_n_rows_inserted,
@@ -2636,6 +2651,9 @@ srv_master_thread(
UT_NOT_USED(arg);
+ srv_main_thread_process_no = os_proc_get_number();
+ srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
srv_table_reserve_slot(SRV_MASTER);
mutex_enter(&kernel_mutex);
diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
index dfa122b2ece..d6d610bb5b8 100644
--- a/innobase/srv/srv0start.c
+++ b/innobase/srv/srv0start.c
@@ -515,7 +515,7 @@ srv_calc_high32(
}
/*************************************************************************
-Creates or opens the log files. */
+Creates or opens the log files and closes them. */
static
ulint
open_or_create_log_file(
@@ -640,7 +640,7 @@ open_or_create_log_file(
}
/*************************************************************************
-Creates or opens database data files. */
+Creates or opens database data files and closes them. */
static
ulint
open_or_create_data_files(
@@ -965,31 +965,63 @@ innobase_start_or_create_for_mysql(void)
srv_is_being_started = TRUE;
srv_startup_is_before_trx_rollback_phase = TRUE;
+ os_aio_use_native_aio = FALSE;
+
+#ifdef __WIN__
+ if (os_get_os_version() == OS_WIN95
+ || os_get_os_version() == OS_WIN31
+ || os_get_os_version() == OS_WINNT) {
+
+ /* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
+ and NT use simulated aio. In NT Windows provides async i/o,
+ but when run in conjunction with InnoDB Hot Backup, it seemed
+ to corrupt the data files. */
+
+ os_aio_use_native_aio = FALSE;
+ } else {
+ /* On Win 2000 and XP use async i/o */
+ os_aio_use_native_aio = TRUE;
+ }
+#endif
+ if (srv_file_flush_method_str == NULL) {
+ /* These are the default options */
+
+ srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
- if (0 == ut_strcmp(srv_unix_file_flush_method_str, "fdatasync")) {
+ srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#ifndef __WIN__
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "fdatasync")) {
srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
- } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "O_DSYNC")) {
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
- } else if (0 == ut_strcmp(srv_unix_file_flush_method_str,
+ } else if (0 == ut_strcmp(srv_file_flush_method_str,
"littlesync")) {
srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
- } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "nosync")) {
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+#else
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
+ srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
+ os_aio_use_native_aio = FALSE;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
+ srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+ os_aio_use_native_aio = FALSE;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str,
+ "async_unbuffered")) {
+ srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#endif
} else {
fprintf(stderr,
"InnoDB: Unrecognized value %s for innodb_flush_method\n",
- srv_unix_file_flush_method_str);
+ srv_file_flush_method_str);
return(DB_ERROR);
}
- /*
- printf("srv_unix set to %lu\n", srv_unix_file_flush_method);
- */
- os_aio_use_native_aio = srv_use_native_aio;
-
err = srv_boot();
if (err != DB_SUCCESS) {
@@ -999,34 +1031,15 @@ innobase_start_or_create_for_mysql(void)
/* Restrict the maximum number of file i/o threads */
if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+
srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
}
-#if !(defined(WIN_ASYNC_IO) || defined(POSIX_ASYNC_IO))
- /* In simulated aio we currently have use only for 4 threads */
-
- os_aio_use_native_aio = FALSE;
-
- srv_n_file_io_threads = 4;
-#endif
-
-#ifdef __WIN__
- if (os_get_os_version() == OS_WIN95
- || os_get_os_version() == OS_WIN31) {
+ if (!os_aio_use_native_aio) {
+ /* In simulated aio we currently have use only for 4 threads */
- /* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use
- simulated aio */
+ srv_n_file_io_threads = 4;
- os_aio_use_native_aio = FALSE;
- srv_n_file_io_threads = 4;
- } else {
- /* On NT and Win 2000 always use aio */
- os_aio_use_native_aio = TRUE;
- }
-#endif
- os_aio_use_native_aio = FALSE;
-
- if (!os_aio_use_native_aio) {
os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
* srv_n_file_io_threads,
srv_n_file_io_threads,
@@ -1047,15 +1060,6 @@ innobase_start_or_create_for_mysql(void)
lock_sys_create(srv_lock_table_size);
-#ifdef POSIX_ASYNC_IO
- if (os_aio_use_native_aio) {
- /* There is only one thread per async io array:
- one for ibuf i/o, one for log i/o, one for ordinary reads,
- one for ordinary writes; we need only 4 i/o threads */
-
- srv_n_file_io_threads = 4;
- }
-#endif
/* Create i/o-handler threads: */
for (i = 0; i < srv_n_file_io_threads; i++) {
diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c
index fe837b119f3..b214bca0470 100644
--- a/innobase/sync/sync0rw.c
+++ b/innobase/sync/sync0rw.c
@@ -663,7 +663,8 @@ rw_lock_own(
/*========*/
/* out: TRUE if locked */
rw_lock_t* lock, /* in: rw-lock */
- ulint lock_type) /* in: lock type */
+ ulint lock_type) /* in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
{
rw_lock_debug_t* info;
diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c
index 3ea996afd6b..376be2e723a 100644
--- a/innobase/sync/sync0sync.c
+++ b/innobase/sync/sync0sync.c
@@ -901,8 +901,7 @@ sync_thread_levels_empty_gen(
if (slot->latch != NULL && (!dict_mutex_allowed ||
(slot->level != SYNC_DICT
- && slot->level != SYNC_FOREIGN_KEY_CHECK
- && slot->level != SYNC_PURGE_IS_RUNNING))) {
+ && slot->level != SYNC_DICT_OPERATION))) {
lock = slot->latch;
mutex = slot->latch;
@@ -1087,12 +1086,10 @@ sync_thread_add_level(
SYNC_IBUF_PESS_INSERT_MUTEX));
} else if (level == SYNC_DICT_AUTOINC_MUTEX) {
ut_a(sync_thread_levels_g(array, SYNC_DICT_AUTOINC_MUTEX));
- } else if (level == SYNC_FOREIGN_KEY_CHECK) {
- ut_a(sync_thread_levels_g(array, SYNC_FOREIGN_KEY_CHECK));
+ } else if (level == SYNC_DICT_OPERATION) {
+ ut_a(sync_thread_levels_g(array, SYNC_DICT_OPERATION));
} else if (level == SYNC_DICT_HEADER) {
ut_a(sync_thread_levels_g(array, SYNC_DICT_HEADER));
- } else if (level == SYNC_PURGE_IS_RUNNING) {
- ut_a(sync_thread_levels_g(array, SYNC_PURGE_IS_RUNNING));
} else if (level == SYNC_DICT) {
ut_a(buf_debug_prints
|| sync_thread_levels_g(array, SYNC_DICT));
diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c
index 97362d00b4b..d58240d3c11 100644
--- a/innobase/trx/trx0purge.c
+++ b/innobase/trx/trx0purge.c
@@ -209,9 +209,6 @@ trx_purge_sys_create(void)
purge_sys->purge_undo_no = ut_dulint_zero;
purge_sys->next_stored = FALSE;
- rw_lock_create(&(purge_sys->purge_is_running));
- rw_lock_set_level(&(purge_sys->purge_is_running),
- SYNC_PURGE_IS_RUNNING);
rw_lock_create(&(purge_sys->latch));
rw_lock_set_level(&(purge_sys->latch), SYNC_PURGE_LATCH);
diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c
index 994a6777924..7566fe1839e 100644
--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@@ -23,7 +23,7 @@ Created 3/26/1996 Heikki Tuuri
#include "srv0srv.h"
#include "thr0loc.h"
#include "btr0sea.h"
-
+#include "os0proc.h"
/* Copy of the prototype for innobase_mysql_print_thd: this
copy MUST be equal to the one in mysql/sql/ha_innobase.cc ! */
@@ -85,12 +85,14 @@ trx_create(
trx->conc_state = TRX_NOT_STARTED;
trx->start_time = time(NULL);
+ trx->isolation_level = TRX_ISO_REPEATABLE_READ;
trx->check_foreigns = TRUE;
trx->check_unique_secondary = TRUE;
trx->dict_operation = FALSE;
trx->mysql_thd = NULL;
+ trx->mysql_query_str = NULL;
trx->n_mysql_tables_in_use = 0;
trx->mysql_n_tables_locked = 0;
@@ -132,7 +134,7 @@ trx_create(
trx->lock_heap = mem_heap_create_in_buffer(256);
UT_LIST_INIT(trx->trx_locks);
- trx->has_dict_foreign_key_check_lock = FALSE;
+ trx->has_dict_operation_lock = FALSE;
trx->has_search_latch = FALSE;
trx->search_latch_timeout = BTR_SEA_TIMEOUT;
@@ -175,6 +177,8 @@ trx_allocate_for_mysql(void)
mutex_exit(&kernel_mutex);
trx->mysql_thread_id = os_thread_get_curr_id();
+
+ trx->mysql_process_no = os_proc_get_number();
return(trx);
}
@@ -1497,9 +1501,12 @@ trx_print(
default: buf += sprintf(buf, " state %lu", trx->conc_state);
}
+#ifdef UNIV_LINUX
+ buf += sprintf(buf, ", process no %lu", trx->mysql_process_no);
+#else
buf += sprintf(buf, ", OS thread id %lu",
os_thread_pf(trx->mysql_thread_id));
-
+#endif
if (ut_strlen(trx->op_info) > 0) {
buf += sprintf(buf, " %s", trx->op_info);
}