summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
Diffstat (limited to 'storage')
-rw-r--r--storage/innobase/btr/btr0btr.c39
-rw-r--r--storage/innobase/buf/buf0buf.c166
-rw-r--r--storage/innobase/buf/buf0flu.c88
-rw-r--r--storage/innobase/buf/buf0lru.c29
-rw-r--r--storage/innobase/dict/dict0crea.c13
-rw-r--r--storage/innobase/fil/fil0fil.c42
-rw-r--r--storage/innobase/include/btr0btr.h12
-rw-r--r--storage/innobase/include/buf0buf.h40
-rw-r--r--storage/innobase/include/buf0buf.ic19
-rw-r--r--storage/innobase/include/dict0crea.h6
-rw-r--r--storage/innobase/include/ut0lst.h41
-rw-r--r--storage/innobase/row/row0mysql.c8
12 files changed, 366 insertions, 137 deletions
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c
index 7a8e5626e25..50a349e78d6 100644
--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.c
@@ -1949,7 +1949,12 @@ btr_lift_page_up(
mtr_t* mtr) /* in: mtr */
{
page_t* father_page;
+ page_t* iter_page;
+ page_t* pages[BTR_MAX_LEVELS];
ulint page_level;
+ ulint root_page_no;
+ ulint ancestors;
+ ulint i;
ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
@@ -1959,6 +1964,30 @@ btr_lift_page_up(
btr_page_get_father_node_ptr(index, page, mtr));
page_level = btr_page_get_level(page, mtr);
+ root_page_no = dict_index_get_page(index);
+
+ ancestors = 1;
+ pages[0] = father_page;
+
+ /* Store all ancestor pages so we can reset their levels later on.
+ We have to do all the searches on the tree now because later on,
+ after we've replaced the first level, the tree is in an inconsistent
+ state and can not be searched. */
+ iter_page = father_page;
+ for (;;) {
+ if (buf_block_get_page_no(buf_block_align(iter_page))
+ == root_page_no) {
+
+ break;
+ }
+
+ ut_a(ancestors < BTR_MAX_LEVELS);
+
+ iter_page = buf_frame_align(
+ btr_page_get_father_node_ptr(index, iter_page, mtr));
+
+ pages[ancestors++] = iter_page;
+ }
btr_search_drop_page_hash_index(page);
@@ -1970,7 +1999,15 @@ btr_lift_page_up(
index, mtr);
lock_update_copy_and_discard(father_page, page);
- btr_page_set_level(father_page, page_level, mtr);
+ /* Go upward to root page, decreasing levels by one. */
+ for (i = 0; i < ancestors; i++) {
+ iter_page = pages[i];
+
+ ut_ad(btr_page_get_level(iter_page, mtr) == (page_level + 1));
+
+ btr_page_set_level(iter_page, page_level, mtr);
+ page_level++;
+ }
/* Free the file page */
btr_page_free(index, page, mtr);
diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c
index 222acc015a6..99e365d6129 100644
--- a/storage/innobase/buf/buf0buf.c
+++ b/storage/innobase/buf/buf0buf.c
@@ -221,6 +221,9 @@ in the free list to the frames.
5) When we have AWE enabled, we disable adaptive hash indexes.
*/
+/* Value in microseconds */
+static const int WAIT_FOR_READ = 20000;
+
buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */
#ifdef UNIV_DEBUG
@@ -539,6 +542,8 @@ buf_block_init(
block->n_pointers = 0;
+ mutex_create(&block->mutex, SYNC_BUF_BLOCK);
+
rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
ut_ad(rw_lock_validate(&(block->lock)));
@@ -813,8 +818,15 @@ buf_awe_map_page_to_frame(
bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped);
while (bck) {
- if (bck->state == BUF_BLOCK_FILE_PAGE
- && (bck->buf_fix_count != 0 || bck->io_fix != 0)) {
+ ibool skip;
+
+ mutex_enter(&bck->mutex);
+
+ skip = (bck->state == BUF_BLOCK_FILE_PAGE
+ && (bck->buf_fix_count != 0 || bck->io_fix != 0));
+
+ if (skip) {
+ mutex_exit(&bck->mutex);
/* We have to skip this */
bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck);
@@ -848,6 +860,8 @@ buf_awe_map_page_to_frame(
buf_pool->n_pages_awe_remapped++;
+ mutex_exit(&bck->mutex);
+
return;
}
}
@@ -886,13 +900,22 @@ buf_block_make_young(
/*=================*/
buf_block_t* block) /* in: block to make younger */
{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!mutex_own(&(buf_pool->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* Note that we read freed_page_clock's without holding any mutex:
+ this is allowed since the result is used only in heuristics */
+
if (buf_pool->freed_page_clock >= block->freed_page_clock
- + 1 + (buf_pool->curr_size / 1024)) {
+ + 1 + (buf_pool->curr_size / 4)) {
+ mutex_enter(&buf_pool->mutex);
/* There has been freeing activity in the LRU list:
best to move to the head of the LRU list */
buf_LRU_make_block_young(block);
+ mutex_exit(&buf_pool->mutex);
}
}
@@ -927,12 +950,16 @@ buf_block_free(
/*===========*/
buf_block_t* block) /* in, own: block to be freed */
{
- ut_a(block->state != BUF_BLOCK_FILE_PAGE);
-
mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
+
+ ut_a(block->state != BUF_BLOCK_FILE_PAGE);
+
buf_LRU_block_free_non_file_page(block);
+ mutex_exit(&block->mutex);
+
mutex_exit(&(buf_pool->mutex));
}
@@ -1151,9 +1178,8 @@ buf_page_get_gen(
#endif
buf_pool->n_page_gets++;
loop:
- mutex_enter_fast(&(buf_pool->mutex));
-
block = NULL;
+ mutex_enter_fast(&(buf_pool->mutex));
if (guess) {
block = buf_block_align(guess);
@@ -1191,6 +1217,8 @@ loop:
goto loop;
}
+ mutex_enter(&block->mutex);
+
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
must_read = FALSE;
@@ -1200,9 +1228,9 @@ loop:
must_read = TRUE;
if (mode == BUF_GET_IF_IN_POOL) {
-
/* The page is only being read to buffer */
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&buf_pool->mutex);
+ mutex_exit(&block->mutex);
return(NULL);
}
@@ -1226,7 +1254,7 @@ loop:
#else
buf_block_buf_fix_inc(block);
#endif
- buf_block_make_young(block);
+ mutex_exit(&buf_pool->mutex);
/* Check if this is the first access to the page */
@@ -1234,10 +1262,13 @@ loop:
block->accessed = TRUE;
+ mutex_exit(&block->mutex);
+
+ buf_block_make_young(block);
+
#ifdef UNIV_DEBUG_FILE_ACCESSES
ut_a(block->file_page_was_freed == FALSE);
#endif
- mutex_exit(&(buf_pool->mutex));
#ifdef UNIV_DEBUG
buf_dbg_counter++;
@@ -1262,13 +1293,14 @@ loop:
}
if (!success) {
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
block->buf_fix_count--;
+
+ mutex_exit(&block->mutex);
#ifdef UNIV_SYNC_DEBUG
rw_lock_s_unlock(&(block->debug_latch));
#endif
- mutex_exit(&(buf_pool->mutex));
return(NULL);
}
@@ -1279,18 +1311,16 @@ loop:
completes */
for (;;) {
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
if (block->io_fix == BUF_IO_READ) {
- mutex_exit(&(buf_pool->mutex));
-
- /* Sleep 20 milliseconds */
+ mutex_exit(&block->mutex);
- os_thread_sleep(20000);
+ os_thread_sleep(WAIT_FOR_READ);
} else {
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
break;
}
@@ -1349,14 +1379,14 @@ buf_page_optimistic_get_func(
ut_ad(mtr && block);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
- mutex_enter(&(buf_pool->mutex));
-
/* If AWE is used, block may have a different frame now, e.g., NULL */
+ mutex_enter(&block->mutex);
+
if (UNIV_UNLIKELY(block->state != BUF_BLOCK_FILE_PAGE)
|| UNIV_UNLIKELY(block->frame != guess)) {
-exit_func:
- mutex_exit(&(buf_pool->mutex));
+
+ mutex_exit(&block->mutex);
return(FALSE);
}
@@ -1366,15 +1396,14 @@ exit_func:
#else
buf_block_buf_fix_inc(block);
#endif
- buf_block_make_young(block);
-
- /* Check if this is the first access to the page */
-
accessed = block->accessed;
-
block->accessed = TRUE;
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
+
+ buf_block_make_young(block);
+
+ /* Check if this is the first access to the page */
ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset));
@@ -1389,13 +1418,16 @@ exit_func:
}
if (UNIV_UNLIKELY(!success)) {
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
block->buf_fix_count--;
+
+ mutex_exit(&block->mutex);
+
#ifdef UNIV_SYNC_DEBUG
rw_lock_s_unlock(&(block->debug_latch));
#endif
- goto exit_func;
+ return(FALSE);
}
if (UNIV_UNLIKELY(!UT_DULINT_EQ(modify_clock, block->modify_clock))) {
@@ -1408,13 +1440,16 @@ exit_func:
rw_lock_x_unlock(&(block->lock));
}
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
block->buf_fix_count--;
+
+ mutex_exit(&block->mutex);
+
#ifdef UNIV_SYNC_DEBUG
rw_lock_s_unlock(&(block->debug_latch));
#endif
- goto exit_func;
+ return(FALSE);
}
mtr_memo_push(mtr, block, fix_type);
@@ -1471,10 +1506,10 @@ buf_page_get_known_nowait(
ut_ad(mtr);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
- mutex_enter(&(buf_pool->mutex));
-
block = buf_block_align(guess);
+ mutex_enter(&block->mutex);
+
if (block->state == BUF_BLOCK_REMOVE_HASH) {
/* Another thread is just freeing the block from the LRU list
of the buffer pool: do not try to access this page; this
@@ -1483,7 +1518,7 @@ buf_page_get_known_nowait(
we have already removed it from the page address hash table
of the buffer pool. */
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
return(FALSE);
}
@@ -1495,12 +1530,12 @@ buf_page_get_known_nowait(
#else
buf_block_buf_fix_inc(block);
#endif
+ mutex_exit(&block->mutex);
+
if (mode == BUF_MAKE_YOUNG) {
buf_block_make_young(block);
}
- mutex_exit(&(buf_pool->mutex));
-
ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));
if (rw_latch == RW_S_LATCH) {
@@ -1514,13 +1549,15 @@ buf_page_get_known_nowait(
}
if (!success) {
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
block->buf_fix_count--;
+
+ mutex_exit(&block->mutex);
+
#ifdef UNIV_SYNC_DEBUG
rw_lock_s_unlock(&(block->debug_latch));
#endif
- mutex_exit(&(buf_pool->mutex));
return(FALSE);
}
@@ -1568,7 +1605,6 @@ buf_page_init_for_backup_restore(
block->offset = offset;
block->lock_hash_val = 0;
- block->lock_mutex = NULL;
block->freed_page_clock = 0;
@@ -1601,6 +1637,7 @@ buf_page_init(
{
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&(block->mutex)));
#endif /* UNIV_SYNC_DEBUG */
ut_a(block->state != BUF_BLOCK_FILE_PAGE);
@@ -1615,7 +1652,6 @@ buf_page_init(
block->index = NULL;
block->lock_hash_val = lock_rec_hash(space, offset);
- block->lock_mutex = NULL;
/* Insert into the hash table of file pages */
@@ -1709,6 +1745,7 @@ buf_page_init_for_read(
ut_a(block);
mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
if (fil_tablespace_deleted_or_being_deleted_in_mem(
space, tablespace_version)) {
@@ -1722,7 +1759,9 @@ buf_page_init_for_read(
deleted or is being deleted, or the page is
already in buf_pool, return */
+ mutex_exit(&block->mutex);
mutex_exit(&(buf_pool->mutex));
+
buf_block_free(block);
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
@@ -1742,6 +1781,7 @@ buf_page_init_for_read(
buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */
block->io_fix = BUF_IO_READ;
+
buf_pool->n_pend_reads++;
/* We set a pass-type x-lock on the frame because then the same
@@ -1753,6 +1793,7 @@ buf_page_init_for_read(
rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ);
+ mutex_exit(&block->mutex);
mutex_exit(&(buf_pool->mutex));
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
@@ -1817,6 +1858,8 @@ buf_page_create(
block = free_block;
+ mutex_enter(&block->mutex);
+
buf_page_init(space, offset, block);
/* The block must be put to the LRU list */
@@ -1827,13 +1870,15 @@ buf_page_create(
#else
buf_block_buf_fix_inc(block);
#endif
+ buf_pool->n_pages_created++;
+
+ mutex_exit(&(buf_pool->mutex));
+
mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
block->accessed = TRUE;
- buf_pool->n_pages_created++;
-
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
/* Delete possible entries for the page from the insert buffer:
such can exist if the page belonged to an index which was dropped */
@@ -1885,6 +1930,12 @@ buf_page_io_complete(
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ /* We do not need protect block->io_fix here by block->mutex to read
+ it because this is the only function where we can change the value
+ from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
+ ensures that this is the only thread that handles the i/o for this
+ block. */
+
io_type = block->io_fix;
if (io_type == BUF_IO_READ) {
@@ -1986,11 +2037,12 @@ buf_page_io_complete(
}
}
+ mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
+
#ifdef UNIV_IBUF_DEBUG
ut_a(ibuf_count_get(block->space, block->offset) == 0);
#endif
- mutex_enter(&(buf_pool->mutex));
-
/* Because this thread which does the unlocking is not the same that
did the locking, we use a pass value != 0 in unlock, which simply
removes the newest lock debug record, without checking the thread
@@ -2033,6 +2085,7 @@ buf_page_io_complete(
#endif /* UNIV_DEBUG */
}
+ mutex_exit(&block->mutex);
mutex_exit(&(buf_pool->mutex));
#ifdef UNIV_DEBUG
@@ -2095,6 +2148,8 @@ buf_validate(void)
block = buf_pool_get_nth_block(buf_pool, i);
+ mutex_enter(&block->mutex);
+
if (block->state == BUF_BLOCK_FILE_PAGE) {
ut_a(buf_page_hash_get(block->space,
@@ -2139,6 +2194,8 @@ buf_validate(void)
} else if (block->state == BUF_BLOCK_NOT_USED) {
n_free++;
}
+
+ mutex_exit(&block->mutex);
}
if (n_lru + n_free > buf_pool->curr_size) {
@@ -2286,9 +2343,14 @@ buf_get_latched_pages_number(void)
block = buf_pool_get_nth_block(buf_pool, i);
- if (((block->buf_fix_count != 0) || (block->io_fix != 0))
- && block->magic_n == BUF_BLOCK_MAGIC_N) {
- fixed_pages_number++;
+ if (block->magic_n == BUF_BLOCK_MAGIC_N) {
+ mutex_enter(&block->mutex);
+
+ if (block->buf_fix_count != 0 || block->io_fix != 0) {
+ fixed_pages_number++;
+ }
+
+ mutex_exit(&block->mutex);
}
}
@@ -2458,6 +2520,8 @@ buf_all_freed(void)
block = buf_pool_get_nth_block(buf_pool, i);
+ mutex_enter(&block->mutex);
+
if (block->state == BUF_BLOCK_FILE_PAGE) {
if (!buf_flush_ready_for_replace(block)) {
@@ -2469,6 +2533,8 @@ buf_all_freed(void)
ut_error;
}
}
+
+ mutex_exit(&block->mutex);
}
mutex_exit(&(buf_pool->mutex));
diff --git a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c
index 8ef8c3b9358..49b81196a64 100644
--- a/storage/innobase/buf/buf0flu.c
+++ b/storage/innobase/buf/buf0flu.c
@@ -113,6 +113,7 @@ buf_flush_ready_for_replace(
{
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
#endif /* UNIV_SYNC_DEBUG */
if (block->state != BUF_BLOCK_FILE_PAGE) {
ut_print_timestamp(stderr);
@@ -148,6 +149,7 @@ buf_flush_ready_for_flush(
{
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&(block->mutex)));
#endif /* UNIV_SYNC_DEBUG */
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
@@ -559,8 +561,15 @@ buf_flush_try_page(
ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
+ if (!block) {
+ mutex_exit(&(buf_pool->mutex));
+ return(0);
+ }
+
+ mutex_enter(&block->mutex);
+
if (flush_type == BUF_FLUSH_LIST
- && block && buf_flush_ready_for_flush(block, flush_type)) {
+ && buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@@ -598,6 +607,7 @@ buf_flush_try_page(
locked = TRUE;
}
+ mutex_exit(&block->mutex);
mutex_exit(&(buf_pool->mutex));
if (!locked) {
@@ -618,7 +628,7 @@ buf_flush_try_page(
return(1);
- } else if (flush_type == BUF_FLUSH_LRU && block
+ } else if (flush_type == BUF_FLUSH_LRU
&& buf_flush_ready_for_flush(block, flush_type)) {
/* VERY IMPORTANT:
@@ -659,13 +669,14 @@ buf_flush_try_page(
buf_pool mutex: this ensures that the latch is acquired
immediately. */
+ mutex_exit(&block->mutex);
mutex_exit(&(buf_pool->mutex));
buf_flush_write_block_low(block);
return(1);
- } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
+ } else if (flush_type == BUF_FLUSH_SINGLE_PAGE
&& buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@@ -692,6 +703,7 @@ buf_flush_try_page(
(buf_pool->n_flush[flush_type])++;
+ mutex_exit(&block->mutex);
mutex_exit(&(buf_pool->mutex));
rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
@@ -709,11 +721,12 @@ buf_flush_try_page(
buf_flush_write_block_low(block);
return(1);
- } else {
- mutex_exit(&(buf_pool->mutex));
-
- return(0);
}
+
+ mutex_exit(&block->mutex);
+ mutex_exit(&(buf_pool->mutex));
+
+ return(0);
}
/***************************************************************
@@ -758,34 +771,48 @@ buf_flush_try_neighbors(
block = buf_page_hash_get(space, i);
ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
- if (block && flush_type == BUF_FLUSH_LRU && i != offset
- && !block->old) {
+ if (!block) {
+
+ continue;
+
+ } else if (flush_type == BUF_FLUSH_LRU && i != offset
+ && !block->old) {
/* We avoid flushing 'non-old' blocks in an LRU flush,
because the flushed blocks are soon freed */
continue;
- }
+ } else {
- if (block && buf_flush_ready_for_flush(block, flush_type)
- && (i == offset || block->buf_fix_count == 0)) {
- /* We only try to flush those neighbors != offset
- where the buf fix count is zero, as we then know that
- we probably can latch the page without a semaphore
- wait. Semaphore waits are expensive because we must
- flush the doublewrite buffer before we start
- waiting. */
+ mutex_enter(&block->mutex);
- mutex_exit(&(buf_pool->mutex));
+ if (buf_flush_ready_for_flush(block, flush_type)
+ && (i == offset || block->buf_fix_count == 0)) {
+ /* We only try to flush those
+ neighbors != offset where the buf fix count is
+ zero, as we then know that we probably can
+ latch the page without a semaphore wait.
+ Semaphore waits are expensive because we must
+ flush the doublewrite buffer before we start
+ waiting. */
- /* Note: as we release the buf_pool mutex above, in
- buf_flush_try_page we cannot be sure the page is still
- in a flushable state: therefore we check it again
- inside that function. */
+ mutex_exit(&block->mutex);
- count += buf_flush_try_page(space, i, flush_type);
+ mutex_exit(&(buf_pool->mutex));
- mutex_enter(&(buf_pool->mutex));
+ /* Note: as we release the buf_pool mutex
+ above, in buf_flush_try_page we cannot be sure
+ the page is still in a flushable state:
+ therefore we check it again inside that
+ function. */
+
+ count += buf_flush_try_page(space, i,
+ flush_type);
+
+ mutex_enter(&(buf_pool->mutex));
+ } else {
+ mutex_exit(&block->mutex);
+ }
}
}
@@ -879,12 +906,15 @@ buf_flush_batch(
while ((block != NULL) && !found) {
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ mutex_enter(&block->mutex);
+
if (buf_flush_ready_for_flush(block, flush_type)) {
found = TRUE;
space = block->space;
offset = block->offset;
+ mutex_exit(&block->mutex);
mutex_exit(&(buf_pool->mutex));
old_page_count = page_count;
@@ -901,10 +931,14 @@ buf_flush_batch(
} else if (flush_type == BUF_FLUSH_LRU) {
+ mutex_exit(&block->mutex);
+
block = UT_LIST_GET_PREV(LRU, block);
} else {
ut_ad(flush_type == BUF_FLUSH_LIST);
+ mutex_exit(&block->mutex);
+
block = UT_LIST_GET_PREV(flush_list, block);
}
}
@@ -986,10 +1020,14 @@ buf_flush_LRU_recommendation(void)
+ BUF_FLUSH_EXTRA_MARGIN)
&& (distance < BUF_LRU_FREE_SEARCH_LEN)) {
+ mutex_enter(&block->mutex);
+
if (buf_flush_ready_for_replace(block)) {
n_replaceable++;
}
+ mutex_exit(&block->mutex);
+
distance++;
block = UT_LIST_GET_PREV(LRU, block);
diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.c
index 4ebe94c40ec..377552ece6c 100644
--- a/storage/innobase/buf/buf0lru.c
+++ b/storage/innobase/buf/buf0lru.c
@@ -86,6 +86,11 @@ scan_again:
block = UT_LIST_GET_LAST(buf_pool->LRU);
while (block != NULL) {
+ buf_block_t* prev_block;
+
+ mutex_enter(&block->mutex);
+ prev_block = UT_LIST_GET_PREV(LRU, block);
+
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
if (block->space == id
@@ -112,6 +117,8 @@ scan_again:
if (block->is_hashed) {
page_no = block->offset;
+ mutex_exit(&block->mutex);
+
mutex_exit(&(buf_pool->mutex));
/* Note that the following call will acquire
@@ -138,7 +145,8 @@ scan_again:
buf_LRU_block_free_hashed_page(block);
}
next_page:
- block = UT_LIST_GET_PREV(LRU, block);
+ mutex_exit(&block->mutex);
+ block = prev_block;
}
mutex_exit(&(buf_pool->mutex));
@@ -211,6 +219,9 @@ buf_LRU_search_and_free_block(
while (block != NULL) {
ut_a(block->in_LRU_list);
+
+ mutex_enter(&block->mutex);
+
if (buf_flush_ready_for_replace(block)) {
#ifdef UNIV_DEBUG
@@ -226,6 +237,7 @@ buf_LRU_search_and_free_block(
buf_LRU_block_remove_hashed_page(block);
mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
/* Remove possible adaptive hash index built on the
page; in the case of AWE the block may not have a
@@ -234,15 +246,21 @@ buf_LRU_search_and_free_block(
if (block->frame) {
btr_search_drop_page_hash_index(block->frame);
}
- mutex_enter(&(buf_pool->mutex));
ut_a(block->buf_fix_count == 0);
+ mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
+
buf_LRU_block_free_hashed_page(block);
freed = TRUE;
+ mutex_exit(&block->mutex);
break;
}
+
+ mutex_exit(&block->mutex);
+
block = UT_LIST_GET_PREV(LRU, block);
distance++;
@@ -428,8 +446,12 @@ loop:
}
}
+ mutex_enter(&block->mutex);
+
block->state = BUF_BLOCK_READY_FOR_USE;
+ mutex_exit(&block->mutex);
+
mutex_exit(&(buf_pool->mutex));
if (started_monitor) {
@@ -838,6 +860,7 @@ buf_LRU_block_free_non_file_page(
{
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
#endif /* UNIV_SYNC_DEBUG */
ut_ad(block);
@@ -877,6 +900,7 @@ buf_LRU_block_remove_hashed_page(
{
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
#endif /* UNIV_SYNC_DEBUG */
ut_ad(block);
@@ -939,6 +963,7 @@ buf_LRU_block_free_hashed_page(
{
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
#endif /* UNIV_SYNC_DEBUG */
ut_a(block->state == BUF_BLOCK_REMOVE_HASH);
diff --git a/storage/innobase/dict/dict0crea.c b/storage/innobase/dict/dict0crea.c
index 75422b929b1..33e328d1e0b 100644
--- a/storage/innobase/dict/dict0crea.c
+++ b/storage/innobase/dict/dict0crea.c
@@ -700,8 +700,10 @@ dict_truncate_index_tree(
/* out: new root page number, or
FIL_NULL on failure */
dict_table_t* table, /* in: the table the index belongs to */
- rec_t* rec, /* in: record in the clustered index of
- SYS_INDEXES table */
+ btr_pcur_t* pcur, /* in/out: persistent cursor pointing to
+ record in the clustered index of
+ SYS_INDEXES table. The cursor may be
+ repositioned in this call. */
mtr_t* mtr) /* in: mtr having the latch
on the record page. The mtr may be
committed and restarted in this call. */
@@ -710,6 +712,7 @@ dict_truncate_index_tree(
ulint space;
ulint type;
dulint index_id;
+ rec_t* rec;
byte* ptr;
ulint len;
ulint comp;
@@ -720,6 +723,7 @@ dict_truncate_index_tree(
#endif /* UNIV_SYNC_DEBUG */
ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+ rec = btr_pcur_get_rec(pcur);
ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
ut_ad(len == 4);
@@ -785,10 +789,11 @@ dict_truncate_index_tree(
/* We will need to commit the mini-transaction in order to avoid
deadlocks in the btr_create() call, because otherwise we would
be freeing and allocating pages in the same mini-transaction. */
+ btr_pcur_store_position(pcur, mtr);
mtr_commit(mtr);
- /* mtr_commit() will invalidate rec. */
- rec = NULL;
+
mtr_start(mtr);
+ btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
/* Find the index corresponding to this SYS_INDEXES record. */
for (index = UT_LIST_GET_FIRST(table->indexes);
diff --git a/storage/innobase/fil/fil0fil.c b/storage/innobase/fil/fil0fil.c
index 1aff3739103..883fbd09ee4 100644
--- a/storage/innobase/fil/fil0fil.c
+++ b/storage/innobase/fil/fil0fil.c
@@ -4416,29 +4416,47 @@ fil_flush_file_spaces(
{
fil_system_t* system = fil_system;
fil_space_t* space;
+ ulint* space_ids;
+ ulint n_space_ids;
+ ulint i;
mutex_enter(&(system->mutex));
- space = UT_LIST_GET_FIRST(system->unflushed_spaces);
+ n_space_ids = UT_LIST_GET_LEN(system->unflushed_spaces);
+ if (n_space_ids == 0) {
- while (space) {
- if (space->purpose == purpose && !space->is_being_deleted) {
+ mutex_exit(&system->mutex);
+ return;
+ }
- space->n_pending_flushes++; /* prevent dropping of
- the space while we are
- flushing */
- mutex_exit(&(system->mutex));
+ /* Assemble a list of space ids to flush. Previously, we
+ traversed system->unflushed_spaces and called UT_LIST_GET_NEXT()
+ on a space that was just removed from the list by fil_flush().
+ Thus, the space could be dropped and the memory overwritten. */
+ space_ids = mem_alloc(n_space_ids * sizeof *space_ids);
- fil_flush(space->id);
+ n_space_ids = 0;
- mutex_enter(&(system->mutex));
+ for (space = UT_LIST_GET_FIRST(system->unflushed_spaces);
+ space;
+ space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
- space->n_pending_flushes--;
+ if (space->purpose == purpose && !space->is_being_deleted) {
+
+ space_ids[n_space_ids++] = space->id;
}
- space = UT_LIST_GET_NEXT(unflushed_spaces, space);
}
- mutex_exit(&(system->mutex));
+ mutex_exit(&system->mutex);
+
+ /* Flush the spaces. It will not hurt to call fil_flush() on
+ a non-existing space id. */
+ for (i = 0; i < n_space_ids; i++) {
+
+ fil_flush(space_ids[i]);
+ }
+
+ mem_free(space_ids);
}
/**********************************************************************
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 664607a26aa..1573de7e818 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -23,7 +23,17 @@ special big record storage structure */
#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200)
-/* Latching modes for the search function (in btr0cur.*) */
+/* Maximum depth of a B-tree in InnoDB. Note that this isn't a maximum as
+such; none of the tree operations avoid producing trees bigger than this. It
+is instead a "max depth that other code must work with", useful for e.g.
+fixed-size arrays that must store some information about each level in a
+tree. In other words: if a B-tree with bigger depth than this is
+encountered, it is not acceptable for it to lead to mysterious memory
+corruption, but it is acceptable for the program to die with a clear assert
+failure. */
+#define BTR_MAX_LEVELS 100
+
+/* Latching modes for btr_cur_search_to_nth_level(). */
#define BTR_SEARCH_LEAF RW_S_LATCH
#define BTR_MODIFY_LEAF RW_X_LATCH
#define BTR_NO_LATCHES RW_NO_LATCH
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 2fc6bf4bcb9..979c28f64ed 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -461,8 +461,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock
table. */
UNIV_INLINE
mutex_t*
-buf_frame_get_lock_mutex(
-/*=====================*/
+buf_frame_get_mutex(
+/*================*/
/* out: mutex */
byte* ptr); /* in: pointer to within a buffer frame */
/***********************************************************************
@@ -713,7 +713,10 @@ struct buf_block_struct{
ulint magic_n; /* magic number to check */
ulint state; /* state of the control block:
- BUF_BLOCK_NOT_USED, ... */
+ BUF_BLOCK_NOT_USED, ...; changing
+ this is only allowed when a thread
+ has BOTH the buffer pool mutex AND
+ block->mutex locked */
byte* frame; /* pointer to buffer frame which
is of size UNIV_PAGE_SIZE, and
aligned to an address divisible by
@@ -731,8 +734,12 @@ struct buf_block_struct{
ulint offset; /* page number within the space */
ulint lock_hash_val; /* hashed value of the page address
in the record lock hash table */
- mutex_t* lock_mutex; /* mutex protecting the chain in the
- record lock hash table */
+ mutex_t mutex; /* mutex protecting this block:
+ state (also protected by the buffer
+ pool mutex), io_fix, buf_fix_count,
+ and accessed; we introduce this new
+ mutex in InnoDB-5.1 to relieve
+ contention on the buffer pool mutex */
rw_lock_t lock; /* read-write lock of the buffer
frame */
buf_block_t* hash; /* node used in chaining to the page
@@ -788,20 +795,27 @@ struct buf_block_struct{
in heuristic algorithms, because of
the possibility of a wrap-around! */
ulint freed_page_clock;/* the value of freed_page_clock
- buffer pool when this block was
- last time put to the head of the
- LRU list */
+ of the buffer pool when this block was
+ the last time put to the head of the
+ LRU list; a thread is allowed to
+ read this for heuristic purposes
+ without holding any mutex or latch */
ibool old; /* TRUE if the block is in the old
blocks in the LRU list */
ibool accessed; /* TRUE if the page has been accessed
while in the buffer pool: read-ahead
may read in pages which have not been
- accessed yet */
+ accessed yet; this is protected by
+ block->mutex; a thread is allowed to
+ read this for heuristic purposes
+ without holding any mutex or latch */
ulint buf_fix_count; /* count of how manyfold this block
- is currently bufferfixed */
+ is currently bufferfixed; this is
+ protected by block->mutex */
ulint io_fix; /* if a read is pending to the frame,
io_fix is BUF_IO_READ, in the case
- of a write BUF_IO_WRITE, otherwise 0 */
+ of a write BUF_IO_WRITE, otherwise 0;
+ this is protected by block->mutex */
/* 4. Optimistic search field */
dulint modify_clock; /* this clock is incremented every
@@ -959,7 +973,9 @@ struct buf_pool_struct{
number of buffer blocks removed from
the end of the LRU list; NOTE that
this counter may wrap around at 4
- billion! */
+ billion! A thread is allowed to
+ read this for heuristic purposes
+ without holding any mutex or latch */
ulint LRU_flush_ended;/* when an LRU flush ends for a page,
this is incremented by one; this is
set to zero when a buffer block is
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
index 015de862705..43895295734 100644
--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -337,8 +337,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock
table. */
UNIV_INLINE
mutex_t*
-buf_frame_get_lock_mutex(
-/*=====================*/
+buf_frame_get_mutex(
+/*================*/
/* out: mutex */
byte* ptr) /* in: pointer to within a buffer frame */
{
@@ -346,7 +346,7 @@ buf_frame_get_lock_mutex(
block = buf_block_align(ptr);
- return(block->lock_mutex);
+ return(&block->mutex);
}
/*************************************************************************
@@ -519,6 +519,7 @@ buf_block_buf_fix_inc_debug(
ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line);
ut_ad(ret == TRUE);
+ ut_ad(mutex_own(&block->mutex));
#endif
block->buf_fix_count++;
}
@@ -531,6 +532,9 @@ buf_block_buf_fix_inc(
/*==================*/
buf_block_t* block) /* in: block to bufferfix */
{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&block->mutex));
+#endif
block->buf_fix_count++;
}
#endif /* UNIV_SYNC_DEBUG */
@@ -625,23 +629,24 @@ buf_page_release(
ut_ad(block);
- mutex_enter_fast(&(buf_pool->mutex));
-
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
ut_a(block->buf_fix_count > 0);
if (rw_latch == RW_X_LATCH && mtr->modifications) {
-
+ mutex_enter(&buf_pool->mutex);
buf_flush_note_modification(block, mtr);
+ mutex_exit(&buf_pool->mutex);
}
+ mutex_enter(&block->mutex);
+
#ifdef UNIV_SYNC_DEBUG
rw_lock_s_unlock(&(block->debug_latch));
#endif
buf_fix_count = block->buf_fix_count;
block->buf_fix_count = buf_fix_count - 1;
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
if (rw_latch == RW_S_LATCH) {
rw_lock_s_unlock(&(block->lock));
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
index bd0890dbfa7..f0f30481abe 100644
--- a/storage/innobase/include/dict0crea.h
+++ b/storage/innobase/include/dict0crea.h
@@ -62,8 +62,10 @@ dict_truncate_index_tree(
/* out: new root page number, or
FIL_NULL on failure */
dict_table_t* table, /* in: the table the index belongs to */
- rec_t* rec, /* in: record in the clustered index of
- SYS_INDEXES table */
+ btr_pcur_t* pcur, /* in/out: persistent cursor pointing to
+ record in the clustered index of
+ SYS_INDEXES table. The cursor may be
+ repositioned in this call. */
mtr_t* mtr); /* in: mtr having the latch
on the record page. The mtr may be
committed and restarted in this call. */
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
index 819f2e770ba..d19885d6edc 100644
--- a/storage/innobase/include/ut0lst.h
+++ b/storage/innobase/include/ut0lst.h
@@ -123,27 +123,36 @@ name, NODE1 and NODE2 are pointers to nodes. */
}\
}\
+/* Invalidate the pointers in a list node. */
+#ifdef UNIV_DEBUG
+# define UT_LIST_REMOVE_CLEAR(NAME, N) \
+((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+#else
+# define UT_LIST_REMOVE_CLEAR(NAME, N) while (0)
+#endif
+
/***********************************************************************
Removes a node from a two-way linked list. BASE has to be the base node
(not a pointer to it). N has to be the pointer to the node to be removed
from the list. NAME is the list name. */
-#define UT_LIST_REMOVE(NAME, BASE, N)\
-{\
- ut_ad(N);\
- ut_a((BASE).count > 0);\
- ((BASE).count)--;\
- if (((N)->NAME).next != NULL) {\
- ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;\
- } else {\
- (BASE).end = ((N)->NAME).prev;\
- }\
- if (((N)->NAME).prev != NULL) {\
- ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;\
- } else {\
- (BASE).start = ((N)->NAME).next;\
- }\
-}\
+#define UT_LIST_REMOVE(NAME, BASE, N) \
+do { \
+ ut_ad(N); \
+ ut_a((BASE).count > 0); \
+ ((BASE).count)--; \
+ if (((N)->NAME).next != NULL) { \
+ ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev; \
+ } else { \
+ (BASE).end = ((N)->NAME).prev; \
+ } \
+ if (((N)->NAME).prev != NULL) { \
+ ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next; \
+ } else { \
+ (BASE).start = ((N)->NAME).next; \
+ } \
+ UT_LIST_REMOVE_CLEAR(NAME, N); \
+} while (0)
/************************************************************************
Gets the next node in a two-way list. NAME is the name of the list
diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c
index a2895bae71a..41a468e5026 100644
--- a/storage/innobase/row/row0mysql.c
+++ b/storage/innobase/row/row0mysql.c
@@ -2820,12 +2820,10 @@ row_truncate_table_for_mysql(
goto next_rec;
}
- btr_pcur_store_position(&pcur, &mtr);
+ /* This call may commit and restart mtr
+ and reposition pcur. */
+ root_page_no = dict_truncate_index_tree(table, &pcur, &mtr);
- /* This call may commit and restart mtr. */
- root_page_no = dict_truncate_index_tree(table, rec, &mtr);
-
- btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr);
rec = btr_pcur_get_rec(&pcur);
if (root_page_no != FIL_NULL) {