diff options
Diffstat (limited to 'storage')
-rw-r--r-- | storage/innobase/btr/btr0btr.c | 39 | ||||
-rw-r--r-- | storage/innobase/buf/buf0buf.c | 166 | ||||
-rw-r--r-- | storage/innobase/buf/buf0flu.c | 88 | ||||
-rw-r--r-- | storage/innobase/buf/buf0lru.c | 29 | ||||
-rw-r--r-- | storage/innobase/dict/dict0crea.c | 13 | ||||
-rw-r--r-- | storage/innobase/fil/fil0fil.c | 42 | ||||
-rw-r--r-- | storage/innobase/include/btr0btr.h | 12 | ||||
-rw-r--r-- | storage/innobase/include/buf0buf.h | 40 | ||||
-rw-r--r-- | storage/innobase/include/buf0buf.ic | 19 | ||||
-rw-r--r-- | storage/innobase/include/dict0crea.h | 6 | ||||
-rw-r--r-- | storage/innobase/include/ut0lst.h | 41 | ||||
-rw-r--r-- | storage/innobase/row/row0mysql.c | 8 |
12 files changed, 366 insertions, 137 deletions
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 7a8e5626e25..50a349e78d6 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -1949,7 +1949,12 @@ btr_lift_page_up( mtr_t* mtr) /* in: mtr */ { page_t* father_page; + page_t* iter_page; + page_t* pages[BTR_MAX_LEVELS]; ulint page_level; + ulint root_page_no; + ulint ancestors; + ulint i; ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); @@ -1959,6 +1964,30 @@ btr_lift_page_up( btr_page_get_father_node_ptr(index, page, mtr)); page_level = btr_page_get_level(page, mtr); + root_page_no = dict_index_get_page(index); + + ancestors = 1; + pages[0] = father_page; + + /* Store all ancestor pages so we can reset their levels later on. + We have to do all the searches on the tree now because later on, + after we've replaced the first level, the tree is in an inconsistent + state and can not be searched. */ + iter_page = father_page; + for (;;) { + if (buf_block_get_page_no(buf_block_align(iter_page)) + == root_page_no) { + + break; + } + + ut_a(ancestors < BTR_MAX_LEVELS); + + iter_page = buf_frame_align( + btr_page_get_father_node_ptr(index, iter_page, mtr)); + + pages[ancestors++] = iter_page; + } btr_search_drop_page_hash_index(page); @@ -1970,7 +1999,15 @@ btr_lift_page_up( index, mtr); lock_update_copy_and_discard(father_page, page); - btr_page_set_level(father_page, page_level, mtr); + /* Go upward to root page, decreasing levels by one. */ + for (i = 0; i < ancestors; i++) { + iter_page = pages[i]; + + ut_ad(btr_page_get_level(iter_page, mtr) == (page_level + 1)); + + btr_page_set_level(iter_page, page_level, mtr); + page_level++; + } /* Free the file page */ btr_page_free(index, page, mtr); diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c index 222acc015a6..99e365d6129 100644 --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -221,6 +221,9 @@ in the free list to the frames. 5) When we have AWE enabled, we disable adaptive hash indexes. */ +/* Value in microseconds */ +static const int WAIT_FOR_READ = 20000; + buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ #ifdef UNIV_DEBUG @@ -539,6 +542,8 @@ buf_block_init( block->n_pointers = 0; + mutex_create(&block->mutex, SYNC_BUF_BLOCK); + rw_lock_create(&block->lock, SYNC_LEVEL_VARYING); ut_ad(rw_lock_validate(&(block->lock))); @@ -813,8 +818,15 @@ buf_awe_map_page_to_frame( bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped); while (bck) { - if (bck->state == BUF_BLOCK_FILE_PAGE - && (bck->buf_fix_count != 0 || bck->io_fix != 0)) { + ibool skip; + + mutex_enter(&bck->mutex); + + skip = (bck->state == BUF_BLOCK_FILE_PAGE + && (bck->buf_fix_count != 0 || bck->io_fix != 0)); + + if (skip) { + mutex_exit(&bck->mutex); /* We have to skip this */ bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck); @@ -848,6 +860,8 @@ buf_awe_map_page_to_frame( buf_pool->n_pages_awe_remapped++; + mutex_exit(&bck->mutex); + return; } } @@ -886,13 +900,22 @@ buf_block_make_young( /*=================*/ buf_block_t* block) /* in: block to make younger */ { +#ifdef UNIV_SYNC_DEBUG + ut_ad(!mutex_own(&(buf_pool->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + /* Note that we read freed_page_clock's without holding any mutex: + this is allowed since the result is used only in heuristics */ + if (buf_pool->freed_page_clock >= block->freed_page_clock - + 1 + (buf_pool->curr_size / 1024)) { + + 1 + (buf_pool->curr_size / 4)) { + mutex_enter(&buf_pool->mutex); /* There has been freeing activity in the LRU list: best to move to the head of the LRU list */ buf_LRU_make_block_young(block); + mutex_exit(&buf_pool->mutex); } } @@ -927,12 +950,16 @@ buf_block_free( /*===========*/ buf_block_t* block) /* in, own: block to be freed */ { - ut_a(block->state != BUF_BLOCK_FILE_PAGE); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); + + ut_a(block->state != BUF_BLOCK_FILE_PAGE); + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); } @@ -1151,9 +1178,8 @@ buf_page_get_gen( #endif buf_pool->n_page_gets++; loop: - mutex_enter_fast(&(buf_pool->mutex)); - block = NULL; + mutex_enter_fast(&(buf_pool->mutex)); if (guess) { block = buf_block_align(guess); @@ -1191,6 +1217,8 @@ loop: goto loop; } + mutex_enter(&block->mutex); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); must_read = FALSE; @@ -1200,9 +1228,9 @@ loop: must_read = TRUE; if (mode == BUF_GET_IF_IN_POOL) { - /* The page is only being read to buffer */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&buf_pool->mutex); + mutex_exit(&block->mutex); return(NULL); } @@ -1226,7 +1254,7 @@ loop: #else buf_block_buf_fix_inc(block); #endif - buf_block_make_young(block); + mutex_exit(&buf_pool->mutex); /* Check if this is the first access to the page */ @@ -1234,10 +1262,13 @@ loop: block->accessed = TRUE; + mutex_exit(&block->mutex); + + buf_block_make_young(block); + #ifdef UNIV_DEBUG_FILE_ACCESSES ut_a(block->file_page_was_freed == FALSE); #endif - mutex_exit(&(buf_pool->mutex)); #ifdef UNIV_DEBUG buf_dbg_counter++; @@ -1262,13 +1293,14 @@ loop: } if (!success) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - mutex_exit(&(buf_pool->mutex)); return(NULL); } @@ -1279,18 +1311,16 @@ loop: completes */ for (;;) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (block->io_fix == BUF_IO_READ) { - mutex_exit(&(buf_pool->mutex)); - - /* Sleep 20 milliseconds */ + mutex_exit(&block->mutex); - os_thread_sleep(20000); + os_thread_sleep(WAIT_FOR_READ); } else { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); break; } @@ -1349,14 +1379,14 @@ buf_page_optimistic_get_func( ut_ad(mtr && block); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - mutex_enter(&(buf_pool->mutex)); - /* If AWE is used, block may have a different frame now, e.g., NULL */ + mutex_enter(&block->mutex); + if (UNIV_UNLIKELY(block->state != BUF_BLOCK_FILE_PAGE) || UNIV_UNLIKELY(block->frame != guess)) { -exit_func: - mutex_exit(&(buf_pool->mutex)); + + mutex_exit(&block->mutex); return(FALSE); } @@ -1366,15 +1396,14 @@ exit_func: #else buf_block_buf_fix_inc(block); #endif - buf_block_make_young(block); - - /* Check if this is the first access to the page */ - accessed = block->accessed; - block->accessed = TRUE; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); + + buf_block_make_young(block); + + /* Check if this is the first access to the page */ ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset)); @@ -1389,13 +1418,16 @@ exit_func: } if (UNIV_UNLIKELY(!success)) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - goto exit_func; + return(FALSE); } if (UNIV_UNLIKELY(!UT_DULINT_EQ(modify_clock, block->modify_clock))) { @@ -1408,13 +1440,16 @@ exit_func: rw_lock_x_unlock(&(block->lock)); } - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - goto exit_func; + return(FALSE); } mtr_memo_push(mtr, block, fix_type); @@ -1471,10 +1506,10 @@ buf_page_get_known_nowait( ut_ad(mtr); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - mutex_enter(&(buf_pool->mutex)); - block = buf_block_align(guess); + mutex_enter(&block->mutex); + if (block->state == BUF_BLOCK_REMOVE_HASH) { /* Another thread is just freeing the block from the LRU list of the buffer pool: do not try to access this page; this @@ -1483,7 +1518,7 @@ buf_page_get_known_nowait( we have already removed it from the page address hash table of the buffer pool. */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(FALSE); } @@ -1495,12 +1530,12 @@ buf_page_get_known_nowait( #else buf_block_buf_fix_inc(block); #endif + mutex_exit(&block->mutex); + if (mode == BUF_MAKE_YOUNG) { buf_block_make_young(block); } - mutex_exit(&(buf_pool->mutex)); - ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); if (rw_latch == RW_S_LATCH) { @@ -1514,13 +1549,15 @@ buf_page_get_known_nowait( } if (!success) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - mutex_exit(&(buf_pool->mutex)); return(FALSE); } @@ -1568,7 +1605,6 @@ buf_page_init_for_backup_restore( block->offset = offset; block->lock_hash_val = 0; - block->lock_mutex = NULL; block->freed_page_clock = 0; @@ -1601,6 +1637,7 @@ buf_page_init( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(block->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state != BUF_BLOCK_FILE_PAGE); @@ -1615,7 +1652,6 @@ buf_page_init( block->index = NULL; block->lock_hash_val = lock_rec_hash(space, offset); - block->lock_mutex = NULL; /* Insert into the hash table of file pages */ @@ -1709,6 +1745,7 @@ buf_page_init_for_read( ut_a(block); mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (fil_tablespace_deleted_or_being_deleted_in_mem( space, tablespace_version)) { @@ -1722,7 +1759,9 @@ buf_page_init_for_read( deleted or is being deleted, or the page is already in buf_pool, return */ + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); + buf_block_free(block); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -1742,6 +1781,7 @@ buf_page_init_for_read( buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ block->io_fix = BUF_IO_READ; + buf_pool->n_pend_reads++; /* We set a pass-type x-lock on the frame because then the same @@ -1753,6 +1793,7 @@ buf_page_init_for_read( rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ); + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -1817,6 +1858,8 @@ buf_page_create( block = free_block; + mutex_enter(&block->mutex); + buf_page_init(space, offset, block); /* The block must be put to the LRU list */ @@ -1827,13 +1870,15 @@ buf_page_create( #else buf_block_buf_fix_inc(block); #endif + buf_pool->n_pages_created++; + + mutex_exit(&(buf_pool->mutex)); + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); block->accessed = TRUE; - buf_pool->n_pages_created++; - - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); /* Delete possible entries for the page from the insert buffer: such can exist if the page belonged to an index which was dropped */ @@ -1885,6 +1930,12 @@ buf_page_io_complete( ut_a(block->state == BUF_BLOCK_FILE_PAGE); + /* We do not need protect block->io_fix here by block->mutex to read + it because this is the only function where we can change the value + from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code + ensures that this is the only thread that handles the i/o for this + block. */ + io_type = block->io_fix; if (io_type == BUF_IO_READ) { @@ -1986,11 +2037,12 @@ buf_page_io_complete( } } + mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); + #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif - mutex_enter(&(buf_pool->mutex)); - /* Because this thread which does the unlocking is not the same that did the locking, we use a pass value != 0 in unlock, which simply removes the newest lock debug record, without checking the thread @@ -2033,6 +2085,7 @@ buf_page_io_complete( #endif /* UNIV_DEBUG */ } + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); #ifdef UNIV_DEBUG @@ -2095,6 +2148,8 @@ buf_validate(void) block = buf_pool_get_nth_block(buf_pool, i); + mutex_enter(&block->mutex); + if (block->state == BUF_BLOCK_FILE_PAGE) { ut_a(buf_page_hash_get(block->space, @@ -2139,6 +2194,8 @@ buf_validate(void) } else if (block->state == BUF_BLOCK_NOT_USED) { n_free++; } + + mutex_exit(&block->mutex); } if (n_lru + n_free > buf_pool->curr_size) { @@ -2286,9 +2343,14 @@ buf_get_latched_pages_number(void) block = buf_pool_get_nth_block(buf_pool, i); - if (((block->buf_fix_count != 0) || (block->io_fix != 0)) - && block->magic_n == BUF_BLOCK_MAGIC_N) { - fixed_pages_number++; + if (block->magic_n == BUF_BLOCK_MAGIC_N) { + mutex_enter(&block->mutex); + + if (block->buf_fix_count != 0 || block->io_fix != 0) { + fixed_pages_number++; + } + + mutex_exit(&block->mutex); } } @@ -2458,6 +2520,8 @@ buf_all_freed(void) block = buf_pool_get_nth_block(buf_pool, i); + mutex_enter(&block->mutex); + if (block->state == BUF_BLOCK_FILE_PAGE) { if (!buf_flush_ready_for_replace(block)) { @@ -2469,6 +2533,8 @@ buf_all_freed(void) ut_error; } } + + mutex_exit(&block->mutex); } mutex_exit(&(buf_pool->mutex)); diff --git a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c index 8ef8c3b9358..49b81196a64 100644 --- a/storage/innobase/buf/buf0flu.c +++ b/storage/innobase/buf/buf0flu.c @@ -113,6 +113,7 @@ buf_flush_ready_for_replace( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ if (block->state != BUF_BLOCK_FILE_PAGE) { ut_print_timestamp(stderr); @@ -148,6 +149,7 @@ buf_flush_ready_for_flush( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(block->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -559,8 +561,15 @@ buf_flush_try_page( ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); + if (!block) { + mutex_exit(&(buf_pool->mutex)); + return(0); + } + + mutex_enter(&block->mutex); + if (flush_type == BUF_FLUSH_LIST - && block && buf_flush_ready_for_flush(block, flush_type)) { + && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -598,6 +607,7 @@ buf_flush_try_page( locked = TRUE; } + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); if (!locked) { @@ -618,7 +628,7 @@ buf_flush_try_page( return(1); - } else if (flush_type == BUF_FLUSH_LRU && block + } else if (flush_type == BUF_FLUSH_LRU && buf_flush_ready_for_flush(block, flush_type)) { /* VERY IMPORTANT: @@ -659,13 +669,14 @@ buf_flush_try_page( buf_pool mutex: this ensures that the latch is acquired immediately. */ + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); buf_flush_write_block_low(block); return(1); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -692,6 +703,7 @@ buf_flush_try_page( (buf_pool->n_flush[flush_type])++; + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); @@ -709,11 +721,12 @@ buf_flush_try_page( buf_flush_write_block_low(block); return(1); - } else { - mutex_exit(&(buf_pool->mutex)); - - return(0); } + + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); + + return(0); } /*************************************************************** @@ -758,34 +771,48 @@ buf_flush_try_neighbors( block = buf_page_hash_get(space, i); ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); - if (block && flush_type == BUF_FLUSH_LRU && i != offset - && !block->old) { + if (!block) { + + continue; + + } else if (flush_type == BUF_FLUSH_LRU && i != offset + && !block->old) { /* We avoid flushing 'non-old' blocks in an LRU flush, because the flushed blocks are soon freed */ continue; - } + } else { - if (block && buf_flush_ready_for_flush(block, flush_type) - && (i == offset || block->buf_fix_count == 0)) { - /* We only try to flush those neighbors != offset - where the buf fix count is zero, as we then know that - we probably can latch the page without a semaphore - wait. Semaphore waits are expensive because we must - flush the doublewrite buffer before we start - waiting. */ + mutex_enter(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + if (buf_flush_ready_for_flush(block, flush_type) + && (i == offset || block->buf_fix_count == 0)) { + /* We only try to flush those + neighbors != offset where the buf fix count is + zero, as we then know that we probably can + latch the page without a semaphore wait. + Semaphore waits are expensive because we must + flush the doublewrite buffer before we start + waiting. */ - /* Note: as we release the buf_pool mutex above, in - buf_flush_try_page we cannot be sure the page is still - in a flushable state: therefore we check it again - inside that function. */ + mutex_exit(&block->mutex); - count += buf_flush_try_page(space, i, flush_type); + mutex_exit(&(buf_pool->mutex)); - mutex_enter(&(buf_pool->mutex)); + /* Note: as we release the buf_pool mutex + above, in buf_flush_try_page we cannot be sure + the page is still in a flushable state: + therefore we check it again inside that + function. */ + + count += buf_flush_try_page(space, i, + flush_type); + + mutex_enter(&(buf_pool->mutex)); + } else { + mutex_exit(&block->mutex); + } } } @@ -879,12 +906,15 @@ buf_flush_batch( while ((block != NULL) && !found) { ut_a(block->state == BUF_BLOCK_FILE_PAGE); + mutex_enter(&block->mutex); + if (buf_flush_ready_for_flush(block, flush_type)) { found = TRUE; space = block->space; offset = block->offset; + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); old_page_count = page_count; @@ -901,10 +931,14 @@ buf_flush_batch( } else if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&block->mutex); + block = UT_LIST_GET_PREV(LRU, block); } else { ut_ad(flush_type == BUF_FLUSH_LIST); + mutex_exit(&block->mutex); + block = UT_LIST_GET_PREV(flush_list, block); } } @@ -986,10 +1020,14 @@ buf_flush_LRU_recommendation(void) + BUF_FLUSH_EXTRA_MARGIN) && (distance < BUF_LRU_FREE_SEARCH_LEN)) { + mutex_enter(&block->mutex); + if (buf_flush_ready_for_replace(block)) { n_replaceable++; } + mutex_exit(&block->mutex); + distance++; block = UT_LIST_GET_PREV(LRU, block); diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.c index 4ebe94c40ec..377552ece6c 100644 --- a/storage/innobase/buf/buf0lru.c +++ b/storage/innobase/buf/buf0lru.c @@ -86,6 +86,11 @@ scan_again: block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { + buf_block_t* prev_block; + + mutex_enter(&block->mutex); + prev_block = UT_LIST_GET_PREV(LRU, block); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (block->space == id @@ -112,6 +117,8 @@ scan_again: if (block->is_hashed) { page_no = block->offset; + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); /* Note that the following call will acquire @@ -138,7 +145,8 @@ scan_again: buf_LRU_block_free_hashed_page(block); } next_page: - block = UT_LIST_GET_PREV(LRU, block); + mutex_exit(&block->mutex); + block = prev_block; } mutex_exit(&(buf_pool->mutex)); @@ -211,6 +219,9 @@ buf_LRU_search_and_free_block( while (block != NULL) { ut_a(block->in_LRU_list); + + mutex_enter(&block->mutex); + if (buf_flush_ready_for_replace(block)) { #ifdef UNIV_DEBUG @@ -226,6 +237,7 @@ buf_LRU_search_and_free_block( buf_LRU_block_remove_hashed_page(block); mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); /* Remove possible adaptive hash index built on the page; in the case of AWE the block may not have a @@ -234,15 +246,21 @@ buf_LRU_search_and_free_block( if (block->frame) { btr_search_drop_page_hash_index(block->frame); } - mutex_enter(&(buf_pool->mutex)); ut_a(block->buf_fix_count == 0); + mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); + buf_LRU_block_free_hashed_page(block); freed = TRUE; + mutex_exit(&block->mutex); break; } + + mutex_exit(&block->mutex); + block = UT_LIST_GET_PREV(LRU, block); distance++; @@ -428,8 +446,12 @@ loop: } } + mutex_enter(&block->mutex); + block->state = BUF_BLOCK_READY_FOR_USE; + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); if (started_monitor) { @@ -838,6 +860,7 @@ buf_LRU_block_free_non_file_page( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); @@ -877,6 +900,7 @@ buf_LRU_block_remove_hashed_page( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); @@ -939,6 +963,7 @@ buf_LRU_block_free_hashed_page( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_REMOVE_HASH); diff --git a/storage/innobase/dict/dict0crea.c b/storage/innobase/dict/dict0crea.c index 75422b929b1..33e328d1e0b 100644 --- a/storage/innobase/dict/dict0crea.c +++ b/storage/innobase/dict/dict0crea.c @@ -700,8 +700,10 @@ dict_truncate_index_tree( /* out: new root page number, or FIL_NULL on failure */ dict_table_t* table, /* in: the table the index belongs to */ - rec_t* rec, /* in: record in the clustered index of - SYS_INDEXES table */ + btr_pcur_t* pcur, /* in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ mtr_t* mtr) /* in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ @@ -710,6 +712,7 @@ dict_truncate_index_tree( ulint space; ulint type; dulint index_id; + rec_t* rec; byte* ptr; ulint len; ulint comp; @@ -720,6 +723,7 @@ dict_truncate_index_tree( #endif /* UNIV_SYNC_DEBUG */ ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); + rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); @@ -785,10 +789,11 @@ dict_truncate_index_tree( /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ + btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); - /* mtr_commit() will invalidate rec. */ - rec = NULL; + mtr_start(mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); diff --git a/storage/innobase/fil/fil0fil.c b/storage/innobase/fil/fil0fil.c index 1aff3739103..883fbd09ee4 100644 --- a/storage/innobase/fil/fil0fil.c +++ b/storage/innobase/fil/fil0fil.c @@ -4416,29 +4416,47 @@ fil_flush_file_spaces( { fil_system_t* system = fil_system; fil_space_t* space; + ulint* space_ids; + ulint n_space_ids; + ulint i; mutex_enter(&(system->mutex)); - space = UT_LIST_GET_FIRST(system->unflushed_spaces); + n_space_ids = UT_LIST_GET_LEN(system->unflushed_spaces); + if (n_space_ids == 0) { - while (space) { - if (space->purpose == purpose && !space->is_being_deleted) { + mutex_exit(&system->mutex); + return; + } - space->n_pending_flushes++; /* prevent dropping of - the space while we are - flushing */ - mutex_exit(&(system->mutex)); + /* Assemble a list of space ids to flush. Previously, we + traversed system->unflushed_spaces and called UT_LIST_GET_NEXT() + on a space that was just removed from the list by fil_flush(). + Thus, the space could be dropped and the memory overwritten. */ + space_ids = mem_alloc(n_space_ids * sizeof *space_ids); - fil_flush(space->id); + n_space_ids = 0; - mutex_enter(&(system->mutex)); + for (space = UT_LIST_GET_FIRST(system->unflushed_spaces); + space; + space = UT_LIST_GET_NEXT(unflushed_spaces, space)) { - space->n_pending_flushes--; + if (space->purpose == purpose && !space->is_being_deleted) { + + space_ids[n_space_ids++] = space->id; } - space = UT_LIST_GET_NEXT(unflushed_spaces, space); } - mutex_exit(&(system->mutex)); + mutex_exit(&system->mutex); + + /* Flush the spaces. It will not hurt to call fil_flush() on + a non-existing space id. */ + for (i = 0; i < n_space_ids; i++) { + + fil_flush(space_ids[i]); + } + + mem_free(space_ids); } /********************************************************************** diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 664607a26aa..1573de7e818 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -23,7 +23,17 @@ special big record storage structure */ #define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200) -/* Latching modes for the search function (in btr0cur.*) */ +/* Maximum depth of a B-tree in InnoDB. Note that this isn't a maximum as +such; none of the tree operations avoid producing trees bigger than this. It +is instead a "max depth that other code must work with", useful for e.g. +fixed-size arrays that must store some information about each level in a +tree. In other words: if a B-tree with bigger depth than this is +encountered, it is not acceptable for it to lead to mysterious memory +corruption, but it is acceptable for the program to die with a clear assert +failure. */ +#define BTR_MAX_LEVELS 100 + +/* Latching modes for btr_cur_search_to_nth_level(). */ #define BTR_SEARCH_LEAF RW_S_LATCH #define BTR_MODIFY_LEAF RW_X_LATCH #define BTR_NO_LATCHES RW_NO_LATCH diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 2fc6bf4bcb9..979c28f64ed 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -461,8 +461,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock table. */ UNIV_INLINE mutex_t* -buf_frame_get_lock_mutex( -/*=====================*/ +buf_frame_get_mutex( +/*================*/ /* out: mutex */ byte* ptr); /* in: pointer to within a buffer frame */ /*********************************************************************** @@ -713,7 +713,10 @@ struct buf_block_struct{ ulint magic_n; /* magic number to check */ ulint state; /* state of the control block: - BUF_BLOCK_NOT_USED, ... */ + BUF_BLOCK_NOT_USED, ...; changing + this is only allowed when a thread + has BOTH the buffer pool mutex AND + block->mutex locked */ byte* frame; /* pointer to buffer frame which is of size UNIV_PAGE_SIZE, and aligned to an address divisible by @@ -731,8 +734,12 @@ struct buf_block_struct{ ulint offset; /* page number within the space */ ulint lock_hash_val; /* hashed value of the page address in the record lock hash table */ - mutex_t* lock_mutex; /* mutex protecting the chain in the - record lock hash table */ + mutex_t mutex; /* mutex protecting this block: + state (also protected by the buffer + pool mutex), io_fix, buf_fix_count, + and accessed; we introduce this new + mutex in InnoDB-5.1 to relieve + contention on the buffer pool mutex */ rw_lock_t lock; /* read-write lock of the buffer frame */ buf_block_t* hash; /* node used in chaining to the page @@ -788,20 +795,27 @@ struct buf_block_struct{ in heuristic algorithms, because of the possibility of a wrap-around! */ ulint freed_page_clock;/* the value of freed_page_clock - buffer pool when this block was - last time put to the head of the - LRU list */ + of the buffer pool when this block was + the last time put to the head of the + LRU list; a thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ ibool old; /* TRUE if the block is in the old blocks in the LRU list */ ibool accessed; /* TRUE if the page has been accessed while in the buffer pool: read-ahead may read in pages which have not been - accessed yet */ + accessed yet; this is protected by + block->mutex; a thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ ulint buf_fix_count; /* count of how manyfold this block - is currently bufferfixed */ + is currently bufferfixed; this is + protected by block->mutex */ ulint io_fix; /* if a read is pending to the frame, io_fix is BUF_IO_READ, in the case - of a write BUF_IO_WRITE, otherwise 0 */ + of a write BUF_IO_WRITE, otherwise 0; + this is protected by block->mutex */ /* 4. Optimistic search field */ dulint modify_clock; /* this clock is incremented every @@ -959,7 +973,9 @@ struct buf_pool_struct{ number of buffer blocks removed from the end of the LRU list; NOTE that this counter may wrap around at 4 - billion! */ + billion! A thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ ulint LRU_flush_ended;/* when an LRU flush ends for a page, this is incremented by one; this is set to zero when a buffer block is diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index 015de862705..43895295734 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -337,8 +337,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock table. */ UNIV_INLINE mutex_t* -buf_frame_get_lock_mutex( -/*=====================*/ +buf_frame_get_mutex( +/*================*/ /* out: mutex */ byte* ptr) /* in: pointer to within a buffer frame */ { @@ -346,7 +346,7 @@ buf_frame_get_lock_mutex( block = buf_block_align(ptr); - return(block->lock_mutex); + return(&block->mutex); } /************************************************************************* @@ -519,6 +519,7 @@ buf_block_buf_fix_inc_debug( ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line); ut_ad(ret == TRUE); + ut_ad(mutex_own(&block->mutex)); #endif block->buf_fix_count++; } @@ -531,6 +532,9 @@ buf_block_buf_fix_inc( /*==================*/ buf_block_t* block) /* in: block to bufferfix */ { +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&block->mutex)); +#endif block->buf_fix_count++; } #endif /* UNIV_SYNC_DEBUG */ @@ -625,23 +629,24 @@ buf_page_release( ut_ad(block); - mutex_enter_fast(&(buf_pool->mutex)); - ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(block->buf_fix_count > 0); if (rw_latch == RW_X_LATCH && mtr->modifications) { - + mutex_enter(&buf_pool->mutex); buf_flush_note_modification(block, mtr); + mutex_exit(&buf_pool->mutex); } + mutex_enter(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif buf_fix_count = block->buf_fix_count; block->buf_fix_count = buf_fix_count - 1; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); if (rw_latch == RW_S_LATCH) { rw_lock_s_unlock(&(block->lock)); diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h index bd0890dbfa7..f0f30481abe 100644 --- a/storage/innobase/include/dict0crea.h +++ b/storage/innobase/include/dict0crea.h @@ -62,8 +62,10 @@ dict_truncate_index_tree( /* out: new root page number, or FIL_NULL on failure */ dict_table_t* table, /* in: the table the index belongs to */ - rec_t* rec, /* in: record in the clustered index of - SYS_INDEXES table */ + btr_pcur_t* pcur, /* in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ mtr_t* mtr); /* in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h index 819f2e770ba..d19885d6edc 100644 --- a/storage/innobase/include/ut0lst.h +++ b/storage/innobase/include/ut0lst.h @@ -123,27 +123,36 @@ name, NODE1 and NODE2 are pointers to nodes. */ }\ }\ +/* Invalidate the pointers in a list node. */ +#ifdef UNIV_DEBUG +# define UT_LIST_REMOVE_CLEAR(NAME, N) \ +((N)->NAME.prev = (N)->NAME.next = (void*) -1) +#else +# define UT_LIST_REMOVE_CLEAR(NAME, N) while (0) +#endif + /*********************************************************************** Removes a node from a two-way linked list. BASE has to be the base node (not a pointer to it). N has to be the pointer to the node to be removed from the list. NAME is the list name. */ -#define UT_LIST_REMOVE(NAME, BASE, N)\ -{\ - ut_ad(N);\ - ut_a((BASE).count > 0);\ - ((BASE).count)--;\ - if (((N)->NAME).next != NULL) {\ - ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;\ - } else {\ - (BASE).end = ((N)->NAME).prev;\ - }\ - if (((N)->NAME).prev != NULL) {\ - ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;\ - } else {\ - (BASE).start = ((N)->NAME).next;\ - }\ -}\ +#define UT_LIST_REMOVE(NAME, BASE, N) \ +do { \ + ut_ad(N); \ + ut_a((BASE).count > 0); \ + ((BASE).count)--; \ + if (((N)->NAME).next != NULL) { \ + ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev; \ + } else { \ + (BASE).end = ((N)->NAME).prev; \ + } \ + if (((N)->NAME).prev != NULL) { \ + ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next; \ + } else { \ + (BASE).start = ((N)->NAME).next; \ + } \ + UT_LIST_REMOVE_CLEAR(NAME, N); \ +} while (0) /************************************************************************ Gets the next node in a two-way list. NAME is the name of the list diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c index a2895bae71a..41a468e5026 100644 --- a/storage/innobase/row/row0mysql.c +++ b/storage/innobase/row/row0mysql.c @@ -2820,12 +2820,10 @@ row_truncate_table_for_mysql( goto next_rec; } - btr_pcur_store_position(&pcur, &mtr); + /* This call may commit and restart mtr + and reposition pcur. */ + root_page_no = dict_truncate_index_tree(table, &pcur, &mtr); - /* This call may commit and restart mtr. */ - root_page_no = dict_truncate_index_tree(table, rec, &mtr); - - btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr); rec = btr_pcur_get_rec(&pcur); if (root_page_no != FIL_NULL) { |