diff options
author | tsmith@quadxeon.mysql.com <> | 2006-11-09 05:02:37 +0100 |
---|---|---|
committer | tsmith@quadxeon.mysql.com <> | 2006-11-09 05:02:37 +0100 |
commit | f1e0cf9d285269571e96ac4fc058e7494a69c7c7 (patch) | |
tree | 03d4d71d67b4aaee305c7431903f593ad5229733 /innobase | |
parent | 41117b1226bcb02536bf963d9a18a6140d5ffe1a (diff) | |
download | mariadb-git-f1e0cf9d285269571e96ac4fc058e7494a69c7c7.tar.gz |
This ChangeSet must be null-merged to 5.1. Applied innodb-5.0-ss982, -ss998, -ss1003
Fixes:
- Bug #15815: Very poor performance with multiple queries running concurrently
- Bug #22868: 'Thread thrashing' with > 50 concurrent conns under an upd-intensive workloadw
- Bug #23769: Debug assertion failure with innodb_locks_unsafe_for_binlog
- Bug #24089: Race condition in fil_flush_file_spaces()
Diffstat (limited to 'innobase')
-rw-r--r-- | innobase/buf/buf0buf.c | 181 | ||||
-rw-r--r-- | innobase/buf/buf0flu.c | 92 | ||||
-rw-r--r-- | innobase/buf/buf0lru.c | 25 | ||||
-rw-r--r-- | innobase/dict/dict0crea.c | 13 | ||||
-rw-r--r-- | innobase/fil/fil0fil.c | 44 | ||||
-rw-r--r-- | innobase/include/buf0buf.h | 40 | ||||
-rw-r--r-- | innobase/include/buf0buf.ic | 19 | ||||
-rw-r--r-- | innobase/include/dict0crea.h | 6 | ||||
-rw-r--r-- | innobase/include/sync0arr.h | 13 | ||||
-rw-r--r-- | innobase/include/sync0rw.h | 1 | ||||
-rw-r--r-- | innobase/include/sync0rw.ic | 6 | ||||
-rw-r--r-- | innobase/include/sync0sync.h | 1 | ||||
-rw-r--r-- | innobase/os/os0sync.c | 55 | ||||
-rw-r--r-- | innobase/row/row0mysql.c | 8 | ||||
-rw-r--r-- | innobase/row/row0sel.c | 6 | ||||
-rw-r--r-- | innobase/srv/srv0start.c | 5 | ||||
-rw-r--r-- | innobase/sync/sync0arr.c | 119 | ||||
-rw-r--r-- | innobase/sync/sync0rw.c | 2 | ||||
-rw-r--r-- | innobase/sync/sync0sync.c | 8 |
19 files changed, 419 insertions, 225 deletions
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index db09a931c29..f24f1744363 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -221,6 +221,9 @@ in the free list to the frames. 5) When we have AWE enabled, we disable adaptive hash indexes. */ +/* Value in microseconds */ +static const int WAIT_FOR_READ = 20000; + buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ #ifdef UNIV_DEBUG @@ -488,6 +491,9 @@ buf_block_init( block->n_pointers = 0; + mutex_create(&block->mutex); + mutex_set_level(&block->mutex, SYNC_BUF_BLOCK); + rw_lock_create(&(block->lock)); ut_ad(rw_lock_validate(&(block->lock))); @@ -756,8 +762,15 @@ buf_awe_map_page_to_frame( bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped); while (bck) { - if (bck->state == BUF_BLOCK_FILE_PAGE - && (bck->buf_fix_count != 0 || bck->io_fix != 0)) { + ibool skip; + + mutex_enter(&bck->mutex); + + skip = (bck->state == BUF_BLOCK_FILE_PAGE + && (bck->buf_fix_count != 0 || bck->io_fix != 0)); + + if (skip) { + mutex_exit(&bck->mutex); /* We have to skip this */ bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck); @@ -790,6 +803,8 @@ buf_awe_map_page_to_frame( buf_pool->n_pages_awe_remapped++; + mutex_exit(&bck->mutex); + return; } } @@ -828,13 +843,22 @@ buf_block_make_young( /*=================*/ buf_block_t* block) /* in: block to make younger */ { +#ifdef UNIV_SYNC_DEBUG + ut_ad(!mutex_own(&(buf_pool->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + /* Note that we read freed_page_clock's without holding any mutex: + this is allowed since the result is used only in heuristics */ + if (buf_pool->freed_page_clock >= block->freed_page_clock - + 1 + (buf_pool->curr_size / 1024)) { + + 1 + (buf_pool->curr_size / 4)) { + mutex_enter(&buf_pool->mutex); /* There has been freeing activity in the LRU list: best to move to the head of the LRU list */ buf_LRU_make_block_young(block); + mutex_exit(&buf_pool->mutex); } } @@ -869,12 +893,16 @@ buf_block_free( /*===========*/ buf_block_t* block) /* in, own: block to be freed */ { - ut_a(block->state != BUF_BLOCK_FILE_PAGE); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); + + ut_a(block->state != BUF_BLOCK_FILE_PAGE); + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); } @@ -1093,9 +1121,8 @@ buf_page_get_gen( #endif buf_pool->n_page_gets++; loop: - mutex_enter_fast(&(buf_pool->mutex)); - block = NULL; + mutex_enter_fast(&(buf_pool->mutex)); if (guess) { block = buf_block_align(guess); @@ -1133,6 +1160,8 @@ loop: goto loop; } + mutex_enter(&block->mutex); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); must_read = FALSE; @@ -1142,9 +1171,9 @@ loop: must_read = TRUE; if (mode == BUF_GET_IF_IN_POOL) { - /* The page is only being read to buffer */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&buf_pool->mutex); + mutex_exit(&block->mutex); return(NULL); } @@ -1168,7 +1197,7 @@ loop: #else buf_block_buf_fix_inc(block); #endif - buf_block_make_young(block); + mutex_exit(&buf_pool->mutex); /* Check if this is the first access to the page */ @@ -1176,10 +1205,13 @@ loop: block->accessed = TRUE; + mutex_exit(&block->mutex); + + buf_block_make_young(block); + #ifdef UNIV_DEBUG_FILE_ACCESSES ut_a(block->file_page_was_freed == FALSE); #endif - mutex_exit(&(buf_pool->mutex)); #ifdef UNIV_DEBUG buf_dbg_counter++; @@ -1204,13 +1236,14 @@ loop: } if (!success) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - mutex_exit(&(buf_pool->mutex)); return(NULL); } @@ -1221,18 +1254,16 @@ loop: completes */ for (;;) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (block->io_fix == BUF_IO_READ) { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); - /* Sleep 20 milliseconds */ - - os_thread_sleep(20000); + os_thread_sleep(WAIT_FOR_READ); } else { - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); break; } @@ -1290,15 +1321,15 @@ buf_page_optimistic_get_func( ut_ad(mtr && block); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - - mutex_enter(&(buf_pool->mutex)); /* If AWE is used, block may have a different frame now, e.g., NULL */ - + + mutex_enter(&block->mutex); + if (UNIV_UNLIKELY(block->state != BUF_BLOCK_FILE_PAGE) - || UNIV_UNLIKELY(block->frame != guess)) { - exit_func: - mutex_exit(&(buf_pool->mutex)); + || UNIV_UNLIKELY(block->frame != guess)) { + + mutex_exit(&block->mutex); return(FALSE); } @@ -1308,15 +1339,14 @@ buf_page_optimistic_get_func( #else buf_block_buf_fix_inc(block); #endif - buf_block_make_young(block); - - /* Check if this is the first access to the page */ - accessed = block->accessed; - block->accessed = TRUE; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); + + buf_block_make_young(block); + + /* Check if this is the first access to the page */ ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset)); @@ -1331,13 +1361,16 @@ buf_page_optimistic_get_func( } if (UNIV_UNLIKELY(!success)) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - goto exit_func; + return(FALSE); } if (UNIV_UNLIKELY(!UT_DULINT_EQ(modify_clock, block->modify_clock))) { @@ -1350,13 +1383,16 @@ buf_page_optimistic_get_func( rw_lock_x_unlock(&(block->lock)); } - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - goto exit_func; + return(FALSE); } mtr_memo_push(mtr, block, fix_type); @@ -1413,10 +1449,10 @@ buf_page_get_known_nowait( ut_ad(mtr); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - mutex_enter(&(buf_pool->mutex)); - block = buf_block_align(guess); + mutex_enter(&block->mutex); + if (block->state == BUF_BLOCK_REMOVE_HASH) { /* Another thread is just freeing the block from the LRU list of the buffer pool: do not try to access this page; this @@ -1425,7 +1461,7 @@ buf_page_get_known_nowait( we have already removed it from the page address hash table of the buffer pool. */ - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(FALSE); } @@ -1437,12 +1473,12 @@ buf_page_get_known_nowait( #else buf_block_buf_fix_inc(block); #endif + mutex_exit(&block->mutex); + if (mode == BUF_MAKE_YOUNG) { buf_block_make_young(block); } - mutex_exit(&(buf_pool->mutex)); - ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); if (rw_latch == RW_S_LATCH) { @@ -1456,13 +1492,15 @@ buf_page_get_known_nowait( } if (!success) { - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); block->buf_fix_count--; + + mutex_exit(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - mutex_exit(&(buf_pool->mutex)); return(FALSE); } @@ -1510,8 +1548,7 @@ buf_page_init_for_backup_restore( block->offset = offset; block->lock_hash_val = 0; - block->lock_mutex = NULL; - + block->freed_page_clock = 0; block->newest_modification = ut_dulint_zero; @@ -1543,6 +1580,7 @@ buf_page_init( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(block->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state != BUF_BLOCK_FILE_PAGE); @@ -1557,8 +1595,7 @@ buf_page_init( block->index = NULL; block->lock_hash_val = lock_rec_hash(space, offset); - block->lock_mutex = NULL; - + /* Insert into the hash table of file pages */ if (buf_page_hash_get(space, offset)) { @@ -1650,6 +1687,7 @@ buf_page_init_for_read( ut_a(block); mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (fil_tablespace_deleted_or_being_deleted_in_mem(space, tablespace_version)) { @@ -1662,7 +1700,9 @@ buf_page_init_for_read( /* The page belongs to a space which has been deleted or is being deleted, or the page is already in buf_pool, return */ + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); + buf_block_free(block); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -1682,6 +1722,7 @@ buf_page_init_for_read( buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ block->io_fix = BUF_IO_READ; + buf_pool->n_pend_reads++; /* We set a pass-type x-lock on the frame because then the same @@ -1693,6 +1734,7 @@ buf_page_init_for_read( rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ); + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -1757,6 +1799,8 @@ buf_page_create( block = free_block; + mutex_enter(&block->mutex); + buf_page_init(space, offset, block); /* The block must be put to the LRU list */ @@ -1767,13 +1811,15 @@ buf_page_create( #else buf_block_buf_fix_inc(block); #endif + buf_pool->n_pages_created++; + + mutex_exit(&(buf_pool->mutex)); + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); block->accessed = TRUE; - buf_pool->n_pages_created++; - - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); /* Delete possible entries for the page from the insert buffer: such can exist if the page belonged to an index which was dropped */ @@ -1822,6 +1868,12 @@ buf_page_io_complete( ut_a(block->state == BUF_BLOCK_FILE_PAGE); + /* We do not need protect block->io_fix here by block->mutex to read + it because this is the only function where we can change the value + from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code + ensures that this is the only thread that handles the i/o for this + block. */ + io_type = block->io_fix; if (io_type == BUF_IO_READ) { @@ -1890,11 +1942,12 @@ buf_page_io_complete( } } + mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); + #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif - mutex_enter(&(buf_pool->mutex)); - /* Because this thread which does the unlocking is not the same that did the locking, we use a pass value != 0 in unlock, which simply removes the newest lock debug record, without checking the thread @@ -1937,6 +1990,7 @@ buf_page_io_complete( #endif /* UNIV_DEBUG */ } + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); #ifdef UNIV_DEBUG @@ -1999,6 +2053,8 @@ buf_validate(void) block = buf_pool_get_nth_block(buf_pool, i); + mutex_enter(&block->mutex); + if (block->state == BUF_BLOCK_FILE_PAGE) { ut_a(buf_page_hash_get(block->space, @@ -2042,6 +2098,8 @@ buf_validate(void) } else if (block->state == BUF_BLOCK_NOT_USED) { n_free++; } + + mutex_exit(&block->mutex); } if (n_lru + n_free > buf_pool->curr_size) { @@ -2185,11 +2243,17 @@ buf_get_latched_pages_number(void) for (i = 0; i < buf_pool->curr_size; i++) { - block = buf_pool_get_nth_block(buf_pool, i); + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->magic_n == BUF_BLOCK_MAGIC_N) { + mutex_enter(&block->mutex); - if (((block->buf_fix_count != 0) || (block->io_fix != 0)) && - block->magic_n == BUF_BLOCK_MAGIC_N ) - fixed_pages_number++; + if (block->buf_fix_count != 0 || block->io_fix != 0) { + fixed_pages_number++; + } + + mutex_exit(&block->mutex); + } } mutex_exit(&(buf_pool->mutex)); @@ -2354,16 +2418,21 @@ buf_all_freed(void) block = buf_pool_get_nth_block(buf_pool, i); + mutex_enter(&block->mutex); + if (block->state == BUF_BLOCK_FILE_PAGE) { if (!buf_flush_ready_for_replace(block)) { fprintf(stderr, "Page %lu %lu still fixed or dirty\n", - (ulong) block->space, (ulong) block->offset); + (ulong) block->space, + (ulong) block->offset); ut_error; } } + + mutex_exit(&block->mutex); } mutex_exit(&(buf_pool->mutex)); diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index e39d1ae0a71..fc7b60bf5fb 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -114,6 +114,7 @@ buf_flush_ready_for_replace( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ if (block->state != BUF_BLOCK_FILE_PAGE) { ut_print_timestamp(stderr); @@ -148,6 +149,7 @@ buf_flush_ready_for_flush( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(block->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -539,8 +541,15 @@ buf_flush_try_page( ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); + if (!block) { + mutex_exit(&(buf_pool->mutex)); + return(0); + } + + mutex_enter(&block->mutex); + if (flush_type == BUF_FLUSH_LIST - && block && buf_flush_ready_for_flush(block, flush_type)) { + && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -578,6 +587,7 @@ buf_flush_try_page( locked = TRUE; } + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); if (!locked) { @@ -598,8 +608,8 @@ buf_flush_try_page( return(1); - } else if (flush_type == BUF_FLUSH_LRU && block - && buf_flush_ready_for_flush(block, flush_type)) { + } else if (flush_type == BUF_FLUSH_LRU + && buf_flush_ready_for_flush(block, flush_type)) { /* VERY IMPORTANT: Because any thread may call the LRU flush, even when owning @@ -639,14 +649,15 @@ buf_flush_try_page( buf_pool mutex: this ensures that the latch is acquired immediately. */ + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); buf_flush_write_block_low(block); return(1); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block - && buf_flush_ready_for_flush(block, flush_type)) { + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE + && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -672,6 +683,7 @@ buf_flush_try_page( (buf_pool->n_flush[flush_type])++; + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); @@ -688,11 +700,12 @@ buf_flush_try_page( buf_flush_write_block_low(block); return(1); - } else { - mutex_exit(&(buf_pool->mutex)); + } - return(0); - } + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); + + return(0); } /*************************************************************** @@ -737,34 +750,48 @@ buf_flush_try_neighbors( block = buf_page_hash_get(space, i); ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); - if (block && flush_type == BUF_FLUSH_LRU && i != offset - && !block->old) { + if (!block) { + + continue; + + } else if (flush_type == BUF_FLUSH_LRU && i != offset + && !block->old) { /* We avoid flushing 'non-old' blocks in an LRU flush, because the flushed blocks are soon freed */ continue; - } + } else { + + mutex_enter(&block->mutex); + + if (buf_flush_ready_for_flush(block, flush_type) + && (i == offset || block->buf_fix_count == 0)) { + /* We only try to flush those + neighbors != offset where the buf fix count is + zero, as we then know that we probably can + latch the page without a semaphore wait. + Semaphore waits are expensive because we must + flush the doublewrite buffer before we start + waiting. */ - if (block && buf_flush_ready_for_flush(block, flush_type) - && (i == offset || block->buf_fix_count == 0)) { - /* We only try to flush those neighbors != offset - where the buf fix count is zero, as we then know that - we probably can latch the page without a semaphore - wait. Semaphore waits are expensive because we must - flush the doublewrite buffer before we start - waiting. */ + mutex_exit(&block->mutex); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->mutex)); - /* Note: as we release the buf_pool mutex above, in - buf_flush_try_page we cannot be sure the page is still - in a flushable state: therefore we check it again - inside that function. */ + /* Note: as we release the buf_pool mutex + above, in buf_flush_try_page we cannot be sure + the page is still in a flushable state: + therefore we check it again inside that + function. */ - count += buf_flush_try_page(space, i, flush_type); + count += buf_flush_try_page(space, i, + flush_type); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->mutex)); + } else { + mutex_exit(&block->mutex); + } } } @@ -858,12 +885,15 @@ buf_flush_batch( while ((block != NULL) && !found) { ut_a(block->state == BUF_BLOCK_FILE_PAGE); + mutex_enter(&block->mutex); + if (buf_flush_ready_for_flush(block, flush_type)) { found = TRUE; space = block->space; offset = block->offset; + mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); old_page_count = page_count; @@ -881,10 +911,14 @@ buf_flush_batch( } else if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&block->mutex); + block = UT_LIST_GET_PREV(LRU, block); } else { ut_ad(flush_type == BUF_FLUSH_LIST); + mutex_exit(&block->mutex); + block = UT_LIST_GET_PREV(flush_list, block); } } @@ -966,10 +1000,14 @@ buf_flush_LRU_recommendation(void) + BUF_FLUSH_EXTRA_MARGIN) && (distance < BUF_LRU_FREE_SEARCH_LEN)) { + mutex_enter(&block->mutex); + if (buf_flush_ready_for_replace(block)) { n_replaceable++; } + mutex_exit(&block->mutex); + distance++; block = UT_LIST_GET_PREV(LRU, block); diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index 8b135cc5db3..dfee2add045 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -86,6 +86,9 @@ scan_again: block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { + + mutex_enter(&block->mutex); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (block->space == id @@ -112,6 +115,8 @@ scan_again: if (block->is_hashed) { page_no = block->offset; + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); /* Note that the following call will acquire @@ -138,6 +143,7 @@ scan_again: buf_LRU_block_free_hashed_page(block); } next_page: + mutex_exit(&block->mutex); block = UT_LIST_GET_PREV(LRU, block); } @@ -211,6 +217,9 @@ buf_LRU_search_and_free_block( while (block != NULL) { ut_a(block->in_LRU_list); + + mutex_enter(&block->mutex); + if (buf_flush_ready_for_replace(block)) { #ifdef UNIV_DEBUG @@ -225,6 +234,7 @@ buf_LRU_search_and_free_block( buf_LRU_block_remove_hashed_page(block); mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); /* Remove possible adaptive hash index built on the page; in the case of AWE the block may not have a @@ -233,15 +243,21 @@ buf_LRU_search_and_free_block( if (block->frame) { btr_search_drop_page_hash_index(block->frame); } - mutex_enter(&(buf_pool->mutex)); ut_a(block->buf_fix_count == 0); + mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); + buf_LRU_block_free_hashed_page(block); freed = TRUE; + mutex_exit(&block->mutex); break; } + + mutex_exit(&block->mutex); + block = UT_LIST_GET_PREV(LRU, block); distance++; @@ -415,8 +431,12 @@ loop: } } + mutex_enter(&block->mutex); + block->state = BUF_BLOCK_READY_FOR_USE; + mutex_exit(&block->mutex); + mutex_exit(&(buf_pool->mutex)); if (started_monitor) { @@ -818,6 +838,7 @@ buf_LRU_block_free_non_file_page( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); @@ -857,6 +878,7 @@ buf_LRU_block_remove_hashed_page( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); @@ -914,6 +936,7 @@ buf_LRU_block_free_hashed_page( { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_REMOVE_HASH); diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index c7d6ffd2c22..e20d8b6e83a 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -724,8 +724,10 @@ dict_truncate_index_tree( /* out: new root page number, or FIL_NULL on failure */ dict_table_t* table, /* in: the table the index belongs to */ - rec_t* rec, /* in: record in the clustered index of - SYS_INDEXES table */ + btr_pcur_t* pcur, /* in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ mtr_t* mtr) /* in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ @@ -734,6 +736,7 @@ dict_truncate_index_tree( ulint space; ulint type; dulint index_id; + rec_t* rec; byte* ptr; ulint len; ulint comp; @@ -744,6 +747,7 @@ dict_truncate_index_tree( #endif /* UNIV_SYNC_DEBUG */ ut_a(!dict_sys->sys_indexes->comp); + rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); @@ -809,10 +813,11 @@ dict_truncate_index_tree( /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ + btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); - /* mtr_commit() will invalidate rec. */ - rec = NULL; + mtr_start(mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index 64987294654..65320b57183 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -4285,29 +4285,47 @@ fil_flush_file_spaces( { fil_system_t* system = fil_system; fil_space_t* space; + ulint* space_ids; + ulint n_space_ids; + ulint i; mutex_enter(&(system->mutex)); - space = UT_LIST_GET_FIRST(system->unflushed_spaces); + n_space_ids = UT_LIST_GET_LEN(system->unflushed_spaces); + if (n_space_ids == 0) { - while (space) { - if (space->purpose == purpose && !space->is_being_deleted) { + mutex_exit(&system->mutex); + return; + } - space->n_pending_flushes++; /* prevent dropping of the - space while we are - flushing */ - mutex_exit(&(system->mutex)); + /* Assemble a list of space ids to flush. Previously, we + traversed system->unflushed_spaces and called UT_LIST_GET_NEXT() + on a space that was just removed from the list by fil_flush(). + Thus, the space could be dropped and the memory overwritten. */ + space_ids = mem_alloc(n_space_ids * sizeof *space_ids); - fil_flush(space->id); + n_space_ids = 0; - mutex_enter(&(system->mutex)); + for (space = UT_LIST_GET_FIRST(system->unflushed_spaces); + space; + space = UT_LIST_GET_NEXT(unflushed_spaces, space)) { + + if (space->purpose == purpose && !space->is_being_deleted) { - space->n_pending_flushes--; + space_ids[n_space_ids++] = space->id; } - space = UT_LIST_GET_NEXT(unflushed_spaces, space); } - - mutex_exit(&(system->mutex)); + + mutex_exit(&system->mutex); + + /* Flush the spaces. It will not hurt to call fil_flush() on + a non-existing space id. */ + for (i = 0; i < n_space_ids; i++) { + + fil_flush(space_ids[i]); + } + + mem_free(space_ids); } /********************************************************************** diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index fc1d9a64c7f..11e5bb39e63 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -461,8 +461,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock table. */ UNIV_INLINE mutex_t* -buf_frame_get_lock_mutex( -/*=====================*/ +buf_frame_get_mutex( +/*================*/ /* out: mutex */ byte* ptr); /* in: pointer to within a buffer frame */ /*********************************************************************** @@ -713,7 +713,10 @@ struct buf_block_struct{ ulint magic_n; /* magic number to check */ ulint state; /* state of the control block: - BUF_BLOCK_NOT_USED, ... */ + BUF_BLOCK_NOT_USED, ...; changing + this is only allowed when a thread + has BOTH the buffer pool mutex AND + block->mutex locked */ byte* frame; /* pointer to buffer frame which is of size UNIV_PAGE_SIZE, and aligned to an address divisible by @@ -731,8 +734,12 @@ struct buf_block_struct{ ulint offset; /* page number within the space */ ulint lock_hash_val; /* hashed value of the page address in the record lock hash table */ - mutex_t* lock_mutex; /* mutex protecting the chain in the - record lock hash table */ + mutex_t mutex; /* mutex protecting this block: + state (also protected by the buffer + pool mutex), io_fix, buf_fix_count, + and accessed; we introduce this new + mutex in InnoDB-5.1 to relieve + contention on the buffer pool mutex */ rw_lock_t lock; /* read-write lock of the buffer frame */ buf_block_t* hash; /* node used in chaining to the page @@ -788,20 +795,27 @@ struct buf_block_struct{ in heuristic algorithms, because of the possibility of a wrap-around! */ ulint freed_page_clock;/* the value of freed_page_clock - buffer pool when this block was - last time put to the head of the - LRU list */ + of the buffer pool when this block was + the last time put to the head of the + LRU list; a thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ ibool old; /* TRUE if the block is in the old blocks in the LRU list */ ibool accessed; /* TRUE if the page has been accessed while in the buffer pool: read-ahead may read in pages which have not been - accessed yet */ + accessed yet; this is protected by + block->mutex; a thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ ulint buf_fix_count; /* count of how manyfold this block - is currently bufferfixed */ + is currently bufferfixed; this is + protected by block->mutex */ ulint io_fix; /* if a read is pending to the frame, io_fix is BUF_IO_READ, in the case - of a write BUF_IO_WRITE, otherwise 0 */ + of a write BUF_IO_WRITE, otherwise 0; + this is protected by block->mutex */ /* 4. Optimistic search field */ dulint modify_clock; /* this clock is incremented every @@ -962,7 +976,9 @@ struct buf_pool_struct{ number of buffer blocks removed from the end of the LRU list; NOTE that this counter may wrap around at 4 - billion! */ + billion! A thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ ulint LRU_flush_ended;/* when an LRU flush ends for a page, this is incremented by one; this is set to zero when a buffer block is diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic index af32db10b5f..ddc91b8d66c 100644 --- a/innobase/include/buf0buf.ic +++ b/innobase/include/buf0buf.ic @@ -330,8 +330,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock table. */ UNIV_INLINE mutex_t* -buf_frame_get_lock_mutex( -/*=====================*/ +buf_frame_get_mutex( +/*================*/ /* out: mutex */ byte* ptr) /* in: pointer to within a buffer frame */ { @@ -339,7 +339,7 @@ buf_frame_get_lock_mutex( block = buf_block_align(ptr); - return(block->lock_mutex); + return(&block->mutex); } /************************************************************************* @@ -512,6 +512,7 @@ buf_block_buf_fix_inc_debug( ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line); ut_ad(ret == TRUE); + ut_ad(mutex_own(&block->mutex)); #endif block->buf_fix_count++; } @@ -524,6 +525,9 @@ buf_block_buf_fix_inc( /*==================*/ buf_block_t* block) /* in: block to bufferfix */ { +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&block->mutex)); +#endif block->buf_fix_count++; } #endif /* UNIV_SYNC_DEBUG */ @@ -618,23 +622,24 @@ buf_page_release( ut_ad(block); - mutex_enter_fast(&(buf_pool->mutex)); - ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(block->buf_fix_count > 0); if (rw_latch == RW_X_LATCH && mtr->modifications) { - + mutex_enter(&buf_pool->mutex); buf_flush_note_modification(block, mtr); + mutex_exit(&buf_pool->mutex); } + mutex_enter(&block->mutex); + #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif buf_fix_count = block->buf_fix_count; block->buf_fix_count = buf_fix_count - 1; - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); if (rw_latch == RW_S_LATCH) { rw_lock_s_unlock(&(block->lock)); diff --git a/innobase/include/dict0crea.h b/innobase/include/dict0crea.h index 5dd571be59c..44acca61c5e 100644 --- a/innobase/include/dict0crea.h +++ b/innobase/include/dict0crea.h @@ -62,8 +62,10 @@ dict_truncate_index_tree( /* out: new root page number, or FIL_NULL on failure */ dict_table_t* table, /* in: the table the index belongs to */ - rec_t* rec, /* in: record in the clustered index of - SYS_INDEXES table */ + btr_pcur_t* pcur, /* in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ mtr_t* mtr); /* in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ diff --git a/innobase/include/sync0arr.h b/innobase/include/sync0arr.h index fecd910683e..ba712d14aad 100644 --- a/innobase/include/sync0arr.h +++ b/innobase/include/sync0arr.h @@ -75,17 +75,12 @@ sync_array_free_cell( sync_array_t* arr, /* in: wait array */ ulint index); /* in: index of the cell in array */ /************************************************************************** -Looks for the cells in the wait array which refer -to the wait object specified, -and sets their corresponding events to the signaled state. In this -way releases the threads waiting for the object to contend for the object. -It is possible that no such cell is found, in which case does nothing. */ +Note that one of the wait objects was signalled. */ void -sync_array_signal_object( -/*=====================*/ - sync_array_t* arr, /* in: wait array */ - void* object);/* in: wait object */ +sync_array_object_signalled( +/*========================*/ + sync_array_t* arr); /* in: wait array */ /************************************************************************** If the wakeup algorithm does not work perfectly at semaphore relases, this function will do the waking (see the comment in mutex_exit). This diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h index 911c8ac3f4a..741f9500612 100644 --- a/innobase/include/sync0rw.h +++ b/innobase/include/sync0rw.h @@ -411,6 +411,7 @@ blocked by readers, a writer may queue for the lock by setting the writer field. Then no new readers are allowed in. */ struct rw_lock_struct { + os_event_t event; /* Used by sync0arr.c for thread queueing */ ulint reader_count; /* Number of readers who have locked this lock in the shared mode */ ulint writer; /* This field is set to RW_LOCK_EX if there diff --git a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic index 9e15475ae53..31a1ea6562a 100644 --- a/innobase/include/sync0rw.ic +++ b/innobase/include/sync0rw.ic @@ -382,7 +382,8 @@ rw_lock_s_unlock_func( mutex_exit(mutex); if (UNIV_UNLIKELY(sg)) { - sync_array_signal_object(sync_primary_wait_array, lock); + os_event_set(lock->event); + sync_array_object_signalled(sync_primary_wait_array); } ut_ad(rw_lock_validate(lock)); @@ -462,7 +463,8 @@ rw_lock_x_unlock_func( mutex_exit(&(lock->mutex)); if (UNIV_UNLIKELY(sg)) { - sync_array_signal_object(sync_primary_wait_array, lock); + os_event_set(lock->event); + sync_array_object_signalled(sync_primary_wait_array); } ut_ad(rw_lock_validate(lock)); diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h index 5955ab9a06a..9893921c5d2 100644 --- a/innobase/include/sync0sync.h +++ b/innobase/include/sync0sync.h @@ -453,6 +453,7 @@ Do not use its fields directly! The structure used in the spin lock implementation of a mutual exclusion semaphore. */ struct mutex_struct { + os_event_t event; /* Used by sync0arr.c for the wait queue */ ulint lock_word; /* This ulint is the target of the atomic test-and-set instruction in Win32 */ #if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER) diff --git a/innobase/os/os0sync.c b/innobase/os/os0sync.c index 8bafb73baf8..a3204a7b3e8 100644 --- a/innobase/os/os0sync.c +++ b/innobase/os/os0sync.c @@ -21,6 +21,7 @@ Created 9/6/1995 Heikki Tuuri /* Type definition for an operating system mutex struct */ struct os_mutex_struct{ + os_event_t event; /* Used by sync0arr.c for queing threads */ void* handle; /* OS handle to mutex */ ulint count; /* we use this counter to check that the same thread does not @@ -35,6 +36,7 @@ struct os_mutex_struct{ /* Mutex protecting counts and the lists of OS mutexes and events */ os_mutex_t os_sync_mutex; ibool os_sync_mutex_inited = FALSE; +ibool os_sync_free_called = FALSE; /* This is incremented by 1 in os_thread_create and decremented by 1 in os_thread_exit */ @@ -50,6 +52,10 @@ ulint os_event_count = 0; ulint os_mutex_count = 0; ulint os_fast_mutex_count = 0; +/* Because a mutex is embedded inside an event and there is an +event embedded inside a mutex, on free, this generates a recursive call. +This version of the free event function doesn't acquire the global lock */ +static void os_event_free_internal(os_event_t event); /************************************************************* Initializes global event and OS 'slow' mutex lists. */ @@ -76,6 +82,7 @@ os_sync_free(void) os_event_t event; os_mutex_t mutex; + os_sync_free_called = TRUE; event = UT_LIST_GET_FIRST(os_event_list); while (event) { @@ -99,6 +106,7 @@ os_sync_free(void) mutex = UT_LIST_GET_FIRST(os_mutex_list); } + os_sync_free_called = FALSE; } /************************************************************* @@ -146,14 +154,21 @@ os_event_create( event->signal_count = 0; #endif /* __WIN__ */ - /* Put to the list of events */ - os_mutex_enter(os_sync_mutex); + /* The os_sync_mutex can be NULL because during startup an event + can be created [ because it's embedded in the mutex/rwlock ] before + this module has been initialized */ + if (os_sync_mutex != NULL) { + os_mutex_enter(os_sync_mutex); + } + /* Put to the list of events */ UT_LIST_ADD_FIRST(os_event_list, os_event_list, event); os_event_count++; - os_mutex_exit(os_sync_mutex); + if (os_sync_mutex != NULL) { + os_mutex_exit(os_sync_mutex); + } return(event); } @@ -256,6 +271,35 @@ os_event_reset( } /************************************************************** +Frees an event object, without acquiring the global lock. */ +static +void +os_event_free_internal( +/*===================*/ + os_event_t event) /* in: event to free */ +{ +#ifdef __WIN__ + ut_a(event); + + ut_a(CloseHandle(event->handle)); +#else + ut_a(event); + + /* This is to avoid freeing the mutex twice */ + os_fast_mutex_free(&(event->os_mutex)); + + ut_a(0 == pthread_cond_destroy(&(event->cond_var))); +#endif + /* Remove from the list of events */ + + UT_LIST_REMOVE(os_event_list, os_event_list, event); + + os_event_count--; + + ut_free(event); +} + +/************************************************************** Frees an event object. */ void @@ -456,6 +500,7 @@ os_mutex_create( mutex_str->handle = mutex; mutex_str->count = 0; + mutex_str->event = os_event_create(NULL); if (os_sync_mutex_inited) { /* When creating os_sync_mutex itself we cannot reserve it */ @@ -532,6 +577,10 @@ os_mutex_free( { ut_a(mutex); + if (!os_sync_free_called) { + os_event_free_internal(mutex->event); + } + if (os_sync_mutex_inited) { os_mutex_enter(os_sync_mutex); } diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 955c7139de7..efbb93ba9f5 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -2920,12 +2920,10 @@ do not allow the TRUNCATE. We also reserve the data dictionary latch. */ goto next_rec; } - btr_pcur_store_position(&pcur, &mtr); + /* This call may commit and restart mtr + and reposition pcur. */ + root_page_no = dict_truncate_index_tree(table, &pcur, &mtr); - /* This call may commit and restart mtr. */ - root_page_no = dict_truncate_index_tree(table, rec, &mtr); - - btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr); rec = btr_pcur_get_rec(&pcur); if (root_page_no != FIL_NULL) { diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index ec56afbb4f5..f8a65e6ff82 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -1323,6 +1323,12 @@ rec_loop: ULINT_UNDEFINED, &heap); if (srv_locks_unsafe_for_binlog) { + + if (page_rec_is_supremum(rec)) { + + goto next_rec; + } + lock_type = LOCK_REC_NOT_GAP; } else { lock_type = LOCK_ORDINARY; diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index 8530f117c9d..b41dcbe44cd 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -1142,10 +1142,7 @@ innobase_start_or_create_for_mysql(void) #if defined(__NETWARE__) -/* Create less event semaphores because Win 98/ME had difficulty creating -40000 event semaphores. -Comment from Novell, Inc.: also, these just take a lot of memory on -NetWare. */ + /* Comment from Novell, Inc.: These take a lot of memory on NetWare.*/ srv_max_n_threads = 1000; #else if (srv_pool_size >= 1000 * 1024) { diff --git a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c index 198ef49ca9f..64f9310bad3 100644 --- a/innobase/sync/sync0arr.c +++ b/innobase/sync/sync0arr.c @@ -62,9 +62,6 @@ struct sync_cell_struct { ibool waiting; /* TRUE if the thread has already called sync_array_event_wait on this cell */ - ibool event_set; /* TRUE if the event is set */ - os_event_t event; /* operating system event - semaphore handle */ time_t reservation_time;/* time when the thread reserved the wait cell */ }; @@ -218,10 +215,7 @@ sync_array_create( for (i = 0; i < n_cells; i++) { cell = sync_array_get_nth_cell(arr, i); cell->wait_object = NULL; - - /* Create an operating system event semaphore with no name */ - cell->event = os_event_create(NULL); - cell->event_set = FALSE; /* it is created in reset state */ + cell->waiting = FALSE; } return(arr); @@ -235,19 +229,12 @@ sync_array_free( /*============*/ sync_array_t* arr) /* in, own: sync wait array */ { - ulint i; - sync_cell_t* cell; ulint protection; ut_a(arr->n_reserved == 0); sync_array_validate(arr); - for (i = 0; i < arr->n_cells; i++) { - cell = sync_array_get_nth_cell(arr, i); - os_event_free(cell->event); - } - protection = arr->protection; /* Release the mutex protecting the wait array complex */ @@ -293,27 +280,19 @@ sync_array_validate( } /*********************************************************************** -Puts the cell event in set state. */ -static -void -sync_cell_event_set( -/*================*/ - sync_cell_t* cell) /* in: array cell */ -{ - os_event_set(cell->event); - cell->event_set = TRUE; -} - -/*********************************************************************** Puts the cell event in reset state. */ static void sync_cell_event_reset( /*==================*/ - sync_cell_t* cell) /* in: array cell */ + ulint type, /* in: lock type mutex/rw_lock */ + void* object) /* in: the rw_lock/mutex object */ { - os_event_reset(cell->event); - cell->event_set = FALSE; + if (type == SYNC_MUTEX) { + os_event_reset(((mutex_t *) object)->event); + } else { + os_event_reset(((rw_lock_t *) object)->event); + } } /********************************************************************** @@ -346,14 +325,7 @@ sync_array_reserve_cell( if (cell->wait_object == NULL) { - /* Make sure the event is reset */ - if (cell->event_set) { - sync_cell_event_reset(cell); - } - - cell->reservation_time = time(NULL); - cell->thread = os_thread_get_curr_id(); - + cell->waiting = FALSE; cell->wait_object = object; if (type == SYNC_MUTEX) { @@ -363,7 +335,6 @@ sync_array_reserve_cell( } cell->request_type = type; - cell->waiting = FALSE; cell->file = file; cell->line = line; @@ -373,6 +344,13 @@ sync_array_reserve_cell( *index = i; sync_array_exit(arr); + + /* Make sure the event is reset */ + sync_cell_event_reset(type, object); + + cell->reservation_time = time(NULL); + + cell->thread = os_thread_get_curr_id(); return; } @@ -408,7 +386,12 @@ sync_array_wait_event( ut_a(!cell->waiting); ut_ad(os_thread_get_curr_id() == cell->thread); - event = cell->event; + if (cell->request_type == SYNC_MUTEX) { + event = ((mutex_t*) cell->wait_object)->event; + } else { + event = ((rw_lock_t*) cell->wait_object)->event; + } + cell->waiting = TRUE; #ifdef UNIV_SYNC_DEBUG @@ -510,10 +493,6 @@ sync_array_cell_print( if (!cell->waiting) { fputs("wait has ended\n", file); } - - if (cell->event_set) { - fputs("wait is ending\n", file); - } } #ifdef UNIV_SYNC_DEBUG @@ -623,7 +602,7 @@ sync_array_detect_deadlock( depth++; - if (cell->event_set || !cell->waiting) { + if (!cell->waiting) { return(FALSE); /* No deadlock here */ } @@ -802,6 +781,7 @@ sync_array_free_cell( ut_a(cell->wait_object != NULL); + cell->waiting = FALSE; cell->wait_object = NULL; ut_a(arr->n_reserved > 0); @@ -811,44 +791,17 @@ sync_array_free_cell( } /************************************************************************** -Looks for the cells in the wait array which refer to the wait object -specified, and sets their corresponding events to the signaled state. In this -way releases the threads waiting for the object to contend for the object. -It is possible that no such cell is found, in which case does nothing. */ +Increments the signalled count. */ void -sync_array_signal_object( -/*=====================*/ - sync_array_t* arr, /* in: wait array */ - void* object) /* in: wait object */ +sync_array_object_signalled( +/*========================*/ + sync_array_t* arr) /* in: wait array */ { - sync_cell_t* cell; - ulint count; - ulint i; - sync_array_enter(arr); arr->sg_count++; - i = 0; - count = 0; - - while (count < arr->n_reserved) { - - cell = sync_array_get_nth_cell(arr, i); - - if (cell->wait_object != NULL) { - - count++; - if (cell->wait_object == object) { - - sync_cell_event_set(cell); - } - } - - i++; - } - sync_array_exit(arr); } @@ -881,7 +834,17 @@ sync_arr_wake_threads_if_sema_free(void) if (sync_arr_cell_can_wake_up(cell)) { - sync_cell_event_set(cell); + if (cell->request_type == SYNC_MUTEX) { + mutex_t* mutex; + + mutex = cell->wait_object; + os_event_set(mutex->event); + } else { + rw_lock_t* lock; + + lock = cell->wait_object; + os_event_set(lock->event); + } } } @@ -911,7 +874,7 @@ sync_array_print_long_waits(void) cell = sync_array_get_nth_cell(sync_primary_wait_array, i); - if (cell->wait_object != NULL + if (cell->wait_object != NULL && cell->waiting && difftime(time(NULL), cell->reservation_time) > 240) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); @@ -919,7 +882,7 @@ sync_array_print_long_waits(void) noticed = TRUE; } - if (cell->wait_object != NULL + if (cell->wait_object != NULL && cell->waiting && difftime(time(NULL), cell->reservation_time) > fatal_timeout) { fatal = TRUE; diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c index 973b46fdd50..050de73db9e 100644 --- a/innobase/sync/sync0rw.c +++ b/innobase/sync/sync0rw.c @@ -128,6 +128,7 @@ rw_lock_create_func( lock->last_x_file_name = "not yet reserved"; lock->last_s_line = 0; lock->last_x_line = 0; + lock->event = os_event_create(NULL); mutex_enter(&rw_lock_list_mutex); @@ -163,6 +164,7 @@ rw_lock_free( mutex_free(rw_lock_get_mutex(lock)); mutex_enter(&rw_lock_list_mutex); + os_event_free(lock->event); if (UT_LIST_GET_PREV(list, lock)) { ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index 43249f4b96f..95bf83dce79 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -212,6 +212,7 @@ mutex_create_func( os_fast_mutex_init(&(mutex->os_fast_mutex)); mutex->lock_word = 0; #endif + mutex->event = os_event_create(NULL); mutex_set_waiters(mutex, 0); mutex->magic_n = MUTEX_MAGIC_N; #ifdef UNIV_SYNC_DEBUG @@ -288,6 +289,8 @@ mutex_free( mutex_exit(&mutex_list_mutex); } + os_event_free(mutex->event); + #if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER) os_fast_mutex_free(&(mutex->os_fast_mutex)); #endif @@ -564,8 +567,8 @@ mutex_signal_object( /* The memory order of resetting the waiters field and signaling the object is important. See LEMMA 1 above. */ - - sync_array_signal_object(sync_primary_wait_array, mutex); + os_event_set(mutex->event); + sync_array_object_signalled(sync_primary_wait_array); } #ifdef UNIV_SYNC_DEBUG @@ -1114,6 +1117,7 @@ sync_thread_add_level( ut_a(sync_thread_levels_g(array, SYNC_PURGE_SYS)); } else if (level == SYNC_TREE_NODE) { ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE) + || sync_thread_levels_contain(array, SYNC_DICT_OPERATION) || sync_thread_levels_g(array, SYNC_TREE_NODE - 1)); } else if (level == SYNC_TREE_NODE_FROM_HASH) { ut_a(1); |