diff options
author | Sergey Vojtovich <svoj@sun.com> | 2010-04-01 16:56:22 +0400 |
---|---|---|
committer | Sergey Vojtovich <svoj@sun.com> | 2010-04-01 16:56:22 +0400 |
commit | 3520a0bd7ce771cdc05cb0abf6457ad5fb8a194e (patch) | |
tree | 14eed13dad977fcfa6598a89777a3fd65da645f6 /storage/innodb_plugin/buf | |
parent | fdb9161579dc2ed30c535a15e30dc6f5b685e9ca (diff) | |
download | mariadb-git-3520a0bd7ce771cdc05cb0abf6457ad5fb8a194e.tar.gz |
Applying InnoDB snapshot
Detailed revision comments:
r6860 | jyang | 2010-03-23 18:20:36 +0200 (Tue, 23 Mar 2010) | 5 lines
branches/zip: This is patch from Inaam that uses red-black tree
to speed up insertions into the flush_list and thus the recovery
process. The patch has been tested by Nokia.
Diffstat (limited to 'storage/innodb_plugin/buf')
-rw-r--r-- | storage/innodb_plugin/buf/buf0buddy.c | 2 | ||||
-rw-r--r-- | storage/innodb_plugin/buf/buf0buf.c | 23 | ||||
-rw-r--r-- | storage/innodb_plugin/buf/buf0flu.c | 254 | ||||
-rw-r--r-- | storage/innodb_plugin/buf/buf0lru.c | 22 | ||||
-rw-r--r-- | storage/innodb_plugin/buf/buf0rea.c | 6 |
5 files changed, 259 insertions, 48 deletions
diff --git a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c index f0e1395c307..b879e97a989 100644 --- a/storage/innodb_plugin/buf/buf0buddy.c +++ b/storage/innodb_plugin/buf/buf0buddy.c @@ -391,6 +391,8 @@ buf_buddy_relocate_block( UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage); } + UNIV_MEM_INVALID(bpage, sizeof *bpage); + mutex_exit(&buf_pool_zip_mutex); return(TRUE); } diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index 40f7925f520..d4a88565570 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -1191,8 +1191,6 @@ buf_relocate( HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage); HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage); - - UNIV_MEM_INVALID(bpage, sizeof *bpage); } /********************************************************************//** @@ -2224,22 +2222,8 @@ wait_until_unfixed: ut_ad(!block->page.in_flush_list); } else { /* Relocate buf_pool->flush_list. */ - buf_page_t* b; - - b = UT_LIST_GET_PREV(list, &block->page); - ut_ad(block->page.in_flush_list); - UT_LIST_REMOVE(list, buf_pool->flush_list, - &block->page); - - if (b) { - UT_LIST_INSERT_AFTER( - list, buf_pool->flush_list, b, - &block->page); - } else { - UT_LIST_ADD_FIRST( - list, buf_pool->flush_list, - &block->page); - } + buf_flush_relocate_on_flush_list(bpage, + &block->page); } /* Buffer-fix, I/O-fix, and X-latch the block @@ -2253,6 +2237,9 @@ wait_until_unfixed: block->page.buf_fix_count = 1; buf_block_set_io_fix(block, BUF_IO_READ); rw_lock_x_lock(&block->lock); + + UNIV_MEM_INVALID(bpage, sizeof *bpage); + mutex_exit(&block->mutex); mutex_exit(&buf_pool_zip_mutex); buf_pool->n_pend_unzip++; diff --git a/storage/innodb_plugin/buf/buf0flu.c b/storage/innodb_plugin/buf/buf0flu.c index 8b614ce90e5..86b9ced73fa 100644 --- a/storage/innodb_plugin/buf/buf0flu.c +++ b/storage/innodb_plugin/buf/buf0flu.c @@ -88,6 +88,138 @@ buf_flush_validate_low(void); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /********************************************************************//** +Insert a block in the flush_rbt and returns a pointer to its +predecessor or NULL if no predecessor. The ordering is maintained +on the basis of the <oldest_modification, space, offset> key. +@return pointer to the predecessor or NULL if no predecessor. */ +static +buf_page_t* +buf_flush_insert_in_flush_rbt( +/*==========================*/ + buf_page_t* bpage) /*!< in: bpage to be inserted. */ +{ + buf_page_t* prev = NULL; + const ib_rbt_node_t* c_node; + const ib_rbt_node_t* p_node; + + ut_ad(buf_pool_mutex_own()); + + /* Insert this buffer into the rbt. */ + c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); + ut_a(c_node != NULL); + + /* Get the predecessor. */ + p_node = rbt_prev(buf_pool->flush_rbt, c_node); + + if (p_node != NULL) { + prev = *rbt_value(buf_page_t*, p_node); + ut_a(prev != NULL); + } + + return(prev); +} + +/********************************************************************//** +Delete a bpage from the flush_rbt. */ +static +void +buf_flush_delete_from_flush_rbt( +/*============================*/ + buf_page_t* bpage) /*!< in: bpage to be removed. */ +{ + + ibool ret = FALSE; + + ut_ad(buf_pool_mutex_own()); + ret = rbt_delete(buf_pool->flush_rbt, &bpage); + ut_ad(ret); +} + +/********************************************************************//** +Compare two modified blocks in the buffer pool. The key for comparison +is: +key = <oldest_modification, space, offset> +This comparison is used to maintian ordering of blocks in the +buf_pool->flush_rbt. +Note that for the purpose of flush_rbt, we only need to order blocks +on the oldest_modification. The other two fields are used to uniquely +identify the blocks. +@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */ +static +int +buf_flush_block_cmp( +/*================*/ + const void* p1, /*!< in: block1 */ + const void* p2) /*!< in: block2 */ +{ + int ret; + + ut_ad(p1 != NULL); + ut_ad(p2 != NULL); + + const buf_page_t* b1 = *(const buf_page_t**) p1; + const buf_page_t* b2 = *(const buf_page_t**) p2; + + ut_ad(b1 != NULL); + ut_ad(b2 != NULL); + + ut_ad(b1->in_flush_list); + ut_ad(b2->in_flush_list); + + if (b2->oldest_modification + > b1->oldest_modification) { + return(1); + } + + if (b2->oldest_modification + < b1->oldest_modification) { + return(-1); + } + + /* If oldest_modification is same then decide on the space. */ + ret = (int)(b2->space - b1->space); + + /* Or else decide ordering on the offset field. */ + return(ret ? ret : (int)(b2->offset - b1->offset)); +} + +/********************************************************************//** +Initialize the red-black tree to speed up insertions into the flush_list +during recovery process. Should be called at the start of recovery +process before any page has been read/written. */ +UNIV_INTERN +void +buf_flush_init_flush_rbt(void) +/*==========================*/ +{ + buf_pool_mutex_enter(); + + /* Create red black tree for speedy insertions in flush list. */ + buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*), + buf_flush_block_cmp); + buf_pool_mutex_exit(); +} + +/********************************************************************//** +Frees up the red-black tree. */ +UNIV_INTERN +void +buf_flush_free_flush_rbt(void) +/*==========================*/ +{ + buf_pool_mutex_enter(); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + rbt_free(buf_pool->flush_rbt); + buf_pool->flush_rbt = NULL; + + buf_pool_mutex_exit(); +} + +/********************************************************************//** Inserts a modified block into the flush list. */ UNIV_INTERN void @@ -100,6 +232,13 @@ buf_flush_insert_into_flush_list( || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification <= block->page.oldest_modification)); + /* If we are in the recovery then we need to update the flush + red-black tree as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_insert_sorted_into_flush_list(block); + return; + } + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.in_LRU_list); ut_ad(block->page.in_page_hash); @@ -136,12 +275,27 @@ buf_flush_insert_sorted_into_flush_list( ut_d(block->page.in_flush_list = TRUE); prev_b = NULL; - b = UT_LIST_GET_FIRST(buf_pool->flush_list); - while (b && b->oldest_modification > block->page.oldest_modification) { - ut_ad(b->in_flush_list); - prev_b = b; - b = UT_LIST_GET_NEXT(list, b); + /* For the most part when this function is called the flush_rbt + should not be NULL. In a very rare boundary case it is possible + that the flush_rbt has already been freed by the recovery thread + before the last page was hooked up in the flush_list by the + io-handler thread. In that case we'll just do a simple + linear search in the else block. */ + if (buf_pool->flush_rbt) { + + prev_b = buf_flush_insert_in_flush_rbt(&block->page); + + } else { + + b = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (b && b->oldest_modification + > block->page.oldest_modification) { + ut_ad(b->in_flush_list); + prev_b = b; + b = UT_LIST_GET_NEXT(list, b); + } } if (prev_b == NULL) { @@ -237,7 +391,6 @@ buf_flush_remove( ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(bpage->in_flush_list); - ut_d(bpage->in_flush_list = FALSE); switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: @@ -259,6 +412,15 @@ buf_flush_remove( break; } + /* If the flush_rbt is active then delete from it as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_delete_from_flush_rbt(bpage); + } + + /* Must be done after we have removed it from the flush_rbt + because we assert on in_flush_list in comparison function. */ + ut_d(bpage->in_flush_list = FALSE); + bpage->oldest_modification = 0; ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, @@ -266,6 +428,63 @@ buf_flush_remove( } /********************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage has already been +copied to dpage. */ +UNIV_INTERN +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage) /*!< in/out: destination block */ +{ + buf_page_t* prev; + buf_page_t* prev_b = NULL; + + ut_ad(buf_pool_mutex_own()); + + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + ut_ad(bpage->in_flush_list); + ut_ad(dpage->in_flush_list); + + /* If recovery is active we must swap the control blocks in + the flush_rbt as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_delete_from_flush_rbt(bpage); + prev_b = buf_flush_insert_in_flush_rbt(dpage); + } + + /* Must be done after we have removed it from the flush_rbt + because we assert on in_flush_list in comparison function. */ + ut_d(bpage->in_flush_list = FALSE); + + prev = UT_LIST_GET_PREV(list, bpage); + UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + + if (prev) { + ut_ad(prev->in_flush_list); + UT_LIST_INSERT_AFTER( + list, + buf_pool->flush_list, + prev, dpage); + } else { + UT_LIST_ADD_FIRST( + list, + buf_pool->flush_list, + dpage); + } + + /* Just an extra check. Previous in flush_list + should be the same control block as in flush_rbt. */ + ut_a(!buf_pool->flush_rbt || prev_b == prev); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +} + +/********************************************************************//** Updates the flush system data structures when a write is completed. */ UNIV_INTERN void @@ -1367,24 +1586,45 @@ ibool buf_flush_validate_low(void) /*========================*/ { - buf_page_t* bpage; + buf_page_t* bpage; + const ib_rbt_node_t* rnode = NULL; UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, ut_ad(ut_list_node_313->in_flush_list)); bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + /* If we are in recovery mode i.e.: flush_rbt != NULL + then each block in the flush_list must also be present + in the flush_rbt. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + rnode = rbt_first(buf_pool->flush_rbt); + } + while (bpage != NULL) { const ib_uint64_t om = bpage->oldest_modification; ut_ad(bpage->in_flush_list); ut_a(buf_page_in_file(bpage)); ut_a(om > 0); + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + ut_a(rnode); + buf_page_t* rpage = *rbt_value(buf_page_t*, + rnode); + ut_a(rpage); + ut_a(rpage == bpage); + rnode = rbt_next(buf_pool->flush_rbt, rnode); + } + bpage = UT_LIST_GET_NEXT(list, bpage); ut_a(!bpage || om >= bpage->oldest_modification); } + /* By this time we must have exhausted the traversal of + flush_rbt (if active) as well. */ + ut_a(rnode == NULL); + return(TRUE); } diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c index 09878f789d4..9cfa02ba3ac 100644 --- a/storage/innodb_plugin/buf/buf0lru.c +++ b/storage/innodb_plugin/buf/buf0lru.c @@ -1530,26 +1530,8 @@ alloc: if (b->state == BUF_BLOCK_ZIP_PAGE) { buf_LRU_insert_zip_clean(b); } else { - buf_page_t* prev; - - ut_ad(b->in_flush_list); - ut_d(bpage->in_flush_list = FALSE); - - prev = UT_LIST_GET_PREV(list, b); - UT_LIST_REMOVE(list, buf_pool->flush_list, b); - - if (prev) { - ut_ad(prev->in_flush_list); - UT_LIST_INSERT_AFTER( - list, - buf_pool->flush_list, - prev, b); - } else { - UT_LIST_ADD_FIRST( - list, - buf_pool->flush_list, - b); - } + /* Relocate on buf_pool->flush_list. */ + buf_flush_relocate_on_flush_list(bpage, b); } bpage->zip.data = NULL; diff --git a/storage/innodb_plugin/buf/buf0rea.c b/storage/innodb_plugin/buf/buf0rea.c index dd98ea17eb5..a973b1b2d26 100644 --- a/storage/innodb_plugin/buf/buf0rea.c +++ b/storage/innodb_plugin/buf/buf0rea.c @@ -608,14 +608,14 @@ buf_read_recv_pages( while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); - os_thread_sleep(500000); + os_thread_sleep(10000); count++; - if (count > 100) { + if (count > 1000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" - " 50 seconds for pending\n" + " 10 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," |