diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2020-10-15 10:27:25 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2020-10-15 10:33:23 +0300 |
commit | 46b1f500983d45e89dc84bb9820023bd51a4cda8 (patch) | |
tree | 7bc4337e341848169d4f4d28a8de0bbc1583ebb5 | |
parent | b535a7904425d730c4dff185f0f313ed99dfbb50 (diff) | |
download | mariadb-git-46b1f500983d45e89dc84bb9820023bd51a4cda8.tar.gz |
MDEV-23399: Remove buf_pool.flush_rbt
Normally, buf_pool.flush_list must be sorted by
buf_page_t::oldest_modification, so that log_checkpoint()
can choose MIN(oldest_modification) as the checkpoint LSN.
During recovery, buf_pool.flush_rbt used to guarantee the
ordering. However, we can allow the buf_pool.flush_list to
be in an arbitrary order during recovery, and simply ensure
that it is in the correct order by the time a log checkpoint
needs to be executed.
recv_sys_t::apply(): To keep it simple, we will always flush the
buffer pool at the end of each batch.
Note that log_checkpoint() will invoke recv_sys_t::apply() in case
a checkpoint is initiated during the last batch of recovery,
when we already allow writes to data pages and the redo log.
Reviewed by: Vladislav Vaintroub
-rw-r--r-- | storage/innobase/buf/buf0buf.cc | 7 | ||||
-rw-r--r-- | storage/innobase/buf/buf0flu.cc | 207 | ||||
-rw-r--r-- | storage/innobase/include/buf0buf.h | 17 | ||||
-rw-r--r-- | storage/innobase/include/buf0flu.h | 14 | ||||
-rw-r--r-- | storage/innobase/log/log0recv.cc | 8 |
5 files changed, 8 insertions, 245 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 8c5eba321a4..078361092fc 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -1607,12 +1607,6 @@ void buf_pool_t::close() mutex_free(&mutex); mutex_free(&flush_list_mutex); - if (flush_rbt) - { - rbt_free(flush_rbt); - flush_rbt= nullptr; - } - for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage; bpage= prev_bpage) { @@ -2113,7 +2107,6 @@ inline void buf_pool_t::resize() ut_ad(curr_size == old_size); ut_ad(n_chunks_new == n_chunks); ut_ad(UT_LIST_GET_LEN(withdraw) == 0); - ut_ad(flush_rbt == NULL); n_chunks_new = (new_instance_size << srv_page_size_shift) / srv_buf_pool_chunk_unit; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index d0fbd59f968..10ed54be452 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -236,134 +236,6 @@ static void buf_flush_validate_skip() } #endif /* UNIV_DEBUG */ -/******************************************************************//** -Insert a block in the flush_rbt and returns a pointer to its -predecessor or NULL if no predecessor. The ordering is maintained -on the basis of the <oldest_modification, space, offset> key. -@return pointer to the predecessor or NULL if no predecessor. */ -static -buf_page_t* -buf_flush_insert_in_flush_rbt( -/*==========================*/ - buf_page_t* bpage) /*!< in: bpage to be inserted. */ -{ - const ib_rbt_node_t* c_node; - const ib_rbt_node_t* p_node; - buf_page_t* prev = NULL; - - ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); - ut_ad(mutex_own(&buf_pool.flush_list_mutex)); - - /* Insert this buffer into the rbt. */ - c_node = rbt_insert(buf_pool.flush_rbt, &bpage, &bpage); - ut_a(c_node != NULL); - - /* Get the predecessor. */ - p_node = rbt_prev(buf_pool.flush_rbt, c_node); - - if (p_node != NULL) { - buf_page_t** value; - value = rbt_value(buf_page_t*, p_node); - prev = *value; - ut_a(prev != NULL); - } - - return(prev); -} - -/*********************************************************//** -Delete a bpage from the flush_rbt. */ -static -void -buf_flush_delete_from_flush_rbt( -/*============================*/ - buf_page_t* bpage) /*!< in: bpage to be removed. */ -{ - ut_ad(mutex_own(&buf_pool.flush_list_mutex)); - -#ifdef UNIV_DEBUG - ibool ret = -#endif /* UNIV_DEBUG */ - rbt_delete(buf_pool.flush_rbt, &bpage); - - ut_ad(ret); -} - -/*****************************************************************//** -Compare two modified blocks in the buffer pool. The key for comparison -is: -key = <oldest_modification, space, offset> -This comparison is used to maintian ordering of blocks in the -buf_pool.flush_rbt. -Note that for the purpose of flush_rbt, we only need to order blocks -on the oldest_modification. The other two fields are used to uniquely -identify the blocks. -@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */ -static -int -buf_flush_block_cmp( -/*================*/ - const void* p1, /*!< in: block1 */ - const void* p2) /*!< in: block2 */ -{ - const buf_page_t* b1 = *static_cast<const buf_page_t*const*>(p1); - const buf_page_t* b2 = *static_cast<const buf_page_t*const*>(p2); - - ut_ad(b1 != NULL); - ut_ad(b2 != NULL); - - ut_ad(mutex_own(&buf_pool.flush_list_mutex)); - - const lsn_t m1 = b1->oldest_modification(), - m2 = b2->oldest_modification(); - - ut_ad(m1); - ut_ad(m2); - - if (m2 > m1) { - return(1); - } else if (m2 < m1) { - return(-1); - } - - if (b2->id() > b1->id()) { - return 1; - } - if (b2->id() < b1->id()) { - return -1; - } - return 0; -} - -/********************************************************************//** -Initialize the red-black tree to speed up insertions into the flush_list -during recovery process. Should be called at the start of recovery -process before any page has been read/written. */ -void -buf_flush_init_flush_rbt(void) -/*==========================*/ -{ - mutex_enter(&buf_pool.flush_list_mutex); - ut_ad(buf_pool.flush_rbt == NULL); - /* Create red black tree for speedy insertions in flush list. */ - buf_pool.flush_rbt = rbt_create( - sizeof(buf_page_t*), buf_flush_block_cmp); - mutex_exit(&buf_pool.flush_list_mutex); -} - -/********************************************************************//** -Frees up the red-black tree. */ -void -buf_flush_free_flush_rbt(void) -/*==========================*/ -{ - mutex_enter(&buf_pool.flush_list_mutex); - ut_d(buf_flush_validate_low()); - rbt_free(buf_pool.flush_rbt); - buf_pool.flush_rbt = NULL; - mutex_exit(&buf_pool.flush_list_mutex); -} - /** Insert a modified block into the flush list. @param[in,out] block modified block @param[in] lsn oldest modification */ @@ -380,32 +252,7 @@ void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn) block->physical_size()); incr_flush_list_size_in_bytes(block); - if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) { - ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); - /* The field in_LRU_list is protected by buf_pool.mutex, which - we are not holding. However, while a block is in the flush - list, it is dirty and cannot be discarded, not from the - page_hash or from the LRU list. At most, the uncompressed - page frame of a compressed block may be discarded or created - (copying the block->page to or from a buf_page_t that is - dynamically allocated from buf_buddy_alloc()). Because those - transitions hold buf_pool.flush_list_mutex (via - buf_flush_relocate_on_flush_list()), there is no possibility - of a race condition in the assertions below. */ - ut_ad(block->page.in_LRU_list); - /* buf_buddy_block_register() will take a block in the - BUF_BLOCK_MEMORY state, not a file page. */ - ut_ad(!block->page.in_zip_hash); - - if (buf_page_t* prev_b = - buf_flush_insert_in_flush_rbt(&block->page)) { - UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev_b, &block->page); - goto func_exit; - } - } - UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page); -func_exit: ut_d(buf_flush_validate_skip()); mutex_exit(&buf_pool.flush_list_mutex); } @@ -430,14 +277,6 @@ void buf_flush_remove(buf_page_t* bpage) the bpage from flush list. */ buf_pool.flush_hp.adjust(bpage); UT_LIST_REMOVE(buf_pool.flush_list, bpage); - - /* If the flush_rbt is active then delete from there as well. */ - if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) { - buf_flush_delete_from_flush_rbt(bpage); - } - - /* Must be done after we have removed it from the flush_rbt - because we assert on it in buf_flush_block_cmp(). */ bpage->clear_oldest_modification(); buf_pool.stat.flush_list_bytes -= bpage->physical_size(); @@ -467,7 +306,6 @@ buf_flush_relocate_on_flush_list( buf_page_t* dpage) /*!< in/out: destination block */ { buf_page_t* prev; - buf_page_t* prev_b = NULL; ut_ad(mutex_own(&buf_pool.mutex)); mutex_enter(&buf_pool.flush_list_mutex); @@ -481,19 +319,10 @@ buf_flush_relocate_on_flush_list( having the buf_pool mutex. */ ut_ad(dpage->oldest_modification()); - /* If recovery is active we must swap the control blocks in - the flush_rbt as well. */ - if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) { - buf_flush_delete_from_flush_rbt(bpage); - prev_b = buf_flush_insert_in_flush_rbt(dpage); - } - /* Important that we adjust the hazard pointer before removing the bpage from the flush list. */ buf_pool.flush_hp.adjust(bpage); - /* Must be done after we have removed it from the flush_rbt - because we assert on it in buf_flush_block_cmp(). */ bpage->clear_oldest_modification(); prev = UT_LIST_GET_PREV(list, bpage); @@ -506,9 +335,6 @@ buf_flush_relocate_on_flush_list( UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage); } - /* Just an extra check. Previous in flush_list - should be the same control block as in flush_rbt. */ - ut_a(!buf_pool.flush_rbt || prev_b == prev); ut_d(buf_flush_validate_low()); mutex_exit(&buf_pool.flush_list_mutex); } @@ -2889,7 +2715,6 @@ struct Check { static void buf_flush_validate_low() { buf_page_t* bpage; - const ib_rbt_node_t* rnode = NULL; ut_ad(mutex_own(&buf_pool.flush_list_mutex)); @@ -2897,13 +2722,6 @@ static void buf_flush_validate_low() bpage = UT_LIST_GET_FIRST(buf_pool.flush_list); - /* If we are in recovery mode i.e.: flush_rbt != NULL - then each block in the flush_list must also be present - in the flush_rbt. */ - if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) { - rnode = rbt_first(buf_pool.flush_rbt); - } - while (bpage != NULL) { const lsn_t om = bpage->oldest_modification(); /* A page in buf_pool.flush_list can be in @@ -2912,29 +2730,14 @@ static void buf_flush_validate_low() original descriptor can have this state and still be in the flush list waiting to acquire the buf_pool.flush_list_mutex to complete the relocation. */ - ut_a(bpage->in_file() - || bpage->state() == BUF_BLOCK_REMOVE_HASH); - ut_a(om > 0); - - if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) { - buf_page_t** prpage; - - ut_a(rnode != NULL); - prpage = rbt_value(buf_page_t*, rnode); - - ut_a(*prpage != NULL); - ut_a(*prpage == bpage); - rnode = rbt_next(buf_pool.flush_rbt, rnode); - } + ut_ad(bpage->in_file() + || bpage->state() == BUF_BLOCK_REMOVE_HASH); + ut_ad(om > 0); bpage = UT_LIST_GET_NEXT(list, bpage); - - ut_a(!bpage || om >= bpage->oldest_modification()); + ut_ad(!bpage || recv_recovery_is_on() + || om >= bpage->oldest_modification()); } - - /* By this time we must have exhausted the traversal of - flush_rbt (if active) as well. */ - ut_a(rnode == NULL); } /** Validate the flush list. */ diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 3eb1773a5b6..9b58fa76c01 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -39,7 +39,6 @@ Created 11/5/1995 Heikki Tuuri #include "hash0hash.h" #include "ut0byte.h" #include "page0types.h" -#include "ut0rbt.h" #include "log0log.h" #include "srv0srv.h" #include <ostream> @@ -1910,7 +1909,7 @@ public: FlushListMutex flush_list_mutex;/*!< mutex protecting the flush list access. This mutex - protects flush_list, flush_rbt + protects flush_list and bpage::list pointers when the bpage is on flush_list. It also protects writes to @@ -1934,20 +1933,6 @@ public: of the given type running; os_event_set() and os_event_reset() are protected by buf_pool_t::mutex */ - ib_rbt_t* flush_rbt; /*!< a red-black tree is used - exclusively during recovery to - speed up insertions in the - flush_list. This tree contains - blocks in order of - oldest_modification LSN and is - kept in sync with the - flush_list. - Each member of the tree MUST - also be on the flush_list. - This tree is relevant only in - recovery and is set to NULL - once the recovery is over. - Protected by flush_list_mutex */ unsigned freed_page_clock;/*!< a sequence number used to count the number of buffer blocks removed from the end of diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index f7f89f1a9e9..17568d0e2b1 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -164,20 +164,6 @@ void buf_flush_wait_LRU_batch_end(); void buf_flush_validate(); #endif /* UNIV_DEBUG */ -/********************************************************************//** -Initialize the red-black tree to speed up insertions into the flush_list -during recovery process. Should be called at the start of recovery -process before any page has been read/written. */ -void -buf_flush_init_flush_rbt(void); -/*==========================*/ - -/********************************************************************//** -Frees up the red-black tree. */ -void -buf_flush_free_flush_rbt(void); -/*==========================*/ - /** Write a flushable page from buf_pool to a file. buf_pool.mutex must be held. @param bpage buffer control block diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 6f3f0ffd68a..aff1d011a8d 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2684,6 +2684,8 @@ void recv_sys_t::apply(bool last_batch) ut_ad(!log_mutex_own()); mutex_exit(&mutex); + /* Instead of flushing, last_batch could sort the buf_pool.flush_list + in ascending order of buf_page_t::oldest_modification. */ buf_flush_wait_LRU_batch_end(); buf_flush_sync(); @@ -3271,10 +3273,6 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); ut_d(mutex_exit(&buf_pool.flush_list_mutex)); - /* Initialize red-black tree for fast insertions into the - flush_list during recovery process. */ - buf_flush_init_flush_rbt(); - if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { ib::info() << "innodb_force_recovery=6 skips redo log apply"; @@ -3567,8 +3565,6 @@ void recv_recovery_from_checkpoint_finish() recv_sys.debug_free(); - /* Free up the flush_rbt. */ - buf_flush_free_flush_rbt(); /* Enable innodb_sync_debug checks */ ut_d(sync_check_enable()); } |