summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2020-10-15 10:27:25 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2020-10-15 10:33:23 +0300
commit46b1f500983d45e89dc84bb9820023bd51a4cda8 (patch)
tree7bc4337e341848169d4f4d28a8de0bbc1583ebb5
parentb535a7904425d730c4dff185f0f313ed99dfbb50 (diff)
downloadmariadb-git-46b1f500983d45e89dc84bb9820023bd51a4cda8.tar.gz
MDEV-23399: Remove buf_pool.flush_rbt
Normally, buf_pool.flush_list must be sorted by buf_page_t::oldest_modification, so that log_checkpoint() can choose MIN(oldest_modification) as the checkpoint LSN. During recovery, buf_pool.flush_rbt used to guarantee the ordering. However, we can allow the buf_pool.flush_list to be in an arbitrary order during recovery, and simply ensure that it is in the correct order by the time a log checkpoint needs to be executed. recv_sys_t::apply(): To keep it simple, we will always flush the buffer pool at the end of each batch. Note that log_checkpoint() will invoke recv_sys_t::apply() in case a checkpoint is initiated during the last batch of recovery, when we already allow writes to data pages and the redo log. Reviewed by: Vladislav Vaintroub
-rw-r--r--storage/innobase/buf/buf0buf.cc7
-rw-r--r--storage/innobase/buf/buf0flu.cc207
-rw-r--r--storage/innobase/include/buf0buf.h17
-rw-r--r--storage/innobase/include/buf0flu.h14
-rw-r--r--storage/innobase/log/log0recv.cc8
5 files changed, 8 insertions, 245 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 8c5eba321a4..078361092fc 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -1607,12 +1607,6 @@ void buf_pool_t::close()
mutex_free(&mutex);
mutex_free(&flush_list_mutex);
- if (flush_rbt)
- {
- rbt_free(flush_rbt);
- flush_rbt= nullptr;
- }
-
for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
bpage= prev_bpage)
{
@@ -2113,7 +2107,6 @@ inline void buf_pool_t::resize()
ut_ad(curr_size == old_size);
ut_ad(n_chunks_new == n_chunks);
ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
- ut_ad(flush_rbt == NULL);
n_chunks_new = (new_instance_size << srv_page_size_shift)
/ srv_buf_pool_chunk_unit;
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index d0fbd59f968..10ed54be452 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -236,134 +236,6 @@ static void buf_flush_validate_skip()
}
#endif /* UNIV_DEBUG */
-/******************************************************************//**
-Insert a block in the flush_rbt and returns a pointer to its
-predecessor or NULL if no predecessor. The ordering is maintained
-on the basis of the <oldest_modification, space, offset> key.
-@return pointer to the predecessor or NULL if no predecessor. */
-static
-buf_page_t*
-buf_flush_insert_in_flush_rbt(
-/*==========================*/
- buf_page_t* bpage) /*!< in: bpage to be inserted. */
-{
- const ib_rbt_node_t* c_node;
- const ib_rbt_node_t* p_node;
- buf_page_t* prev = NULL;
-
- ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
-
- /* Insert this buffer into the rbt. */
- c_node = rbt_insert(buf_pool.flush_rbt, &bpage, &bpage);
- ut_a(c_node != NULL);
-
- /* Get the predecessor. */
- p_node = rbt_prev(buf_pool.flush_rbt, c_node);
-
- if (p_node != NULL) {
- buf_page_t** value;
- value = rbt_value(buf_page_t*, p_node);
- prev = *value;
- ut_a(prev != NULL);
- }
-
- return(prev);
-}
-
-/*********************************************************//**
-Delete a bpage from the flush_rbt. */
-static
-void
-buf_flush_delete_from_flush_rbt(
-/*============================*/
- buf_page_t* bpage) /*!< in: bpage to be removed. */
-{
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
-
-#ifdef UNIV_DEBUG
- ibool ret =
-#endif /* UNIV_DEBUG */
- rbt_delete(buf_pool.flush_rbt, &bpage);
-
- ut_ad(ret);
-}
-
-/*****************************************************************//**
-Compare two modified blocks in the buffer pool. The key for comparison
-is:
-key = <oldest_modification, space, offset>
-This comparison is used to maintian ordering of blocks in the
-buf_pool.flush_rbt.
-Note that for the purpose of flush_rbt, we only need to order blocks
-on the oldest_modification. The other two fields are used to uniquely
-identify the blocks.
-@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
-static
-int
-buf_flush_block_cmp(
-/*================*/
- const void* p1, /*!< in: block1 */
- const void* p2) /*!< in: block2 */
-{
- const buf_page_t* b1 = *static_cast<const buf_page_t*const*>(p1);
- const buf_page_t* b2 = *static_cast<const buf_page_t*const*>(p2);
-
- ut_ad(b1 != NULL);
- ut_ad(b2 != NULL);
-
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
-
- const lsn_t m1 = b1->oldest_modification(),
- m2 = b2->oldest_modification();
-
- ut_ad(m1);
- ut_ad(m2);
-
- if (m2 > m1) {
- return(1);
- } else if (m2 < m1) {
- return(-1);
- }
-
- if (b2->id() > b1->id()) {
- return 1;
- }
- if (b2->id() < b1->id()) {
- return -1;
- }
- return 0;
-}
-
-/********************************************************************//**
-Initialize the red-black tree to speed up insertions into the flush_list
-during recovery process. Should be called at the start of recovery
-process before any page has been read/written. */
-void
-buf_flush_init_flush_rbt(void)
-/*==========================*/
-{
- mutex_enter(&buf_pool.flush_list_mutex);
- ut_ad(buf_pool.flush_rbt == NULL);
- /* Create red black tree for speedy insertions in flush list. */
- buf_pool.flush_rbt = rbt_create(
- sizeof(buf_page_t*), buf_flush_block_cmp);
- mutex_exit(&buf_pool.flush_list_mutex);
-}
-
-/********************************************************************//**
-Frees up the red-black tree. */
-void
-buf_flush_free_flush_rbt(void)
-/*==========================*/
-{
- mutex_enter(&buf_pool.flush_list_mutex);
- ut_d(buf_flush_validate_low());
- rbt_free(buf_pool.flush_rbt);
- buf_pool.flush_rbt = NULL;
- mutex_exit(&buf_pool.flush_list_mutex);
-}
-
/** Insert a modified block into the flush list.
@param[in,out] block modified block
@param[in] lsn oldest modification */
@@ -380,32 +252,7 @@ void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn)
block->physical_size());
incr_flush_list_size_in_bytes(block);
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
- /* The field in_LRU_list is protected by buf_pool.mutex, which
- we are not holding. However, while a block is in the flush
- list, it is dirty and cannot be discarded, not from the
- page_hash or from the LRU list. At most, the uncompressed
- page frame of a compressed block may be discarded or created
- (copying the block->page to or from a buf_page_t that is
- dynamically allocated from buf_buddy_alloc()). Because those
- transitions hold buf_pool.flush_list_mutex (via
- buf_flush_relocate_on_flush_list()), there is no possibility
- of a race condition in the assertions below. */
- ut_ad(block->page.in_LRU_list);
- /* buf_buddy_block_register() will take a block in the
- BUF_BLOCK_MEMORY state, not a file page. */
- ut_ad(!block->page.in_zip_hash);
-
- if (buf_page_t* prev_b =
- buf_flush_insert_in_flush_rbt(&block->page)) {
- UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev_b, &block->page);
- goto func_exit;
- }
- }
-
UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page);
-func_exit:
ut_d(buf_flush_validate_skip());
mutex_exit(&buf_pool.flush_list_mutex);
}
@@ -430,14 +277,6 @@ void buf_flush_remove(buf_page_t* bpage)
the bpage from flush list. */
buf_pool.flush_hp.adjust(bpage);
UT_LIST_REMOVE(buf_pool.flush_list, bpage);
-
- /* If the flush_rbt is active then delete from there as well. */
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- buf_flush_delete_from_flush_rbt(bpage);
- }
-
- /* Must be done after we have removed it from the flush_rbt
- because we assert on it in buf_flush_block_cmp(). */
bpage->clear_oldest_modification();
buf_pool.stat.flush_list_bytes -= bpage->physical_size();
@@ -467,7 +306,6 @@ buf_flush_relocate_on_flush_list(
buf_page_t* dpage) /*!< in/out: destination block */
{
buf_page_t* prev;
- buf_page_t* prev_b = NULL;
ut_ad(mutex_own(&buf_pool.mutex));
mutex_enter(&buf_pool.flush_list_mutex);
@@ -481,19 +319,10 @@ buf_flush_relocate_on_flush_list(
having the buf_pool mutex. */
ut_ad(dpage->oldest_modification());
- /* If recovery is active we must swap the control blocks in
- the flush_rbt as well. */
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- buf_flush_delete_from_flush_rbt(bpage);
- prev_b = buf_flush_insert_in_flush_rbt(dpage);
- }
-
/* Important that we adjust the hazard pointer before removing
the bpage from the flush list. */
buf_pool.flush_hp.adjust(bpage);
- /* Must be done after we have removed it from the flush_rbt
- because we assert on it in buf_flush_block_cmp(). */
bpage->clear_oldest_modification();
prev = UT_LIST_GET_PREV(list, bpage);
@@ -506,9 +335,6 @@ buf_flush_relocate_on_flush_list(
UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
}
- /* Just an extra check. Previous in flush_list
- should be the same control block as in flush_rbt. */
- ut_a(!buf_pool.flush_rbt || prev_b == prev);
ut_d(buf_flush_validate_low());
mutex_exit(&buf_pool.flush_list_mutex);
}
@@ -2889,7 +2715,6 @@ struct Check {
static void buf_flush_validate_low()
{
buf_page_t* bpage;
- const ib_rbt_node_t* rnode = NULL;
ut_ad(mutex_own(&buf_pool.flush_list_mutex));
@@ -2897,13 +2722,6 @@ static void buf_flush_validate_low()
bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
- /* If we are in recovery mode i.e.: flush_rbt != NULL
- then each block in the flush_list must also be present
- in the flush_rbt. */
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- rnode = rbt_first(buf_pool.flush_rbt);
- }
-
while (bpage != NULL) {
const lsn_t om = bpage->oldest_modification();
/* A page in buf_pool.flush_list can be in
@@ -2912,29 +2730,14 @@ static void buf_flush_validate_low()
original descriptor can have this state and still be
in the flush list waiting to acquire the
buf_pool.flush_list_mutex to complete the relocation. */
- ut_a(bpage->in_file()
- || bpage->state() == BUF_BLOCK_REMOVE_HASH);
- ut_a(om > 0);
-
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- buf_page_t** prpage;
-
- ut_a(rnode != NULL);
- prpage = rbt_value(buf_page_t*, rnode);
-
- ut_a(*prpage != NULL);
- ut_a(*prpage == bpage);
- rnode = rbt_next(buf_pool.flush_rbt, rnode);
- }
+ ut_ad(bpage->in_file()
+ || bpage->state() == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(om > 0);
bpage = UT_LIST_GET_NEXT(list, bpage);
-
- ut_a(!bpage || om >= bpage->oldest_modification());
+ ut_ad(!bpage || recv_recovery_is_on()
+ || om >= bpage->oldest_modification());
}
-
- /* By this time we must have exhausted the traversal of
- flush_rbt (if active) as well. */
- ut_a(rnode == NULL);
}
/** Validate the flush list. */
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 3eb1773a5b6..9b58fa76c01 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -39,7 +39,6 @@ Created 11/5/1995 Heikki Tuuri
#include "hash0hash.h"
#include "ut0byte.h"
#include "page0types.h"
-#include "ut0rbt.h"
#include "log0log.h"
#include "srv0srv.h"
#include <ostream>
@@ -1910,7 +1909,7 @@ public:
FlushListMutex flush_list_mutex;/*!< mutex protecting the
flush list access. This mutex
- protects flush_list, flush_rbt
+ protects flush_list
and bpage::list pointers when
the bpage is on flush_list. It
also protects writes to
@@ -1934,20 +1933,6 @@ public:
of the given type running;
os_event_set() and os_event_reset()
are protected by buf_pool_t::mutex */
- ib_rbt_t* flush_rbt; /*!< a red-black tree is used
- exclusively during recovery to
- speed up insertions in the
- flush_list. This tree contains
- blocks in order of
- oldest_modification LSN and is
- kept in sync with the
- flush_list.
- Each member of the tree MUST
- also be on the flush_list.
- This tree is relevant only in
- recovery and is set to NULL
- once the recovery is over.
- Protected by flush_list_mutex */
unsigned freed_page_clock;/*!< a sequence number used
to count the number of buffer
blocks removed from the end of
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index f7f89f1a9e9..17568d0e2b1 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -164,20 +164,6 @@ void buf_flush_wait_LRU_batch_end();
void buf_flush_validate();
#endif /* UNIV_DEBUG */
-/********************************************************************//**
-Initialize the red-black tree to speed up insertions into the flush_list
-during recovery process. Should be called at the start of recovery
-process before any page has been read/written. */
-void
-buf_flush_init_flush_rbt(void);
-/*==========================*/
-
-/********************************************************************//**
-Frees up the red-black tree. */
-void
-buf_flush_free_flush_rbt(void);
-/*==========================*/
-
/** Write a flushable page from buf_pool to a file.
buf_pool.mutex must be held.
@param bpage buffer control block
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 6f3f0ffd68a..aff1d011a8d 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -2684,6 +2684,8 @@ void recv_sys_t::apply(bool last_batch)
ut_ad(!log_mutex_own());
mutex_exit(&mutex);
+ /* Instead of flushing, last_batch could sort the buf_pool.flush_list
+ in ascending order of buf_page_t::oldest_modification. */
buf_flush_wait_LRU_batch_end();
buf_flush_sync();
@@ -3271,10 +3273,6 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
ut_d(mutex_exit(&buf_pool.flush_list_mutex));
- /* Initialize red-black tree for fast insertions into the
- flush_list during recovery process. */
- buf_flush_init_flush_rbt();
-
if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
ib::info() << "innodb_force_recovery=6 skips redo log apply";
@@ -3567,8 +3565,6 @@ void recv_recovery_from_checkpoint_finish()
recv_sys.debug_free();
- /* Free up the flush_rbt. */
- buf_flush_free_flush_rbt();
/* Enable innodb_sync_debug checks */
ut_d(sync_check_enable());
}