summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2021-07-22 08:34:49 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2021-07-22 10:05:13 +0300
commitbf435a3f4daa90dd6b4b94ed62f05a8e30fecc3d (patch)
tree4c6733ac04154261a99534fb19e10e908cbeb02c
parent316a8cebf5c5833ca1dde5ff6e81d9c27c3af0ed (diff)
downloadmariadb-git-bf435a3f4daa90dd6b4b94ed62f05a8e30fecc3d.tar.gz
MDEV-26200 buf_pool.flush_list corrupted by buffer pool resizing or ROW_FORMAT=COMPRESSED
The lazy deletion of clean blocks from buf_pool.flush_list that was introduced in commit 6441bc614a99f5cd6357c8a23b9f583c56d0a90c (MDEV-25113) introduced a race condition around the function buf_flush_relocate_on_flush_list(). The test innodb_zip.wl5522_debug_zip as well as the buffer pool resizing tests would occasionally fail in debug builds due to buf_pool.flush_list.count disagreeing with the actual length of the doubly-linked list. The safe procedure for relocating a block in buf_pool.flush_list should be as follows, now that we implement lazy deletion from buf_pool.flush_list: 1. Acquire buf_pool.mutex. 2. Acquire the exclusive buf_pool.page_hash.latch. 3. Acquire buf_pool.flush_list_mutex. 4. Copy the block descriptor. 5. Invoke buf_flush_relocate_on_flush_list(). 6. Release buf_pool.flush_list_mutex. buf_flush_relocate_on_flush_list(): Assert that buf_pool.flush_list_mutex is being held. Invoke buf_page_t::oldest_modification() only once, using std::memory_order_relaxed, now that the mutex protects us. buf_LRU_free_page(), buf_LRU_block_remove_hashed(): Avoid an unlock-lock cycle on hash_lock. (We must not acquire hash_lock while already holding buf_pool.flush_list_mutex, because that could lead to a deadlock due to latching order violation.)
-rw-r--r--storage/innobase/buf/buf0buf.cc6
-rw-r--r--storage/innobase/buf/buf0flu.cc31
-rw-r--r--storage/innobase/buf/buf0lru.cc14
3 files changed, 25 insertions, 26 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index dcf95fdd5f2..aca0d6b5ae8 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -1566,6 +1566,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
if (block->page.can_relocate()) {
memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
new_block->frame, block->frame, srv_page_size);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
new (&new_block->page) buf_page_t(block->page);
/* relocate LRU list */
@@ -1625,6 +1626,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
buf_flush_relocate_on_flush_list(&block->page,
&new_block->page);
}
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
block->page.set_corrupt_id();
/* set other flags of buf_block_t */
@@ -3131,12 +3133,14 @@ evict_from_pool:
/* Note: this is the uncompressed block and it is not
accessible by other threads yet because it is not in
any list or hash table */
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_relocate(bpage, &block->page);
/* Set after buf_relocate(). */
block->page.set_buf_fix_count(1);
buf_flush_relocate_on_flush_list(bpage, &block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
/* Buffer-fix, I/O-fix, and X-latch the block
for the duration of the decompression.
@@ -3646,8 +3650,10 @@ loop:
}
rw_lock_x_lock(&free_block->lock);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_relocate(&block->page, &free_block->page);
buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
buf_unzip_LRU_add_block(free_block, FALSE);
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 1c425f308ef..10a84d99a2e 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -286,43 +286,29 @@ buf_flush_relocate_on_flush_list(
{
buf_page_t* prev;
- mysql_mutex_assert_owner(&buf_pool.mutex);
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
ut_ad(!fsp_is_system_temporary(bpage->id().space()));
- const lsn_t lsn = bpage->oldest_modification_acquire();
+ const lsn_t lsn = bpage->oldest_modification();
if (!lsn) {
return;
}
ut_ad(lsn == 1 || lsn > 2);
-
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
-
- /* FIXME: Can we avoid holding buf_pool.mutex here? */
ut_ad(dpage->oldest_modification() == lsn);
- if (ut_d(const lsn_t o_lsn =) bpage->oldest_modification()) {
- ut_ad(o_lsn == lsn);
+ /* Important that we adjust the hazard pointer before removing
+ the bpage from the flush list. */
+ buf_pool.flush_hp.adjust(bpage);
- /* Important that we adjust the hazard pointer before removing
- the bpage from the flush list. */
- buf_pool.flush_hp.adjust(bpage);
+ prev = UT_LIST_GET_PREV(list, bpage);
+ UT_LIST_REMOVE(buf_pool.flush_list, bpage);
- prev = UT_LIST_GET_PREV(list, bpage);
- UT_LIST_REMOVE(buf_pool.flush_list, bpage);
-
- bpage->clear_oldest_modification();
- } else {
- /* bpage was removed from buf_pool.flush_list
- since we last checked, and before we acquired
- buf_pool.flush_list_mutex. */
- goto was_clean;
- }
+ bpage->clear_oldest_modification();
if (lsn == 1) {
buf_pool.stat.flush_list_bytes -= dpage->physical_size();
-was_clean:
dpage->list.prev = nullptr;
dpage->list.next = nullptr;
dpage->clear_oldest_modification();
@@ -334,7 +320,6 @@ was_clean:
}
ut_d(buf_flush_validate_low());
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
/** Complete write of a file page from buf_pool.
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index cb42d4e2ae3..b282eb17dae 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -845,6 +845,7 @@ func_exit:
} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
b = buf_page_alloc_descriptor();
ut_a(b);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
new (b) buf_page_t(*bpage);
b->set_state(BUF_BLOCK_ZIP_PAGE);
}
@@ -859,6 +860,8 @@ func_exit:
ut_ad(bpage->can_relocate());
if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) {
+ ut_ad(!b);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
return(true);
}
@@ -872,8 +875,6 @@ func_exit:
if (UNIV_LIKELY_NULL(b)) {
buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
- hash_lock->write_lock();
-
ut_ad(!buf_pool.page_hash_get_low(id, fold));
ut_ad(b->zip_size());
@@ -940,6 +941,7 @@ func_exit:
}
buf_flush_relocate_on_flush_list(bpage, b);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
bpage->zip.data = nullptr;
@@ -950,6 +952,8 @@ func_exit:
hash_lock. */
b->set_io_fix(BUF_IO_PIN);
hash_lock->write_unlock();
+ } else if (!zip) {
+ hash_lock->write_unlock();
}
buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
@@ -1182,6 +1186,10 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size);
bpage->set_state(BUF_BLOCK_REMOVE_HASH);
+ if (!zip) {
+ return true;
+ }
+
/* Question: If we release hash_lock here
then what protects us against:
1) Some other thread buffer fixing this page
@@ -1203,7 +1211,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
page_hash. */
hash_lock->write_unlock();
- if (zip && bpage->zip.data) {
+ if (bpage->zip.data) {
/* Free the compressed page. */
void* data = bpage->zip.data;
bpage->zip.data = NULL;