summaryrefslogtreecommitdiff
path: root/storage/innobase/buf
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2020-11-02 12:49:19 +0200
committerMarko Mäkelä <marko.makela@mariadb.com>2020-11-02 12:49:19 +0200
commit09a1f0075a8d5752dd7b2940a20d86a040af1741 (patch)
tree42c96cf95d5df2950b77329c76c0024f33088aff /storage/innobase/buf
parente6f95b23f425001a14a528256354e0faf4e272f6 (diff)
parent440d4b282dd4992d64abdd6289859598db7e5f75 (diff)
downloadmariadb-git-09a1f0075a8d5752dd7b2940a20d86a040af1741.tar.gz
Merge 10.5 into 10.6
Diffstat (limited to 'storage/innobase/buf')
-rw-r--r--storage/innobase/buf/buf0block_hint.cc59
-rw-r--r--storage/innobase/buf/buf0buddy.cc32
-rw-r--r--storage/innobase/buf/buf0buf.cc494
-rw-r--r--storage/innobase/buf/buf0dblwr.cc1579
-rw-r--r--storage/innobase/buf/buf0dump.cc57
-rw-r--r--storage/innobase/buf/buf0flu.cc2766
-rw-r--r--storage/innobase/buf/buf0lru.cc624
-rw-r--r--storage/innobase/buf/buf0rea.cc190
8 files changed, 2152 insertions, 3649 deletions
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
new file mode 100644
index 00000000000..6d99d0b61f0
--- /dev/null
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+#include "buf0block_hint.h"
+namespace buf {
+
+void Block_hint::buffer_fix_block_if_still_valid()
+{
+ /* To check if m_block belongs to the current buf_pool, we must
+ prevent freeing memory while we check, and until we buffer-fix the
+ block. For this purpose it is enough to latch any of the many
+ latches taken by buf_pool_t::resize().
+
+ Similar to buf_page_optimistic_get(), we must validate
+ m_block->page.id() after acquiring the hash_lock, because the object
+ may have been freed and not actually attached to buf_pool.page_hash
+ at the moment. (The block could have been reused to store a
+ different page, and that slice of buf_pool.page_hash could be protected
+ by another hash_lock that we are not holding.)
+
+ Finally, assuming that we have correct hash bucket latched, we must
+ validate m_block->state() to ensure that the block is not being freed. */
+ if (m_block)
+ {
+ const ulint fold= m_page_id.fold();
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+ if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
+ m_block->page.state() == BUF_BLOCK_FILE_PAGE)
+ buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
+ else
+ clear();
+ hash_lock->read_unlock();
+ }
+}
+} // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
index 8280377b42a..f822adc3389 100644
--- a/storage/innobase/buf/buf0buddy.cc
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -192,7 +192,7 @@ static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i)
{
const ulint size = BUF_BUDDY_LOW << i;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(!ut_align_offset(buf, size));
ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
@@ -261,7 +261,7 @@ UNIV_INLINE
void
buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i)
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(buf_pool.zip_free[i].start != buf);
buf_buddy_stamp_free(buf, i);
@@ -276,7 +276,7 @@ UNIV_INLINE
void
buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i)
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(buf_buddy_check_free(buf, i));
UT_LIST_REMOVE(buf_pool.zip_free[i], buf);
@@ -290,7 +290,7 @@ static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
{
buf_buddy_free_t* buf;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_a(i < BUF_BUDDY_SIZES);
ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
@@ -319,7 +319,7 @@ static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
if (buf) {
buf_buddy_free_t* buddy =
reinterpret_cast<buf_buddy_free_t*>(
- buf->stamp.bytes
+ reinterpret_cast<byte*>(buf)
+ (BUF_BUDDY_LOW << i));
ut_ad(!buf_pool.contains_zip(buddy));
buf_buddy_add_to_free(buddy, i);
@@ -350,7 +350,7 @@ buf_buddy_block_free(void* buf)
buf_page_t* bpage;
buf_block_t* block;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_a(!ut_align_offset(buf, srv_page_size));
HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage,
@@ -433,7 +433,7 @@ byte *buf_buddy_alloc_low(ulint i, bool *lru)
{
buf_block_t* block;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
if (i < BUF_BUDDY_SIZES) {
@@ -480,19 +480,17 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
{
buf_page_t* bpage;
const ulint size = BUF_BUDDY_LOW << i;
- ulint space;
- ulint offset;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(!ut_align_offset(src, size));
ut_ad(!ut_align_offset(dst, size));
ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
MEM_CHECK_ADDRESSABLE(dst, size);
- space = mach_read_from_4((const byte*) src
- + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
- offset = mach_read_from_4((const byte*) src
- + FIL_PAGE_OFFSET);
+ uint32_t space = mach_read_from_4(static_cast<const byte*>(src)
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ uint32_t offset = mach_read_from_4(static_cast<const byte*>(src)
+ + FIL_PAGE_OFFSET);
/* Suppress Valgrind or MSAN warnings. */
MEM_MAKE_DEFINED(&space, sizeof space);
@@ -584,7 +582,7 @@ void buf_buddy_free_low(void* buf, ulint i)
{
buf_buddy_free_t* buddy;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(i <= BUF_BUDDY_SIZES);
ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
ut_ad(buf_pool.buddy_stat[i].used > 0);
@@ -670,7 +668,7 @@ buf_buddy_realloc(void* buf, ulint size)
buf_block_t* block = NULL;
ulint i = buf_buddy_get_slot(size);
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(i <= BUF_BUDDY_SIZES);
ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
@@ -711,7 +709,7 @@ buf_buddy_realloc(void* buf, ulint size)
/** Combine all pairs of free buddies. */
void buf_buddy_condense_free()
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(buf_pool.curr_size < buf_pool.old_size);
for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index ec4b7395ca5..9a180614afd 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -212,12 +212,12 @@ but not written to disk yet. The block with the oldest modification
which has not yet been written to disk is at the end of the chain.
The access to this list is protected by buf_pool.flush_list_mutex.
-The chain of unmodified compressed blocks (buf_pool.zip_clean)
-contains the control blocks (buf_page_t) of those compressed pages
+The control blocks for uncompressed pages are accessible via
+buf_block_t objects that are reachable via buf_pool.chunks[].
+The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages
that are not in buf_pool.flush_list and for which no uncompressed
-page has been allocated in the buffer pool. The control blocks for
-uncompressed pages are accessible via buf_block_t objects that are
-reachable via buf_pool.chunks[].
+page has been allocated in buf_pool are only accessible via
+buf_pool.LRU.
The chains of free memory blocks (buf_pool.zip_free[]) are used by
the buddy allocator (buf0buddy.cc) to keep track of currently unused
@@ -415,7 +415,7 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
static bool buf_page_decrypt_after_read(buf_page_t *bpage,
const fil_node_t &node)
{
- ut_ad(node.space->pending_io());
+ ut_ad(node.space->referenced());
ut_ad(node.space->id == bpage->id().space());
const auto flags = node.space->flags;
@@ -475,7 +475,7 @@ decompress_with_slot:
slot->release();
ut_ad(!write_size
|| fil_page_type_validate(node.space, dst_frame));
- ut_ad(node.space->pending_io());
+ ut_ad(node.space->referenced());
return write_size != 0;
}
@@ -516,34 +516,9 @@ decrypt_failed:
goto decompress;
}
- ut_ad(node.space->pending_io());
+ ut_ad(node.space->referenced());
return true;
}
-
-/**
-@return the smallest oldest_modification lsn for any page.
-@retval 0 if all modified persistent pages have been flushed */
-lsn_t buf_pool_t::get_oldest_modification()
-{
- mutex_enter(&flush_list_mutex);
-
- /* FIXME: Keep temporary tablespace pages in a separate flush
- list. We would only need to write out temporary pages if the
- page is about to be evicted from the buffer pool, and the page
- contents is still needed (the page has not been freed). */
- const buf_page_t *bpage;
- for (bpage= UT_LIST_GET_LAST(flush_list);
- bpage && fsp_is_system_temporary(bpage->id().space());
- bpage= UT_LIST_GET_PREV(list, bpage))
- ut_ad(bpage->oldest_modification());
-
- lsn_t oldest_lsn= bpage ? bpage->oldest_modification() : 0;
- mutex_exit(&flush_list_mutex);
-
- /* The result may become stale as soon as we released the mutex.
- On log checkpoint, also log_sys.flush_order_mutex will be needed. */
- return oldest_lsn;
-}
#endif /* !UNIV_INNOCHECKSUM */
/** Checks if the page is in crc32 checksum format.
@@ -719,9 +694,9 @@ static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
phase it makes no sense to spam the log with error messages. */
if (current_lsn < page_lsn) {
- const ulint space_id = mach_read_from_4(
+ const uint32_t space_id = mach_read_from_4(
read_buf + FIL_PAGE_SPACE_ID);
- const ulint page_no = mach_read_from_4(
+ const uint32_t page_no = mach_read_from_4(
read_buf + FIL_PAGE_OFFSET);
ib::error() << "Page " << page_id_t(space_id, page_no)
@@ -1063,14 +1038,14 @@ buf_madvise_do_dump()
ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP);
}
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
auto chunk = buf_pool.chunks;
for (ulint n = buf_pool.n_chunks; n--; chunk++) {
ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
return ret;
}
#endif
@@ -1504,8 +1479,6 @@ bool buf_pool_t::create()
NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
ut_ad(!resizing);
- ut_ad(!withdrawing);
- ut_ad(!withdraw_clock());
ut_ad(!chunks_old);
chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
@@ -1546,7 +1519,7 @@ bool buf_pool_t::create()
while (++chunk < chunks + n_chunks);
ut_ad(is_initialised());
- mutex_create(LATCH_ID_BUF_POOL, &mutex);
+ mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
UT_LIST_INIT(LRU, &buf_page_t::LRU);
UT_LIST_INIT(withdraw, &buf_page_t::list);
@@ -1554,8 +1527,6 @@ bool buf_pool_t::create()
UT_LIST_INIT(flush_list, &buf_page_t::list);
UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
- ut_d(UT_LIST_INIT(zip_clean, &buf_page_t::list));
-
for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
ulint s= curr_size;
@@ -1572,17 +1543,18 @@ bool buf_pool_t::create()
zip_hash.create(2 * curr_size);
last_printout_time= time(NULL);
- mutex_create(LATCH_ID_FLUSH_LIST, &flush_list_mutex);
+ mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
+ MY_MUTEX_INIT_FAST);
- for (int i= 0; i < 3; i++)
- no_flush[i]= os_event_create(0);
+ mysql_cond_init(0, &done_flush_LRU, nullptr);
+ mysql_cond_init(0, &done_flush_list, nullptr);
+ mysql_cond_init(0, &do_flush_list, nullptr);
try_LRU_scan= true;
ut_d(flush_hp.m_mutex= &flush_list_mutex;);
ut_d(lru_hp.m_mutex= &mutex);
ut_d(lru_scan_itr.m_mutex= &mutex);
- ut_d(single_scan_itr.m_mutex= &mutex);
io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
OS_AIO_N_PENDING_IOS_PER_THREAD);
@@ -1606,14 +1578,8 @@ void buf_pool_t::close()
if (!is_initialised())
return;
- mutex_free(&mutex);
- mutex_free(&flush_list_mutex);
-
- if (flush_rbt)
- {
- rbt_free(flush_rbt);
- flush_rbt= nullptr;
- }
+ mysql_mutex_destroy(&mutex);
+ mysql_mutex_destroy(&flush_list_mutex);
for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
bpage= prev_bpage)
@@ -1641,8 +1607,9 @@ void buf_pool_t::close()
allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
}
- for (int i= 0; i < 3; ++i)
- os_event_destroy(no_flush[i]);
+ mysql_cond_destroy(&done_flush_LRU);
+ mysql_cond_destroy(&done_flush_list);
+ mysql_cond_destroy(&do_flush_list);
ut_free(chunks);
chunks= nullptr;
@@ -1668,8 +1635,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
{
buf_block_t* new_block;
- ut_ad(withdrawing);
- ut_ad(mutex_own(&mutex));
+ mysql_mutex_assert_owner(&mutex);
ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
new_block = buf_LRU_get_free_only();
@@ -1740,13 +1706,8 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
MEM_UNDEFINED(block->frame, srv_page_size);
block->page.set_state(BUF_BLOCK_REMOVE_HASH);
-
- /* Relocate flush_list. */
- if (block->page.oldest_modification()) {
- buf_flush_relocate_on_flush_list(
- &block->page, &new_block->page);
- }
-
+ buf_flush_relocate_on_flush_list(&block->page,
+ &new_block->page);
block->page.set_corrupt_id();
/* set other flags of buf_block_t */
@@ -1812,16 +1773,16 @@ inline bool buf_pool_t::withdraw_blocks()
<< withdraw_target << " blocks";
/* Minimize zip_free[i] lists */
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
buf_buddy_condense_free();
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
/* try to withdraw from free_list */
ulint count1 = 0;
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
block = reinterpret_cast<buf_block_t*>(
UT_LIST_GET_FIRST(free));
while (block != NULL
@@ -1836,7 +1797,7 @@ inline bool buf_pool_t::withdraw_blocks()
UT_LIST_GET_NEXT(
list, &block->page));
- if (buf_pool.will_be_withdrawn(block->page)) {
+ if (will_be_withdrawn(block->page)) {
/* This should be withdrawn */
UT_LIST_REMOVE(free, &block->page);
UT_LIST_ADD_LAST(withdraw, &block->page);
@@ -1846,40 +1807,29 @@ inline bool buf_pool_t::withdraw_blocks()
block = next_block;
}
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
/* reserve free_list length */
if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
- ulint scan_depth;
- flush_counters_t n;
-
- /* cap scan_depth with current LRU size. */
- mutex_enter(&mutex);
- scan_depth = UT_LIST_GET_LEN(LRU);
- mutex_exit(&mutex);
-
- scan_depth = ut_min(
- ut_max(withdraw_target
- - UT_LIST_GET_LEN(withdraw),
- static_cast<ulint>(srv_LRU_scan_depth)),
- scan_depth);
+ ulint n_flushed = buf_flush_lists(
+ std::max<ulint>(withdraw_target
+ - UT_LIST_GET_LEN(withdraw),
+ srv_LRU_scan_depth), 0);
+ buf_flush_wait_batch_end_acquiring_mutex(true);
- buf_flush_do_batch(true, scan_depth, 0, &n);
- buf_flush_wait_batch_end(true);
-
- if (n.flushed) {
+ if (n_flushed) {
MONITOR_INC_VALUE_CUMULATIVE(
MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
MONITOR_LRU_BATCH_FLUSH_COUNT,
MONITOR_LRU_BATCH_FLUSH_PAGES,
- n.flushed);
+ n_flushed);
}
}
/* relocate blocks/buddies in withdrawn area */
ulint count2 = 0;
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
buf_page_t* bpage;
bpage = UT_LIST_GET_FIRST(LRU);
while (bpage != NULL) {
@@ -1900,7 +1850,7 @@ inline bool buf_pool_t::withdraw_blocks()
}
if (bpage->state() == BUF_BLOCK_FILE_PAGE
- && buf_pool.will_be_withdrawn(*bpage)) {
+ && will_be_withdrawn(*bpage)) {
if (bpage->can_relocate()) {
buf_pool_mutex_exit_forbid();
if (!realloc(
@@ -1919,7 +1869,7 @@ inline bool buf_pool_t::withdraw_blocks()
bpage = next_bpage;
}
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
buf_resize_status(
"withdrawing blocks. (" ULINTPF "/" ULINTPF ")",
@@ -1956,9 +1906,6 @@ inline bool buf_pool_t::withdraw_blocks()
ib::info() << "withdrawn target: " << UT_LIST_GET_LEN(withdraw)
<< " blocks";
- /* retry is not needed */
- ++withdraw_clock_;
-
return(false);
}
@@ -2039,7 +1986,7 @@ inline void buf_pool_t::page_hash_table::write_unlock_all()
inline void buf_pool_t::write_lock_all_page_hash()
{
- ut_ad(mutex_own(&mutex));
+ mysql_mutex_assert_owner(&mutex);
page_hash.write_lock_all();
for (page_hash_table *old_page_hash= freed_page_hash; old_page_hash;
old_page_hash= static_cast<page_hash_table*>
@@ -2111,16 +2058,15 @@ inline void buf_pool_t::resize()
srv_buf_pool_old_size, srv_buf_pool_size,
srv_buf_pool_chunk_unit);
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
ut_ad(curr_size == old_size);
ut_ad(n_chunks_new == n_chunks);
ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
- ut_ad(flush_rbt == NULL);
n_chunks_new = (new_instance_size << srv_page_size_shift)
/ srv_buf_pool_chunk_unit;
curr_size = n_chunks_new * chunks->size;
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
#ifdef BTR_CUR_HASH_ADAPT
/* disable AHI if needed */
@@ -2153,7 +2099,6 @@ inline void buf_pool_t::resize()
ut_ad(withdraw_target == 0);
withdraw_target = w;
- withdrawing.store(true, std::memory_order_relaxed);
}
buf_resize_status("Withdrawing blocks to be shrunken.");
@@ -2169,7 +2114,6 @@ withdraw_retry:
if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
/* abort to resize for shutdown. */
- withdrawing.store(false, std::memory_order_relaxed);
return;
}
@@ -2210,8 +2154,6 @@ withdraw_retry:
goto withdraw_retry;
}
- withdrawing.store(false, std::memory_order_relaxed);
-
buf_resize_status("Latching whole of buffer pool.");
#ifndef DBUG_OFF
@@ -2234,7 +2176,7 @@ withdraw_retry:
/* Indicate critical path */
resizing.store(true, std::memory_order_relaxed);
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
write_lock_all_page_hash();
chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
@@ -2398,7 +2340,7 @@ calc_buf_pool_size:
ib::info() << "hash tables were resized";
}
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
write_unlock_all_page_hash();
UT_DELETE(chunk_map_old);
@@ -2463,10 +2405,10 @@ static void buf_resize_callback(void *)
{
DBUG_ENTER("buf_resize_callback");
ut_a(srv_shutdown_state == SRV_SHUTDOWN_NONE);
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
const auto size= srv_buf_pool_size;
const bool work= srv_buf_pool_old_size != size;
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
if (work)
buf_pool.resize();
@@ -2504,7 +2446,7 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
{
const ulint fold= bpage->id().fold();
ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
ut_a(bpage->io_fix() == BUF_IO_NONE);
ut_a(!bpage->buf_fix_count());
@@ -2577,7 +2519,7 @@ retry:
(*hash_lock)->write_unlock();
/* Allocate a watch[] and then try to insert it into the page_hash. */
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
/* The maximum number of purge tasks should never exceed
the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
@@ -2601,17 +2543,17 @@ retry:
*hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
buf_page_t *bpage= page_hash_get_low(id, fold);
if (UNIV_LIKELY_NULL(bpage))
{
(*hash_lock)->write_unlock();
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
w->set_state(BUF_BLOCK_NOT_USED);
*hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
goto retry;
}
@@ -2624,7 +2566,7 @@ retry:
}
ut_error;
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
return nullptr;
}
@@ -2742,10 +2684,10 @@ err_exit:
{
discard_attempted= true;
hash_lock->read_unlock();
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
buf_LRU_free_page(bpage, false);
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
goto lookup;
}
@@ -2816,7 +2758,7 @@ buf_zip_decompress(
ulint size = page_zip_get_size(&block->page.zip);
/* The tablespace will not be found if this function is called
during IMPORT. */
- fil_space_t* space= fil_space_acquire_for_io(block->page.id().space());
+ fil_space_t* space= fil_space_t::get(block->page.id().space());
const unsigned key_version = mach_read_from_4(
frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
@@ -2853,7 +2795,7 @@ buf_zip_decompress(
if (page_zip_decompress(&block->page.zip,
block->frame, TRUE)) {
if (space) {
- space->release_for_io();
+ space->release();
}
return(TRUE);
}
@@ -2872,7 +2814,7 @@ buf_zip_decompress(
/* Copy to uncompressed storage. */
memcpy(block->frame, frame, block->zip_size());
if (space) {
- space->release_for_io();
+ space->release();
}
return(TRUE);
@@ -2896,7 +2838,7 @@ err_exit:
dict_set_corrupted_by_space(space);
}
- space->release_for_io();
+ space->release();
}
return(FALSE);
@@ -3075,16 +3017,16 @@ buf_page_get_low(
break;
default:
ut_error;
+ case BUF_GET_POSSIBLY_FREED:
+ break;
case BUF_GET_NO_LATCH:
ut_ad(rw_latch == RW_NO_LATCH);
/* fall through */
case BUF_GET:
case BUF_GET_IF_IN_POOL_OR_WATCH:
- case BUF_GET_POSSIBLY_FREED:
- fil_space_t* s = fil_space_acquire_for_io(page_id.space());
+ fil_space_t* s = fil_space_get(page_id.space());
ut_ad(s);
ut_ad(s->zip_size() == zip_size);
- s->release_for_io();
}
#endif /* UNIV_DEBUG */
@@ -3154,7 +3096,7 @@ lookup:
}
/* The call path is buf_read_page() ->
- buf_read_page_low() (fil_io()) ->
+ buf_read_page_low() (fil_space_t::io()) ->
buf_page_read_complete() ->
buf_decrypt_after_read(). Here fil_space_t* is used
and we decrypt -> buf_page_check_corrupt() where page
@@ -3208,11 +3150,10 @@ lookup:
asserting. */
if (page_id.space() == TRX_SYS_SPACE) {
} else if (page_id.space() == SRV_TMP_SPACE_ID) {
- } else if (fil_space_t* space
- = fil_space_acquire_for_io(
+ } else if (fil_space_t* space= fil_space_t::get(
page_id.space())) {
bool set = dict_set_corrupted_by_space(space);
- space->release_for_io();
+ space->release();
if (set) {
return NULL;
}
@@ -3270,14 +3211,14 @@ got_block:
if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
evict_from_pool:
ut_ad(!fix_block->page.oldest_modification());
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
fix_block->unfix();
if (!buf_LRU_free_page(&fix_block->page, true)) {
ut_ad(0);
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
return(NULL);
}
@@ -3326,7 +3267,7 @@ evict_from_pool:
block = buf_LRU_get_free_block(false);
buf_block_init_low(block);
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
hash_lock = buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
@@ -3345,7 +3286,7 @@ evict_from_pool:
hash_lock->write_unlock();
buf_LRU_block_free_non_file_page(block);
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
/* Try again */
goto loop;
@@ -3364,12 +3305,7 @@ evict_from_pool:
/* Set after buf_relocate(). */
block->page.set_buf_fix_count(1);
- if (!block->page.oldest_modification()) {
- ut_d(UT_LIST_REMOVE(buf_pool.zip_clean, &block->page));
- } else {
- /* Relocate buf_pool.flush_list. */
- buf_flush_relocate_on_flush_list(bpage, &block->page);
- }
+ buf_flush_relocate_on_flush_list(bpage, &block->page);
/* Buffer-fix, I/O-fix, and X-latch the block
for the duration of the decompression.
@@ -3384,7 +3320,7 @@ evict_from_pool:
MEM_UNDEFINED(bpage, sizeof *bpage);
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
hash_lock->write_unlock();
buf_pool.n_pend_unzip++;
@@ -3424,27 +3360,27 @@ evict_from_pool:
ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-
+re_evict:
if (mode != BUF_GET_IF_IN_POOL
&& mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
} else if (!ibuf_debug) {
- } else if (fil_space_t* space =
- fil_space_acquire_for_io(page_id.space())) {
+ } else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
/* Try to evict the block from the buffer pool, to use the
insert buffer (change buffer) as much as possible. */
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
fix_block->unfix();
/* Blocks cannot be relocated or enter or exit the
buf_pool while we are holding the buf_pool.mutex. */
+ const bool evicted = buf_LRU_free_page(&fix_block->page, true);
+ space->release();
- if (buf_LRU_free_page(&fix_block->page, true)) {
- space->release_for_io();
+ if (evicted) {
hash_lock = buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
/* We may set the watch, as it would have
been set if the page were not in the
buffer pool in the first place. */
@@ -3468,20 +3404,16 @@ evict_from_pool:
return(NULL);
}
- bool flushed = fix_block->page.ready_for_flush()
- && buf_flush_page(&fix_block->page,
- IORequest::SINGLE_PAGE, space, true);
- space->release_for_io();
- if (flushed) {
- guess = fix_block;
- goto loop;
- }
-
fix_block->fix();
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_flush_lists(ULINT_UNDEFINED, LSN_MAX);
+ buf_flush_wait_batch_end_acquiring_mutex(false);
- /* Failed to evict the page; change it directly */
+ if (!fix_block->page.oldest_modification()) {
+ goto re_evict;
+ }
- mutex_exit(&buf_pool.mutex);
+ /* Failed to evict the page; change it directly */
}
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
@@ -3661,10 +3593,13 @@ buf_page_optimistic_get(
return FALSE;
}
- page_hash_latch *hash_lock = buf_pool.hash_lock_get(block->page.id());
+ const page_id_t id(block->page.id());
+
+ page_hash_latch *hash_lock = buf_pool.hash_lock_get(id);
hash_lock->read_lock();
- if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
+ if (UNIV_UNLIKELY(id != block->page.id()
+ || block->page.state() != BUF_BLOCK_FILE_PAGE
|| block->page.io_fix() != BUF_IO_NONE)) {
hash_lock->read_unlock();
return(FALSE);
@@ -3677,8 +3612,7 @@ buf_page_optimistic_get(
buf_page_make_young_if_needed(&block->page);
- ut_ad(!ibuf_inside(mtr)
- || ibuf_page(block->page.id(), block->zip_size(), NULL));
+ ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), NULL));
mtr_memo_type_t fix_type;
@@ -3691,6 +3625,8 @@ buf_page_optimistic_get(
&block->lock, file, line);
}
+ ut_ad(id == block->page.id());
+
if (!success) {
buf_block_buf_fix_dec(block);
return(FALSE);
@@ -3805,23 +3741,22 @@ FILE_PAGE (the other is buf_page_get_gen).
@param[in] offset offset of the tablespace
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in,out] mtr mini-transaction
+@param[in,out] free_block pre-allocated buffer block
@return pointer to the block, page bufferfixed */
buf_block_t*
buf_page_create(fil_space_t *space, uint32_t offset,
- ulint zip_size, mtr_t *mtr)
+ ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
{
page_id_t page_id(space->id, offset);
ut_ad(mtr->is_active());
ut_ad(page_id.space() != 0 || !zip_size);
space->free_page(offset, false);
-loop:
- buf_block_t *free_block= buf_LRU_get_free_block(false);
free_block->initialise(page_id, zip_size, 1);
const ulint fold= page_id.fold();
-
- mutex_enter(&buf_pool.mutex);
+loop:
+ mysql_mutex_lock(&buf_pool.mutex);
buf_block_t *block= reinterpret_cast<buf_block_t*>
(buf_pool.page_hash_get_low(page_id, fold));
@@ -3832,7 +3767,7 @@ loop:
#ifdef BTR_CUR_HASH_ADAPT
const dict_index_t *drop_hash_entry= nullptr;
#endif
- switch (block->page.state()) {
+ switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) {
default:
ut_ad(0);
break;
@@ -3843,16 +3778,15 @@ loop:
while (block->page.io_fix() != BUF_IO_NONE ||
num_fix_count != block->page.buf_fix_count())
{
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
os_thread_yield();
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
}
}
rw_lock_x_lock(&block->lock);
#ifdef BTR_CUR_HASH_ADAPT
drop_hash_entry= block->index;
#endif
- buf_LRU_block_free_non_file_page(free_block);
break;
case BUF_BLOCK_ZIP_PAGE:
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
@@ -3860,20 +3794,13 @@ loop:
if (block->page.io_fix() != BUF_IO_NONE)
{
hash_lock->write_unlock();
- buf_LRU_block_free_non_file_page(free_block);
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
goto loop;
}
rw_lock_x_lock(&free_block->lock);
buf_relocate(&block->page, &free_block->page);
-
- if (block->page.oldest_modification() > 0)
- buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
-#ifdef UNIV_DEBUG
- else
- UT_LIST_REMOVE(buf_pool.zip_clean, &block->page);
-#endif
+ buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
buf_unzip_LRU_add_block(free_block, FALSE);
@@ -3884,7 +3811,7 @@ loop:
break;
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
#ifdef BTR_CUR_HASH_ADAPT
if (drop_hash_entry)
@@ -3893,6 +3820,13 @@ loop:
mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+ if (block->page.ibuf_exist)
+ {
+ if (!recv_recovery_is_on())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size, true);
+ block->page.ibuf_exist= false;
+ }
+
return block;
}
@@ -3942,7 +3876,7 @@ loop:
else
hash_lock->write_unlock();
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
block->page.set_accessed();
@@ -4090,12 +4024,12 @@ static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
/** Release and evict a corrupted page.
@param bpage page that was being read */
-void buf_pool_t::corrupted_evict(buf_page_t *bpage)
+ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
{
const page_id_t id(bpage->id());
page_hash_latch *hash_lock= hash_lock_get(id);
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
hash_lock->write_lock();
ut_ad(bpage->io_fix() == BUF_IO_READ);
@@ -4110,7 +4044,7 @@ void buf_pool_t::corrupted_evict(buf_page_t *bpage)
/* remove from LRU and page_hash */
buf_LRU_free_one_page(bpage, id, hash_lock);
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
ut_d(auto n=) n_pend_reads--;
ut_ad(n > 0);
@@ -4120,6 +4054,7 @@ void buf_pool_t::corrupted_evict(buf_page_t *bpage)
@param[in] bpage Corrupted page
@param[in] node data file
Also remove the bpage from LRU list. */
+ATTRIBUTE_COLD
static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node)
{
ut_ad(bpage->id().space() == node.space->id);
@@ -4162,7 +4097,7 @@ after decryption normal page checksum does not match.
static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
const fil_node_t &node)
{
- ut_ad(node.space->pending_io());
+ ut_ad(node.space->referenced());
byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
((buf_block_t*) bpage)->frame;
@@ -4236,7 +4171,7 @@ dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node)
{
const page_id_t id(bpage->id());
ut_ad(bpage->in_file());
- ut_ad(id.space() || !buf_dblwr_page_inside(id.page_no()));
+ ut_ad(!buf_dblwr.is_inside(id));
ut_ad(id.space() == node.space->id);
ut_ad(bpage->zip_size() == node.space->zip_size());
@@ -4303,7 +4238,7 @@ dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node)
}
err= buf_page_check_corrupt(bpage, node);
- if (err != DB_SUCCESS)
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
{
database_corrupted:
/* Not a real corruption if it was triggered by error injection */
@@ -4363,7 +4298,7 @@ release_page:
if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations &&
(!id.space() || !is_predefined_tablespace(id.space())) &&
fil_page_get_type(frame) == FIL_PAGE_INDEX &&
- page_is_leaf(frame) && ibuf_page_exists(id, bpage->zip_size()))
+ page_is_leaf(frame))
bpage->ibuf_exist= true;
if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
@@ -4391,12 +4326,12 @@ release_page:
@retval nullptr if all freed */
void buf_pool_t::assert_all_freed()
{
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
const chunk_t *chunk= chunks;
for (auto i= n_chunks; i--; chunk++)
if (const buf_block_t* block= chunk->not_freed())
ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
}
#endif /* UNIV_DEBUG */
@@ -4411,33 +4346,20 @@ void buf_refresh_io_stats()
All pages must be in a replaceable state (not modified or latched). */
void buf_pool_invalidate()
{
- mutex_enter(&buf_pool.mutex);
- ut_ad(!buf_pool.init_flush[IORequest::LRU]);
- ut_ad(!buf_pool.init_flush[IORequest::FLUSH_LIST]);
- ut_ad(!buf_pool.init_flush[IORequest::SINGLE_PAGE]);
- ut_ad(!buf_pool.n_flush[IORequest::SINGLE_PAGE]);
-
- if (buf_pool.n_flush[IORequest::LRU]) {
- mutex_exit(&buf_pool.mutex);
- buf_flush_wait_batch_end(true);
- mutex_enter(&buf_pool.mutex);
- }
+ mysql_mutex_lock(&buf_pool.mutex);
- if (buf_pool.n_flush[IORequest::FLUSH_LIST]) {
- mutex_exit(&buf_pool.mutex);
- buf_flush_wait_batch_end(false);
- mutex_enter(&buf_pool.mutex);
- }
+ buf_flush_wait_batch_end(true);
+ buf_flush_wait_batch_end(false);
/* It is possible that a write batch that has been posted
earlier is still not complete. For buffer pool invalidation to
proceed we must ensure there is NO write activity happening. */
- ut_d(mutex_exit(&buf_pool.mutex));
+ ut_d(mysql_mutex_unlock(&buf_pool.mutex));
ut_d(buf_pool.assert_all_freed());
- ut_d(mutex_enter(&buf_pool.mutex));
+ ut_d(mysql_mutex_lock(&buf_pool.mutex));
- while (buf_LRU_scan_and_free_block(true));
+ while (buf_LRU_scan_and_free_block());
ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
@@ -4448,7 +4370,7 @@ void buf_pool_invalidate()
memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat));
buf_refresh_io_stats();
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
#ifdef UNIV_DEBUG
@@ -4460,7 +4382,7 @@ void buf_pool_t::validate()
ulint n_free = 0;
ulint n_zip = 0;
- mutex_enter(&mutex);
+ mysql_mutex_lock(&mutex);
chunk_t* chunk = chunks;
@@ -4474,8 +4396,8 @@ void buf_pool_t::validate()
for (j = chunk->size; j--; block++) {
switch (block->page.state()) {
case BUF_BLOCK_ZIP_PAGE:
- /* These should only occur on
- zip_clean, zip_free[], or flush_list. */
+ /* This kind of block descriptors should
+ be allocated by malloc() only. */
ut_error;
break;
@@ -4499,37 +4421,9 @@ void buf_pool_t::validate()
}
}
- /* Check clean compressed-only blocks. */
-
- for (buf_page_t* b = UT_LIST_GET_FIRST(zip_clean); b;
- b = UT_LIST_GET_NEXT(list, b)) {
- ut_ad(b->state() == BUF_BLOCK_ZIP_PAGE);
- ut_ad(!b->oldest_modification());
- switch (b->io_fix()) {
- case BUF_IO_NONE:
- case BUF_IO_PIN:
- /* All clean blocks should be I/O-unfixed. */
- break;
- case BUF_IO_READ:
- /* In buf_LRU_free_page(), we temporarily set
- b->io_fix = BUF_IO_READ for a newly allocated
- control block in order to prevent
- buf_page_get_gen() from decompressing the block. */
- break;
- default:
- ut_error;
- break;
- }
-
- const page_id_t id = b->id();
- ut_ad(page_hash_get_low(id, id.fold()) == b);
- n_lru++;
- n_zip++;
- }
-
/* Check dirty blocks. */
- mutex_enter(&flush_list_mutex);
+ mysql_mutex_lock(&flush_list_mutex);
for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->oldest_modification());
@@ -4555,7 +4449,7 @@ void buf_pool_t::validate()
ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
- mutex_exit(&flush_list_mutex);
+ mysql_mutex_unlock(&flush_list_mutex);
if (curr_size == old_size
&& n_lru + n_free > curr_size + n_zip) {
@@ -4565,7 +4459,7 @@ void buf_pool_t::validate()
<< " zip " << n_zip << ". Aborting...";
}
- ut_ad(UT_LIST_GET_LEN(LRU) == n_lru);
+ ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
if (curr_size == old_size
&& UT_LIST_GET_LEN(free) != n_free) {
@@ -4575,7 +4469,7 @@ void buf_pool_t::validate()
<< ", free blocks " << n_free << ". Aborting...";
}
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
ut_d(buf_LRU_validate());
ut_d(buf_flush_validate());
@@ -4603,8 +4497,8 @@ void buf_pool_t::print()
counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
- mutex_enter(&mutex);
- mutex_enter(&flush_list_mutex);
+ mysql_mutex_lock(&mutex);
+ mysql_mutex_lock(&flush_list_mutex);
ib::info()
<< "[buffer pool: size=" << curr_size
@@ -4614,16 +4508,15 @@ void buf_pool_t::print()
<< UT_LIST_GET_LEN(flush_list)
<< ", n pending decompressions=" << n_pend_unzip
<< ", n pending reads=" << n_pend_reads
- << ", n pending flush LRU=" << n_flush[IORequest::LRU]
- << " list=" << n_flush[IORequest::FLUSH_LIST]
- << " single page=" << n_flush[IORequest::SINGLE_PAGE]
+ << ", n pending flush LRU=" << n_flush_LRU
+ << " list=" << n_flush_list
<< ", pages made young=" << stat.n_pages_made_young
<< ", not young=" << stat.n_pages_not_made_young
<< ", pages read=" << stat.n_pages_read
<< ", created=" << stat.n_pages_created
<< ", written=" << stat.n_pages_written << "]";
- mutex_exit(&flush_list_mutex);
+ mysql_mutex_unlock(&flush_list_mutex);
/* Count the number of blocks belonging to each index in the buffer */
@@ -4664,7 +4557,7 @@ void buf_pool_t::print()
}
}
- mutex_exit(&mutex);
+ mysql_mutex_unlock(&mutex);
for (i = 0; i < n_found; i++) {
index = dict_index_get_if_in_cache(index_ids[i]);
@@ -4692,66 +4585,18 @@ void buf_pool_t::print()
/** @return the number of latched pages in the buffer pool */
ulint buf_get_latched_pages_number()
{
- buf_page_t* b;
- ulint i;
- ulint fixed_pages_number = 0;
+ ulint fixed_pages_number= 0;
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
- auto chunk = buf_pool.chunks;
-
- for (i = buf_pool.n_chunks; i--; chunk++) {
- buf_block_t* block= chunk->blocks;
-
- for (auto j= chunk->size; j--; block++) {
- if (block->page.state() == BUF_BLOCK_FILE_PAGE
- && (block->page.buf_fix_count()
- || block->page.io_fix() != BUF_IO_NONE)) {
+ for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
+ b= UT_LIST_GET_NEXT(LRU, b))
+ if (b->in_file() && (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE))
+ fixed_pages_number++;
- fixed_pages_number++;
- }
- }
- }
+ mysql_mutex_unlock(&buf_pool.mutex);
- /* Traverse the lists of clean and dirty compressed-only blocks. */
-
- for (b = UT_LIST_GET_FIRST(buf_pool.zip_clean); b;
- b = UT_LIST_GET_NEXT(list, b)) {
- ut_a(b->state() == BUF_BLOCK_ZIP_PAGE);
- ut_a(!b->oldest_modification());
- ut_a(b->io_fix() != BUF_IO_WRITE);
-
- if (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE) {
- fixed_pages_number++;
- }
- }
-
- mutex_enter(&buf_pool.flush_list_mutex);
- for (b = UT_LIST_GET_FIRST(buf_pool.flush_list); b;
- b = UT_LIST_GET_NEXT(list, b)) {
- ut_ad(b->oldest_modification());
-
- switch (b->state()) {
- case BUF_BLOCK_ZIP_PAGE:
- if (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE) {
- fixed_pages_number++;
- }
- continue;
- case BUF_BLOCK_FILE_PAGE:
- /* uncompressed page */
- continue;
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- break;
- }
- ut_error;
- }
-
- mutex_exit(&buf_pool.flush_list_mutex);
- mutex_exit(&buf_pool.mutex);
-
- return(fixed_pages_number);
+ return fixed_pages_number;
}
#endif /* UNIV_DEBUG */
@@ -4762,8 +4607,8 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
time_t current_time;
double time_elapsed;
- mutex_enter(&buf_pool.mutex);
- mutex_enter(&buf_pool.flush_list_mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
pool_info->pool_size = buf_pool.curr_size;
@@ -4779,19 +4624,11 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
pool_info->n_pend_reads = buf_pool.n_pend_reads;
- pool_info->n_pending_flush_lru =
- (buf_pool.n_flush[IORequest::LRU]
- + buf_pool.init_flush[IORequest::LRU]);
-
- pool_info->n_pending_flush_list =
- (buf_pool.n_flush[IORequest::FLUSH_LIST]
- + buf_pool.init_flush[IORequest::FLUSH_LIST]);
+ pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU;
- pool_info->n_pending_flush_single_page =
- (buf_pool.n_flush[IORequest::SINGLE_PAGE]
- + buf_pool.init_flush[IORequest::SINGLE_PAGE]);
+ pool_info->n_pending_flush_list = buf_pool.n_flush_list;
- mutex_exit(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
@@ -4882,7 +4719,7 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
buf_refresh_io_stats();
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
/*********************************************************************//**
@@ -4905,8 +4742,7 @@ buf_print_io_instance(
"Percent of dirty pages(LRU & free pages): %.3f\n"
"Max dirty pages percent: %.3f\n"
"Pending reads " ULINTPF "\n"
- "Pending writes: LRU " ULINTPF ", flush list " ULINTPF
- ", single page " ULINTPF "\n",
+ "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n",
pool_info->pool_size,
pool_info->free_list_len,
pool_info->lru_len,
@@ -4919,8 +4755,7 @@ buf_print_io_instance(
srv_max_buf_pool_modified_pct,
pool_info->n_pend_reads,
pool_info->n_pending_flush_lru,
- pool_info->n_pending_flush_list,
- pool_info->n_pending_flush_single_page);
+ pool_info->n_pending_flush_list);
fprintf(file,
"Pages made young " ULINTPF ", not young " ULINTPF "\n"
@@ -5021,17 +4856,4 @@ std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
<< ", page number=" << page_id.page_no() << "]";
return out;
}
-
-/**
-Calculate the length of trim (punch_hole) operation.
-@param[in] bpage Page control block
-@param[in] write_length Write length
-@return length of the trim or zero. */
-ulint
-buf_page_get_trim_length(
- const buf_page_t* bpage,
- ulint write_length)
-{
- return bpage->physical_size() - write_length;
-}
#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index 4b2822bf865..f17cf6cc128 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -29,6 +29,7 @@ Created 2011/12/19
#include "buf0checksum.h"
#include "srv0start.h"
#include "srv0srv.h"
+#include "sync0sync.h"
#include "page0zip.h"
#include "trx0sys.h"
#include "fil0crypt.h"
@@ -37,41 +38,7 @@ Created 2011/12/19
using st_::span;
/** The doublewrite buffer */
-buf_dblwr_t* buf_dblwr = NULL;
-
-/** Set to TRUE when the doublewrite buffer is being created */
-ibool buf_dblwr_being_created = FALSE;
-
-#define TRX_SYS_DOUBLEWRITE_BLOCKS 2
-
-/****************************************************************//**
-Determines if a page number is located inside the doublewrite buffer.
-@return TRUE if the location is inside the two blocks of the
-doublewrite buffer */
-ibool
-buf_dblwr_page_inside(
-/*==================*/
- ulint page_no) /*!< in: page number */
-{
- if (buf_dblwr == NULL) {
-
- return(FALSE);
- }
-
- if (page_no >= buf_dblwr->block1
- && page_no < buf_dblwr->block1
- + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- return(TRUE);
- }
-
- if (page_no >= buf_dblwr->block2
- && page_no < buf_dblwr->block2
- + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- return(TRUE);
- }
-
- return(FALSE);
-}
+buf_dblwr_t buf_dblwr;
/** @return the TRX_SYS page */
inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
@@ -82,619 +49,450 @@ inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
return block;
}
-/****************************************************************//**
-Creates or initialializes the doublewrite buffer at a database start. */
-static void buf_dblwr_init(const byte *doublewrite)
+/** Initialize the doublewrite buffer data structure.
+@param header doublewrite page header in the TRX_SYS page */
+inline void buf_dblwr_t::init(const byte *header)
{
- ulint buf_size;
+ ut_ad(!active_slot->first_free);
+ ut_ad(!active_slot->reserved);
+ ut_ad(!batch_running);
- buf_dblwr = static_cast<buf_dblwr_t*>(
- ut_zalloc_nokey(sizeof(buf_dblwr_t)));
+ mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+ mysql_cond_init(0, &cond, nullptr);
+ block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
+ block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
- /* There are two blocks of same size in the doublewrite
- buffer. */
- buf_size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ const uint32_t buf_size= 2 * block_size();
+ for (int i= 0; i < 2; i++)
+ {
+ slots[i].write_buf= static_cast<byte*>
+ (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
+ slots[i].buf_block_arr= static_cast<element*>
+ (ut_zalloc_nokey(buf_size * sizeof(element)));
+ }
+ active_slot= &slots[0];
+}
- /* There must be atleast one buffer for single page writes
- and one buffer for batch writes. */
- ut_a(srv_doublewrite_batch_size > 0
- && srv_doublewrite_batch_size < buf_size);
+/** Create or restore the doublewrite buffer in the TRX_SYS page.
+@return whether the operation succeeded */
+bool buf_dblwr_t::create()
+{
+ if (is_initialised())
+ return true;
- mutex_create(LATCH_ID_BUF_DBLWR, &buf_dblwr->mutex);
+ mtr_t mtr;
+ const ulint size= block_size();
- buf_dblwr->b_event = os_event_create("dblwr_batch_event");
- buf_dblwr->s_event = os_event_create("dblwr_single_event");
- buf_dblwr->first_free = 0;
- buf_dblwr->s_reserved = 0;
- buf_dblwr->b_reserved = 0;
+start_again:
+ mtr.start();
- buf_dblwr->block1 = mach_read_from_4(
- doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
- buf_dblwr->block2 = mach_read_from_4(
- doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+ buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
- buf_dblwr->write_buf = static_cast<byte*>(
- aligned_malloc(buf_size << srv_page_size_shift,
- srv_page_size));
+ if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ trx_sys_block->frame) == TRX_SYS_DOUBLEWRITE_MAGIC_N)
+ {
+ /* The doublewrite buffer has already been created: just read in
+ some numbers */
+ init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame);
+ mtr.commit();
+ return true;
+ }
- buf_dblwr->buf_block_arr = static_cast<buf_dblwr_t::element*>(
- ut_zalloc_nokey(buf_size * sizeof(buf_dblwr_t::element)));
-}
+ if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
+ {
+too_small:
+ ib::error() << "Cannot create doublewrite buffer: "
+ "the first file in innodb_data_file_path must be at least "
+ << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+ mtr.commit();
+ return false;
+ }
+ else
+ {
+ buf_block_t *b= fseg_create(fil_system.sys_space,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
+ &mtr, false, trx_sys_block);
+ if (!b)
+ goto too_small;
+ ib::info() << "Doublewrite buffer not found: creating new";
+
+ /* FIXME: After this point, the doublewrite buffer creation
+ is not atomic. The doublewrite buffer should not exist in
+ the InnoDB system tablespace file in the first place.
+ It could be located in separate optional file(s) in a
+ user-specified location. */
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+ buf_block_dbg_add_level(b, SYNC_NO_ORDER_CHECK);
+ }
-/** Create the doublewrite buffer if the doublewrite buffer header
-is not present in the TRX_SYS page.
-@return whether the operation succeeded
-@retval true if the doublewrite buffer exists or was created
-@retval false if the creation failed (too small first data file) */
-bool
-buf_dblwr_create()
-{
- buf_block_t* block2;
- buf_block_t* new_block;
- byte* fseg_header;
- ulint page_no;
- ulint prev_page_no;
- ulint i;
- mtr_t mtr;
-
- if (buf_dblwr) {
- /* Already inited */
- return(true);
- }
+ byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+ trx_sys_block->frame;
+ for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+ i < 2 * size + extent_size / 2; i++)
+ {
+ buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
+ FSP_UP, &mtr);
+ if (!new_block)
+ {
+ ib::error() << "Cannot create doublewrite buffer: "
+ " you must increase your tablespace size."
+ " Cannot continue operation.";
+ /* This may essentially corrupt the doublewrite
+ buffer. However, usually the doublewrite buffer
+ is created at database initialization, and it
+ should not matter (just remove all newly created
+ InnoDB files and restart). */
+ mtr.commit();
+ return false;
+ }
-start_again:
- mtr.start();
- buf_dblwr_being_created = TRUE;
-
- buf_block_t *trx_sys_block = buf_dblwr_trx_sys_get(&mtr);
-
- if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC
- + trx_sys_block->frame)
- == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
- /* The doublewrite buffer has already been created:
- just read in some numbers */
-
- buf_dblwr_init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame);
-
- mtr.commit();
- buf_dblwr_being_created = FALSE;
- return(true);
- } else {
- if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size
- < 3 * FSP_EXTENT_SIZE) {
- goto too_small;
- }
- }
-
- block2 = fseg_create(fil_system.sys_space,
- TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
- &mtr, false, trx_sys_block);
-
- if (block2 == NULL) {
-too_small:
- ib::error()
- << "Cannot create doublewrite buffer: "
- "the first file in innodb_data_file_path"
- " must be at least "
- << (3 * (FSP_EXTENT_SIZE
- >> (20U - srv_page_size_shift)))
- << "M.";
- mtr.commit();
- return(false);
- }
-
- ib::info() << "Doublewrite buffer not found: creating new";
-
- /* FIXME: After this point, the doublewrite buffer creation
- is not atomic. The doublewrite buffer should not exist in
- the InnoDB system tablespace file in the first place.
- It could be located in separate optional file(s) in a
- user-specified location. */
-
- /* fseg_create acquires a second latch on the page,
- therefore we must declare it: */
-
- buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
-
- fseg_header = TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG
- + trx_sys_block->frame;
- prev_page_no = 0;
-
- for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
- + FSP_EXTENT_SIZE / 2; i++) {
- new_block = fseg_alloc_free_page(
- fseg_header, prev_page_no + 1, FSP_UP, &mtr);
- if (new_block == NULL) {
- ib::error() << "Cannot create doublewrite buffer: "
- " you must increase your tablespace size."
- " Cannot continue operation.";
- /* This may essentially corrupt the doublewrite
- buffer. However, usually the doublewrite buffer
- is created at database initialization, and it
- should not matter (just remove all newly created
- InnoDB files and restart). */
- mtr.commit();
- return(false);
- }
-
- /* We read the allocated pages to the buffer pool;
- when they are written to disk in a flush, the space
- id and page number fields are also written to the
- pages. When we at database startup read pages
- from the doublewrite buffer, we know that if the
- space id and page number in them are the same as
- the page position in the tablespace, then the page
- has not been written to in doublewrite. */
-
- ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
- page_no = new_block->page.id().page_no();
- /* We only do this in the debug build, to ensure that
- the check in buf_flush_init_for_writing() will see a valid
- page type. The flushes of new_block are actually
- unnecessary here. */
- ut_d(mtr.write<2>(*new_block,
- FIL_PAGE_TYPE + new_block->frame,
- FIL_PAGE_TYPE_SYS));
-
- if (i == FSP_EXTENT_SIZE / 2) {
- ut_a(page_no == FSP_EXTENT_SIZE);
- mtr.write<4>(*trx_sys_block,
- TRX_SYS_DOUBLEWRITE
- + TRX_SYS_DOUBLEWRITE_BLOCK1
- + trx_sys_block->frame,
- page_no);
- mtr.write<4>(*trx_sys_block,
- TRX_SYS_DOUBLEWRITE
- + TRX_SYS_DOUBLEWRITE_REPEAT
- + TRX_SYS_DOUBLEWRITE_BLOCK1
- + trx_sys_block->frame,
- page_no);
-
- } else if (i == FSP_EXTENT_SIZE / 2
- + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- ut_a(page_no == 2 * FSP_EXTENT_SIZE);
- mtr.write<4>(*trx_sys_block,
- TRX_SYS_DOUBLEWRITE
- + TRX_SYS_DOUBLEWRITE_BLOCK2
- + trx_sys_block->frame,
- page_no);
- mtr.write<4>(*trx_sys_block,
- TRX_SYS_DOUBLEWRITE
- + TRX_SYS_DOUBLEWRITE_REPEAT
- + TRX_SYS_DOUBLEWRITE_BLOCK2
- + trx_sys_block->frame,
- page_no);
- } else if (i > FSP_EXTENT_SIZE / 2) {
- ut_a(page_no == prev_page_no + 1);
- }
-
- if (((i + 1) & 15) == 0) {
- /* rw_locks can only be recursively x-locked
- 2048 times. (on 32 bit platforms,
- (lint) 0 - (X_LOCK_DECR * 2049)
- is no longer a negative number, and thus
- lock_word becomes like a shared lock).
- For 4k page size this loop will
- lock the fseg header too many times. Since
- this code is not done while any other threads
- are active, restart the MTR occasionally. */
- mtr.commit();
- mtr.start();
- trx_sys_block = buf_dblwr_trx_sys_get(&mtr);
- fseg_header = TRX_SYS_DOUBLEWRITE
- + TRX_SYS_DOUBLEWRITE_FSEG
- + trx_sys_block->frame;
- }
-
- prev_page_no = page_no;
- }
-
- mtr.write<4>(*trx_sys_block,
- TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC
- + trx_sys_block->frame,
- TRX_SYS_DOUBLEWRITE_MAGIC_N);
- mtr.write<4>(*trx_sys_block,
- TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC
- + TRX_SYS_DOUBLEWRITE_REPEAT
- + trx_sys_block->frame,
- TRX_SYS_DOUBLEWRITE_MAGIC_N);
-
- mtr.write<4>(*trx_sys_block,
- TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED
- + trx_sys_block->frame,
- TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
- mtr.commit();
-
- /* Flush the modified pages to disk and make a checkpoint */
- log_make_checkpoint();
- buf_dblwr_being_created = FALSE;
-
- /* Remove doublewrite pages from LRU */
- buf_pool_invalidate();
-
- ib::info() << "Doublewrite buffer created";
-
- goto start_again;
+ /* We read the allocated pages to the buffer pool; when they are
+ written to disk in a flush, the space id and page number fields
+ are also written to the pages. When we at database startup read
+ pages from the doublewrite buffer, we know that if the space id
+ and page number in them are the same as the page position in the
+ tablespace, then the page has not been written to in
+ doublewrite. */
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ const page_id_t id= new_block->page.id();
+ /* We only do this in the debug build, to ensure that the check in
+ buf_flush_init_for_writing() will see a valid page type. The
+ flushes of new_block are actually unnecessary here. */
+ ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->frame,
+ FIL_PAGE_TYPE_SYS));
+
+ if (i == size / 2)
+ {
+ ut_a(id.page_no() == size);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
+ trx_sys_block->frame, id.page_no());
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+ TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->frame,
+ id.page_no());
+ }
+ else if (i == size / 2 + size)
+ {
+ ut_a(id.page_no() == 2 * size);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
+ trx_sys_block->frame, id.page_no());
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+ TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->frame,
+ id.page_no());
+ }
+ else if (i > size / 2)
+ ut_a(id.page_no() == prev_page_no + 1);
+
+ if (((i + 1) & 15) == 0) {
+ /* rw_locks can only be recursively x-locked 2048 times. (on 32
+ bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
+ negative number, and thus lock_word becomes like a shared lock).
+ For 4k page size this loop will lock the fseg header too many
+ times. Since this code is not done while any other threads are
+ active, restart the MTR occasionally. */
+ mtr.commit();
+ mtr.start();
+ trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+ fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+ trx_sys_block->frame;
+ }
+
+ prev_page_no= id.page_no();
+ }
+
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->frame,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N);
+
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+ trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+ mtr.commit();
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint();
+
+ /* Remove doublewrite pages from LRU */
+ buf_pool_invalidate();
+
+ ib::info() << "Doublewrite buffer created";
+ goto start_again;
}
-/**
-At database startup initializes the doublewrite buffer memory structure if
-we already have a doublewrite buffer created in the data files. If we are
-upgrading to an InnoDB version which supports multiple tablespaces, then this
-function performs the necessary update operations. If we are in a crash
-recovery, this function loads the pages from double write buffer into memory.
-@param[in] file File handle
-@param[in] path Path name of file
+/** Initialize the doublewrite buffer memory structure on recovery.
+If we are upgrading from a version before MySQL 4.1, then this
+function performs the necessary update operations to support
+innodb_file_per_table. If we are in a crash recovery, this function
+loads the pages from double write buffer into memory.
+@param file File handle
+@param path Path name of file
@return DB_SUCCESS or error code */
-dberr_t
-buf_dblwr_init_or_load_pages(
- pfs_os_file_t file,
- const char* path)
+dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
{
- byte* buf;
- byte* page;
- ulint block1;
- ulint block2;
- ulint space_id;
- byte* read_buf;
- byte* doublewrite;
- ibool reset_space_ids = FALSE;
- recv_dblwr_t& recv_dblwr = recv_sys.dblwr;
-
- /* We do the file i/o past the buffer pool */
- read_buf = static_cast<byte*>(
- aligned_malloc(2 * srv_page_size, srv_page_size));
-
- /* Read the trx sys header to check if we are using the doublewrite
- buffer */
- dberr_t err;
-
- IORequest read_request(IORequest::READ);
-
- err = os_file_read(
- read_request,
- file, read_buf, TRX_SYS_PAGE_NO << srv_page_size_shift,
- srv_page_size);
-
- if (err != DB_SUCCESS) {
-
- ib::error()
- << "Failed to read the system tablespace header page";
+ ut_ad(this == &buf_dblwr);
+ const uint32_t size= block_size();
+
+ /* We do the file i/o past the buffer pool */
+ byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+ /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
+ dberr_t err= os_file_read(IORequestRead, file, read_buf,
+ TRX_SYS_PAGE_NO << srv_page_size_shift,
+ srv_page_size);
+
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the system tablespace header page";
func_exit:
- aligned_free(read_buf);
- return(err);
- }
-
- doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
-
- /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
-
- if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
- == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
- /* The doublewrite buffer has been created */
-
- buf_dblwr_init(doublewrite);
-
- block1 = buf_dblwr->block1;
- block2 = buf_dblwr->block2;
-
- buf = buf_dblwr->write_buf;
- } else {
- err = DB_SUCCESS;
- goto func_exit;
- }
-
- if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
- != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
-
- /* We are upgrading from a version < 4.1.x to a version where
- multiple tablespaces are supported. We must reset the space id
- field in the pages in the doublewrite buffer because starting
- from this version the space id is stored to
- FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
-
- reset_space_ids = TRUE;
-
- ib::info() << "Resetting space id's in the doublewrite buffer";
- }
-
- /* Read the pages from the doublewrite buffer to memory */
- err = os_file_read(
- read_request,
- file, buf, block1 << srv_page_size_shift,
- TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift);
-
- if (err != DB_SUCCESS) {
-
- ib::error()
- << "Failed to read the first double write buffer "
- "extent";
- goto func_exit;
- }
-
- err = os_file_read(
- read_request,
- file,
- buf + (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift),
- block2 << srv_page_size_shift,
- TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift);
-
- if (err != DB_SUCCESS) {
-
- ib::error()
- << "Failed to read the second double write buffer "
- "extent";
- goto func_exit;
- }
-
- /* Check if any of these pages is half-written in data files, in the
- intended position */
-
- page = buf;
-
- for (ulint i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
-
- if (reset_space_ids) {
- ulint source_page_no;
-
- space_id = 0;
- mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
- space_id);
- /* We do not need to calculate new checksums for the
- pages because the field .._SPACE_ID does not affect
- them. Write the page back to where we read it from. */
-
- if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- source_page_no = block1 + i;
- } else {
- source_page_no = block2
- + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
- }
-
- err = os_file_write(
- IORequestWrite, path, file, page,
- source_page_no << srv_page_size_shift,
- srv_page_size);
- if (err != DB_SUCCESS) {
-
- ib::error()
- << "Failed to write to the double write"
- " buffer";
- goto func_exit;
- }
- } else if (mach_read_from_8(page + FIL_PAGE_LSN)) {
- /* Each valid page header must contain
- a nonzero FIL_PAGE_LSN field. */
- recv_dblwr.add(page);
- }
-
- page += srv_page_size;
- }
-
- if (reset_space_ids) {
- os_file_flush(file);
- }
-
- err = DB_SUCCESS;
- goto func_exit;
+ aligned_free(read_buf);
+ return err;
+ }
+
+ /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
+ if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+ read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
+ {
+ /* There is no doublewrite buffer initialized in the TRX_SYS page.
+ This should normally not be possible; the doublewrite buffer should
+ be initialized when creating the database. */
+ err= DB_SUCCESS;
+ goto func_exit;
+ }
+
+ init(TRX_SYS_DOUBLEWRITE + read_buf);
+
+ const bool upgrade_to_innodb_file_per_table=
+ mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+ TRX_SYS_DOUBLEWRITE + read_buf) !=
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
+
+ auto write_buf= active_slot->write_buf;
+ /* Read the pages from the doublewrite buffer to memory */
+ err= os_file_read(IORequestRead, file, write_buf,
+ block1.page_no() << srv_page_size_shift,
+ size << srv_page_size_shift);
+
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the first double write buffer extent";
+ goto func_exit;
+ }
+
+ err= os_file_read(IORequestRead, file,
+ write_buf + (size << srv_page_size_shift),
+ block2.page_no() << srv_page_size_shift,
+ size << srv_page_size_shift);
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the second double write buffer extent";
+ goto func_exit;
+ }
+
+ byte *page= write_buf;
+
+ if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
+ {
+ ib::info() << "Resetting space id's in the doublewrite buffer";
+
+ for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+ {
+ memset(page + FIL_PAGE_SPACE_ID, 0, 4);
+ /* For innodb_checksum_algorithm=innodb, we do not need to
+ calculate new checksums for the pages because the field
+ .._SPACE_ID does not affect them. Write the page back to where
+ we read it from. */
+ const ulint source_page_no= i < size
+ ? block1.page_no() + i
+ : block2.page_no() + i - size;
+ err= os_file_write(IORequestWrite, path, file, page,
+ source_page_no << srv_page_size_shift, srv_page_size);
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to upgrade the double write buffer";
+ goto func_exit;
+ }
+ }
+ os_file_flush(file);
+ }
+ else
+ for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+ if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
+ /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+ recv_sys.dblwr.add(page);
+
+ err= DB_SUCCESS;
+ goto func_exit;
}
/** Process and remove the double write buffer pages for all tablespaces. */
-void
-buf_dblwr_process()
+void buf_dblwr_t::recover()
{
- ut_ad(recv_sys.parse_start_lsn);
-
- ulint page_no_dblwr = 0;
- byte* read_buf;
- recv_dblwr_t& recv_dblwr = recv_sys.dblwr;
-
- if (!buf_dblwr) {
- return;
- }
-
- read_buf = static_cast<byte*>(
- aligned_malloc(3 * srv_page_size, srv_page_size));
- byte* const buf = read_buf + srv_page_size;
-
- for (recv_dblwr_t::list::iterator i = recv_dblwr.pages.begin();
- i != recv_dblwr.pages.end();
- ++i, ++page_no_dblwr) {
- byte* page = *i;
- const ulint page_no = page_get_page_no(page);
-
- if (!page_no) {
- /* page 0 should have been recovered
- already via Datafile::restore_from_doublewrite() */
- continue;
- }
-
- const ulint space_id = page_get_space_id(page);
- const lsn_t lsn = mach_read_from_8(page + FIL_PAGE_LSN);
-
- if (recv_sys.parse_start_lsn > lsn) {
- /* Pages written before the checkpoint are
- not useful for recovery. */
- continue;
- }
-
- const page_id_t page_id(space_id, page_no);
-
- if (recv_sys.scanned_lsn < lsn) {
- ib::warn() << "Ignoring a doublewrite copy of page "
- << page_id
- << " with future log sequence number "
- << lsn;
- continue;
- }
-
- fil_space_t* space = fil_space_acquire_for_io(space_id);
-
- if (!space) {
- /* Maybe we have dropped the tablespace
- and this page once belonged to it: do nothing */
- continue;
- }
-
- fil_space_open_if_needed(space);
-
- if (UNIV_UNLIKELY(page_no >= space->size)) {
-
- /* Do not report the warning for undo
- tablespaces, because they can be truncated in place. */
- if (!srv_is_undo_tablespace(space_id)) {
- ib::warn() << "A copy of page " << page_no
- << " in the doublewrite buffer slot "
- << page_no_dblwr
- << " is beyond the end of tablespace "
- << space->name
- << " (" << space->size << " pages)";
- }
+ ut_ad(recv_sys.parse_start_lsn);
+ if (!is_initialised())
+ return;
+
+ uint32_t page_no_dblwr= 0;
+ byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
+ srv_page_size));
+ byte *const buf= read_buf + srv_page_size;
+
+ for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
+ i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
+ {
+ byte *page= *i;
+ const uint32_t page_no= page_get_page_no(page);
+ if (!page_no) /* recovered via Datafile::restore_from_doublewrite() */
+ continue;
+
+ const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+ if (recv_sys.parse_start_lsn > lsn)
+ /* Pages written before the checkpoint are not useful for recovery. */
+ continue;
+ const ulint space_id= page_get_space_id(page);
+ const page_id_t page_id(space_id, page_no);
+
+ if (recv_sys.scanned_lsn < lsn)
+ {
+ ib::warn() << "Ignoring a doublewrite copy of page " << page_id
+ << " with future log sequence number " << lsn;
+ continue;
+ }
+
+ fil_space_t *space= fil_space_t::get(space_id);
+
+ if (!space)
+ /* The tablespace that this page once belonged to does not exist */
+ continue;
+
+ if (UNIV_UNLIKELY(page_no >= space->get_size()))
+ {
+ /* Do not report the warning for undo tablespaces, because they
+ can be truncated in place. */
+ if (!srv_is_undo_tablespace(space_id))
+ ib::warn() << "A copy of page " << page_no
+ << " in the doublewrite buffer slot " << page_no_dblwr
+ << " is beyond the end of tablespace " << space->name
+ << " (" << space->size << " pages)";
next_page:
- space->release_for_io();
- continue;
- }
-
- const ulint physical_size = space->physical_size();
- const ulint zip_size = space->zip_size();
- ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
-
- /* We want to ensure that for partial reads the
- unread portion of the page is NUL. */
- memset(read_buf, 0x0, physical_size);
-
- IORequest request;
-
- request.dblwr_recover();
-
- /* Read in the actual page from the file */
- fil_io_t fio = fil_io(
- request, true,
- page_id, zip_size,
- 0, physical_size, read_buf, NULL);
-
- if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
- ib::warn()
- << "Double write buffer recovery: "
- << page_id << " read failed with "
- << "error: " << fio.err;
- }
-
- if (fio.node) {
- fio.node->space->release_for_io();
- }
-
- if (buf_is_zeroes(span<const byte>(read_buf, physical_size))) {
- /* We will check if the copy in the
- doublewrite buffer is valid. If not, we will
- ignore this page (there should be redo log
- records to initialize it). */
- } else if (recv_dblwr.validate_page(
- page_id, read_buf, space, buf)) {
- goto next_page;
- } else {
- /* We intentionally skip this message for
- all-zero pages. */
- ib::info()
- << "Trying to recover page " << page_id
- << " from the doublewrite buffer.";
- }
-
- page = recv_dblwr.find_page(page_id, space, buf);
-
- if (!page) {
- goto next_page;
- }
-
- /* Write the good page from the doublewrite buffer to
- the intended position. */
- fio = fil_io(IORequestWrite, true, page_id, zip_size,
- 0, physical_size, page, nullptr);
-
- if (fio.node) {
- ut_ad(fio.err == DB_SUCCESS);
- ib::info() << "Recovered page " << page_id
- << " to '" << fio.node->name
- << "' from the doublewrite buffer.";
- fio.node->space->release_for_io();
- }
-
- goto next_page;
- }
-
- recv_dblwr.pages.clear();
-
- fil_flush_file_spaces();
- aligned_free(read_buf);
+ space->release();
+ continue;
+ }
+
+ const ulint physical_size= space->physical_size();
+ ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
+
+ /* We want to ensure that for partial reads the unread portion of
+ the page is NUL. */
+ memset(read_buf, 0x0, physical_size);
+
+ /* Read in the actual page from the file */
+ fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+ os_offset_t{page_no} * physical_size,
+ physical_size, read_buf);
+
+ if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+ ib::warn() << "Double write buffer recovery: " << page_id
+ << " (tablespace '" << space->name
+ << "') read failed with error: " << fio.err;
+
+ if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
+ {
+ /* We will check if the copy in the doublewrite buffer is
+ valid. If not, we will ignore this page (there should be redo
+ log records to initialize it). */
+ }
+ else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf))
+ goto next_page;
+ else
+ /* We intentionally skip this message for all-zero pages. */
+ ib::info() << "Trying to recover page " << page_id
+ << " from the doublewrite buffer.";
+
+ page= recv_sys.dblwr.find_page(page_id, space, buf);
+
+ if (!page)
+ goto next_page;
+
+ /* Write the good page from the doublewrite buffer to the intended
+ position. */
+ space->reacquire();
+ fio= space->io(IORequestWrite,
+ os_offset_t{page_id.page_no()} * physical_size,
+ physical_size, page);
+
+ if (fio.err == DB_SUCCESS)
+ ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
+ << "' from the doublewrite buffer.";
+ goto next_page;
+ }
+
+ recv_sys.dblwr.pages.clear();
+ fil_flush_file_spaces();
+ aligned_free(read_buf);
}
-/****************************************************************//**
-Frees doublewrite buffer. */
-void
-buf_dblwr_free()
+/** Free the doublewrite buffer. */
+void buf_dblwr_t::close()
{
- /* Free the double write data structures. */
- ut_a(buf_dblwr != NULL);
- ut_ad(buf_dblwr->s_reserved == 0);
- ut_ad(buf_dblwr->b_reserved == 0);
-
- os_event_destroy(buf_dblwr->b_event);
- os_event_destroy(buf_dblwr->s_event);
- aligned_free(buf_dblwr->write_buf);
- ut_free(buf_dblwr->buf_block_arr);
- mutex_free(&buf_dblwr->mutex);
- ut_free(buf_dblwr);
- buf_dblwr = NULL;
+ if (!is_initialised())
+ return;
+
+ /* Free the double write data structures. */
+ ut_ad(!active_slot->reserved);
+ ut_ad(!active_slot->first_free);
+ ut_ad(!batch_running);
+
+ mysql_cond_destroy(&cond);
+ for (int i= 0; i < 2; i++)
+ {
+ aligned_free(slots[i].write_buf);
+ ut_free(slots[i].buf_block_arr);
+ }
+ mysql_mutex_destroy(&mutex);
+
+ memset((void*) this, 0, sizeof *this);
+ active_slot= &slots[0];
}
/** Update the doublewrite buffer on write completion. */
-void buf_dblwr_update(const buf_page_t &bpage, bool single_page)
+void buf_dblwr_t::write_completed()
{
+ ut_ad(this == &buf_dblwr);
ut_ad(srv_use_doublewrite_buf);
- ut_ad(buf_dblwr);
- ut_ad(!fsp_is_system_temporary(bpage.id().space()));
+ ut_ad(is_initialised());
ut_ad(!srv_read_only_mode);
- if (!single_page)
- {
- mutex_enter(&buf_dblwr->mutex);
-
- ut_ad(buf_dblwr->batch_running);
- ut_ad(buf_dblwr->b_reserved > 0);
- ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free);
-
- if (!--buf_dblwr->b_reserved)
- {
- mutex_exit(&buf_dblwr->mutex);
- /* This will finish the batch. Sync data files to the disk. */
- fil_flush_file_spaces();
- mutex_enter(&buf_dblwr->mutex);
-
- /* We can now reuse the doublewrite memory buffer: */
- buf_dblwr->first_free= 0;
- buf_dblwr->batch_running= false;
- os_event_set(buf_dblwr->b_event);
- }
+ mysql_mutex_lock(&mutex);
- mutex_exit(&buf_dblwr->mutex);
- return;
- }
+ ut_ad(batch_running);
+ slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_ad(flush_slot->reserved);
+ ut_ad(flush_slot->reserved <= flush_slot->first_free);
- ulint size= TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
- mutex_enter(&buf_dblwr->mutex);
- for (ulint i= srv_doublewrite_batch_size; i < size; ++i)
+ if (!--flush_slot->reserved)
{
- if (buf_dblwr->buf_block_arr[i].bpage != &bpage)
- continue;
- buf_dblwr->s_reserved--;
- buf_dblwr->buf_block_arr[i].bpage= nullptr;
- os_event_set(buf_dblwr->s_event);
- mutex_exit(&buf_dblwr->mutex);
- return;
+ mysql_mutex_unlock(&mutex);
+ /* This will finish the batch. Sync data files to the disk. */
+ fil_flush_file_spaces();
+ mysql_mutex_lock(&mutex);
+
+ /* We can now reuse the doublewrite memory buffer: */
+ flush_slot->first_free= 0;
+ batch_running= false;
+ mysql_cond_broadcast(&cond);
}
- /* The block must exist as a reserved block. */
- ut_error;
+ mysql_mutex_unlock(&mutex);
}
#ifdef UNIV_DEBUG
@@ -718,396 +516,243 @@ static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
{
- if (fil_space_t *space= fil_space_acquire_for_io(b.id().space()))
+ if (fil_space_t *space= fil_space_t::get(b.id().space()))
{
buf_dblwr_check_page_lsn(page, *space);
- space->release_for_io();
+ space->release();
}
}
-#endif /* UNIV_DEBUG */
-/********************************************************************//**
-Asserts when a corrupt block is find during writing out data to the
-disk. */
-static
-void
-buf_dblwr_assert_on_corrupt_block(
-/*==============================*/
- const buf_block_t* block) /*!< in: block to check */
+/** Check the LSN values on the page with which this block is associated. */
+static void buf_dblwr_check_block(const buf_page_t *bpage)
{
- buf_page_print(block->frame);
-
- ib::fatal() << "Apparent corruption of an index page "
- << block->page.id()
- << " to be written to data file. We intentionally crash"
- " the server to prevent corrupt data from ending up in"
- " data files.";
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+ const page_t *page= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_RTREE:
+ if (page_is_comp(page))
+ {
+ if (page_simple_validate_new(page))
+ return;
+ }
+ else if (page_simple_validate_old(page))
+ return;
+ /* While it is possible that this is not an index page but just
+ happens to have wrongly set FIL_PAGE_TYPE, such pages should never
+ be modified to without also adjusting the page type during page
+ allocation or buf_flush_init_for_writing() or
+ fil_block_reset_type(). */
+ buf_page_print(page);
+
+ ib::fatal() << "Apparent corruption of an index page " << bpage->id()
+ << " to be written to data file. We intentionally crash"
+ " the server to prevent corrupt data from ending up in"
+ " data files.";
+ }
}
+#endif /* UNIV_DEBUG */
-/********************************************************************//**
-Check the LSN values on the page with which this block is associated.
-Also validate the page if the option is set. */
-static
-void
-buf_dblwr_check_block(
-/*==================*/
- const buf_block_t* block) /*!< in: block to check */
+bool buf_dblwr_t::flush_buffered_writes(const ulint size)
{
- ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
-
- switch (fil_page_get_type(block->frame)) {
- case FIL_PAGE_INDEX:
- case FIL_PAGE_TYPE_INSTANT:
- case FIL_PAGE_RTREE:
- if (page_is_comp(block->frame)) {
- if (page_simple_validate_new(block->frame)) {
- return;
- }
- } else if (page_simple_validate_old(block->frame)) {
- return;
- }
- /* While it is possible that this is not an index page
- but just happens to have wrongly set FIL_PAGE_TYPE,
- such pages should never be modified to without also
- adjusting the page type during page allocation or
- buf_flush_init_for_writing() or fil_block_reset_type(). */
- break;
- case FIL_PAGE_TYPE_FSP_HDR:
- case FIL_PAGE_IBUF_BITMAP:
- case FIL_PAGE_TYPE_UNKNOWN:
- /* Do not complain again, we already reset this field. */
- case FIL_PAGE_UNDO_LOG:
- case FIL_PAGE_INODE:
- case FIL_PAGE_IBUF_FREE_LIST:
- case FIL_PAGE_TYPE_SYS:
- case FIL_PAGE_TYPE_TRX_SYS:
- case FIL_PAGE_TYPE_XDES:
- case FIL_PAGE_TYPE_BLOB:
- case FIL_PAGE_TYPE_ZBLOB:
- case FIL_PAGE_TYPE_ZBLOB2:
- /* TODO: validate also non-index pages */
- return;
- case FIL_PAGE_TYPE_ALLOCATED:
- /* empty pages should never be flushed */
- return;
- }
-
- buf_dblwr_assert_on_corrupt_block(block);
-}
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(size == block_size());
-/********************************************************************//**
-Writes a page that has already been written to the doublewrite buffer
-to the datafile. It is the job of the caller to sync the datafile. */
-static void
-buf_dblwr_write_block_to_datafile(const buf_dblwr_t::element &e, bool sync)
-{
- ut_ad(!sync || e.flush == IORequest::SINGLE_PAGE);
- buf_page_t* bpage = e.bpage;
- ut_a(bpage->in_file());
- IORequest request(IORequest::WRITE, bpage, e.flush);
-
- /* We request frame here to get correct buffer in case of
- encryption and/or page compression */
- void * frame = buf_page_get_frame(bpage);
-
- fil_io_t fio;
-
- if (bpage->zip.data) {
- ut_ad(bpage->zip_size());
-
- fio = fil_io(request, sync, bpage->id(), bpage->zip_size(), 0,
- bpage->zip_size(), frame, bpage);
- } else {
- ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
- ut_ad(!bpage->zip_size());
-
- ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>
- (frame)));
- fio = fil_io(request,
- sync, bpage->id(), bpage->zip_size(), 0,
- e.size, frame, bpage);
- }
-
- if (sync && fio.node) {
- ut_ad(fio.err == DB_SUCCESS);
- fio.node->space->release_for_io();
- }
-}
+ for (;;)
+ {
+ if (!active_slot->first_free)
+ return false;
+ if (!batch_running)
+ break;
+ mysql_cond_wait(&cond, &mutex);
+ }
-/********************************************************************//**
-Flushes possible buffered writes from the doublewrite memory buffer to disk.
-It is very important to call this function after a batch of writes has been posted,
-and also when we may have to wait for a page latch! Otherwise a deadlock
-of threads can occur. */
-void
-buf_dblwr_flush_buffered_writes()
-{
- byte* write_buf;
- ulint first_free;
- ulint len;
-
- if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
- /* Sync the writes to the disk. */
- os_aio_wait_until_no_pending_writes();
- /* Now we flush the data to disk (for example, with fsync) */
- fil_flush_file_spaces();
- return;
- }
-
- ut_ad(!srv_read_only_mode);
-
-try_again:
- mutex_enter(&buf_dblwr->mutex);
-
- /* Write first to doublewrite buffer blocks. We use synchronous
- aio and thus know that file write has been completed when the
- control returns. */
-
- if (buf_dblwr->first_free == 0) {
-
- mutex_exit(&buf_dblwr->mutex);
- return;
- }
-
- if (buf_dblwr->batch_running) {
- /* Another thread is running the batch right now. Wait
- for it to finish. */
- int64_t sig_count = os_event_reset(buf_dblwr->b_event);
- mutex_exit(&buf_dblwr->mutex);
-
- os_event_wait_low(buf_dblwr->b_event, sig_count);
- goto try_again;
- }
-
- ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
-
- /* Disallow anyone else to post to doublewrite buffer or to
- start another batch of flushing. */
- buf_dblwr->batch_running = true;
- first_free = buf_dblwr->first_free;
-
- /* Now safe to release the mutex. Note that though no other
- thread is allowed to post to the doublewrite batch flushing
- but any threads working on single page flushes are allowed
- to proceed. */
- mutex_exit(&buf_dblwr->mutex);
-
- write_buf = buf_dblwr->write_buf;
-
- for (ulint len2 = 0, i = 0;
- i < buf_dblwr->first_free;
- len2 += srv_page_size, i++) {
-
- buf_page_t* bpage= buf_dblwr->buf_block_arr[i].bpage;
-
- if (bpage->state() != BUF_BLOCK_FILE_PAGE || bpage->zip.data) {
- /* No simple validate for compressed
- pages exists. */
- continue;
- }
-
- /* Check that the actual page in the buffer pool is
- not corrupt and the LSN values are sane. */
- buf_dblwr_check_block(reinterpret_cast<buf_block_t*>(bpage));
- ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
- }
-
- /* Write out the first block of the doublewrite buffer */
- len = std::min<ulint>(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
- buf_dblwr->first_free) << srv_page_size_shift;
-
- fil_io_t fio = fil_io(IORequestWrite, true,
- page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), 0,
- 0, len, write_buf, nullptr);
- fio.node->space->release_for_io();
-
- if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- /* No unwritten pages in the second block. */
- goto flush;
- }
-
- /* Write out the second block of the doublewrite buffer. */
- len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
- << srv_page_size_shift;
-
- write_buf = buf_dblwr->write_buf
- + (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift);
-
- fio = fil_io(IORequestWrite, true,
- page_id_t(TRX_SYS_SPACE, buf_dblwr->block2), 0,
- 0, len, write_buf, nullptr);
- fio.node->space->release_for_io();
-
-flush:
- /* increment the doublewrite flushed pages counter */
- srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
- srv_stats.dblwr_writes.inc();
-
- /* Now flush the doublewrite buffer data to disk */
- fil_flush(TRX_SYS_SPACE);
-
- /* We know that the writes have been flushed to disk now
- and in recovery we will find them in the doublewrite buffer
- blocks. Next do the writes to the intended positions. */
-
- /* Up to this point first_free and buf_dblwr->first_free are
- same because we have set the buf_dblwr->batch_running flag
- disallowing any other thread to post any request but we
- can't safely access buf_dblwr->first_free in the loop below.
- This is so because it is possible that after we are done with
- the last iteration and before we terminate the loop, the batch
- gets finished in the IO helper thread and another thread posts
- a new batch setting buf_dblwr->first_free to a higher value.
- If this happens and we are using buf_dblwr->first_free in the
- loop termination condition then we'll end up dispatching
- the same block twice from two different threads. */
- ut_ad(first_free == buf_dblwr->first_free);
- for (ulint i = 0; i < first_free; i++) {
- buf_dblwr_write_block_to_datafile(
- buf_dblwr->buf_block_arr[i], false);
- }
+ ut_ad(active_slot->reserved == active_slot->first_free);
+ ut_ad(!flushing_buffered_writes);
+
+ /* Disallow anyone else to start another batch of flushing. */
+ slot *flush_slot= active_slot;
+ /* Switch the active slot */
+ active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_a(active_slot->first_free == 0);
+ batch_running= true;
+ const ulint old_first_free= flush_slot->first_free;
+ auto write_buf= flush_slot->write_buf;
+ const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
+ old_first_free > size;
+ flushing_buffered_writes= 1 + multi_batch;
+ /* Now safe to release the mutex. */
+ mysql_mutex_unlock(&mutex);
+#ifdef UNIV_DEBUG
+ for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
+ {
+ buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
+
+ if (bpage->zip.data)
+ /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
+ continue;
+
+ /* Check that the actual page in the buffer pool is not corrupt
+ and the LSN values are sane. */
+ buf_dblwr_check_block(bpage);
+ ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
+ }
+#endif /* UNIV_DEBUG */
+ const IORequest request(nullptr, fil_system.sys_space->chain.start,
+ IORequest::DBLWR_BATCH);
+ ut_a(fil_system.sys_space->acquire());
+ if (multi_batch)
+ {
+ fil_system.sys_space->reacquire();
+ os_aio(request, write_buf,
+ os_offset_t{block1.page_no()} << srv_page_size_shift,
+ size << srv_page_size_shift);
+ os_aio(request, write_buf + (size << srv_page_size_shift),
+ os_offset_t{block2.page_no()} << srv_page_size_shift,
+ (old_first_free - size) << srv_page_size_shift);
+ }
+ else
+ os_aio(request, write_buf,
+ os_offset_t{block1.page_no()} << srv_page_size_shift,
+ old_first_free << srv_page_size_shift);
+ srv_stats.data_written.add(old_first_free);
+ return true;
}
-/** Schedule a page write. If the doublewrite memory buffer is full,
-buf_dblwr_flush_buffered_writes() will be invoked to make space.
-@param bpage buffer pool page to be written
-@param flush type of flush
-@param size payload size in bytes */
-void buf_dblwr_t::add_to_batch(buf_page_t *bpage, IORequest::flush_t flush,
- size_t size)
+void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
{
- ut_ad(bpage->in_file());
- ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST);
+ ut_ad(this == &buf_dblwr);
+ ut_ad(srv_use_doublewrite_buf);
+ ut_ad(is_initialised());
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!request.bpage);
+ ut_ad(request.node == fil_system.sys_space->chain.start);
+ ut_ad(request.type == IORequest::DBLWR_BATCH);
+ mysql_mutex_lock(&mutex);
+ ut_ad(batch_running);
+ ut_ad(flushing_buffered_writes);
+ ut_ad(flushing_buffered_writes <= 2);
+ const bool completed= !--flushing_buffered_writes;
+ mysql_mutex_unlock(&mutex);
+
+ if (!completed)
+ return;
-try_again:
- mutex_enter(&mutex);
+ slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_ad(flush_slot->reserved == flush_slot->first_free);
+ /* increment the doublewrite flushed pages counter */
+ srv_stats.dblwr_pages_written.add(flush_slot->first_free);
+ srv_stats.dblwr_writes.inc();
- ut_a(first_free <= srv_doublewrite_batch_size);
+ /* Now flush the doublewrite buffer data to disk */
+ fil_system.sys_space->flush();
- if (batch_running)
+ /* The writes have been flushed to disk now and in recovery we will
+ find them in the doublewrite buffer blocks. Next, write the data pages. */
+ for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
{
- /* This not nearly as bad as it looks. There is only page_cleaner
- thread which does background flushing in batches therefore it is
- unlikely to be a contention point. The only exception is when a
- user thread is forced to do a flush batch because of a sync
- checkpoint. */
- int64_t sig_count= os_event_reset(b_event);
- mutex_exit(&mutex);
-
- os_event_wait_low(b_event, sig_count);
- goto try_again;
- }
+ auto e= flush_slot->buf_block_arr[i];
+ buf_page_t* bpage= e.request.bpage;
+ ut_ad(bpage->in_file());
- if (first_free == srv_doublewrite_batch_size)
- {
- mutex_exit(&mutex);
- buf_dblwr_flush_buffered_writes();
- goto try_again;
- }
+ /* We request frame here to get correct buffer in case of
+ encryption and/or page compression */
+ void *frame= buf_page_get_frame(bpage);
- byte *p= write_buf + srv_page_size * first_free;
+ auto e_size= e.size;
- /* We request frame here to get correct buffer in case of
- encryption and/or page compression */
- void * frame = buf_page_get_frame(bpage);
+ if (UNIV_LIKELY_NULL(bpage->zip.data))
+ {
+ e_size= bpage->zip_size();
+ ut_ad(e_size);
+ }
+ else
+ {
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(!bpage->zip_size());
+ ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
+ }
- memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(p, frame, size);
- ut_ad(!bpage->zip_size() || bpage->zip_size() == size);
- buf_block_arr[first_free++] = { bpage, flush, size };
- b_reserved++;
+ e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
+ frame, bpage);
+ }
+}
- ut_ad(!batch_running);
- ut_ad(first_free == b_reserved);
- ut_ad(b_reserved <= srv_doublewrite_batch_size);
+/** Flush possible buffered writes to persistent storage.
+It is very important to call this function after a batch of writes has been
+posted, and also when we may have to wait for a page latch!
+Otherwise a deadlock of threads can occur. */
+void buf_dblwr_t::flush_buffered_writes()
+{
+ if (!is_initialised() || !srv_use_doublewrite_buf)
+ {
+ os_aio_wait_until_no_pending_writes();
+ fil_flush_file_spaces();
+ return;
+ }
- const bool need_flush= first_free == srv_doublewrite_batch_size;
- mutex_exit(&mutex);
+ ut_ad(!srv_read_only_mode);
+ const ulint size= block_size();
- if (need_flush)
- buf_dblwr_flush_buffered_writes();
+ mysql_mutex_lock(&mutex);
+ if (!flush_buffered_writes(size))
+ mysql_mutex_unlock(&mutex);
}
-/** Write a page to the doublewrite buffer on disk, sync it, then write
-the page to the datafile and sync the datafile. This function is used
-for single page flushes. If all the buffers allocated for single page
-flushes in the doublewrite buffer are in use we wait here for one to
-become free. We are guaranteed that a slot will become free because any
-thread that is using a slot must also release the slot before leaving
-this function.
-@param bpage buffer pool page to be written
-@param sync whether synchronous operation is requested
-@param size payload size in bytes */
-void buf_dblwr_t::write_single_page(buf_page_t *bpage, bool sync, size_t size)
+/** Schedule a page write. If the doublewrite memory buffer is full,
+flush_buffered_writes() will be invoked to make space.
+@param request asynchronous write request
+@param size payload size in bytes */
+void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
{
- ut_ad(bpage->in_file());
- ut_ad(srv_use_doublewrite_buf);
- ut_ad(this == buf_dblwr);
+ ut_ad(request.is_async());
+ ut_ad(request.is_write());
+ ut_ad(request.bpage);
+ ut_ad(request.bpage->in_file());
+ ut_ad(request.node);
+ ut_ad(request.node->space->id == request.bpage->id().space());
+ ut_ad(request.node->space->referenced());
+ ut_ad(!srv_read_only_mode);
- /* total number of slots available for single page flushes
- starts from srv_doublewrite_batch_size to the end of the buffer. */
- ulint slots = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
- ut_a(slots > srv_doublewrite_batch_size);
- ulint n_slots= slots - srv_doublewrite_batch_size;
+ const ulint buf_size= 2 * block_size();
- if (bpage->state() == BUF_BLOCK_FILE_PAGE)
- {
- /* Check that the actual page in the buffer pool is not corrupt
- and the LSN values are sane. */
- buf_dblwr_check_block(reinterpret_cast<buf_block_t*>(bpage));
-#ifdef UNIV_DEBUG
- /* Check that the page as written to the doublewrite buffer has
- sane LSN values. */
- if (!bpage->zip.data)
- buf_dblwr_check_page_lsn(*bpage, reinterpret_cast<buf_block_t*>
- (bpage)->frame);
-#endif
- }
+ mysql_mutex_lock(&mutex);
-retry:
- mutex_enter(&mutex);
- if (s_reserved == n_slots)
+ for (;;)
{
- /* All slots are reserved. */
- int64_t sig_count = os_event_reset(s_event);
- mutex_exit(&mutex);
- os_event_wait_low(s_event, sig_count);
- goto retry;
- }
-
- ulint i;
- for (i = srv_doublewrite_batch_size; i < slots; ++i)
- if (!buf_block_arr[i].bpage)
- goto found;
- /* We are guaranteed to find a slot. */
- ut_error;
-found:
- s_reserved++;
- buf_block_arr[i]= { bpage, IORequest::SINGLE_PAGE, size };
-
- /* increment the doublewrite flushed pages counter */
- srv_stats.dblwr_pages_written.inc();
- srv_stats.dblwr_writes.inc();
+ ut_ad(active_slot->first_free <= buf_size);
+ if (active_slot->first_free != buf_size)
+ break;
- mutex_exit(&mutex);
+ if (flush_buffered_writes(buf_size / 2))
+ mysql_mutex_lock(&mutex);
+ }
- const ulint offset= i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
- ? block1 + i
- : block2 + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
/* We request frame here to get correct buffer in case of
encryption and/or page compression */
- void * frame = buf_page_get_frame(bpage);
- ut_ad(!bpage->zip_size() || bpage->zip_size() == size);
- fil_io_t fio= fil_io(IORequestWrite, true, page_id_t(TRX_SYS_SPACE, offset),
- 0, 0, size, frame, nullptr);
- fio.node->space->release_for_io();
-
- /* Now flush the doublewrite buffer data to disk */
- fil_flush(TRX_SYS_SPACE);
-
- /* We know that the write has been flushed to disk now
- and during recovery we will find it in the doublewrite buffer
- blocks. Next do the write to the intended position. */
- buf_dblwr_write_block_to_datafile({bpage, IORequest::SINGLE_PAGE, size},
- sync);
+ void *frame= buf_page_get_frame(request.bpage);
+
+ /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
+ and at least srv_page_size (4096-byte) for everything else. */
+ memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, frame, size);
+ /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
+ memset_aligned<256>(p + size, 0, srv_page_size - size);
+ /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
+ are integer multiples of 256, so the above can translate into simple
+ SIMD instructions. Currently, we make no such assumptions about the
+ non-pointer parameters that are passed to the _aligned templates. */
+ ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
+ ut_ad(active_slot->reserved == active_slot->first_free);
+ ut_ad(active_slot->reserved < buf_size);
+ new (active_slot->buf_block_arr + active_slot->first_free++)
+ element{request, size};
+ active_slot->reserved= active_slot->first_free;
+
+ if (active_slot->first_free != buf_size ||
+ !flush_buffered_writes(buf_size / 2))
+ mysql_mutex_unlock(&mutex);
}
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
index c67273cfec2..037128cbee2 100644
--- a/storage/innobase/buf/buf0dump.cc
+++ b/storage/innobase/buf/buf0dump.cc
@@ -280,13 +280,13 @@ buf_dump(
ulint n_pages;
ulint j;
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
/* skip empty buffer pools */
if (n_pages == 0) {
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
goto done;
}
@@ -314,7 +314,7 @@ buf_dump(
n_pages * sizeof(*dump)));
if (dump == NULL) {
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
fclose(f);
buf_dump_status(STATUS_ERR,
"Cannot allocate " ULINTPF " bytes: %s",
@@ -339,7 +339,7 @@ buf_dump(
dump[j++] = id;
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
ut_a(j <= n_pages);
n_pages = j;
@@ -493,8 +493,8 @@ buf_load()
page_id_t* dump;
ulint dump_n;
ulint i;
- ulint space_id;
- ulint page_no;
+ uint32_t space_id;
+ uint32_t page_no;
int fscanf_ret;
/* Ignore any leftovers from before */
@@ -518,7 +518,7 @@ buf_load()
This file is tiny (approx 500KB per 1GB buffer pool), reading it
two times is fine. */
dump_n = 0;
- while (fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no) == 2
+ while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
&& !SHUTTING_DOWN()) {
dump_n++;
}
@@ -569,8 +569,7 @@ buf_load()
export_vars.innodb_buffer_pool_load_incomplete = 1;
for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
- fscanf_ret = fscanf(f, ULINTPF "," ULINTPF,
- &space_id, &page_no);
+ fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
if (fscanf_ret != 2) {
if (feof(f)) {
@@ -592,9 +591,8 @@ buf_load()
fclose(f);
buf_load_status(STATUS_ERR,
"Error parsing '%s': bogus"
- " space,page " ULINTPF "," ULINTPF
- " at line " ULINTPF ","
- " unable to load buffer pool",
+ " space,page %u,%u at line " ULINTPF
+ ", unable to load buffer pool",
full_filename,
space_id, page_no,
i);
@@ -627,11 +625,11 @@ buf_load()
ulint last_check_time = 0;
ulint last_activity_cnt = 0;
- /* Avoid calling the expensive fil_space_acquire_silent() for each
+ /* Avoid calling the expensive fil_space_t::get() for each
page within the same tablespace. dump[] is sorted by (space, page),
so all pages from a given tablespace are consecutive. */
ulint cur_space_id = dump[0].space();
- fil_space_t* space = fil_space_acquire_silent(cur_space_id);
+ fil_space_t* space = fil_space_t::get(cur_space_id);
ulint zip_size = space ? space->zip_size() : 0;
PSI_stage_progress* pfs_stage_progress __attribute__((unused))
@@ -650,31 +648,40 @@ buf_load()
}
if (this_space_id != cur_space_id) {
- if (space != NULL) {
+ if (space) {
space->release();
}
cur_space_id = this_space_id;
- space = fil_space_acquire_silent(cur_space_id);
+ space = fil_space_t::get(cur_space_id);
- if (space != NULL) {
- zip_size = space->zip_size();
+ if (!space) {
+ continue;
}
+
+ zip_size = space->zip_size();
}
/* JAN: TODO: As we use background page read below,
if tablespace is encrypted we cant use it. */
- if (space == NULL ||
- (space && space->crypt_data &&
- space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
- space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
+ if (!space || dump[i].page_no() >= space->get_size() ||
+ (space->crypt_data &&
+ space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
+ space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
+ continue;
+ }
+
+ if (space->is_stopping()) {
+ space->release();
+ space = nullptr;
continue;
}
- buf_read_page_background(dump[i], zip_size, true);
+ space->reacquire();
+ buf_read_page_background(space, dump[i], zip_size, true);
if (buf_load_abort_flag) {
- if (space != NULL) {
+ if (space) {
space->release();
}
buf_load_abort_flag = false;
@@ -702,7 +709,7 @@ buf_load()
#endif
}
- if (space != NULL) {
+ if (space) {
space->release();
}
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 8f7e2b04782..ac6c45deeab 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -26,6 +26,7 @@ Created 11/11/1995 Heikki Tuuri
*******************************************************/
#include "univ.i"
+#include <my_service_manager.h>
#include <mysql/service_thd_wait.h>
#include <sql_class.h>
@@ -34,20 +35,10 @@ Created 11/11/1995 Heikki Tuuri
#include "buf0checksum.h"
#include "buf0dblwr.h"
#include "srv0start.h"
-#include "srv0srv.h"
#include "page0zip.h"
-#include "ut0byte.h"
-#include "page0page.h"
#include "fil0fil.h"
-#include "buf0lru.h"
-#include "buf0rea.h"
-#include "ibuf0ibuf.h"
-#include "log0log.h"
#include "log0crypt.h"
-#include "os0file.h"
-#include "trx0sys.h"
#include "srv0mon.h"
-#include "ut0stage.h"
#include "fil0pagecompress.h"
#ifdef UNIV_LINUX
/* include defs for CPU time priority settings */
@@ -55,138 +46,46 @@ Created 11/11/1995 Heikki Tuuri
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/resource.h>
-static const int buf_flush_page_cleaner_priority = -20;
#endif /* UNIV_LINUX */
#ifdef HAVE_LZO
-#include "lzo/lzo1x.h"
-#endif
-
-#ifdef HAVE_SNAPPY
-#include "snappy-c.h"
+# include "lzo/lzo1x.h"
+#elif defined HAVE_SNAPPY
+# include "snappy-c.h"
#endif
-/** Sleep time in microseconds for loop waiting for the oldest
-modification lsn */
-static const ulint buf_flush_wait_flushed_sleep_time = 10000;
-
-#include <my_service_manager.h>
-
-/** Number of pages flushed through non flush_list flushes. */
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_flush_page_count. */
ulint buf_lru_flush_page_count;
-/** Flag indicating if the page_cleaner is in active state. This flag
-is set to TRUE by the page_cleaner thread when it is spawned and is set
-back to FALSE at shutdown by the page_cleaner as well. Therefore no
-need to protect it by a mutex. It is only ever read by the thread
-doing the shutdown */
+/** Number of pages flushed. Protected by buf_pool.mutex. */
+ulint buf_flush_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
bool buf_page_cleaner_is_active;
/** Factor for scan length to determine n_pages for intended oldest LSN
progress */
-static ulint buf_flush_lsn_scan_factor = 3;
+static constexpr ulint buf_flush_lsn_scan_factor = 3;
/** Average redo generation rate */
static lsn_t lsn_avg_rate = 0;
-/** Target oldest LSN for the requested flush_sync */
-static lsn_t buf_flush_sync_lsn = 0;
+/** Target oldest_modification for the page cleaner; writes are protected by
+buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
#ifdef UNIV_PFS_THREAD
mysql_pfs_key_t page_cleaner_thread_key;
#endif /* UNIV_PFS_THREAD */
-/** Event to synchronise with the flushing. */
-os_event_t buf_flush_event;
-
-static void pc_flush_slot_func(void *);
-static tpool::task_group page_cleaner_task_group(1);
-static tpool::waitable_task pc_flush_slot_task(
- pc_flush_slot_func, 0, &page_cleaner_task_group);
-
-/** State for page cleaner array slot */
-enum page_cleaner_state_t {
- /** Not requested any yet. Moved from FINISHED. */
- PAGE_CLEANER_STATE_NONE = 0,
- /** Requested but not started flushing. Moved from NONE. */
- PAGE_CLEANER_STATE_REQUESTED,
- /** Flushing is on going. Moved from REQUESTED. */
- PAGE_CLEANER_STATE_FLUSHING,
- /** Flushing was finished. Moved from FLUSHING. */
- PAGE_CLEANER_STATE_FINISHED
-};
-
-/** Page cleaner request state for buf_pool */
-struct page_cleaner_slot_t {
- page_cleaner_state_t state; /*!< state of the request.
- protected by page_cleaner_t::mutex
- if the worker thread got the slot and
- set to PAGE_CLEANER_STATE_FLUSHING,
- n_flushed_lru and n_flushed_list can be
- updated only by the worker thread */
- /* This value is set during state==PAGE_CLEANER_STATE_NONE */
- ulint n_pages_requested;
- /*!< number of requested pages
- for the slot */
- /* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
- and commited with state==PAGE_CLEANER_STATE_FINISHED.
- The consistency is protected by the 'state' */
- ulint n_flushed_lru;
- /*!< number of flushed pages
- by LRU scan flushing */
- ulint n_flushed_list;
- /*!< number of flushed pages
- by flush_list flushing */
- bool succeeded_list;
- /*!< true if flush_list flushing
- succeeded. */
- ulint flush_lru_time;
- /*!< elapsed time for LRU flushing */
- ulint flush_list_time;
- /*!< elapsed time for flush_list
- flushing */
- ulint flush_lru_pass;
- /*!< count to attempt LRU flushing */
- ulint flush_list_pass;
- /*!< count to attempt flush_list
- flushing */
-};
-
/** Page cleaner structure */
-struct page_cleaner_t {
- /* FIXME: do we need mutex? use atomics? */
- ib_mutex_t mutex; /*!< mutex to protect whole of
- page_cleaner_t struct and
- page_cleaner_slot_t slots. */
- os_event_t is_finished; /*!< event to signal that all
- slots were finished. */
- bool requested; /*!< true if requested pages
- to flush */
- lsn_t lsn_limit; /*!< upper limit of LSN to be
- flushed */
-#if 1 /* FIXME: use bool for these, or remove some of these */
- ulint n_slots_requested;
- /*!< number of slots
- in the state
- PAGE_CLEANER_STATE_REQUESTED */
- ulint n_slots_flushing;
- /*!< number of slots
- in the state
- PAGE_CLEANER_STATE_FLUSHING */
- ulint n_slots_finished;
- /*!< number of slots
- in the state
- PAGE_CLEANER_STATE_FINISHED */
-#endif
- ulint flush_time; /*!< elapsed time to flush
- requests for all slots */
- ulint flush_pass; /*!< count to finish to flush
- requests for all slots */
- page_cleaner_slot_t slot;
- bool is_running; /*!< false if attempt
- to shutdown */
-};
-
-static page_cleaner_t page_cleaner;
+static struct
+{
+ /** total elapsed time in adaptive flushing, in seconds */
+ ulint flush_time;
+ /** number of adaptive flushing passes */
+ ulint flush_pass;
+} page_cleaner;
#ifdef UNIV_DEBUG
my_bool innodb_page_cleaner_disabled_debug;
@@ -200,15 +99,6 @@ in thrashing. */
/* @} */
-/** Increases flush_list size in bytes with the page size */
-static inline void incr_flush_list_size_in_bytes(const buf_block_t* block)
-{
- /* FIXME: use std::atomic! */
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
- buf_pool.stat.flush_list_bytes += block->physical_size();
- ut_ad(buf_pool.stat.flush_list_bytes <= buf_pool.curr_pool_size);
-}
-
#ifdef UNIV_DEBUG
/** Validate the flush list. */
static void buf_flush_validate_low();
@@ -236,223 +126,122 @@ static void buf_flush_validate_skip()
}
#endif /* UNIV_DEBUG */
-/******************************************************************//**
-Insert a block in the flush_rbt and returns a pointer to its
-predecessor or NULL if no predecessor. The ordering is maintained
-on the basis of the <oldest_modification, space, offset> key.
-@return pointer to the predecessor or NULL if no predecessor. */
-static
-buf_page_t*
-buf_flush_insert_in_flush_rbt(
-/*==========================*/
- buf_page_t* bpage) /*!< in: bpage to be inserted. */
-{
- const ib_rbt_node_t* c_node;
- const ib_rbt_node_t* p_node;
- buf_page_t* prev = NULL;
-
- ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
-
- /* Insert this buffer into the rbt. */
- c_node = rbt_insert(buf_pool.flush_rbt, &bpage, &bpage);
- ut_a(c_node != NULL);
-
- /* Get the predecessor. */
- p_node = rbt_prev(buf_pool.flush_rbt, c_node);
-
- if (p_node != NULL) {
- buf_page_t** value;
- value = rbt_value(buf_page_t*, p_node);
- prev = *value;
- ut_a(prev != NULL);
- }
-
- return(prev);
-}
-
-/*********************************************************//**
-Delete a bpage from the flush_rbt. */
-static
-void
-buf_flush_delete_from_flush_rbt(
-/*============================*/
- buf_page_t* bpage) /*!< in: bpage to be removed. */
-{
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
-
-#ifdef UNIV_DEBUG
- ibool ret =
-#endif /* UNIV_DEBUG */
- rbt_delete(buf_pool.flush_rbt, &bpage);
-
- ut_ad(ret);
-}
-
-/*****************************************************************//**
-Compare two modified blocks in the buffer pool. The key for comparison
-is:
-key = <oldest_modification, space, offset>
-This comparison is used to maintian ordering of blocks in the
-buf_pool.flush_rbt.
-Note that for the purpose of flush_rbt, we only need to order blocks
-on the oldest_modification. The other two fields are used to uniquely
-identify the blocks.
-@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
-static
-int
-buf_flush_block_cmp(
-/*================*/
- const void* p1, /*!< in: block1 */
- const void* p2) /*!< in: block2 */
-{
- const buf_page_t* b1 = *static_cast<const buf_page_t*const*>(p1);
- const buf_page_t* b2 = *static_cast<const buf_page_t*const*>(p2);
-
- ut_ad(b1 != NULL);
- ut_ad(b2 != NULL);
-
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
-
- const lsn_t m1 = b1->oldest_modification(),
- m2 = b2->oldest_modification();
-
- ut_ad(m1);
- ut_ad(m2);
-
- if (m2 > m1) {
- return(1);
- } else if (m2 < m1) {
- return(-1);
- }
-
- if (b2->id() > b1->id()) {
- return 1;
- }
- if (b2->id() < b1->id()) {
- return -1;
- }
- return 0;
-}
-
-/********************************************************************//**
-Initialize the red-black tree to speed up insertions into the flush_list
-during recovery process. Should be called at the start of recovery
-process before any page has been read/written. */
-void
-buf_flush_init_flush_rbt(void)
-/*==========================*/
-{
- mutex_enter(&buf_pool.flush_list_mutex);
- ut_ad(buf_pool.flush_rbt == NULL);
- /* Create red black tree for speedy insertions in flush list. */
- buf_pool.flush_rbt = rbt_create(
- sizeof(buf_page_t*), buf_flush_block_cmp);
- mutex_exit(&buf_pool.flush_list_mutex);
-}
-
-/********************************************************************//**
-Frees up the red-black tree. */
-void
-buf_flush_free_flush_rbt(void)
-/*==========================*/
-{
- mutex_enter(&buf_pool.flush_list_mutex);
- ut_d(buf_flush_validate_low());
- rbt_free(buf_pool.flush_rbt);
- buf_pool.flush_rbt = NULL;
- mutex_exit(&buf_pool.flush_list_mutex);
-}
-
/** Insert a modified block into the flush list.
@param[in,out] block modified block
@param[in] lsn oldest modification */
void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn)
{
- ut_ad(!mutex_own(&buf_pool.mutex));
- ut_ad(log_flush_order_mutex_own());
+ mysql_mutex_assert_not_owner(&buf_pool.mutex);
+ mysql_mutex_assert_owner(&log_sys.flush_order_mutex);
ut_ad(lsn);
- mutex_enter(&buf_pool.flush_list_mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
block->page.set_oldest_modification(lsn);
MEM_CHECK_DEFINED(block->page.zip.data
- ? block->page.zip.data : block->frame,
- block->physical_size());
- incr_flush_list_size_in_bytes(block);
-
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
- /* The field in_LRU_list is protected by buf_pool.mutex, which
- we are not holding. However, while a block is in the flush
- list, it is dirty and cannot be discarded, not from the
- page_hash or from the LRU list. At most, the uncompressed
- page frame of a compressed block may be discarded or created
- (copying the block->page to or from a buf_page_t that is
- dynamically allocated from buf_buddy_alloc()). Because those
- transitions hold buf_pool.flush_list_mutex (via
- buf_flush_relocate_on_flush_list()), there is no possibility
- of a race condition in the assertions below. */
- ut_ad(block->page.in_LRU_list);
- /* buf_buddy_block_register() will take a block in the
- BUF_BLOCK_MEMORY state, not a file page. */
- ut_ad(!block->page.in_zip_hash);
-
- if (buf_page_t* prev_b =
- buf_flush_insert_in_flush_rbt(&block->page)) {
- UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev_b, &block->page);
- goto func_exit;
- }
- }
+ ? block->page.zip.data : block->frame,
+ block->physical_size());
+ buf_pool.stat.flush_list_bytes += block->physical_size();
+ ut_ad(buf_pool.stat.flush_list_bytes <= buf_pool.curr_pool_size);
UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page);
-func_exit:
ut_d(buf_flush_validate_skip());
- mutex_exit(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
/** Remove a block from the flush list of modified blocks.
-@param[in] bpage block to be removed from the flush list */
-void buf_flush_remove(buf_page_t* bpage)
+@param[in,out] bpage block to be removed from the flush list */
+static void buf_flush_remove(buf_page_t *bpage)
{
-#if 0 // FIXME: Rate-limit the output. Move this to the page cleaner?
- if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) {
- service_manager_extend_timeout(
- INNODB_EXTEND_TIMEOUT_INTERVAL,
- "Flush and remove page with tablespace id %u"
- ", flush list length " ULINTPF,
- bpage->space, UT_LIST_GET_LEN(buf_pool.flush_list));
- }
-#endif
- ut_ad(mutex_own(&buf_pool.mutex));
- mutex_enter(&buf_pool.flush_list_mutex);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
/* Important that we adjust the hazard pointer before removing
the bpage from flush list. */
buf_pool.flush_hp.adjust(bpage);
UT_LIST_REMOVE(buf_pool.flush_list, bpage);
-
- /* If the flush_rbt is active then delete from there as well. */
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- buf_flush_delete_from_flush_rbt(bpage);
- }
-
- /* Must be done after we have removed it from the flush_rbt
- because we assert on it in buf_flush_block_cmp(). */
bpage->clear_oldest_modification();
-#ifdef UNIV_DEBUG
- if (bpage->state() == BUF_BLOCK_ZIP_PAGE) {
- buf_LRU_insert_zip_clean(bpage);
- }
-#endif /* UNIV_DEBUG */
-
buf_pool.stat.flush_list_bytes -= bpage->physical_size();
#ifdef UNIV_DEBUG
buf_flush_validate_skip();
#endif /* UNIV_DEBUG */
+}
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id tablespace identifier */
+void buf_flush_remove_pages(ulint id)
+{
+ const page_id_t first(id, 0), end(id + 1, 0);
+ ut_ad(id);
+ mysql_mutex_lock(&buf_pool.mutex);
- mutex_exit(&buf_pool.flush_list_mutex);
+ for (;;)
+ {
+ bool deferred= false;
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+ {
+ ut_d(const auto s= bpage->state());
+ ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+ s == BUF_BLOCK_REMOVE_HASH);
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+ const page_id_t bpage_id(bpage->id());
+
+ if (bpage_id < first || bpage_id >= end);
+ else if (bpage->io_fix() != BUF_IO_NONE)
+ deferred= true;
+ else
+ buf_flush_remove(bpage);
+
+ bpage= prev;
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (!deferred)
+ break;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ os_thread_yield();
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_flush_wait_batch_end(false);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Try to flush all the dirty pages that belong to a given tablespace.
+@param id tablespace identifier
+@return number dirty pages that there were for this tablespace */
+ulint buf_flush_dirty_pages(ulint id)
+{
+ ut_ad(!sync_check_iterate(dict_sync_check()));
+
+ ulint n= 0;
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_FIRST(buf_pool.flush_list); bpage;
+ bpage= UT_LIST_GET_NEXT(list, bpage))
+ {
+ ut_d(const auto s= bpage->state());
+ ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+ s == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(bpage->oldest_modification());
+ if (id == bpage->id().space())
+ n++;
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ if (n)
+ buf_flush_lists(srv_max_io_capacity, LSN_MAX);
+ return n;
}
/*******************************************************************//**
@@ -466,6 +255,7 @@ use the current list node (bpage) to do the list manipulation because
the list pointers could have changed between the time that we copied
the contents of bpage to the dpage and the flush list manipulation
below. */
+ATTRIBUTE_COLD
void
buf_flush_relocate_on_flush_list(
/*=============================*/
@@ -473,10 +263,14 @@ buf_flush_relocate_on_flush_list(
buf_page_t* dpage) /*!< in/out: destination block */
{
buf_page_t* prev;
- buf_page_t* prev_b = NULL;
- ut_ad(mutex_own(&buf_pool.mutex));
- mutex_enter(&buf_pool.flush_list_mutex);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (!bpage->oldest_modification()) {
+ return;
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
/* FIXME: At this point we have both buf_pool and flush_list
mutexes. Theoretically removal of a block from flush list is
@@ -487,19 +281,10 @@ buf_flush_relocate_on_flush_list(
having the buf_pool mutex. */
ut_ad(dpage->oldest_modification());
- /* If recovery is active we must swap the control blocks in
- the flush_rbt as well. */
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- buf_flush_delete_from_flush_rbt(bpage);
- prev_b = buf_flush_insert_in_flush_rbt(dpage);
- }
-
/* Important that we adjust the hazard pointer before removing
the bpage from the flush list. */
buf_pool.flush_hp.adjust(bpage);
- /* Must be done after we have removed it from the flush_rbt
- because we assert on it in buf_flush_block_cmp(). */
bpage->clear_oldest_modification();
prev = UT_LIST_GET_PREV(list, bpage);
@@ -507,59 +292,41 @@ buf_flush_relocate_on_flush_list(
if (prev) {
ut_ad(prev->oldest_modification());
- UT_LIST_INSERT_AFTER( buf_pool.flush_list, prev, dpage);
+ UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
} else {
UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
}
- /* Just an extra check. Previous in flush_list
- should be the same control block as in flush_rbt. */
- ut_a(!buf_pool.flush_rbt || prev_b == prev);
ut_d(buf_flush_validate_low());
- mutex_exit(&buf_pool.flush_list_mutex);
-}
-
-/** Update the buf_pool data structures on write completion.
-@param[in,out] bpage written page
-@param[in] flush_type write request type
-@param[in] dblwr whether the doublewrite buffer was used */
-static void buf_flush_write_complete(buf_page_t *bpage,
- IORequest::flush_t flush_type, bool dblwr)
-{
- ut_ad(mutex_own(&buf_pool.mutex));
- buf_flush_remove(bpage);
-
- switch (--buf_pool.n_flush[flush_type]) {
-#ifdef UNIV_DEBUG
- case ULINT_UNDEFINED:
- ut_error;
- break;
-#endif
- case 0:
- if (!buf_pool.init_flush[flush_type])
- os_event_set(buf_pool.no_flush[flush_type]);
- }
-
- if (dblwr)
- buf_dblwr_update(*bpage, flush_type == IORequest::SINGLE_PAGE);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
/** Complete write of a file page from buf_pool.
-@param bpage written page
-@param request write request
-@param dblwr whether the doublewrite buffer was used
-@param evict whether or not to evict the page from LRU list */
-void buf_page_write_complete(buf_page_t *bpage, const IORequest &request,
- bool dblwr, bool evict)
+@param request write request */
+void buf_page_write_complete(const IORequest &request)
{
ut_ad(request.is_write());
+ ut_ad(!srv_read_only_mode/* ||
+ request.node->space->purpose == FIL_TYPE_TEMPORARY*/);
+ buf_page_t *bpage= request.bpage;
+ ut_ad(bpage);
ut_ad(bpage->in_file());
ut_ad(bpage->io_fix() == BUF_IO_WRITE);
- ut_ad(bpage->id().space() != TRX_SYS_SPACE ||
- !buf_dblwr_page_inside(bpage->id().page_no()));
+ ut_ad(!buf_dblwr.is_inside(bpage->id()));
+ bool dblwr;
+ if (bpage->status == buf_page_t::INIT_ON_FLUSH)
+ {
+ bpage->status= buf_page_t::NORMAL;
+ dblwr= false;
+ }
+ else
+ {
+ ut_ad(bpage->status == buf_page_t::NORMAL);
+ dblwr= request.node->space->use_doublewrite();
+ }
/* We do not need protect io_fix here by mutex to read it because
- this and buf_page_write_complete() are the only functions where we can
+ this and buf_page_read_complete() are the only functions where we can
change the value from BUF_IO_READ or BUF_IO_WRITE to some other
value, and our code ensures that this is the only thread that handles
the i/o for this block. */
@@ -573,9 +340,19 @@ void buf_page_write_complete(buf_page_t *bpage, const IORequest &request,
buf_page_monitor(bpage, BUF_IO_WRITE);
DBUG_PRINT("ib_buf", ("write page %u:%u",
bpage->id().space(), bpage->id().page_no()));
- mutex_enter(&buf_pool.mutex);
+ ut_ad(request.is_LRU() ? buf_pool.n_flush_LRU : buf_pool.n_flush_list);
+
+ mysql_mutex_lock(&buf_pool.mutex);
bpage->set_io_fix(BUF_IO_NONE);
- buf_flush_write_complete(bpage, request.flush_type(), dblwr);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_remove(bpage);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (dblwr)
+ {
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+ buf_dblwr.write_completed();
+ }
/* Because this thread which does the unlocking might not be the same that
did the locking, we use a pass value != 0 in unlock, which simply
@@ -585,10 +362,19 @@ void buf_page_write_complete(buf_page_t *bpage, const IORequest &request,
buf_pool.stat.n_pages_written++;
- if (evict)
+ if (request.is_LRU())
+ {
buf_LRU_free_page(bpage, true);
+ if (!--buf_pool.n_flush_LRU)
+ mysql_cond_broadcast(&buf_pool.done_flush_LRU);
+ }
+ else
+ {
+ if (!--buf_pool.n_flush_list)
+ mysql_cond_broadcast(&buf_pool.done_flush_list);
+ }
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
@@ -983,72 +769,52 @@ static void buf_release_freed_page(buf_page_t *bpage)
{
ut_ad(bpage->in_file());
const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
bpage->set_io_fix(BUF_IO_NONE);
bpage->status= buf_page_t::NORMAL;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_flush_remove(bpage);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (uncompressed)
rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
BUF_IO_WRITE);
buf_LRU_free_page(bpage, true);
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
/** Write a flushable page from buf_pool to a file.
buf_pool.mutex must be held.
@param bpage buffer control block
-@param flush_type type of flush
-@param space tablespace (or nullptr if not known)
-@param sync whether this is a synchronous request
- (only for flush_type=SINGLE_PAGE)
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
+@param space tablespace
@return whether the page was flushed and buf_pool.mutex was released */
-bool buf_flush_page(buf_page_t *bpage, IORequest::flush_t flush_type,
- fil_space_t *space, bool sync)
+static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
{
ut_ad(bpage->in_file());
ut_ad(bpage->ready_for_flush());
- ut_ad(!sync || flush_type == IORequest::SINGLE_PAGE);
- ut_ad(mutex_own(&buf_pool.mutex));
+ ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
+ (space == fil_system.temp_space));
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
+ space->atomic_write_supported);
+ ut_ad(space->referenced());
rw_lock_t *rw_lock;
- bool no_fix_count= bpage->buf_fix_count() == 0;
if (bpage->state() != BUF_BLOCK_FILE_PAGE)
rw_lock= nullptr;
- else if (!(no_fix_count || flush_type == IORequest::FLUSH_LIST) ||
- (!no_fix_count && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP &&
- fsp_is_system_temporary(bpage->id().space())))
- /* This is a heuristic, to avoid expensive SX attempts. */
- /* For table residing in temporary tablespace sync is done
- using IO_FIX and so before scheduling for flush ensure that
- page is not fixed. */
- return false;
else
{
rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
- if (flush_type != IORequest::FLUSH_LIST &&
- !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
+ if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
return false;
}
- /* We are committed to flushing by the time we get here */
bpage->set_io_fix(BUF_IO_WRITE);
- mutex_exit(&buf_pool.mutex);
-
- if (flush_type == IORequest::FLUSH_LIST && rw_lock &&
- !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
- {
- if (!fsp_is_system_temporary(bpage->id().space()))
- /* Avoid a potential deadlock with the doublewrite buffer,
- which might be holding another buf_block_t::lock. */
- buf_dblwr_flush_buffered_writes();
- else
- os_aio_wait_until_no_pending_writes();
-
- rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
- }
+ buf_flush_page_count++;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
/* We are holding rw_lock = buf_block_t::lock in SX mode except if
this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame
@@ -1057,38 +823,16 @@ bool buf_flush_page(buf_page_t *bpage, IORequest::flush_t flush_type,
Apart from possible rw_lock protection, bpage is also protected by
io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
the buffer pool or removed from flush_list or LRU_list. */
-#if 0 /* rw_lock_own() does not hold because we passed BUF_IO_WRITE above. */
- ut_ad(!rw_lock || rw_lock_own(rw_lock, RW_LOCK_SX));
-#endif
-
- const fil_space_t * const provided_space= space;
- if (!space)
- {
- space= fil_space_acquire_for_io(bpage->id().space());
- if (UNIV_UNLIKELY(!space))
- {
- mutex_enter(&buf_pool.mutex);
- bpage->status= buf_page_t::NORMAL;
- bpage->set_io_fix(BUF_IO_NONE);
- if (rw_lock)
- rw_lock_sx_unlock_gen(rw_lock, BUF_IO_WRITE);
- return false;
- }
- }
- ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
- (space == fil_system.temp_space));
-
- const bool full_crc32= space->full_crc32();
- DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u",
- sync ? "sync" : "async", (unsigned) flush_type,
+ DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
+ lru ? "LRU" : "flush_list",
bpage->id().space(), bpage->id().page_no()));
- ut_ad(!mutex_own(&buf_pool.mutex));
- ut_ad(!mutex_own(&buf_pool.flush_list_mutex));
ut_ad(bpage->io_fix() == BUF_IO_WRITE);
ut_ad(bpage->oldest_modification());
ut_ad(bpage->state() ==
(rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
+ ut_ad(ULINT_UNDEFINED >
+ (lru ? buf_pool.n_flush_LRU : buf_pool.n_flush_list));
/* Because bpage->status can only be changed while buf_block_t
exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages
@@ -1097,124 +841,89 @@ bool buf_flush_page(buf_page_t *bpage, IORequest::flush_t flush_type,
is protected even if !rw_lock. */
const auto status= bpage->status;
- if (status != buf_page_t::FREED)
- {
- switch (buf_pool.n_flush[flush_type]++) {
- case 0:
- os_event_reset(buf_pool.no_flush[flush_type]);
- break;
-#ifdef UNIV_DEBUG
- case ULINT_UNDEFINED:
- ut_error;
- break;
-#endif
- }
- }
-
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
page_t *frame= bpage->zip.data;
- size_t size, orig_size;
- if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
+ if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
{
- ut_ad(!space->full_crc32());
- ut_ad(!space->is_compressed()); /* not page_compressed */
- orig_size= size= bpage->zip_size();
- if (status != buf_page_t::FREED)
+ const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN +
+ (frame ? frame : block->frame)));
+ ut_ad(lsn);
+ ut_ad(lsn >= bpage->oldest_modification());
+ ut_ad(!srv_read_only_mode);
+ if (UNIV_UNLIKELY(lsn > log_sys.get_flushed_lsn()))
{
- buf_flush_update_zip_checksum(frame, orig_size);
- frame= buf_page_encrypt(space, bpage, frame, &size);
+ if (rw_lock)
+ rw_lock_sx_unlock_gen(rw_lock, BUF_IO_WRITE);
+ mysql_mutex_lock(&buf_pool.mutex);
+ bpage->set_io_fix(BUF_IO_NONE);
+ return false;
}
- ut_ad(size == bpage->zip_size());
}
+
+ if (status == buf_page_t::FREED)
+ buf_release_freed_page(&block->page);
else
{
- buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
- byte *page= block->frame;
- orig_size= size= block->physical_size();
+ space->reacquire();
+ ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
+ size_t size, orig_size;
+ IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
- if (status != buf_page_t::FREED)
+ if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
+ {
+ ut_ad(!space->full_crc32());
+ ut_ad(!space->is_compressed()); /* not page_compressed */
+ orig_size= size= bpage->zip_size();
+ buf_flush_update_zip_checksum(frame, size);
+ frame= buf_page_encrypt(space, bpage, frame, &size);
+ ut_ad(size == bpage->zip_size());
+ }
+ else
{
- if (full_crc32)
+ byte *page= block->frame;
+ orig_size= size= block->physical_size();
+
+ if (space->full_crc32())
{
/* innodb_checksum_algorithm=full_crc32 is not implemented for
ROW_FORMAT=COMPRESSED pages. */
ut_ad(!frame);
- page= buf_page_encrypt(space, bpage, page, &size);
+ page= buf_page_encrypt(space, bpage, page, &size);
+ buf_flush_init_for_writing(block, page, nullptr, true);
}
-
- buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
- full_crc32);
-
- if (!full_crc32)
+ else
+ {
+ buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
+ false);
page= buf_page_encrypt(space, bpage, frame ? frame : page, &size);
- }
-
- frame= page;
- }
-
- if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
- {
- const lsn_t lsn= mach_read_from_8(frame + FIL_PAGE_LSN);
- ut_ad(lsn);
- ut_ad(lsn >= bpage->oldest_modification());
- ut_ad(!srv_read_only_mode);
- log_write_up_to(lsn, true);
- }
- else
- ut_ad(space->atomic_write_supported);
-
- bool use_doublewrite;
- IORequest request(IORequest::WRITE, bpage, flush_type);
-
- ut_ad(status == bpage->status);
-
- switch (status) {
- default:
- ut_ad(status == buf_page_t::FREED);
- buf_release_freed_page(bpage);
- goto done;
- case buf_page_t::NORMAL:
- use_doublewrite= space->use_doublewrite();
+ }
- if (use_doublewrite)
- {
- ut_ad(!srv_read_only_mode);
- if (flush_type == IORequest::SINGLE_PAGE)
- buf_dblwr->write_single_page(bpage, sync, size);
- else
- buf_dblwr->add_to_batch(bpage, flush_type, size);
- break;
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ if (size != orig_size && space->punch_hole)
+ type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
+#else
+ DBUG_EXECUTE_IF("ignore_punch_hole",
+ if (size != orig_size && space->punch_hole)
+ type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;);
+#endif
+ frame=page;
}
- /* fall through */
- case buf_page_t::INIT_ON_FLUSH:
- use_doublewrite= false;
- if (size != orig_size)
- request.set_punch_hole();
- /* FIXME: pass space to fil_io() */
- fil_io_t fio= fil_io(request, sync, bpage->id(), bpage->zip_size(), 0,
- bpage->physical_size(), frame, bpage);
- ut_ad(!fio.node || fio.node->space == space);
- if (fio.node && sync)
- fio.node->space->release_for_io();
- }
-
- if (sync)
- {
- ut_ad(bpage->io_fix() == BUF_IO_WRITE);
- /* When flushing single page synchronously, we flush the changes
- only for the tablespace we are working on. */
- if (space->purpose != FIL_TYPE_TEMPORARY)
- fil_flush(space);
+ ut_ad(status == bpage->status);
- if (size != orig_size && space->punch_hole)
- request.set_punch_hole();
- buf_page_write_complete(bpage, request, use_doublewrite, true/*evict*/);
+ if (lru)
+ buf_pool.n_flush_LRU++;
+ else
+ buf_pool.n_flush_list++;
+ if (status != buf_page_t::NORMAL || !space->use_doublewrite())
+ space->io(IORequest(type, bpage),
+ bpage->physical_offset(), size, frame, bpage);
+ else
+ buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size);
}
-done:
- if (!provided_space)
- space->release_for_io();
/* Increment the I/O operation count used for selecting LRU policy. */
buf_LRU_stat_inc_io();
return true;
@@ -1222,15 +931,15 @@ done:
/** Check whether a page can be flushed from the buf_pool.
@param id page identifier
-@param flush LRU or FLUSH_LIST
+@param fold id.fold()
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
@return whether the page can be flushed */
-static bool buf_flush_check_neighbor(const page_id_t id,
- IORequest::flush_t flush)
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
{
- ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST);
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(fold == id.fold());
- buf_page_t *bpage= buf_pool.page_hash_get_low(id, id.fold());
+ buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
if (!bpage || buf_pool.watch_is_sentinel(*bpage))
return false;
@@ -1238,21 +947,20 @@ static bool buf_flush_check_neighbor(const page_id_t id,
/* We avoid flushing 'non-old' blocks in an LRU flush, because the
flushed blocks are soon freed */
- return (flush != IORequest::LRU || bpage->is_old()) &&
- bpage->ready_for_flush();
+ return (!lru || bpage->is_old()) && bpage->ready_for_flush();
}
/** Check which neighbors of a page can be flushed from the buf_pool.
@param space tablespace
@param id page identifier of a dirty page
-@param flush LRU or FLUSH_LIST
+@param contiguous whether to consider contiguous areas of pages
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
@return last page number that can be flushed */
static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
- page_id_t &id,
- IORequest::flush_t flush)
+ page_id_t &id, bool contiguous,
+ bool lru)
{
ut_ad(id.page_no() < space.size);
- ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST);
/* When flushed, dirty blocks are searched in neighborhoods of this
size, and flushed along with the original page. */
const ulint s= buf_pool.curr_size / 16;
@@ -1261,13 +969,19 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
? static_cast<uint32_t>(s) : read_ahead;
page_id_t low= id - (id.page_no() % buf_flush_area);
page_id_t high= low + buf_flush_area;
- high.set_page_no(std::min(high.page_no(),
- static_cast<uint32_t>(space.committed_size - 1)));
+ high.set_page_no(std::min(high.page_no(), space.last_page_number()));
+
+ if (!contiguous)
+ {
+ high= std::max(id + 1, high);
+ id= low;
+ return high;
+ }
/* Determine the contiguous dirty area around id. */
const ulint id_fold= id.fold();
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
if (id > low)
{
@@ -1275,8 +989,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
for (page_id_t i= id - 1;; --i)
{
fold--;
- ut_ad(i.fold() == fold);
- if (!buf_flush_check_neighbor(i, flush))
+ if (!buf_flush_check_neighbor(i, fold, lru))
{
low= i + 1;
break;
@@ -1292,22 +1005,20 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
while (++i < high)
{
++fold;
- ut_ad(i.fold() == fold);
- if (!buf_flush_check_neighbor(i, flush))
+ if (!buf_flush_check_neighbor(i, fold, lru))
break;
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
return i;
}
+MY_ATTRIBUTE((nonnull))
/** Write punch-hole or zeroes of the freed ranges when
innodb_immediate_scrub_data_uncompressed from the freed ranges.
-@param[in] space tablespace which contains freed ranges
-@param[in] freed_ranges freed ranges of the page to be flushed */
+@param space tablespace which may contain ranges of freed pages */
static void buf_flush_freed_pages(fil_space_t *space)
{
- ut_ad(space != NULL);
const bool punch_hole= space->punch_hole;
if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
return;
@@ -1326,27 +1037,24 @@ static void buf_flush_freed_pages(fil_space_t *space)
for (const auto &range : freed_ranges)
{
- ulint page_size= space->zip_size();
- if (!page_size)
- page_size= srv_page_size;
+ const ulint physical_size= space->physical_size();
if (punch_hole)
{
- const auto len= (range.last - range.first + 1) * page_size;
- const page_id_t page_id(space->id, range.first);
- fil_io_t fio= fil_io(IORequestWrite, true, page_id, space->zip_size(),
- 0, len, nullptr, nullptr, false, true);
- if (fio.node)
- fio.node->space->release_for_io();
+ space->reacquire();
+ space->io(IORequest(IORequest::PUNCH_RANGE),
+ os_offset_t{range.first} * physical_size,
+ (range.last - range.first + 1) * physical_size,
+ nullptr);
}
else if (srv_immediate_scrub_data_uncompressed)
{
- for (auto i= range.first; i <= range.last; i++)
+ for (os_offset_t i= range.first; i <= range.last; i++)
{
- const page_id_t page_id(space->id, i);
- fil_io(IORequestWrite, false, page_id, space->zip_size(), 0,
- space->zip_size() ? space->zip_size() : srv_page_size,
- const_cast<byte*>(field_ref_zero), nullptr, false, false);
+ space->reacquire();
+ space->io(IORequest(IORequest::WRITE_ASYNC),
+ i * physical_size, physical_size,
+ const_cast<byte*>(field_ref_zero));
}
}
buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
@@ -1355,95 +1063,69 @@ static void buf_flush_freed_pages(fil_space_t *space)
/** Flushes to disk all flushable pages within the flush area
and also write zeroes or punch the hole for the freed ranges of pages.
-@param[in] page_id page id
-@param[in] flush LRU or FLUSH_LIST
-@param[in] n_flushed number of pages flushed so far in this batch
-@param[in] n_to_flush maximum number of pages we are allowed to flush
+@param space tablespace
+@param page_id page identifier
+@param contiguous whether to consider contiguous areas of pages
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
+@param n_flushed number of pages flushed so far in this batch
+@param n_to_flush maximum number of pages we are allowed to flush
@return number of pages flushed */
-static
-ulint
-buf_flush_try_neighbors(
- const page_id_t page_id,
- IORequest::flush_t flush,
- ulint n_flushed,
- ulint n_to_flush)
+static ulint buf_flush_try_neighbors(fil_space_t *space,
+ const page_id_t page_id,
+ bool contiguous, bool lru,
+ ulint n_flushed, ulint n_to_flush)
{
- ulint count = 0;
-
- ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST);
- fil_space_t* space = fil_space_acquire_for_io(page_id.space());
- if (!space) {
- return 0;
- }
-
- /* Flush the freed ranges while flushing the neighbors */
- buf_flush_freed_pages(space);
-
- page_id_t id = page_id;
- page_id_t high = (srv_flush_neighbors != 1
- || UT_LIST_GET_LEN(buf_pool.LRU)
- < BUF_LRU_OLD_MIN_LEN
- || !space->is_rotational())
- ? id + 1 /* Flush the minimum. */
- : buf_flush_check_neighbors(*space, id, flush);
-
- for (; id < high; ++id) {
- buf_page_t* bpage;
-
- if ((count + n_flushed) >= n_to_flush) {
-
- /* We have already flushed enough pages and
- should call it a day. There is, however, one
- exception. If the page whose neighbors we
- are flushing has not been flushed yet then
- we'll try to flush the victim that we
- selected originally. */
- if (id <= page_id) {
- id = page_id;
- } else {
- break;
- }
- }
+ ut_ad(space->id == page_id.space());
- const ulint fold = id.fold();
-
- mutex_enter(&buf_pool.mutex);
+ ulint count= 0;
+ page_id_t id= page_id;
+ page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru);
- bpage = buf_pool.page_hash_get_low(id, fold);
+ ut_ad(page_id >= id);
+ ut_ad(page_id < high);
- if (bpage == NULL) {
- mutex_exit(&buf_pool.mutex);
- continue;
- }
-
- ut_a(bpage->in_file());
+ for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
+ ++id, ++id_fold)
+ {
+ if (count + n_flushed >= n_to_flush)
+ {
+ if (id > page_id)
+ break;
+ /* If the page whose neighbors we are flushing has not been
+ flushed yet, we must flush the page that we selected originally. */
+ id= page_id;
+ id_fold= id.fold();
+ }
- /* We avoid flushing 'non-old' blocks in an LRU flush,
- because the flushed blocks are soon freed */
+ mysql_mutex_lock(&buf_pool.mutex);
- if (flush != IORequest::LRU
- || id == page_id || bpage->is_old()) {
- if (bpage->ready_for_flush()
- && (id == page_id || bpage->buf_fix_count() == 0)
- && buf_flush_page(bpage, flush, space, false)) {
- ++count;
- continue;
- }
- }
- mutex_exit(&buf_pool.mutex);
- }
+ if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
+ {
+ ut_ad(bpage->in_file());
+ /* We avoid flushing 'non-old' blocks in an LRU flush,
+ because the flushed blocks are soon freed */
+ if (!lru || id == page_id || bpage->is_old())
+ {
+ if (!buf_pool.watch_is_sentinel(*bpage) &&
+ bpage->ready_for_flush() && buf_flush_page(bpage, lru, space))
+ {
+ ++count;
+ continue;
+ }
+ }
+ }
- space->release_for_io();
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
- if (count > 1) {
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
- MONITOR_FLUSH_NEIGHBOR_COUNT,
- MONITOR_FLUSH_NEIGHBOR_PAGES,
- (count - 1));
- }
+ if (auto n= count - 1)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_COUNT,
+ MONITOR_FLUSH_NEIGHBOR_PAGES, n);
+ }
- return(count);
+ return count;
}
/*******************************************************************//**
@@ -1460,17 +1142,16 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
{
ulint scanned = 0;
ulint count = 0;
- ulint free_len = UT_LIST_GET_LEN(buf_pool.free);
- ulint lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
- while (block != NULL
+ while (block
&& count < max
- && free_len < srv_LRU_scan_depth
- && lru_len > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+ && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
+ && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+ > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
++scanned;
if (buf_LRU_free_page(&block->page, false)) {
@@ -1478,14 +1159,12 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
released and reacquired */
++count;
block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
- free_len = UT_LIST_GET_LEN(buf_pool.free);
- lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
} else {
block = UT_LIST_GET_PREV(unzip_LRU, block);
}
}
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
if (scanned) {
MONITOR_INC_VALUE_CUMULATIVE(
@@ -1498,23 +1177,76 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
return(count);
}
-/** Flush dirty blocks from the end of the LRU list.
-The calling thread is not allowed to own any latches on pages!
+/** Start writing out pages for a tablespace.
+@param id tablespace identifier
+@return tablespace
+@retval nullptr if the pages for this tablespace should be discarded */
+static fil_space_t *buf_flush_space(const uint32_t id)
+{
+ fil_space_t *space= fil_space_t::get(id);
+ if (space)
+ buf_flush_freed_pages(space);
+ return space;
+}
+
+struct flush_counters_t
+{
+ /** number of dirty pages flushed */
+ ulint flushed;
+ /** number of clean pages evicted */
+ ulint evicted;
+};
+
+/** Try to discard a dirty page.
+@param bpage dirty page whose tablespace is not accessible */
+static void buf_flush_discard_page(buf_page_t *bpage)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->oldest_modification());
+
+ rw_lock_t *rw_lock;
+
+ if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ rw_lock= nullptr;
+ else
+ {
+ rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+ if (!rw_lock_sx_lock_nowait(rw_lock, 0))
+ return;
+ }
+
+ bpage->status= buf_page_t::NORMAL;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_remove(bpage);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (rw_lock)
+ rw_lock_sx_unlock(rw_lock);
+
+ buf_LRU_free_page(bpage, true);
+}
-@param[in] max desired number of blocks to make available
- in the free list (best effort; not guaranteed)
-@param[out] n counts of flushed and evicted pages */
+/** Flush dirty blocks from the end of the LRU list.
+@param max maximum number of blocks to make available in buf_pool.free
+@param n counts of flushed and evicted pages */
static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
{
ulint scanned= 0;
ulint free_limit= srv_LRU_scan_depth;
- n->flushed = 0;
- n->evicted = 0;
- n->unzip_LRU_evicted = 0;
- ut_ad(mutex_own(&buf_pool.mutex));
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size)
free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
+ const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+ ? 0 : srv_flush_neighbors;
+ fil_space_t *space= nullptr;
+ uint32_t last_space_id= FIL_NULL;
+ static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+ static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
bpage && n->flushed + n->evicted < max &&
UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
@@ -1536,10 +1268,40 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
/* Block is ready for flush. Dispatch an IO request. The IO
helper thread will put it on free list in IO completion routine. */
const page_id_t page_id(bpage->id());
- mutex_exit(&buf_pool.mutex);
- n->flushed+= buf_flush_try_neighbors(page_id, IORequest::LRU, n->flushed,
- max);
- mutex_enter(&buf_pool.mutex);
+ const uint32_t space_id= page_id.space();
+ if (!space || space->id != space_id)
+ {
+ if (last_space_id != space_id)
+ {
+ if (space)
+ space->release();
+ space= buf_flush_space(space_id);
+ last_space_id= space_id;
+ }
+ else
+ ut_ad(!space);
+ }
+ else if (space->is_stopping())
+ {
+ space->release();
+ space= nullptr;
+ }
+
+ if (!space)
+ buf_flush_discard_page(bpage);
+ else if (neighbors && space->is_rotational())
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+ true, n->flushed, max);
+reacquire_mutex:
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ else if (buf_flush_page(bpage, true, space))
+ {
+ ++n->flushed;
+ goto reacquire_mutex;
+ }
}
else
/* Can't evict or dispatch this block. Go to previous. */
@@ -1548,18 +1310,16 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
buf_pool.lru_hp.set(nullptr);
+ if (space)
+ space->release();
+
/* We keep track of all flushes happening as part of LRU flush. When
estimating the desired rate at which flush_list should be flushed,
we factor in this value. */
buf_lru_flush_page_count+= n->flushed;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
- if (n->evicted)
- MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
- MONITOR_LRU_BATCH_EVICT_COUNT,
- MONITOR_LRU_BATCH_EVICT_PAGES,
- n->evicted);
if (scanned)
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
@@ -1569,44 +1329,52 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
/** Flush and move pages from LRU or unzip_LRU list to the free list.
Whether LRU or unzip_LRU is used depends on the state of the system.
-@param[in] max desired number of blocks to make available
- in the free list (best effort; not guaranteed)
-@param[out] n counts of flushed and evicted pages */
-static void buf_do_LRU_batch(ulint max, flush_counters_t* n)
+@param max maximum number of blocks to make available in buf_pool.free
+@return number of flushed pages */
+static ulint buf_do_LRU_batch(ulint max)
{
- n->unzip_LRU_evicted = buf_LRU_evict_from_unzip_LRU()
- ? buf_free_from_unzip_LRU_list_batch(max) : 0;
-
- if (max > n->unzip_LRU_evicted) {
- buf_flush_LRU_list_batch(max - n->unzip_LRU_evicted, n);
- } else {
- n->evicted = 0;
- n->flushed = 0;
- }
+ const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU()
+ ? buf_free_from_unzip_LRU_list_batch(max)
+ : 0;
+ flush_counters_t n;
+ n.flushed= 0;
+ n.evicted= n_unzip_LRU_evicted;
+ buf_flush_LRU_list_batch(max, &n);
+
+ if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_PAGES,
+ evicted);
+ }
- /* Add evicted pages from unzip_LRU to the evicted pages from
- the simple LRU. */
- n->evicted += n->unzip_LRU_evicted;
+ return n.flushed;
}
/** This utility flushes dirty blocks from the end of the flush_list.
The calling thread is not allowed to own any latches on pages!
-@param[in] min_n wished minimum mumber of blocks flushed (it is
-not guaranteed that the actual number is that big, though)
-@param[in] lsn_limit all blocks whose oldest_modification is smaller
-than this should be flushed (if their number does not exceed min_n)
+@param max_n maximum mumber of blocks to flush
+@param lsn once an oldest_modification>=lsn is found, terminate the batch
@return number of blocks for which the write request was queued */
-static ulint buf_do_flush_list_batch(ulint min_n, lsn_t lsn_limit)
+static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
{
ulint count= 0;
ulint scanned= 0;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+ ? 0 : srv_flush_neighbors;
+ fil_space_t *space= nullptr;
+ uint32_t last_space_id= FIL_NULL;
+ static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+ static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
/* Start from the end of the list looking for a suitable block to be
flushed. */
- mutex_enter(&buf_pool.flush_list_mutex);
- ulint len = UT_LIST_GET_LEN(buf_pool.flush_list);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
/* In order not to degenerate this scan to O(n*n) we attempt to
preserve pointer of previous block in the flush list. To do so we
@@ -1614,17 +1382,17 @@ static ulint buf_do_flush_list_batch(ulint min_n, lsn_t lsn_limit)
must check the hazard pointer and if it is removing the same block
then it must reset it. */
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
- bpage && len && count < min_n;
+ bpage && len && count < max_n;
bpage= buf_pool.flush_hp.get(), ++scanned, len--)
{
const lsn_t oldest_modification= bpage->oldest_modification();
- if (oldest_modification >= lsn_limit)
+ if (oldest_modification >= lsn)
break;
- ut_a(oldest_modification);
+ ut_ad(oldest_modification);
buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
buf_pool.flush_hp.set(prev);
- mutex_exit(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ut_ad(bpage->in_file());
const bool flushed= bpage->ready_for_flush();
@@ -1632,18 +1400,51 @@ static ulint buf_do_flush_list_batch(ulint min_n, lsn_t lsn_limit)
if (flushed)
{
const page_id_t page_id(bpage->id());
- mutex_exit(&buf_pool.mutex);
- count+= buf_flush_try_neighbors(page_id, IORequest::FLUSH_LIST,
- count, min_n);
- mutex_enter(&buf_pool.mutex);
+ const uint32_t space_id= page_id.space();
+ if (!space || space->id != space_id)
+ {
+ if (last_space_id != space_id)
+ {
+ if (space)
+ space->release();
+ space= buf_flush_space(space_id);
+ last_space_id= space_id;
+ }
+ else
+ ut_ad(!space);
+ }
+ else if (space->is_stopping())
+ {
+ space->release();
+ space= nullptr;
+ }
+
+ if (!space)
+ buf_flush_discard_page(bpage);
+ else if (neighbors && space->is_rotational())
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+ false, count, max_n);
+reacquire_mutex:
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ else if (buf_flush_page(bpage, false, space))
+ {
+ ++count;
+ goto reacquire_mutex;
+ }
}
- mutex_enter(&buf_pool.flush_list_mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
ut_ad(flushed || buf_pool.flush_hp.is_hp(prev));
}
buf_pool.flush_hp.set(nullptr);
- mutex_exit(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (space)
+ space->release();
if (scanned)
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
@@ -1655,351 +1456,391 @@ static ulint buf_do_flush_list_batch(ulint min_n, lsn_t lsn_limit)
MONITOR_FLUSH_BATCH_COUNT,
MONITOR_FLUSH_BATCH_PAGES,
count);
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
return count;
}
-/** This utility flushes dirty blocks from the end of the LRU list or
-flush_list.
-NOTE 1: in the case of an LRU flush the calling thread may own latches to
-pages: to avoid deadlocks, this function must be written so that it cannot
-end up waiting for these latches! NOTE 2: in the case of a flush list flush,
-the calling thread is not allowed to own any latches on pages!
-@param[in] lru true=LRU; false=FLUSH_LIST;
-if !lru, then the caller must not own any latches on pages
-@param[in] min_n wished minimum mumber of blocks flushed (it is
-not guaranteed that the actual number is that big, though)
-@param[in] lsn_limit in the case of !lru all blocks whose
-@param[out] n counts of flushed and evicted pages
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored */
-static
-void
-buf_flush_batch(
- bool lru,
- ulint min_n,
- lsn_t lsn_limit,
- flush_counters_t* n)
+/** Wait until a flush batch ends.
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
+void buf_flush_wait_batch_end(bool lru)
{
- ut_ad(lru || !sync_check_iterate(dict_sync_check()));
-
- mutex_enter(&buf_pool.mutex);
-
- /* Note: The buffer pool mutex is released and reacquired within
- the flush functions. */
- if (lru) {
- buf_do_LRU_batch(min_n, n);
- } else {
- n->flushed = buf_do_flush_list_batch(min_n, lsn_limit);
- n->evicted = 0;
- }
+ const auto &n_flush= lru ? buf_pool.n_flush_LRU : buf_pool.n_flush_list;
- mutex_exit(&buf_pool.mutex);
-
- DBUG_PRINT("ib_buf",
- (lru ? "LRU flush completed" : "flush_list completed"));
+ if (n_flush)
+ {
+ auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
+ tpool::tpool_wait_begin();
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ do
+ mysql_cond_wait(cond, &buf_pool.mutex);
+ while (n_flush);
+ tpool::tpool_wait_end();
+ thd_wait_end(nullptr);
+ mysql_cond_broadcast(cond);
+ }
}
-/******************************************************************//**
-Gather the aggregated stats for both flush list and LRU list flushing.
-@param page_count_flush number of pages flushed from the end of the flush_list
-@param page_count_LRU number of pages flushed from the end of the LRU list
-*/
-static
-void
-buf_flush_stats(
-/*============*/
- ulint page_count_flush,
- ulint page_count_LRU)
-{
- DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
- "from LRU_list %u pages",
- unsigned(page_count_flush),
- unsigned(page_count_LRU)));
+/** Whether a background log flush is pending */
+static std::atomic_flag log_flush_pending;
- srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
+/** Advance log_sys.get_flushed_lsn() */
+static void log_flush(void *)
+{
+ /* Between batches, we try to prevent I/O stalls by these calls.
+ This should not be needed for correctness. */
+ os_aio_wait_until_no_pending_writes();
+ fil_flush_file_spaces();
+
+ /* Guarantee progress for buf_flush_lists(). */
+ log_write_up_to(log_sys.get_lsn(), true);
+ log_flush_pending.clear();
}
-/** Start a buffer flush batch for LRU or flush list
-@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list
-@return whether the flush batch was started (was not already running) */
-static bool buf_flush_start(bool lru)
+static tpool::waitable_task log_flush_task(log_flush, nullptr, nullptr);
+
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n wished maximum mumber of blocks flushed
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target (0=LRU flush)
+@return the number of processed pages
+@retval 0 if a batch of the same type (lsn==0 or lsn!=0) is already running */
+ulint buf_flush_lists(ulint max_n, lsn_t lsn)
{
- IORequest::flush_t flush_type= lru ? IORequest::LRU : IORequest::FLUSH_LIST;
- mutex_enter(&buf_pool.mutex);
+ auto &n_flush= lsn ? buf_pool.n_flush_list : buf_pool.n_flush_LRU;
+
+ if (n_flush)
+ return 0;
- if (buf_pool.n_flush[flush_type] > 0 || buf_pool.init_flush[flush_type])
+ if (log_sys.get_lsn() > log_sys.get_flushed_lsn())
{
- /* There is already a flush batch of the same type running */
- mutex_exit(&buf_pool.mutex);
- return false;
+ log_flush_task.wait();
+ if (log_sys.get_lsn() > log_sys.get_flushed_lsn() &&
+ !log_flush_pending.test_and_set())
+ srv_thread_pool->submit_task(&log_flush_task);
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+ if (UNIV_UNLIKELY(ibuf_debug))
+ log_write_up_to(log_sys.get_lsn(), true);
+#endif
}
- buf_pool.init_flush[flush_type]= true;
- os_event_reset(buf_pool.no_flush[flush_type]);
- mutex_exit(&buf_pool.mutex);
- return true;
-}
+ auto cond= lsn ? &buf_pool.done_flush_list : &buf_pool.done_flush_LRU;
-/** End a buffer flush batch.
-@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list */
-static void buf_flush_end(bool lru)
-{
- IORequest::flush_t flush_type= lru ? IORequest::LRU : IORequest::FLUSH_LIST;
+ mysql_mutex_lock(&buf_pool.mutex);
+ const bool running= n_flush != 0;
+ /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
+ while not holding buf_pool.flush_list_mutex */
+ if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
+ {
+ if (!running)
+ mysql_cond_broadcast(cond);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return 0;
+ }
+ n_flush++;
+
+ ulint n_flushed= lsn
+ ? buf_do_flush_list_batch(max_n, lsn)
+ : buf_do_LRU_batch(max_n);
- mutex_enter(&buf_pool.mutex);
+ const auto n_flushing= --n_flush;
- buf_pool.init_flush[flush_type]= false;
buf_pool.try_LRU_scan= true;
- if (!buf_pool.n_flush[flush_type])
- /* The running flush batch has ended */
- os_event_set(buf_pool.no_flush[flush_type]);
+ mysql_mutex_unlock(&buf_pool.mutex);
- mutex_exit(&buf_pool.mutex);
+ if (!n_flushing)
+ mysql_cond_broadcast(cond);
- if (!srv_read_only_mode)
- buf_dblwr_flush_buffered_writes();
-}
+ buf_dblwr.flush_buffered_writes();
-/** Wait until a flush batch ends.
-@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list */
-void buf_flush_wait_batch_end(bool lru)
-{
- thd_wait_begin(nullptr, THD_WAIT_DISKIO);
- os_event_wait(buf_pool.no_flush[lru
- ? IORequest::LRU : IORequest::FLUSH_LIST]);
- thd_wait_end(nullptr);
+ DBUG_PRINT("ib_buf", ("%s completed, " ULINTPF " pages",
+ lsn ? "flush_list" : "LRU flush", n_flushed));
+ return n_flushed;
}
-/** Do flushing batch of a given type.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list
-@param[in] min_n wished minimum mumber of blocks flushed
-(it is not guaranteed that the actual number is that big, though)
-@param[in] lsn_limit if !lru, all blocks whose
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored
-@param[out] n_processed the number of pages which were processed is
-passed back to caller. Ignored if NULL
-@retval true if a batch was queued successfully.
-@retval false if another batch of same type was already running. */
-bool buf_flush_do_batch(bool lru, ulint min_n, lsn_t lsn_limit,
- flush_counters_t *n)
+
+/** Initiate a log checkpoint, discarding the start of the log.
+@param oldest_lsn the checkpoint LSN
+@param end_lsn log_sys.get_lsn()
+@return true if success, false if a checkpoint write was already running */
+static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
{
- if (n)
- n->flushed= 0;
+ ut_ad(!srv_read_only_mode);
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(oldest_lsn <= end_lsn);
+ ut_ad(end_lsn == log_sys.get_lsn());
+ ut_ad(!recv_no_log_write);
+
+ ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
+
+ if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+ /* Some log has been written since the previous checkpoint. */;
+ else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ /* MariaDB startup expects the redo log file to be logically empty
+ (not even containing a FILE_CHECKPOINT record) after a clean shutdown.
+ Perform an extra checkpoint at shutdown. */;
+ else
+ {
+ /* Do nothing, because nothing was logged (other than a
+ FILE_CHECKPOINT record) since the previous checkpoint. */
+ mysql_mutex_unlock(&log_sys.mutex);
+ return true;
+ }
+
+ /* Repeat the FILE_MODIFY records after the checkpoint, in case some
+ log records between the checkpoint and log_sys.lsn need them.
+ Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
+ see a FILE_CHECKPOINT after the checkpoint, except on clean
+ shutdown, where the log will be empty after the checkpoint.
+
+ It is important that we write out the redo log before any further
+ dirty pages are flushed to the tablespace files. At this point,
+ because we hold log_sys.mutex, mtr_t::commit() in other threads will
+ be blocked, and no pages can be added to the flush lists. */
+ lsn_t flush_lsn= oldest_lsn;
+
+ if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn ||
+ srv_shutdown_state <= SRV_SHUTDOWN_INITIATED))
+ {
+ flush_lsn= log_sys.get_lsn();
+ ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
+ mysql_mutex_unlock(&log_sys.mutex);
+ log_write_up_to(flush_lsn, true, true);
+ mysql_mutex_lock(&log_sys.mutex);
+ if (log_sys.last_checkpoint_lsn >= oldest_lsn)
+ {
+ mysql_mutex_unlock(&log_sys.mutex);
+ return true;
+ }
+ }
+ else
+ ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
- if (!buf_flush_start(lru))
+ ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
+
+ if (log_sys.n_pending_checkpoint_writes)
+ {
+ /* A checkpoint write is running */
+ mysql_mutex_unlock(&log_sys.mutex);
return false;
+ }
- buf_flush_batch(lru, min_n, lsn_limit, n);
- buf_flush_end(lru);
+ log_sys.next_checkpoint_lsn= oldest_lsn;
+ log_write_checkpoint_info(end_lsn);
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
return true;
}
-/** Wait until a flush batch of the given lsn ends
-@param[in] new_oldest target oldest_modified_lsn to wait for */
-void buf_flush_wait_flushed(lsn_t new_oldest)
+/** Make a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log file. Use log_make_checkpoint() to flush also the pool.
+@retval true if the checkpoint was or had been made
+@retval false if a checkpoint write was already running */
+static bool log_checkpoint()
{
- for (;;) {
- /* We don't need to wait for fsync of the flushed
- blocks, because anyway we need fsync to make chekpoint.
- So, we don't need to wait for the batch end here. */
-
- mutex_enter(&buf_pool.flush_list_mutex);
-
- buf_page_t* bpage;
- /* FIXME: Keep temporary tablespace pages in a separate flush
- list. We would only need to write out temporary pages if the
- page is about to be evicted from the buffer pool, and the page
- contents is still needed (the page has not been freed). */
- for (bpage = UT_LIST_GET_LAST(buf_pool.flush_list);
- bpage && fsp_is_system_temporary(bpage->id().space());
- bpage = UT_LIST_GET_PREV(list, bpage)) {
- ut_ad(bpage->oldest_modification());
- }
-
- lsn_t oldest = bpage ? bpage->oldest_modification() : 0;
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
- mutex_exit(&buf_pool.flush_list_mutex);
-
- if (oldest == 0 || oldest >= new_oldest) {
- break;
- }
+ switch (srv_file_flush_method) {
+ case SRV_NOSYNC:
+ case SRV_O_DIRECT_NO_FSYNC:
+ break;
+ default:
+ fil_flush_file_spaces();
+ }
- /* sleep and retry */
- os_thread_sleep(buf_flush_wait_flushed_sleep_time);
+ mysql_mutex_lock(&log_sys.mutex);
+ const lsn_t end_lsn= log_sys.get_lsn();
+ mysql_mutex_lock(&log_sys.flush_order_mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&log_sys.flush_order_mutex);
+ return log_checkpoint_low(oldest_lsn, end_lsn);
+}
- MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
- }
+/** Make a checkpoint. */
+ATTRIBUTE_COLD void log_make_checkpoint()
+{
+ buf_flush_wait_flushed(log_sys.get_lsn());
+ while (!log_checkpoint());
}
-/** This utility flushes dirty blocks from the end of the flush list.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@param[in] min_n wished minimum mumber of blocks flushed (it is
-not guaranteed that the actual number is that big, though)
-@param[in] lsn_limit all blocks whose
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored
-@param[out] n_processed the number of pages which were processed is
-passed back to caller. Ignored if NULL.
-@retval true if a batch was queued successfully
-@retval false if another batch of same type was already running */
-bool buf_flush_lists(ulint min_n, lsn_t lsn_limit, ulint *n_processed)
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
{
- flush_counters_t n;
+ ut_ad(sync_lsn);
+ ut_ad(sync_lsn < LSN_MAX);
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
+ ut_ad(!srv_read_only_mode);
- bool success = buf_flush_do_batch(false, min_n, lsn_limit, &n);
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
- if (n.flushed) {
- buf_flush_stats(n.flushed, 0);
- }
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
- if (n_processed) {
- *n_processed = n.flushed;
- }
+#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
+ if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)
+ ut_d(|| innodb_page_cleaner_disabled_debug))
+ {
+ for (;;)
+ {
+ const lsn_t lsn= buf_pool.get_oldest_modification(sync_lsn);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ if (lsn >= sync_lsn)
+ return;
+ ulint n_pages= buf_flush_lists(srv_max_io_capacity, sync_lsn);
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ if (n_pages)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES, n_pages);
+ log_checkpoint();
+ }
+ MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ }
+ return;
+ }
+ else if (UNIV_LIKELY(srv_flush_sync))
+#endif
+ {
+ if (buf_flush_sync_lsn < sync_lsn)
+ {
+ buf_flush_sync_lsn= sync_lsn;
+ mysql_cond_signal(&buf_pool.do_flush_list);
+ }
+ }
+
+ while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
+ {
+ tpool::tpool_wait_begin();
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ mysql_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex);
+ thd_wait_end(nullptr);
+ tpool::tpool_wait_end();
- return success;
+ MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
-/******************************************************************//**
-This function picks up a single page from the tail of the LRU
-list, flushes it (if it is dirty), removes it from page_hash and LRU
-list and puts it on the free list. It is called from user threads when
-they are unable to find a replaceable page at the tail of the LRU
-list i.e.: when the background LRU flushing in the page_cleaner thread
-is not fast enough to keep pace with the workload.
-@return true if success. */
-bool buf_flush_single_page_from_LRU()
+/** If innodb_flush_sync=ON, initiate a furious flush.
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target */
+void buf_flush_ahead(lsn_t lsn)
{
- ulint scanned = 0;
- bool freed = false;
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
+ ut_ad(!srv_read_only_mode);
- mutex_enter(&buf_pool.mutex);
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
- for (buf_page_t* bpage = buf_pool.single_scan_itr.start(); bpage;
- ++scanned, bpage = buf_pool.single_scan_itr.get()) {
+ if (buf_flush_sync_lsn < lsn &&
+ UNIV_LIKELY(srv_flush_sync) && UNIV_LIKELY(buf_page_cleaner_is_active))
+ {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (buf_flush_sync_lsn < lsn)
+ {
+ buf_flush_sync_lsn= lsn;
+ mysql_cond_signal(&buf_pool.do_flush_list);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ }
+}
- ut_ad(mutex_own(&buf_pool.mutex));
+/** Wait for pending flushes to complete. */
+void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
+{
+ if (lru ? buf_pool.n_flush_LRU : buf_pool.n_flush_list)
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_flush_wait_batch_end(lru);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+}
- buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
- buf_pool.single_scan_itr.set(prev);
+/** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
+and try to initiate checkpoints until the target is met.
+@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
+ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
+{
+ ut_ad(!srv_read_only_mode);
- if (!bpage->ready_for_flush()) { // FIXME: ready_for_replace()
- continue;
- }
+ for (;;)
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- if (!bpage->buf_fix_count()
- && buf_LRU_free_page(bpage, true)) {
- /* block is ready for eviction i.e., it is
- clean and is not IO-fixed or buffer fixed. */
- freed = true;
- break;
- } else {
- /* Block is ready for flush. Try and dispatch an IO
- request. We'll put it on free list in IO completion
- routine if it is not buffer fixed. The following call
- will release the buf_pool.mutex.
+ if (ulint n_flushed= buf_flush_lists(srv_max_io_capacity, lsn))
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES, n_flushed);
+ }
- Note: There is no guarantee that this page has actually
- been freed, only that it has been flushed to disk */
+ /* Attempt to perform a log checkpoint upon completing each batch. */
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
- freed = buf_flush_page(bpage, IORequest::SINGLE_PAGE,
- nullptr, true);
+ switch (srv_file_flush_method) {
+ case SRV_NOSYNC:
+ case SRV_O_DIRECT_NO_FSYNC:
+ break;
+ default:
+ fil_flush_file_spaces();
+ }
- if (freed) {
- goto found;
- }
- }
- }
+ mysql_mutex_lock(&log_sys.mutex);
+ const lsn_t newest_lsn= log_sys.get_lsn();
+ mysql_mutex_lock(&log_sys.flush_order_mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ lsn_t measure= buf_pool.get_oldest_modification(0);
+ mysql_mutex_unlock(&log_sys.flush_order_mutex);
+ const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
- mutex_exit(&buf_pool.mutex);
-found:
- if (scanned) {
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_LRU_SINGLE_FLUSH_SCANNED,
- MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
- MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
- scanned);
- }
+ if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ log_checkpoint_low(checkpoint_lsn, newest_lsn);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ measure= buf_pool.get_oldest_modification(LSN_MAX);
+ }
+ else
+ {
+ mysql_mutex_unlock(&log_sys.mutex);
+ if (!measure)
+ measure= LSN_MAX;
+ }
- ut_ad(!mutex_own(&buf_pool.mutex));
- return(freed);
-}
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
-/**
-Clear up the tail of the LRU list.
-Put replaceable pages at the tail of LRU to the free list.
-Flush dirty pages at the tail of LRU to the disk.
-The depth to which we scan each buffer pool is controlled by dynamic
-config parameter innodb_LRU_scan_depth.
-@return total pages flushed */
-static ulint buf_flush_LRU_list()
-{
- ulint scan_depth, withdraw_depth;
- flush_counters_t n;
-
- memset(&n, 0, sizeof(flush_counters_t));
-
- /* srv_LRU_scan_depth can be arbitrarily large value.
- We cap it with current LRU size. */
- mutex_enter(&buf_pool.mutex);
- scan_depth = UT_LIST_GET_LEN(buf_pool.LRU);
- if (buf_pool.curr_size < buf_pool.old_size
- && buf_pool.withdraw_target > 0) {
- withdraw_depth = buf_pool.withdraw_target
- - UT_LIST_GET_LEN(buf_pool.withdraw);
- } else {
- withdraw_depth = 0;
- }
- mutex_exit(&buf_pool.mutex);
- if (withdraw_depth > srv_LRU_scan_depth) {
- scan_depth = ut_min(withdraw_depth, scan_depth);
- } else {
- scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
- scan_depth);
- }
- /* Currently one of page_cleaners is the only thread
- that can trigger an LRU flush at the same time.
- So, it is not possible that a batch triggered during
- last iteration is still running, */
- buf_flush_do_batch(true, scan_depth, 0, &n);
+ /* After attempting log checkpoint, check if we have reached our target. */
+ const lsn_t target= buf_flush_sync_lsn;
- return(n.flushed);
-}
+ if (measure >= target)
+ buf_flush_sync_lsn= 0;
-/** Wait for any possible LRU flushes to complete. */
-void buf_flush_wait_LRU_batch_end()
-{
- if (buf_pool.n_flush[IORequest::LRU] || buf_pool.init_flush[IORequest::LRU])
- buf_flush_wait_batch_end(true);
+ /* wake up buf_flush_wait_flushed() */
+ mysql_cond_broadcast(&buf_pool.done_flush_list);
+
+ lsn= std::max(lsn, target);
+
+ if (measure >= lsn)
+ return;
+ }
}
/*********************************************************************//**
Calculates if flushing is required based on number of dirty pages in
the buffer pool.
+@param dirty_pct 100*flush_list.count / (LRU.count + free.count)
@return percent of io_capacity to flush to manage dirty page ratio */
-static
-ulint
-af_get_pct_for_dirty()
+static ulint af_get_pct_for_dirty(double dirty_pct)
{
- const ulint dirty = UT_LIST_GET_LEN(buf_pool.flush_list);
- if (!dirty) {
- /* No pages modified */
- return 0;
- }
-
- /* 1 + is there to avoid division by zero (in case the buffer
- pool (including the flush_list) was emptied while we are
- looking at it) */
- double dirty_pct = 100 * static_cast<double>(dirty)
- / static_cast<double>(1 + UT_LIST_GET_LEN(buf_pool.LRU)
- + UT_LIST_GET_LEN(buf_pool.free));
-
- ut_a(srv_max_dirty_pages_pct_lwm
- <= srv_max_buf_pool_modified_pct);
+ ut_ad(srv_max_dirty_pages_pct_lwm <= srv_max_buf_pool_modified_pct);
if (srv_max_dirty_pages_pct_lwm == 0) {
/* The user has not set the option to preflush dirty
@@ -2010,7 +1851,7 @@ af_get_pct_for_dirty()
innodb_io_capacity. */
return(100);
}
- } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
+ } else {
/* We should start flushing pages gradually. */
return(static_cast<ulint>((dirty_pct * 100)
/ (srv_max_buf_pool_modified_pct + 1)));
@@ -2028,30 +1869,16 @@ af_get_pct_for_lsn(
/*===============*/
lsn_t age) /*!< in: current age of LSN. */
{
- lsn_t max_async_age;
- lsn_t lsn_age_factor;
lsn_t af_lwm = static_cast<lsn_t>(
srv_adaptive_flushing_lwm
- * static_cast<double>(log_get_capacity()) / 100);
+ * static_cast<double>(log_sys.log_capacity) / 100);
if (age < af_lwm) {
/* No adaptive flushing. */
return(0);
}
- max_async_age = log_get_max_modified_age_async();
-
- if (age < max_async_age && !srv_adaptive_flushing) {
- /* We have still not reached the max_async point and
- the user has disabled adaptive flushing. */
- return(0);
- }
-
- /* If we are here then we know that either:
- 1) User has enabled adaptive flushing
- 2) User may have disabled adaptive flushing but we have reached
- max_async_age. */
- lsn_age_factor = (age * 100) / max_async_age;
+ lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
ut_ad(srv_max_io_capacity >= srv_io_capacity);
return static_cast<ulint>(
@@ -2061,47 +1888,40 @@ af_get_pct_for_lsn(
/ 7.5));
}
-/*********************************************************************//**
-This function is called approximately once every second by the
-page_cleaner thread. Based on various factors it decides if there is a
-need to do flushing.
+/** This function is called approximately once every second by the
+page_cleaner thread if innodb_adaptive_flushing=ON.
+Based on various factors it decides if there is a need to do flushing.
@return number of pages recommended to be flushed
-@param last_pages_in the number of pages flushed by the last flush_list
- flushing. */
-static
-ulint
-page_cleaner_flush_pages_recommendation(ulint last_pages_in)
+@param last_pages_in number of pages flushed in previous batch
+@param oldest_lsn buf_pool.get_oldest_modification(0)
+@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */
+static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
+ lsn_t oldest_lsn,
+ double dirty_pct)
{
static lsn_t prev_lsn = 0;
static ulint sum_pages = 0;
static ulint avg_page_rate = 0;
static ulint n_iterations = 0;
static time_t prev_time;
- lsn_t oldest_lsn;
- lsn_t cur_lsn;
- lsn_t age;
lsn_t lsn_rate;
ulint n_pages = 0;
- ulint pct_for_dirty = 0;
- ulint pct_for_lsn = 0;
- ulint pct_total = 0;
- cur_lsn = log_sys.get_lsn();
+ const lsn_t cur_lsn = log_sys.get_lsn();
+ ulint pct_for_dirty = af_get_pct_for_dirty(dirty_pct);
+ ut_ad(oldest_lsn <= cur_lsn);
+ ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
+ time_t curr_time = time(nullptr);
- if (prev_lsn == 0) {
- /* First time around. */
+ if (!prev_lsn || !pct_for_lsn) {
+ prev_time = curr_time;
prev_lsn = cur_lsn;
- prev_time = time(NULL);
- return(0);
- }
-
- if (prev_lsn == cur_lsn) {
- return(0);
+ return ulint(double(pct_for_dirty) / 100.0
+ * double(srv_io_capacity));
}
sum_pages += last_pages_in;
- time_t curr_time = time(NULL);
double time_elapsed = difftime(curr_time, prev_time);
/* We update our variables every srv_flushing_avg_loops
@@ -2125,66 +1945,18 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in)
lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
- /* aggregate stats of all slots */
- mutex_enter(&page_cleaner.mutex);
-
ulint flush_tm = page_cleaner.flush_time;
ulint flush_pass = page_cleaner.flush_pass;
page_cleaner.flush_time = 0;
page_cleaner.flush_pass = 0;
- ulint lru_tm = page_cleaner.slot.flush_lru_time;
- ulint list_tm = page_cleaner.slot.flush_list_time;
- ulint lru_pass = page_cleaner.slot.flush_lru_pass;
- ulint list_pass = page_cleaner.slot.flush_list_pass;
- page_cleaner.slot.flush_lru_time = 0;
- page_cleaner.slot.flush_lru_pass = 0;
- page_cleaner.slot.flush_list_time = 0;
- page_cleaner.slot.flush_list_pass = 0;
- mutex_exit(&page_cleaner.mutex);
-
- /* minimum values are 1, to avoid dividing by zero. */
- if (lru_tm < 1) {
- lru_tm = 1;
- }
- if (list_tm < 1) {
- list_tm = 1;
- }
- if (flush_tm < 1) {
- flush_tm = 1;
+ if (flush_pass) {
+ flush_tm /= flush_pass;
}
- if (lru_pass < 1) {
- lru_pass = 1;
- }
- if (list_pass < 1) {
- list_pass = 1;
- }
- if (flush_pass < 1) {
- flush_pass = 1;
- }
-
- MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
- list_tm / list_pass);
- MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
- lru_tm / lru_pass);
-
- MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
- list_tm / flush_pass);
- MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
- lru_tm / flush_pass);
- MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
- flush_tm * list_tm / flush_pass
- / (list_tm + lru_tm));
- MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
- flush_tm * lru_tm / flush_pass
- / (list_tm + lru_tm));
- MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
-
- MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, list_pass);
- MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS, lru_pass);
- MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
+ MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
+ MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
prev_lsn = cur_lsn;
prev_time = curr_time;
@@ -2194,49 +1966,32 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in)
sum_pages = 0;
}
- oldest_lsn = buf_pool.get_oldest_modification();
-
- ut_ad(oldest_lsn <= log_get_lsn());
-
- age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
- pct_for_dirty = af_get_pct_for_dirty();
- pct_for_lsn = af_get_pct_for_lsn(age);
-
- pct_total = ut_max(pct_for_dirty, pct_for_lsn);
+ ulint pct_total = std::max(pct_for_dirty, pct_for_lsn);
/* Estimate pages to be flushed for the lsn progress */
lsn_t target_lsn = oldest_lsn
+ lsn_avg_rate * buf_flush_lsn_scan_factor;
ulint pages_for_lsn = 0;
- mutex_enter(&buf_pool.flush_list_mutex);
for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
b != NULL;
b = UT_LIST_GET_PREV(list, b)) {
if (b->oldest_modification() > target_lsn) {
break;
}
- ++pages_for_lsn;
+ if (++pages_for_lsn >= srv_max_io_capacity) {
+ break;
+ }
}
- mutex_exit(&buf_pool.flush_list_mutex);
-
- mutex_enter(&page_cleaner.mutex);
- ut_ad(page_cleaner.slot.state == PAGE_CLEANER_STATE_NONE);
- page_cleaner.slot.n_pages_requested
- = pages_for_lsn / buf_flush_lsn_scan_factor + 1;
- mutex_exit(&page_cleaner.mutex);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
pages_for_lsn /= buf_flush_lsn_scan_factor;
if (pages_for_lsn < 1) {
pages_for_lsn = 1;
}
- /* Cap the maximum IO capacity that we are going to use by
- max_io_capacity. Limit the value to avoid too quick increase */
- pages_for_lsn = std::min<ulint>(
- pages_for_lsn, srv_max_io_capacity * 2);
-
n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0)
+ avg_page_rate + pages_for_lsn) / 3;
@@ -2244,21 +1999,6 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in)
n_pages = srv_max_io_capacity;
}
- mutex_enter(&page_cleaner.mutex);
- ut_ad(page_cleaner.n_slots_requested == 0);
- ut_ad(page_cleaner.n_slots_flushing == 0);
- ut_ad(page_cleaner.n_slots_finished == 0);
-
- /* if REDO has enough of free space,
- don't care about age distribution of pages */
- if (pct_for_lsn > 30) {
- page_cleaner.slot.n_pages_requested *= n_pages
- / pages_for_lsn + 1;
- } else {
- page_cleaner.slot.n_pages_requested = n_pages;
- }
- mutex_exit(&page_cleaner.mutex);
-
MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
@@ -2271,635 +2011,241 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in)
return(n_pages);
}
-/*********************************************************************//**
-Puts the page_cleaner thread to sleep if it has finished work in less
-than a second
-@retval 0 wake up by event set,
-@retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
-@param next_loop_time time when next loop iteration should start
-@param sig_count zero or the value returned by previous call of
- os_event_reset()
-@param cur_time current time as in ut_time_ms() */
-static
-ulint
-pc_sleep_if_needed(
-/*===============*/
- ulint next_loop_time,
- int64_t sig_count,
- ulint cur_time)
-{
- /* No sleep if we are cleaning the buffer pool during the shutdown
- with everything else finished */
- if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
- return OS_SYNC_TIME_EXCEEDED;
-
- if (next_loop_time > cur_time) {
- /* Get sleep interval in micro seconds. We use
- ut_min() to avoid long sleep in case of wrap around. */
- ulint sleep_us;
-
- sleep_us = ut_min(static_cast<ulint>(1000000),
- (next_loop_time - cur_time) * 1000);
-
- return(os_event_wait_time_low(buf_flush_event,
- sleep_us, sig_count));
- }
-
- return(OS_SYNC_TIME_EXCEEDED);
-}
-
-/**
-Requests for all slots to flush.
-@param min_n wished minimum mumber of blocks flushed
- (it is not guaranteed that the actual number is that big)
-@param lsn_limit in the case of buf_pool.flush_list all blocks whose
- oldest_modification is smaller than this should be flushed
- (if their number does not exceed min_n), otherwise ignored
-*/
-static void pc_request(ulint min_n, lsn_t lsn_limit)
-{
- mutex_enter(&page_cleaner.mutex);
-
- ut_ad(page_cleaner.n_slots_requested == 0);
- ut_ad(page_cleaner.n_slots_flushing == 0);
- ut_ad(page_cleaner.n_slots_finished == 0);
-
- page_cleaner.requested = (min_n > 0);
- page_cleaner.lsn_limit = lsn_limit;
-
- ut_ad(page_cleaner.slot.state == PAGE_CLEANER_STATE_NONE);
-
- if (min_n == 0 || min_n == ULINT_MAX) {
- page_cleaner.slot.n_pages_requested = min_n;
- }
-
- /* page_cleaner.slot.n_pages_requested was already set by
- page_cleaner_flush_pages_recommendation() */
-
- page_cleaner.slot.state = PAGE_CLEANER_STATE_REQUESTED;
-
- page_cleaner.n_slots_requested = 1;
- page_cleaner.n_slots_flushing = 0;
- page_cleaner.n_slots_finished = 0;
-
- mutex_exit(&page_cleaner.mutex);
-}
-
-/**
-Do flush for one slot.
-@return the number of the slots which has not been treated yet. */
-static ulint pc_flush_slot()
-{
- ulint lru_tm = 0;
- ulint list_tm = 0;
- ulint lru_pass = 0;
- ulint list_pass = 0;
-
- mutex_enter(&page_cleaner.mutex);
-
- if (page_cleaner.n_slots_requested) {
- ut_ad(page_cleaner.slot.state == PAGE_CLEANER_STATE_REQUESTED);
- page_cleaner.n_slots_requested--;
- page_cleaner.n_slots_flushing++;
- page_cleaner.slot.state = PAGE_CLEANER_STATE_FLUSHING;
-
- if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
- page_cleaner.slot.n_flushed_lru = 0;
- page_cleaner.slot.n_flushed_list = 0;
- goto finish_mutex;
- }
-
- mutex_exit(&page_cleaner.mutex);
-
- lru_tm = ut_time_ms();
-
- /* Flush pages from end of LRU if required */
- page_cleaner.slot.n_flushed_lru = buf_flush_LRU_list();
-
- lru_tm = ut_time_ms() - lru_tm;
- lru_pass++;
-
- if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
- page_cleaner.slot.n_flushed_list = 0;
- goto finish;
- }
-
- /* Flush pages from flush_list if required */
- if (page_cleaner.requested) {
- flush_counters_t n;
- memset(&n, 0, sizeof(flush_counters_t));
- list_tm = ut_time_ms();
-
- page_cleaner.slot.succeeded_list = buf_flush_do_batch(
- false,
- page_cleaner.slot.n_pages_requested,
- page_cleaner.lsn_limit,
- &n);
-
- page_cleaner.slot.n_flushed_list = n.flushed;
-
- list_tm = ut_time_ms() - list_tm;
- list_pass++;
- } else {
- page_cleaner.slot.n_flushed_list = 0;
- page_cleaner.slot.succeeded_list = true;
- }
-finish:
- mutex_enter(&page_cleaner.mutex);
-finish_mutex:
- page_cleaner.n_slots_flushing--;
- page_cleaner.n_slots_finished++;
- page_cleaner.slot.state = PAGE_CLEANER_STATE_FINISHED;
-
- page_cleaner.slot.flush_lru_time += lru_tm;
- page_cleaner.slot.flush_list_time += list_tm;
- page_cleaner.slot.flush_lru_pass += lru_pass;
- page_cleaner.slot.flush_list_pass += list_pass;
-
- if (page_cleaner.n_slots_requested == 0
- && page_cleaner.n_slots_flushing == 0) {
- os_event_set(page_cleaner.is_finished);
- }
- }
-
- ulint ret = page_cleaner.n_slots_requested;
-
- mutex_exit(&page_cleaner.mutex);
-
- return(ret);
-}
-
-/**
-Wait until all flush requests are finished.
-@param n_flushed_lru number of pages flushed from the end of the LRU list.
-@param n_flushed_list number of pages flushed from the end of the
- flush_list.
-@return true if all flush_list flushing batch were success. */
-static
-bool
-pc_wait_finished(
- ulint* n_flushed_lru,
- ulint* n_flushed_list)
-{
- bool all_succeeded = true;
-
- *n_flushed_lru = 0;
- *n_flushed_list = 0;
-
- os_event_wait(page_cleaner.is_finished);
-
- mutex_enter(&page_cleaner.mutex);
-
- ut_ad(page_cleaner.n_slots_requested == 0);
- ut_ad(page_cleaner.n_slots_flushing == 0);
- ut_ad(page_cleaner.n_slots_finished == 1);
-
- ut_ad(page_cleaner.slot.state == PAGE_CLEANER_STATE_FINISHED);
- page_cleaner.slot.state = PAGE_CLEANER_STATE_NONE;
- *n_flushed_lru = page_cleaner.slot.n_flushed_lru;
- *n_flushed_list = page_cleaner.slot.n_flushed_list;
- all_succeeded = page_cleaner.slot.succeeded_list;
- page_cleaner.slot.n_pages_requested = 0;
-
- page_cleaner.n_slots_finished = 0;
-
- os_event_reset(page_cleaner.is_finished);
-
- mutex_exit(&page_cleaner.mutex);
-
- return(all_succeeded);
-}
-
-#ifdef UNIV_LINUX
-/**
-Set priority for page_cleaner threads.
-@param[in] priority priority intended to set
-@return true if set as intended */
-static
-bool
-buf_flush_page_cleaner_set_priority(
- int priority)
-{
- setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
- priority);
- return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
- == priority);
-}
-#endif /* UNIV_LINUX */
-
-#ifdef UNIV_DEBUG
-/** Loop used to disable the page cleaner thread. */
-static void buf_flush_page_cleaner_disabled_loop()
-{
- while (innodb_page_cleaner_disabled_debug
- && srv_shutdown_state == SRV_SHUTDOWN_NONE
- && page_cleaner.is_running) {
- os_thread_sleep(100000);
- }
-}
-#endif /* UNIV_DEBUG */
-
/******************************************************************//**
page_cleaner thread tasked with flushing dirty pages from the buffer
pools. As of now we'll have only one coordinator.
@return a dummy parameter */
static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
{
- my_thread_init();
+ my_thread_init();
#ifdef UNIV_PFS_THREAD
- pfs_register_thread(page_cleaner_thread_key);
+ pfs_register_thread(page_cleaner_thread_key);
#endif /* UNIV_PFS_THREAD */
- ut_ad(!srv_read_only_mode);
+ ut_ad(!srv_read_only_mode);
+ ut_ad(buf_page_cleaner_is_active);
#ifdef UNIV_DEBUG_THREAD_CREATION
- ib::info() << "page_cleaner thread running, id "
- << os_thread_pf(os_thread_get_curr_id());
+ ib::info() << "page_cleaner thread running, id "
+ << os_thread_pf(os_thread_get_curr_id());
#endif /* UNIV_DEBUG_THREAD_CREATION */
#ifdef UNIV_LINUX
- /* linux might be able to set different setting for each thread.
- worth to try to set high priority for page cleaner threads */
- if (buf_flush_page_cleaner_set_priority(
- buf_flush_page_cleaner_priority)) {
-
- ib::info() << "page_cleaner coordinator priority: "
- << buf_flush_page_cleaner_priority;
- } else {
- ib::info() << "If the mysqld execution user is authorized,"
- " page cleaner thread priority can be changed."
- " See the man page of setpriority().";
- }
- /* Signal that setpriority() has been attempted. */
- os_event_set(recv_sys.flush_end);
+ /* linux might be able to set different setting for each thread.
+ worth to try to set high priority for the page cleaner thread */
+ const pid_t tid= static_cast<pid_t>(syscall(SYS_gettid));
+ setpriority(PRIO_PROCESS, tid, -20);
+ if (getpriority(PRIO_PROCESS, tid) != -20)
+ ib::info() << "If the mysqld execution user is authorized,"
+ " page cleaner thread priority can be changed."
+ " See the man page of setpriority().";
#endif /* UNIV_LINUX */
- do {
- /* treat flushing requests during recovery. */
- ulint n_flushed_lru = 0;
- ulint n_flushed_list = 0;
-
- os_event_wait(recv_sys.flush_start);
-
- if (!recv_writer_thread_active) {
- break;
- }
-
- if (recv_sys.flush_lru) {
- /* Flush pages from end of LRU if required */
- pc_request(0, LSN_MAX);
- while (pc_flush_slot() > 0) {}
- pc_wait_finished(&n_flushed_lru, &n_flushed_list);
- } else {
- /* Flush all pages */
- do {
- pc_request(ULINT_MAX, LSN_MAX);
- while (pc_flush_slot() > 0) {}
- } while (!pc_wait_finished(&n_flushed_lru,
- &n_flushed_list));
- }
-
- os_event_reset(recv_sys.flush_start);
- os_event_set(recv_sys.flush_end);
- } while (recv_writer_thread_active);
-
- os_event_wait(buf_flush_event);
-
- ulint ret_sleep = 0;
- ulint n_evicted = 0;
- ulint n_flushed_last = 0;
- ulint warn_interval = 1;
- ulint warn_count = 0;
- int64_t sig_count = os_event_reset(buf_flush_event);
- ulint next_loop_time = ut_time_ms() + 1000;
- ulint n_flushed = 0;
- ulint last_activity = srv_get_activity_count();
- ulint last_pages = 0;
-
- while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
- ulint curr_time = ut_time_ms();
-
- /* The page_cleaner skips sleep if the server is
- idle and there are no pending IOs in the buffer pool
- and there is work to do. */
- if (!n_flushed || !buf_pool.n_pend_reads
- || srv_check_activity(&last_activity)) {
-
- ret_sleep = pc_sleep_if_needed(
- next_loop_time, sig_count, curr_time);
- } else if (curr_time > next_loop_time) {
- ret_sleep = OS_SYNC_TIME_EXCEEDED;
- } else {
- ret_sleep = 0;
- }
-
- if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
- break;
- }
-
- sig_count = os_event_reset(buf_flush_event);
-
- if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
- if (global_system_variables.log_warnings > 2
- && curr_time > next_loop_time + 3000
- && !(test_flags & TEST_SIGINT)) {
- if (warn_count == 0) {
- ib::info() << "page_cleaner: 1000ms"
- " intended loop took "
- << 1000 + curr_time
- - next_loop_time
- << "ms. The settings might not"
- " be optimal. (flushed="
- << n_flushed_last
- << " and evicted="
- << n_evicted
- << ", during the time.)";
- if (warn_interval > 300) {
- warn_interval = 600;
- } else {
- warn_interval *= 2;
- }
-
- warn_count = warn_interval;
- } else {
- --warn_count;
- }
- } else {
- /* reset counter */
- warn_interval = 1;
- warn_count = 0;
- }
-
- next_loop_time = curr_time + 1000;
- n_flushed_last = n_evicted = 0;
- }
-
- if (ret_sleep != OS_SYNC_TIME_EXCEEDED
- && srv_flush_sync
- && buf_flush_sync_lsn > 0) {
- /* woke up for flush_sync */
- mutex_enter(&page_cleaner.mutex);
- lsn_t lsn_limit = buf_flush_sync_lsn;
- buf_flush_sync_lsn = 0;
- mutex_exit(&page_cleaner.mutex);
-
- /* Request flushing for threads */
- pc_request(ULINT_MAX, lsn_limit);
-
- ulint tm = ut_time_ms();
-
- /* Coordinator also treats requests */
- while (pc_flush_slot() > 0) {}
-
- /* only coordinator is using these counters,
- so no need to protect by lock. */
- page_cleaner.flush_time += ut_time_ms() - tm;
- page_cleaner.flush_pass++;
-
- /* Wait for all slots to be finished */
- ulint n_flushed_lru = 0;
- ulint n_flushed_list = 0;
- pc_wait_finished(&n_flushed_lru, &n_flushed_list);
-
- if (n_flushed_list > 0 || n_flushed_lru > 0) {
- buf_flush_stats(n_flushed_list, n_flushed_lru);
-
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_FLUSH_SYNC_TOTAL_PAGE,
- MONITOR_FLUSH_SYNC_COUNT,
- MONITOR_FLUSH_SYNC_PAGES,
- n_flushed_lru + n_flushed_list);
- }
-
- n_flushed = n_flushed_lru + n_flushed_list;
-
- } else if (srv_check_activity(&last_activity)) {
- ulint n_to_flush;
- lsn_t lsn_limit;
-
- /* Estimate pages from flush_list to be flushed */
- if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
- last_activity = srv_get_activity_count();
- n_to_flush =
- page_cleaner_flush_pages_recommendation(
- last_pages);
- lsn_limit = LSN_MAX;
- } else {
- n_to_flush = 0;
- lsn_limit = 0;
- }
-
- /* Request flushing for threads */
- pc_request(n_to_flush, lsn_limit);
-
- ulint tm = ut_time_ms();
-
- /* Coordinator also treats requests */
- while (pc_flush_slot() > 0) {
- /* No op */
- }
-
- /* only coordinator is using these counters,
- so no need to protect by lock. */
- page_cleaner.flush_time += ut_time_ms() - tm;
- page_cleaner.flush_pass++ ;
-
- /* Wait for all slots to be finished */
- ulint n_flushed_lru = 0;
- ulint n_flushed_list = 0;
-
- pc_wait_finished(&n_flushed_lru, &n_flushed_list);
-
- if (n_flushed_list > 0 || n_flushed_lru > 0) {
- buf_flush_stats(n_flushed_list, n_flushed_lru);
- }
-
- if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
- last_pages = n_flushed_list;
- }
+ ulint last_pages= 0;
+ timespec abstime;
+ set_timespec(abstime, 1);
- n_evicted += n_flushed_lru;
- n_flushed_last += n_flushed_list;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
- n_flushed = n_flushed_lru + n_flushed_list;
+ lsn_t lsn_limit;
- if (n_flushed_lru) {
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
- MONITOR_LRU_BATCH_FLUSH_COUNT,
- MONITOR_LRU_BATCH_FLUSH_PAGES,
- n_flushed_lru);
- }
-
- if (n_flushed_list) {
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
- MONITOR_FLUSH_ADAPTIVE_COUNT,
- MONITOR_FLUSH_ADAPTIVE_PAGES,
- n_flushed_list);
- }
-
- } else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
- /* no activity, slept enough */
- buf_flush_lists(srv_io_capacity, LSN_MAX, &n_flushed);
-
- n_flushed_last += n_flushed;
-
- if (n_flushed) {
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
- MONITOR_FLUSH_BACKGROUND_COUNT,
- MONITOR_FLUSH_BACKGROUND_PAGES,
- n_flushed);
-
- }
-
- } else {
- /* no activity, but woken up by event */
- n_flushed = 0;
- }
+ for (;;)
+ {
+ lsn_limit= buf_flush_sync_lsn;
- ut_d(buf_flush_page_cleaner_disabled_loop());
- }
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ {
+furious_flush:
+ buf_flush_sync_for_checkpoint(lsn_limit);
+ last_pages= 0;
+ set_timespec(abstime, 1);
+ continue;
+ }
- ut_ad(srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
- if (srv_fast_shutdown == 2
- || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
- /* In very fast shutdown or when innodb failed to start, we
- simulate a crash of the buffer pool. We are not required to do
- any flushing. */
- goto thread_exit;
- }
+ if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ break;
- /* In case of normal and slow shutdown the page_cleaner thread
- must wait for all other activity in the server to die down.
- Note that we can start flushing the buffer pool as soon as the
- server enters shutdown phase but we must stay alive long enough
- to ensure that any work done by the master or purge threads is
- also flushed.
- During shutdown we pass through two stages. In the first stage,
- when SRV_SHUTDOWN_CLEANUP is set other threads like the master
- and the purge threads may be working as well. We start flushing
- the buffer pool but can't be sure that no new pages are being
- dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
+ mysql_cond_timedwait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex,
+ &abstime);
+ set_timespec(abstime, 1);
- do {
- pc_request(ULINT_MAX, LSN_MAX);
+ lsn_limit= buf_flush_sync_lsn;
- while (pc_flush_slot() > 0) {}
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ goto furious_flush;
- ulint n_flushed_lru = 0;
- ulint n_flushed_list = 0;
- pc_wait_finished(&n_flushed_lru, &n_flushed_list);
+ if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ break;
- n_flushed = n_flushed_lru + n_flushed_list;
+ const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
- /* We sleep only if there are no pages to flush */
- if (n_flushed == 0) {
- os_thread_sleep(100000);
- }
- } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+ if (!dirty_blocks)
+ continue;
- /* At this point all threads including the master and the purge
- thread must have been suspended. */
- ut_ad(!srv_any_background_activity());
- ut_ad(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+ /* We perform dirty reads of the LRU+free list lengths here.
+ Division by zero is not possible, because buf_pool.flush_list is
+ guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
+ const double dirty_pct= double(dirty_blocks) * 100.0 /
+ double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
- /* We can now make a final sweep on flushing the buffer pool
- and exit after we have cleaned the whole buffer pool.
- It is important that we wait for any running batch that has
- been triggered by us to finish. Otherwise we can end up
- considering end of that batch as a finish of our final
- sweep and we'll come out of the loop leaving behind dirty pages
- in the flush_list */
- buf_flush_wait_batch_end(false);
- buf_flush_wait_LRU_batch_end();
+ if (dirty_pct < srv_max_dirty_pages_pct_lwm)
+ continue;
- bool success;
+ const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
- do {
- pc_request(ULINT_MAX, LSN_MAX);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- while (pc_flush_slot() > 0) {}
+ ulint n_flushed;
- ulint n_flushed_lru = 0;
- ulint n_flushed_list = 0;
- success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
+ if (!srv_adaptive_flushing)
+ {
+ n_flushed= buf_flush_lists(srv_io_capacity, LSN_MAX);
- n_flushed = n_flushed_lru + n_flushed_list;
+ if (n_flushed)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_COUNT,
+ MONITOR_FLUSH_BACKGROUND_PAGES,
+ n_flushed);
+do_checkpoint:
+ /* The periodic log_checkpoint() call here makes it harder to
+ reproduce bugs in crash recovery or mariabackup --prepare, or
+ in code that writes the redo log records. Omitting the call
+ here should not affect correctness, because log_free_check()
+ should still be invoking checkpoints when needed. */
+ DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
+
+ if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL)
+ log_checkpoint();
+ }
+ }
+ else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
+ oldest_lsn,
+ dirty_pct))
+ {
+ page_cleaner.flush_pass++;
+ const ulint tm= ut_time_ms();
+ last_pages= n_flushed= buf_flush_lists(n, LSN_MAX);
+ page_cleaner.flush_time+= ut_time_ms() - tm;
- buf_flush_wait_batch_end(false);
- buf_flush_wait_LRU_batch_end();
+ if (n_flushed)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_COUNT,
+ MONITOR_FLUSH_ADAPTIVE_PAGES,
+ n_flushed);
+ goto do_checkpoint;
+ }
+ }
- } while (!success || n_flushed > 0);
+#ifdef UNIV_DEBUG
+ while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn &&
+ srv_shutdown_state == SRV_SHUTDOWN_NONE)
+ os_thread_sleep(100000);
+#endif /* UNIV_DEBUG */
- /* Some sanity checks */
- ut_ad(!srv_any_background_activity());
- ut_ad(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
- ut_a(UT_LIST_GET_LEN(buf_pool.flush_list) == 0);
+#ifndef DBUG_OFF
+next:
+#endif /* !DBUG_OFF */
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ }
- /* We have lived our life. Time to die. */
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-thread_exit:
- page_cleaner.is_running = false;
- mutex_destroy(&page_cleaner.mutex);
+ if (srv_fast_shutdown != 2)
+ {
+ buf_flush_wait_batch_end_acquiring_mutex(true);
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ }
- os_event_destroy(page_cleaner.is_finished);
+ log_flush_task.wait();
- buf_page_cleaner_is_active = false;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ lsn_limit= buf_flush_sync_lsn;
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ goto furious_flush;
+ buf_page_cleaner_is_active= false;
+ mysql_cond_broadcast(&buf_pool.done_flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- my_thread_end();
- /* We count the number of threads in os_thread_exit(). A created
- thread should always use that to exit and not use return() to exit. */
- os_thread_exit();
+ my_thread_end();
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+ os_thread_exit();
- OS_THREAD_DUMMY_RETURN;
+ OS_THREAD_DUMMY_RETURN;
}
-static void pc_flush_slot_func(void*)
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
{
- while (pc_flush_slot() > 0) {};
+ ut_ad(!buf_page_cleaner_is_active);
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+ buf_flush_sync_lsn= 0;
+ buf_page_cleaner_is_active= true;
+ os_thread_create(buf_flush_page_cleaner);
}
+/** @return the number of dirty pages in the buffer pool */
+static ulint buf_flush_list_length()
+{
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ return len;
+}
-/** Initialize page_cleaner. */
-void buf_flush_page_cleaner_init()
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool()
{
- ut_ad(!page_cleaner.is_running);
+ ut_ad(!buf_page_cleaner_is_active);
+ ut_ad(!buf_flush_sync_lsn);
- mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex);
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Waiting to flush the buffer pool");
- page_cleaner.is_finished = os_event_create("pc_is_finished");
+ while (buf_pool.n_flush_list || buf_flush_list_length())
+ {
+ buf_flush_lists(srv_max_io_capacity, LSN_MAX);
+ timespec abstime;
- page_cleaner.is_running = true;
+ if (buf_pool.n_flush_list)
+ {
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Waiting to flush " ULINTPF " pages",
+ buf_flush_list_length());
+ set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
+ mysql_mutex_lock(&buf_pool.mutex);
+ while (buf_pool.n_flush_list)
+ mysql_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex,
+ &abstime);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+ }
- buf_page_cleaner_is_active = true;
- os_thread_create(buf_flush_page_cleaner, NULL, NULL);
+ ut_ad(!buf_pool.any_io_pending());
+ log_flush_task.wait();
}
/** Synchronously flush dirty blocks.
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync()
{
- bool success;
- do {
- success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
- buf_flush_wait_batch_end(false);
- } while (!success);
-}
-
-/** Request IO burst and wake page_cleaner up.
-@param[in] lsn_limit upper limit of LSN to be flushed */
-void buf_flush_request_force(lsn_t lsn_limit)
-{
- /* adjust based on lsn_avg_rate not to get old */
- lsn_t lsn_target = lsn_limit + lsn_avg_rate * 3;
-
- mutex_enter(&page_cleaner.mutex);
- if (lsn_target > buf_flush_sync_lsn) {
- buf_flush_sync_lsn = lsn_target;
- }
- mutex_exit(&page_cleaner.mutex);
+ ut_ad(!sync_check_iterate(dict_sync_check()));
- os_event_set(buf_flush_event);
+ for (;;)
+ {
+ const ulint n_flushed= buf_flush_lists(srv_max_io_capacity, LSN_MAX);
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ if (!n_flushed)
+ {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const auto len= UT_LIST_GET_LEN(buf_pool.flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ if (!len)
+ return;
+ }
+ }
}
#ifdef UNIV_DEBUG
@@ -2915,21 +2261,13 @@ struct Check {
static void buf_flush_validate_low()
{
buf_page_t* bpage;
- const ib_rbt_node_t* rnode = NULL;
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
ut_list_validate(buf_pool.flush_list, Check());
bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
- /* If we are in recovery mode i.e.: flush_rbt != NULL
- then each block in the flush_list must also be present
- in the flush_rbt. */
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- rnode = rbt_first(buf_pool.flush_rbt);
- }
-
while (bpage != NULL) {
const lsn_t om = bpage->oldest_modification();
/* A page in buf_pool.flush_list can be in
@@ -2938,36 +2276,22 @@ static void buf_flush_validate_low()
original descriptor can have this state and still be
in the flush list waiting to acquire the
buf_pool.flush_list_mutex to complete the relocation. */
- ut_a(bpage->in_file()
- || bpage->state() == BUF_BLOCK_REMOVE_HASH);
- ut_a(om > 0);
-
- if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
- buf_page_t** prpage;
-
- ut_a(rnode != NULL);
- prpage = rbt_value(buf_page_t*, rnode);
-
- ut_a(*prpage != NULL);
- ut_a(*prpage == bpage);
- rnode = rbt_next(buf_pool.flush_rbt, rnode);
- }
+ ut_d(const auto s= bpage->state());
+ ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE
+ || s == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(om > 0);
bpage = UT_LIST_GET_NEXT(list, bpage);
-
- ut_a(!bpage || om >= bpage->oldest_modification());
+ ut_ad(!bpage || recv_recovery_is_on()
+ || om >= bpage->oldest_modification());
}
-
- /* By this time we must have exhausted the traversal of
- flush_rbt (if active) as well. */
- ut_a(rnode == NULL);
}
/** Validate the flush list. */
void buf_flush_validate()
{
- mutex_enter(&buf_pool.flush_list_mutex);
- buf_flush_validate_low();
- mutex_exit(&buf_pool.flush_list_mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_validate_low();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 26c5850d290..f9ed938b20c 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -25,26 +25,23 @@ Created 11/5/1995 Heikki Tuuri
*******************************************************/
#include "buf0lru.h"
-#include "ut0byte.h"
-#include "ut0rnd.h"
#include "sync0rw.h"
-#include "hash0hash.h"
-#include "os0event.h"
#include "fil0fil.h"
#include "btr0btr.h"
#include "buf0buddy.h"
#include "buf0buf.h"
-#include "buf0dblwr.h"
#include "buf0flu.h"
#include "buf0rea.h"
#include "btr0sea.h"
-#include "ibuf0ibuf.h"
#include "os0file.h"
#include "page0zip.h"
#include "log0recv.h"
#include "srv0srv.h"
#include "srv0mon.h"
+/** Flush this many pages in buf_LRU_get_free_block() */
+size_t innodb_lru_flush_size;
+
/** The number of blocks from the LRU_old pointer onward, including
the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
of the whole LRU list length, except that the tolerance defined below
@@ -52,28 +49,13 @@ is allowed. Note that the tolerance must be small enough such that for
even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
allowed to point to either end of the LRU list. */
-static const ulint BUF_LRU_OLD_TOLERANCE = 20;
+static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20;
/** The minimum amount of non-old blocks when the LRU_old list exists
(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
@see buf_LRU_old_adjust_len */
#define BUF_LRU_NON_OLD_MIN_LEN 5
-#ifdef BTR_CUR_HASH_ADAPT
-/** When dropping the search hash index entries before deleting an ibd
-file, we build a local array of pages belonging to that tablespace
-in the buffer pool. Following is the size of that array.
-We also release buf_pool.mutex after scanning this many pages of the
-flush_list when dropping a table. This is to ensure that other threads
-are not blocked for extended period of time when using very large
-buffer pools. */
-static const ulint BUF_LRU_DROP_SEARCH_SIZE = 1024;
-#endif /* BTR_CUR_HASH_ADAPT */
-
-/** We scan these many blocks when looking for a clean page to evict
-during LRU eviction. */
-static const ulint BUF_LRU_SEARCH_SCAN_THRESHOLD = 100;
-
/** If we switch on the InnoDB monitor because there are too few available
frames in the buffer pool, we set this to TRUE */
static bool buf_lru_switched_on_innodb_mon = false;
@@ -155,7 +137,7 @@ static void buf_LRU_block_free_hashed_page(buf_block_t *block)
static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
{
/* FIXME: use atomics, not mutex */
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
buf_pool.stat.LRU_bytes += bpage->physical_size();
@@ -166,7 +148,7 @@ static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
instead of the general LRU list */
bool buf_LRU_evict_from_unzip_LRU()
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
/* If the unzip_LRU list is empty, we can only use the LRU. */
if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) {
@@ -202,314 +184,19 @@ bool buf_LRU_evict_from_unzip_LRU()
return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
}
-#ifdef BTR_CUR_HASH_ADAPT
-/**
-While flushing (or removing dirty) pages from a tablespace we don't
-want to hog the CPU and resources. Release the buffer pool and block
-mutex and try to force a context switch. Then reacquire the same mutexes.
-The current page is "fixed" before the release of the mutexes and then
-"unfixed" again once we have reacquired the mutexes.
-@param[in,out] bpage current page */
-static void buf_flush_yield(buf_page_t *bpage)
-{
- mutex_exit(&buf_pool.flush_list_mutex);
- ut_ad(bpage->oldest_modification());
- ut_ad(bpage->in_file());
- ut_ad(bpage->io_fix() == BUF_IO_NONE);
- /** Make the block sticky, so that even after we release buf_pool.mutex:
- (1) it cannot be removed from the buf_pool.flush_list
- (2) bpage cannot be relocated in buf_pool
- (3) bpage->in_LRU_list cannot change
- However, bpage->LRU can change. */
- bpage->set_io_fix(BUF_IO_PIN);
- mutex_exit(&buf_pool.mutex);
-
- /* Try and force a context switch. */
- os_thread_yield();
-
- mutex_enter(&buf_pool.mutex);
- bpage->io_unfix();
- mutex_enter(&buf_pool.flush_list_mutex);
- /* Should not have been removed from the flush
- list during the yield. However, this check is
- not sufficient to catch a remove -> add. */
- ut_ad(bpage->oldest_modification());
-}
-
-/******************************************************************//**
-If we have hogged the resources for too long then release the buffer
-pool and flush list mutex and do a thread yield. Set the current page
-to "sticky" so that it is not relocated during the yield.
-@return true if yielded */
-static MY_ATTRIBUTE((warn_unused_result))
-bool
-buf_flush_try_yield(
-/*================*/
- buf_page_t* bpage, /*!< in/out: bpage to remove */
- ulint processed) /*!< in: number of pages processed */
-{
- /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
- loop we release buf_pool.mutex to let other threads
- do their job but only if the block is not IO fixed. This
- ensures that the block stays in its position in the
- flush_list. */
-
- if (bpage != NULL
- && processed >= BUF_LRU_DROP_SEARCH_SIZE
- && bpage->io_fix() == BUF_IO_NONE) {
-
- /* Release the buf_pool.mutex
- to give the other threads a go. */
-
- buf_flush_yield(bpage);
- return(true);
- }
-
- return(false);
-}
-#endif /* BTR_CUR_HASH_ADAPT */
-
-/** Remove a single page from flush_list.
-@param[in,out] bpage buffer page to remove
-@param[in] flush whether to flush the page before removing
-@return true if page was removed. */
-static bool buf_flush_or_remove_page(buf_page_t *bpage, bool flush)
-{
- ut_ad(mutex_own(&buf_pool.mutex));
- ut_ad(mutex_own(&buf_pool.flush_list_mutex));
-
- /* bpage->id and bpage->io_fix are protected by
- buf_pool.mutex (and bpage->id additionally by hash_lock).
- It is safe to check them while holding buf_pool.mutex only. */
-
- if (bpage->io_fix() != BUF_IO_NONE) {
-
- /* We cannot remove this page during this scan
- yet; maybe the system is currently reading it
- in, or flushing the modifications to the file */
- return(false);
-
- }
-
- bool processed = false;
-
- /* We have to release the flush_list_mutex to obey the
- latching order. We are however guaranteed that the page
- will stay in the flush_list and won't be relocated because
- buf_flush_remove() and buf_flush_relocate_on_flush_list()
- need buf_pool.mutex as well. */
-
- mutex_exit(&buf_pool.flush_list_mutex);
-
- ut_ad(bpage->oldest_modification());
-
- if (!flush) {
- buf_flush_remove(bpage);
- processed = true;
- } else if (bpage->ready_for_flush()) {
- processed = buf_flush_page(bpage, IORequest::SINGLE_PAGE,
- nullptr, false);
-
- if (processed) {
- mutex_enter(&buf_pool.mutex);
- }
- }
-
- mutex_enter(&buf_pool.flush_list_mutex);
-
- ut_ad(mutex_own(&buf_pool.mutex));
-
- return(processed);
-}
-
-/** Remove all dirty pages belonging to a given tablespace when we are
-deleting the data file of that tablespace.
-The pages still remain a part of LRU and are evicted from
-the list as they age towards the tail of the LRU.
-@param[in] id tablespace identifier
-@param[in] flush whether to flush the pages before removing
-@param[in] first first page to be flushed or evicted
-@return whether all matching dirty pages were removed */
-static bool buf_flush_or_remove_pages(ulint id, bool flush, ulint first)
-{
- buf_page_t* prev;
- buf_page_t* bpage;
- ulint processed = 0;
-
- mutex_enter(&buf_pool.flush_list_mutex);
-rescan:
- bool all_freed = true;
-
- for (bpage = UT_LIST_GET_LAST(buf_pool.flush_list);
- bpage != NULL;
- bpage = prev) {
-
- ut_a(bpage->in_file());
-
- /* Save the previous link because once we free the
- page we can't rely on the links. */
-
- prev = UT_LIST_GET_PREV(list, bpage);
-
- const page_id_t bpage_id(bpage->id());
-
- if (id != bpage_id.space()) {
- /* Skip this block, because it is for a
- different tablespace. */
- } else if (bpage_id.page_no() < first) {
- /* Skip this block, because it is below the limit. */
- } else if (!buf_flush_or_remove_page(bpage, flush)) {
-
- /* Remove was unsuccessful, we have to try again
- by scanning the entire list from the end.
- This also means that we never released the
- buf_pool mutex. Therefore we can trust the prev
- pointer.
- buf_flush_or_remove_page() released the
- flush list mutex but not the buf_pool mutex.
- Therefore it is possible that a new page was
- added to the flush list. For example, in case
- where we are at the head of the flush list and
- prev == NULL. That is OK because we have the
- tablespace quiesced and no new pages for this
- space-id should enter flush_list. This is
- because the only callers of this function are
- DROP TABLE and FLUSH TABLE FOR EXPORT.
- We know that we'll have to do at least one more
- scan but we don't break out of loop here and
- try to do as much work as we can in this
- iteration. */
-
- all_freed = false;
- } else if (flush) {
-
- /* The processing was successful. And during the
- processing we have released the buf_pool mutex
- when calling buf_page_flush(). We cannot trust
- prev pointer. */
- goto rescan;
- }
-
-#ifdef BTR_CUR_HASH_ADAPT
- ++processed;
-
- /* Yield if we have hogged the CPU and mutexes for too long. */
- if (buf_flush_try_yield(prev, processed)) {
- /* Reset the batch size counter if we had to yield. */
- processed = 0;
- }
-#endif /* BTR_CUR_HASH_ADAPT */
- }
-
- mutex_exit(&buf_pool.flush_list_mutex);
-
- return(all_freed);
-}
-
-/** Remove or flush all the dirty pages that belong to a given tablespace.
-The pages will remain in the LRU list and will be evicted from the LRU list
-as they age and move towards the tail of the LRU list.
-@param[in] id tablespace identifier
-@param[in] flush whether to flush the pages before removing
-@param[in] first first page to be flushed or evicted */
-static void buf_flush_dirty_pages(ulint id, bool flush, ulint first)
-{
- mutex_enter(&buf_pool.mutex);
- while (!buf_flush_or_remove_pages(id, flush, first))
- {
- mutex_exit(&buf_pool.mutex);
- ut_d(buf_flush_validate());
- os_thread_sleep(2000);
- mutex_enter(&buf_pool.mutex);
- }
-
-#ifdef UNIV_DEBUG
- if (!first)
- {
- mutex_enter(&buf_pool.flush_list_mutex);
-
- for (buf_page_t *bpage= UT_LIST_GET_FIRST(buf_pool.flush_list); bpage;
- bpage= UT_LIST_GET_NEXT(list, bpage))
- {
- ut_ad(bpage->in_file());
- ut_ad(bpage->oldest_modification());
- ut_ad(id != bpage->id().space());
- }
-
- mutex_exit(&buf_pool.flush_list_mutex);
- }
-#endif
-
- mutex_exit(&buf_pool.mutex);
-}
-
-/** Empty the flush list for all pages belonging to a tablespace.
-@param[in] id tablespace identifier
-@param[in] flush whether to write the pages to files
-@param[in] first first page to be flushed or evicted */
-void buf_LRU_flush_or_remove_pages(ulint id, bool flush, ulint first)
-{
- /* Pages in the system tablespace must never be discarded. */
- ut_ad(id || flush);
-
- buf_flush_dirty_pages(id, flush, first);
-
- if (flush) {
- /* Ensure that all asynchronous IO is completed. */
- os_aio_wait_until_no_pending_writes();
- fil_flush(id);
- }
-}
-
-#ifdef UNIV_DEBUG
-/********************************************************************//**
-Insert a compressed block into buf_pool.zip_clean in the LRU order. */
-void
-buf_LRU_insert_zip_clean(
-/*=====================*/
- buf_page_t* bpage) /*!< in: pointer to the block in question */
-{
- ut_ad(mutex_own(&buf_pool.mutex));
- ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
- ut_ad(!bpage->oldest_modification());
-
- /* Find the first successor of bpage in the LRU list
- that is in the zip_clean list. */
- buf_page_t* b = bpage;
-
- do {
- b = UT_LIST_GET_NEXT(LRU, b);
- } while (b && (b->state() != BUF_BLOCK_ZIP_PAGE
- || b->oldest_modification()));
-
- /* Insert bpage before b, i.e., after the predecessor of b. */
- if (b != NULL) {
- b = UT_LIST_GET_PREV(list, b);
- }
-
- if (b != NULL) {
- UT_LIST_INSERT_AFTER(buf_pool.zip_clean, b, bpage);
- } else {
- UT_LIST_ADD_FIRST(buf_pool.zip_clean, bpage);
- }
-}
-#endif /* UNIV_DEBUG */
-
/** Try to free an uncompressed page of a compressed block from the unzip
LRU list. The compressed page is preserved, and it need not be clean.
-@param[in] scan_all true=scan the whole list;
- false=scan srv_LRU_scan_depth / 2 blocks
+@param limit maximum number of blocks to scan
@return true if freed */
-static bool buf_LRU_free_from_unzip_LRU_list(bool scan_all)
+static bool buf_LRU_free_from_unzip_LRU_list(ulint limit)
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
if (!buf_LRU_evict_from_unzip_LRU()) {
return(false);
}
ulint scanned = 0;
- const ulint limit = scan_all ? ULINT_UNDEFINED : srv_LRU_scan_depth;
bool freed = false;
for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
@@ -540,31 +227,24 @@ static bool buf_LRU_free_from_unzip_LRU_list(bool scan_all)
}
/** Try to free a clean page from the common LRU list.
-@param[in] scan_all true=scan the whole LRU list
- false=use BUF_LRU_SEARCH_SCAN_THRESHOLD
+@param limit maximum number of blocks to scan
@return whether a page was freed */
-static bool buf_LRU_free_from_common_LRU_list(bool scan_all)
+static bool buf_LRU_free_from_common_LRU_list(ulint limit)
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ulint scanned = 0;
bool freed = false;
for (buf_page_t* bpage = buf_pool.lru_scan_itr.start();
- bpage && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD);
+ bpage && scanned < limit;
++scanned, bpage = buf_pool.lru_scan_itr.get()) {
buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
buf_pool.lru_scan_itr.set(prev);
const auto accessed = bpage->is_accessed();
- freed = bpage->ready_for_replace();
-
- if (freed) {
- freed = buf_LRU_free_page(bpage, true);
- if (!freed) {
- continue;
- }
-
+ if (!bpage->oldest_modification()
+ && buf_LRU_free_page(bpage, true)) {
if (!accessed) {
/* Keep track of pages that are evicted without
ever being accessed. This gives us a measure of
@@ -572,6 +252,7 @@ static bool buf_LRU_free_from_common_LRU_list(bool scan_all)
++buf_pool.stat.n_ra_pages_evicted;
}
+ freed = true;
break;
}
}
@@ -588,15 +269,14 @@ static bool buf_LRU_free_from_common_LRU_list(bool scan_all)
}
/** Try to free a replaceable block.
-@param[in] scan_all true=scan the whole LRU list,
- false=use BUF_LRU_SEARCH_SCAN_THRESHOLD
+@param limit maximum number of blocks to scan
@return true if found and freed */
-bool buf_LRU_scan_and_free_block(bool scan_all)
+bool buf_LRU_scan_and_free_block(ulint limit)
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
- return(buf_LRU_free_from_unzip_LRU_list(scan_all)
- || buf_LRU_free_from_common_LRU_list(scan_all));
+ return buf_LRU_free_from_unzip_LRU_list(limit) ||
+ buf_LRU_free_from_common_LRU_list(limit);
}
/** @return a buffer block from the buf_pool.free list
@@ -605,7 +285,7 @@ buf_block_t* buf_LRU_get_free_only()
{
buf_block_t* block;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
block = reinterpret_cast<buf_block_t*>(
UT_LIST_GET_FIRST(buf_pool.free));
@@ -651,106 +331,89 @@ function will either assert or issue a warning and switch on the
status monitor. */
static void buf_LRU_check_size_of_non_data_objects()
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
- if (!recv_recovery_is_on()
- && buf_pool.curr_size == buf_pool.old_size
- && UT_LIST_GET_LEN(buf_pool.free)
- + UT_LIST_GET_LEN(buf_pool.LRU) < buf_pool.curr_size / 20) {
+ if (recv_recovery_is_on() || buf_pool.curr_size != buf_pool.old_size)
+ return;
- ib::fatal() << "Over 95 percent of the buffer pool is"
- " occupied by lock heaps"
+ const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+
+ if (s < buf_pool.curr_size / 20)
+ ib::fatal() << "Over 95 percent of the buffer pool is"
+ " occupied by lock heaps"
#ifdef BTR_CUR_HASH_ADAPT
- " or the adaptive hash index!"
+ " or the adaptive hash index"
#endif /* BTR_CUR_HASH_ADAPT */
- " Check that your transactions do not set too many"
- " row locks, or review if"
- " innodb_buffer_pool_size="
- << (buf_pool.curr_size >> (20U - srv_page_size_shift))
- << "M could be bigger.";
- } else if (!recv_recovery_is_on()
- && buf_pool.curr_size == buf_pool.old_size
- && (UT_LIST_GET_LEN(buf_pool.free)
- + UT_LIST_GET_LEN(buf_pool.LRU))
- < buf_pool.curr_size / 3) {
-
- if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer) {
-
- /* Over 67 % of the buffer pool is occupied by lock
- heaps or the adaptive hash index. This may be a memory
- leak! */
-
- ib::warn() << "Over 67 percent of the buffer pool is"
- " occupied by lock heaps"
+ "! Check that your transactions do not set too many"
+ " row locks, or review if innodb_buffer_pool_size="
+ << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+ << "M could be bigger.";
+
+ if (s < buf_pool.curr_size / 3)
+ {
+ if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer)
+ {
+ /* Over 67 % of the buffer pool is occupied by lock heaps or
+ the adaptive hash index. This may be a memory leak! */
+ ib::warn() << "Over 67 percent of the buffer pool is"
+ " occupied by lock heaps"
#ifdef BTR_CUR_HASH_ADAPT
- " or the adaptive hash index!"
+ " or the adaptive hash index"
#endif /* BTR_CUR_HASH_ADAPT */
- " Check that your transactions do not"
- " set too many row locks."
- " innodb_buffer_pool_size="
- << (buf_pool.curr_size >>
- (20U - srv_page_size_shift)) << "M."
- " Starting the InnoDB Monitor to print"
- " diagnostics.";
-
- buf_lru_switched_on_innodb_mon = true;
- srv_print_innodb_monitor = TRUE;
- srv_monitor_timer_schedule_now();
- }
-
- } else if (buf_lru_switched_on_innodb_mon) {
-
- /* Switch off the InnoDB Monitor; this is a simple way
- to stop the monitor if the situation becomes less urgent,
- but may also surprise users if the user also switched on the
- monitor! */
-
- buf_lru_switched_on_innodb_mon = false;
- srv_print_innodb_monitor = FALSE;
- }
+ "! Check that your transactions do not set too many row locks."
+ " innodb_buffer_pool_size="
+ << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+ << "M. Starting the InnoDB Monitor to print diagnostics.";
+ buf_lru_switched_on_innodb_mon= true;
+ srv_print_innodb_monitor= TRUE;
+ srv_monitor_timer_schedule_now();
+ }
+ }
+ else if (buf_lru_switched_on_innodb_mon)
+ {
+ /* Switch off the InnoDB Monitor; this is a simple way to stop the
+ monitor if the situation becomes less urgent, but may also
+ surprise users who did SET GLOBAL innodb_status_output=ON earlier! */
+ buf_lru_switched_on_innodb_mon= false;
+ srv_print_innodb_monitor= FALSE;
+ }
}
-/** Get a free block from the buf_pool. The block is taken off the
-free list. If free list is empty, blocks are moved from the end of the
-LRU list to the free list.
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
This function is called from a user thread when it needs a clean
block to read in a page. Note that we only ever get a block from
the free list. Even when we flush a page or find a page in LRU scan
we put it to free list to be used.
* iteration 0:
- * get a block from free list, success:done
+ * get a block from the buf_pool.free list, success:done
* if buf_pool.try_LRU_scan is set
- * scan LRU up to srv_LRU_scan_depth to find a clean block
- * the above will put the block on free list
+ * scan LRU up to 100 pages to free a clean block
* success:retry the free list
- * flush one dirty page from tail of LRU to disk
- * the above will put the block on free list
+ * flush up to innodb_lru_flush_size LRU blocks to data files
+ (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+ * on buf_page_write_complete() the blocks will put on buf_pool.free list
* success: retry the free list
-* iteration 1:
- * same as iteration 0 except:
- * scan whole LRU list
- * scan LRU list even if buf_pool.try_LRU_scan is not set
-* iteration > 1:
- * same as iteration 1 but sleep 10ms
+* subsequent iterations: same as iteration 0 except:
+ * scan whole LRU list
+ * scan LRU list even if buf_pool.try_LRU_scan is not set
@param have_mutex whether buf_pool.mutex is already being held
@return the free control block, in state BUF_BLOCK_MEMORY */
buf_block_t* buf_LRU_get_free_block(bool have_mutex)
{
- buf_block_t* block = NULL;
- bool freed = false;
ulint n_iterations = 0;
ulint flush_failures = 0;
MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
if (have_mutex) {
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
goto got_mutex;
}
loop:
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
got_mutex:
-
buf_LRU_check_size_of_non_data_objects();
DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
@@ -758,49 +421,38 @@ got_mutex:
n_iterations = 21;
goto not_found;});
+retry:
/* If there is a block in the free list, take it */
- block = buf_LRU_get_free_only();
-
- if (block) {
+ if (buf_block_t* block = buf_LRU_get_free_only()) {
if (!have_mutex) {
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
memset(&block->page.zip, 0, sizeof block->page.zip);
- return(block);
+ return block;
}
MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
- freed = false;
if (n_iterations || buf_pool.try_LRU_scan) {
/* If no block was in the free list, search from the
end of the LRU list and try to free a block there.
If we are doing for the first time we'll scan only
tail of the LRU list otherwise we scan the whole LRU
list. */
- freed = buf_LRU_scan_and_free_block(n_iterations > 0);
-
- if (!freed && n_iterations == 0) {
- /* Tell other threads that there is no point
- in scanning the LRU list. This flag is set to
- TRUE again when we flush a batch from this
- buffer pool. */
- buf_pool.try_LRU_scan = false;
-
- /* Also tell the page_cleaner thread that
- there is work for it to do. */
- os_event_set(buf_flush_event);
+ if (buf_LRU_scan_and_free_block(n_iterations
+ ? ULINT_UNDEFINED : 100)) {
+ goto retry;
}
+
+ /* Tell other threads that there is no point
+ in scanning the LRU list. */
+ buf_pool.try_LRU_scan = false;
}
#ifndef DBUG_OFF
not_found:
#endif
-
- mutex_exit(&buf_pool.mutex);
-
- if (freed) {
- goto loop;
- }
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_flush_wait_batch_end_acquiring_mutex(true);
if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
&& srv_buf_pool_old_size == srv_buf_pool_size) {
@@ -822,18 +474,8 @@ not_found:
buf_lru_free_blocks_error_printed = true;
}
- /* If we have scanned the whole LRU and still are unable to
- find a free block then we should sleep here to let the
- page_cleaner do an LRU batch for us. */
-
- if (!srv_read_only_mode) {
- os_event_set(buf_flush_event);
- }
-
if (n_iterations > 1) {
-
MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
- os_thread_sleep(10000);
}
/* No free block was found: try to flush the LRU list.
@@ -844,10 +486,10 @@ not_found:
TODO: A more elegant way would have been to return the freed
up block to the caller here but the code that deals with
removing the block from page_hash and LRU_list is fairly
- involved (particularly in case of compressed pages). We
+ involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
can do that in a separate patch sometime in future. */
- if (!buf_flush_single_page_from_LRU()) {
+ if (!buf_flush_lists(innodb_lru_flush_size, 0)) {
MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
++flush_failures;
}
@@ -867,7 +509,7 @@ static void buf_LRU_old_adjust_len()
ulint new_len;
ut_a(buf_pool.LRU_old);
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
@@ -928,7 +570,7 @@ static void buf_LRU_old_adjust_len()
called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
static void buf_LRU_old_init()
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN);
/* We first initialize all blocks in the LRU list as old and then use
@@ -957,7 +599,7 @@ static void buf_LRU_old_init()
static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage)
{
ut_ad(bpage->in_file());
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
if (bpage->belongs_to_unzip_LRU()) {
buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
@@ -1040,7 +682,7 @@ buf_unzip_LRU_add_block(
ibool old) /*!< in: TRUE if should be put to the end
of the list, else put to the start */
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_a(block->page.belongs_to_unzip_LRU());
ut_ad(!block->in_unzip_LRU_list);
ut_d(block->in_unzip_LRU_list = true);
@@ -1064,7 +706,7 @@ buf_LRU_add_block(
LRU list is very short, the block is added to
the start, regardless of this parameter */
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(!bpage->in_LRU_list);
if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) {
@@ -1124,7 +766,7 @@ void buf_page_make_young(buf_page_t *bpage)
{
ut_ad(bpage->in_file());
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
if (UNIV_UNLIKELY(bpage->old))
buf_pool.stat.n_pages_made_young++;
@@ -1132,7 +774,7 @@ void buf_page_make_young(buf_page_t *bpage)
buf_LRU_remove_block(bpage);
buf_LRU_add_block(bpage, false);
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
/** Try to free a block. If bpage is a descriptor of a compressed-only
@@ -1147,7 +789,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
const page_id_t id(bpage->id());
buf_page_t* b = nullptr;
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(bpage->in_file());
ut_ad(bpage->in_LRU_list);
@@ -1188,7 +830,7 @@ func_exit:
b->set_state(BUF_BLOCK_ZIP_PAGE);
}
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(bpage->in_file());
ut_ad(bpage->in_LRU_list);
@@ -1278,14 +920,7 @@ func_exit:
buf_LRU_add_block(b, b->old);
}
- if (!b->oldest_modification()) {
-#ifdef UNIV_DEBUG
- buf_LRU_insert_zip_clean(b);
-#endif /* UNIV_DEBUG */
- } else {
- /* Relocate on buf_pool.flush_list. */
- buf_flush_relocate_on_flush_list(bpage, b);
- }
+ buf_flush_relocate_on_flush_list(bpage, b);
bpage->zip.data = nullptr;
@@ -1298,25 +933,35 @@ func_exit:
hash_lock->write_unlock();
}
- mutex_exit(&buf_pool.mutex);
-
- /* Remove possible adaptive hash index on the page.
- The page was declared uninitialized by
- buf_LRU_block_remove_hashed(). We need to flag
- the contents of the page valid (which it still is) in
- order to avoid bogus Valgrind or MSAN warnings.*/
buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
- MEM_MAKE_DEFINED(block->frame, srv_page_size);
- btr_search_drop_page_hash_index(block);
- MEM_UNDEFINED(block->frame, srv_page_size);
+#ifdef BTR_CUR_HASH_ADAPT
+ if (block->index) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ /* Remove the adaptive hash index on the page.
+ The page was declared uninitialized by
+ buf_LRU_block_remove_hashed(). We need to flag
+ the contents of the page valid (which it still is) in
+ order to avoid bogus Valgrind or MSAN warnings.*/
+
+ MEM_MAKE_DEFINED(block->frame, srv_page_size);
+ btr_search_drop_page_hash_index(block);
+ MEM_UNDEFINED(block->frame, srv_page_size);
+
+ if (UNIV_LIKELY_NULL(b)) {
+ ut_ad(b->zip_size());
+ b->io_unfix();
+ }
+ mysql_mutex_lock(&buf_pool.mutex);
+ } else
+#endif
if (UNIV_LIKELY_NULL(b)) {
ut_ad(b->zip_size());
b->io_unfix();
}
- mutex_enter(&buf_pool.mutex);
buf_LRU_block_free_hashed_page(block);
return(true);
@@ -1377,6 +1022,16 @@ buf_LRU_block_free_non_file_page(
MEM_NOACCESS(block->frame, srv_page_size);
}
+/** Release a memory block to the buffer pool. */
+ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block)
+{
+ ut_ad(this == &buf_pool);
+ mysql_mutex_lock(&mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&mutex);
+}
+
+
/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(),
@@ -1395,7 +1050,7 @@ this case the block is already returned to the buddy allocator. */
static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
page_hash_latch *hash_lock, bool zip)
{
- ut_ad(mutex_own(&buf_pool.mutex));
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(hash_lock->is_write_locked());
ut_a(bpage->io_fix() == BUF_IO_NONE);
@@ -1486,9 +1141,6 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
ut_a(bpage->zip.ssize);
ut_ad(!bpage->oldest_modification());
-#ifdef UNIV_DEBUG
- UT_LIST_REMOVE(buf_pool.zip_clean, bpage);
-#endif /* UNIV_DEBUG */
hash_lock->write_unlock();
buf_pool_mutex_exit_forbid();
@@ -1593,7 +1245,7 @@ uint buf_LRU_old_ratio_update(uint old_pct, bool adjust)
}
if (adjust) {
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
if (ratio != buf_pool.LRU_old_ratio) {
buf_pool.LRU_old_ratio = ratio;
@@ -1604,7 +1256,7 @@ uint buf_LRU_old_ratio_update(uint old_pct, bool adjust)
}
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
} else {
buf_pool.LRU_old_ratio = ratio;
}
@@ -1657,7 +1309,7 @@ void buf_LRU_validate()
ulint old_len;
ulint new_len;
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) {
@@ -1735,7 +1387,7 @@ void buf_LRU_validate()
ut_a(block->page.belongs_to_unzip_LRU());
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
#endif /* UNIV_DEBUG */
@@ -1743,7 +1395,7 @@ void buf_LRU_validate()
/** Dump the LRU list to stderr. */
void buf_LRU_print()
{
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
bpage != NULL;
@@ -1792,6 +1444,6 @@ void buf_LRU_print()
}
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
}
#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index ed36873837e..2f59f1ae4d5 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -121,7 +121,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
const ulint fold= page_id.fold();
- mutex_enter(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
/* We must acquire hash_lock this early to prevent
a race condition with buf_pool_t::watch_remove() */
@@ -237,16 +237,13 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
/* The block must be put to the LRU list, to the old blocks.
The zip size is already set into the page zip */
buf_LRU_add_block(bpage, true/* to old blocks */);
-#ifdef UNIV_DEBUG
- buf_LRU_insert_zip_clean(bpage);
-#endif /* UNIV_DEBUG */
}
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
buf_pool.n_pend_reads++;
goto func_exit_no_mutex;
func_exit:
- mutex_exit(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
func_exit_no_mutex:
if (mode == BUF_READ_IBUF_PAGES_ONLY)
ibuf_mtr_commit(&mtr);
@@ -264,46 +261,48 @@ flag is cleared and the x-lock released by an i/o-handler thread.
@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED
if we are trying
to read from a non-existent tablespace
+@param[in,out] space tablespace
@param[in] sync true if synchronous aio is desired
@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...,
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] unzip true=request uncompressed page
-@param[in] ignore whether to ignore out-of-bounds page_id
-@return 1 if a read request was queued, 0 if the page already resided
-in buf_pool, or if the page is in the doublewrite buffer blocks in
-which case it is never read into the pool, or if the tablespace does
-not exist or is being dropped */
+@return whether a read request was queued */
static
-ulint
+bool
buf_read_page_low(
dberr_t* err,
+ fil_space_t* space,
bool sync,
ulint mode,
const page_id_t page_id,
ulint zip_size,
- bool unzip,
- bool ignore = false)
+ bool unzip)
{
buf_page_t* bpage;
*err = DB_SUCCESS;
- if (!page_id.space() && buf_dblwr_page_inside(page_id.page_no())) {
-
+ if (buf_dblwr.is_inside(page_id)) {
ib::error() << "Trying to read doublewrite buffer page "
<< page_id;
- return(0);
+ ut_ad(0);
+nothing_read:
+ space->release();
+ return false;
}
- if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) {
+ if (sync) {
+ } else if (trx_sys_hdr_page(page_id)
+ || ibuf_bitmap_page(page_id, zip_size)
+ || (!recv_no_ibuf_operations
+ && ibuf_page(page_id, zip_size, nullptr))) {
/* Trx sys header is so low in the latching order that we play
safe and do not leave the i/o-completion to an asynchronous
- i/o-thread. Ibuf bitmap pages must always be read with
+ i/o-thread. Change buffer pages must always be read with
syncronous i/o, to make sure they do not get involved in
thread deadlocks. */
-
sync = true;
}
@@ -314,20 +313,19 @@ buf_read_page_low(
bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
if (bpage == NULL) {
-
- return(0);
+ goto nothing_read;
}
- DBUG_LOG("ib_buf",
- "read page " << page_id << " zip_size=" << zip_size
- << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
-
ut_ad(bpage->in_file());
if (sync) {
- thd_wait_begin(NULL, THD_WAIT_DISKIO);
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
}
+ DBUG_LOG("ib_buf",
+ "read page " << page_id << " zip_size=" << zip_size
+ << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
+
void* dst;
if (zip_size) {
@@ -338,20 +336,18 @@ buf_read_page_low(
dst = ((buf_block_t*) bpage)->frame;
}
- fil_io_t fio = fil_io(
- IORequestRead, sync, page_id, zip_size, 0,
- zip_size ? zip_size : srv_page_size,
- dst, bpage, ignore);
+ const ulint len = zip_size ? zip_size : srv_page_size;
+ auto fio = space->io(IORequest(sync
+ ? IORequest::READ_SYNC
+ : IORequest::READ_ASYNC),
+ page_id.page_no() * len, len, dst, bpage);
*err= fio.err;
if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
- if (ignore || fio.err == DB_TABLESPACE_DELETED) {
+ if (!sync || fio.err == DB_TABLESPACE_DELETED) {
buf_pool.corrupted_evict(bpage);
- if (sync && fio.node) {
- fio.node->space->release_for_io();
- }
- return(0);
+ return false;
}
ut_error;
@@ -360,16 +356,16 @@ buf_read_page_low(
if (sync) {
thd_wait_end(NULL);
- /* The i/o was already completed in fil_io() */
+ /* The i/o was already completed in space->io() */
*err = buf_page_read_complete(bpage, *fio.node);
- fio.node->space->release_for_io();
+ space->release();
if (*err != DB_SUCCESS) {
- return(0);
+ return false;
}
}
- return(1);
+ return true;
}
/** Applies a random read-ahead in buf_pool if there are at least a threshold
@@ -406,7 +402,7 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
return 0;
- fil_space_t* space= fil_space_acquire(page_id.space());
+ fil_space_t* space= fil_space_t::get(page_id.space());
if (!space)
return 0;
@@ -414,8 +410,7 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
ulint count= 5 + buf_read_ahead_area / 8;
const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
page_id_t high= low + buf_read_ahead_area;
- high.set_page_no(std::min(high.page_no(),
- static_cast<uint32_t>(space->committed_size - 1)));
+ high.set_page_no(std::min(high.page_no(), space->last_page_number()));
/* Count how many blocks in the area have been recently accessed,
that is, reside near the start of the LRU list. */
@@ -431,10 +426,14 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
goto read_ahead;
}
+no_read_ahead:
space->release();
return 0;
read_ahead:
+ if (!space->acquire_if_not_stopped())
+ goto no_read_ahead;
+
/* Read all the suitable blocks within the area */
const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
@@ -442,8 +441,12 @@ read_ahead:
{
if (ibuf_bitmap_page(i, zip_size))
continue;
+ if (space->is_stopping())
+ break;
dberr_t err;
- count+= buf_read_page_low(&err, false, ibuf_mode, i, zip_size, false);
+ space->reacquire();
+ if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
+ count++;
}
if (count)
@@ -474,41 +477,40 @@ after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
{
- dberr_t err = DB_SUCCESS;
-
- ulint count = buf_read_page_low(
- &err, true, BUF_READ_ANY_PAGE, page_id, zip_size, false);
-
- srv_stats.buf_pool_reads.add(count);
-
- if (err == DB_TABLESPACE_DELETED) {
- ib::info() << "trying to read page " << page_id
- << " in nonexisting or being-dropped tablespace";
- }
+ fil_space_t *space= fil_space_t::get(page_id.space());
+ if (!space)
+ {
+ ib::info() << "trying to read page " << page_id
+ << " in nonexisting or being-dropped tablespace";
+ return DB_TABLESPACE_DELETED;
+ }
- /* Increment number of I/O operations used for LRU policy. */
- buf_LRU_stat_inc_io();
+ dberr_t err;
+ if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
+ page_id, zip_size, false))
+ srv_stats.buf_pool_reads.add(1);
- return(err);
+ buf_LRU_stat_inc_io();
+ return err;
}
/** High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
+@param[in,out] space tablespace
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] sync true if synchronous aio is desired */
-void
-buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+ ulint zip_size, bool sync)
{
- ulint count;
dberr_t err;
- count = buf_read_page_low(
- &err, sync,
- BUF_READ_ANY_PAGE,
- page_id, zip_size, false, true);
+ if (buf_read_page_low(&err, space, sync, BUF_READ_ANY_PAGE,
+ page_id, zip_size, false)) {
+ srv_stats.buf_pool_reads.add(1);
+ }
switch (err) {
case DB_SUCCESS:
@@ -530,8 +532,6 @@ buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
<< page_id;
}
- srv_stats.buf_pool_reads.add(count);
-
/* We do not increment number of I/O operations used for LRU policy
here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
about evicting uncompressed version of compressed pages from the
@@ -597,12 +597,14 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
read-ahead, as that could break the ibuf page access order */
return 0;
- fil_space_t *space= fil_space_acquire(page_id.space());
+ fil_space_t *space= fil_space_t::get(page_id.space());
if (!space)
return 0;
- if (high_1.page_no() >= space->committed_size)
+
+ if (high_1.page_no() > space->last_page_number())
{
/* The area is not whole. */
+fail:
space->release();
return 0;
}
@@ -630,8 +632,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
{
hard_fail:
hash_lock->read_unlock();
- space->release();
- return 0;
+ goto fail;
}
const byte *f;
switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
@@ -663,7 +664,7 @@ hard_fail:
if (id != new_low && id != new_high_1)
/* This is not a border page of the area: return */
goto hard_fail;
- if (new_high_1.page_no() >= space->committed_size)
+ if (new_high_1.page_no() > space->last_page_number())
/* The area is not whole */
goto hard_fail;
}
@@ -673,8 +674,7 @@ failed:
hash_lock->read_unlock();
if (--count)
continue;
- space->release();
- return 0;
+ goto fail;
}
const unsigned accessed= bpage->is_accessed();
@@ -701,8 +701,11 @@ failed:
{
if (ibuf_bitmap_page(new_low, zip_size))
continue;
+ if (space->is_stopping())
+ break;
dberr_t err;
- count+= buf_read_page_low(&err, false, ibuf_mode, new_low, zip_size,
+ space->reacquire();
+ count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
false);
}
@@ -721,35 +724,25 @@ failed:
}
/** Issues read requests for pages which recovery wants to read in.
-@param[in] sync true if the caller wants this function to wait
-for the highest address page to get read in, before this function returns
@param[in] space_id tablespace id
@param[in] page_nos array of page numbers to read, with the
highest page number the last in the array
-@param[in] n_stored number of page numbers in the array */
-void
-buf_read_recv_pages(
- bool sync,
- ulint space_id,
- const ulint* page_nos,
- ulint n_stored)
+@param[in] n number of page numbers in the array */
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
{
- fil_space_t* space = fil_space_get(space_id);
+ fil_space_t* space = fil_space_t::get(space_id);
- if (space == NULL) {
- /* The tablespace is missing: do nothing */
+ if (!space) {
+ /* The tablespace is missing or unreadable: do nothing */
return;
}
- fil_space_open_if_needed(space);
-
const ulint zip_size = space->zip_size();
- for (ulint i = 0; i < n_stored; i++) {
+ for (ulint i = 0; i < n; i++) {
/* Ignore if the page already present in freed ranges. */
- if (space->freed_ranges.contains(
- static_cast<uint32_t>(page_nos[i]))) {
+ if (space->freed_ranges.contains(page_nos[i])) {
continue;
}
@@ -774,9 +767,10 @@ buf_read_recv_pages(
}
dberr_t err;
- buf_read_page_low(
- &err, sync && i + 1 == n_stored,
- BUF_READ_ANY_PAGE, cur_page_id, zip_size, true);
+ space->reacquire();
+ buf_read_page_low(&err, space, false,
+ BUF_READ_ANY_PAGE, cur_page_id, zip_size,
+ true);
if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
ib::error() << "Recovery failed to read or decrypt "
@@ -784,6 +778,8 @@ buf_read_recv_pages(
}
}
- DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)",
- unsigned(n_stored)));
+
+ DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
+ space->chain.start->name));
+ space->release();
}