diff options
Diffstat (limited to 'storage/innobase/buf/buf0flu.cc')
-rw-r--r-- | storage/innobase/buf/buf0flu.cc | 809 |
1 files changed, 485 insertions, 324 deletions
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index da25b825e7e..b69026ef990 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -26,6 +26,7 @@ Created 11/11/1995 Heikki Tuuri *******************************************************/ #include "univ.i" +#include <my_service_manager.h> #include <mysql/service_thd_wait.h> #include <sql_class.h> @@ -52,10 +53,6 @@ Created 11/11/1995 Heikki Tuuri # include "snappy-c.h" #endif -/** Sleep time in microseconds for loop waiting for the oldest -modification lsn */ -static constexpr ulint buf_flush_wait_flushed_sleep_time = 10000; - /** Number of pages flushed via LRU. Protected by buf_pool.mutex. Also included in buf_flush_page_count. */ ulint buf_lru_flush_page_count; @@ -68,41 +65,27 @@ bool buf_page_cleaner_is_active; /** Factor for scan length to determine n_pages for intended oldest LSN progress */ -static ulint buf_flush_lsn_scan_factor = 3; +static constexpr ulint buf_flush_lsn_scan_factor = 3; /** Average redo generation rate */ static lsn_t lsn_avg_rate = 0; -/** Target oldest LSN for the requested flush_sync */ -static std::atomic<lsn_t> buf_flush_sync_lsn; +/** Target oldest_modification for the page cleaner; writes are protected by +buf_pool.flush_list_mutex */ +static Atomic_relaxed<lsn_t> buf_flush_sync_lsn; #ifdef UNIV_PFS_THREAD mysql_pfs_key_t page_cleaner_thread_key; #endif /* UNIV_PFS_THREAD */ -/** Page cleaner request state for buf_pool */ -struct page_cleaner_slot_t { - ulint n_flushed_list; - /*!< number of flushed pages - by flush_list flushing */ - ulint flush_list_time; - /*!< elapsed time for flush_list - flushing */ - ulint flush_list_pass; - /*!< count to attempt flush_list - flushing */ -}; - /** Page cleaner structure */ -struct page_cleaner_t { - ulint flush_time; /*!< elapsed time to flush - requests for all slots */ - ulint flush_pass; /*!< count to finish to flush - requests for all slots */ - page_cleaner_slot_t slot; -}; - -static page_cleaner_t page_cleaner; +static struct +{ + /** total elapsed time in adaptive flushing, in seconds */ + ulint flush_time; + /** number of adaptive flushing passes */ + ulint flush_pass; +} page_cleaner; #ifdef UNIV_DEBUG my_bool innodb_page_cleaner_disabled_debug; @@ -257,7 +240,7 @@ ulint buf_flush_dirty_pages(ulint id) } mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (n) - buf_flush_lists(ULINT_UNDEFINED, LSN_MAX); + buf_flush_lists(srv_max_io_capacity, LSN_MAX); return n; } @@ -1449,6 +1432,12 @@ static std::atomic_flag log_flush_pending; /** Advance log_sys.get_flushed_lsn() */ static void log_flush(void *) { + /* Between batches, we try to prevent I/O stalls by these calls. + This should not be needed for correctness. */ + os_aio_wait_until_no_pending_writes(); + fil_flush_file_spaces(); + + /* Guarantee progress for buf_flush_lists(). */ log_write_up_to(log_sys.get_lsn(), true); log_flush_pending.clear(); } @@ -1515,65 +1504,199 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn) return n_flushed; } -/** Request IO burst and wake up the page_cleaner. -@param lsn desired lower bound of oldest_modification */ -static void buf_flush_request_force(lsn_t lsn) + +/** Initiate a log checkpoint, discarding the start of the log. +@param oldest_lsn the checkpoint LSN +@param end_lsn log_sys.get_lsn() +@return true if success, false if a checkpoint write was already running */ +static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) { - lsn+= lsn_avg_rate * 3; + ut_ad(!srv_read_only_mode); + ut_ad(log_mutex_own()); + ut_ad(oldest_lsn <= end_lsn); + ut_ad(end_lsn == log_sys.get_lsn()); + ut_ad(!recv_no_log_write); + + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); + + if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + /* Some log has been written since the previous checkpoint. */; + else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + /* MariaDB startup expects the redo log file to be logically empty + (not even containing a FILE_CHECKPOINT record) after a clean shutdown. + Perform an extra checkpoint at shutdown. */; + else + { + /* Do nothing, because nothing was logged (other than a + FILE_CHECKPOINT record) since the previous checkpoint. */ + log_mutex_exit(); + return true; + } - lsn_t o= 0; + /* Repeat the FILE_MODIFY records after the checkpoint, in case some + log records between the checkpoint and log_sys.lsn need them. + Finally, write a FILE_CHECKPOINT record. Redo log apply expects to + see a FILE_CHECKPOINT after the checkpoint, except on clean + shutdown, where the log will be empty after the checkpoint. - while (!buf_flush_sync_lsn.compare_exchange_weak(o, lsn, - std::memory_order_acquire, - std::memory_order_relaxed)) - if (lsn > o) - break; + It is important that we write out the redo log before any further + dirty pages are flushed to the tablespace files. At this point, + because we hold log_sys.mutex, mtr_t::commit() in other threads will + be blocked, and no pages can be added to the flush lists. */ + lsn_t flush_lsn= oldest_lsn; - mysql_cond_signal(&buf_pool.do_flush_list); + if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn || + srv_shutdown_state <= SRV_SHUTDOWN_INITIATED)) + { + flush_lsn= log_sys.get_lsn(); + ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); + log_mutex_exit(); + log_write_up_to(flush_lsn, true, true); + log_mutex_enter(); + if (log_sys.last_checkpoint_lsn >= oldest_lsn) + { + log_mutex_exit(); + return true; + } + } + else + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); + + ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); + + if (log_sys.n_pending_checkpoint_writes) + { + /* A checkpoint write is running */ + log_mutex_exit(); + return false; + } + + log_sys.next_checkpoint_lsn= oldest_lsn; + log_write_checkpoint_info(end_lsn); + ut_ad(!log_mutex_own()); + + return true; } -/** Wait until a flush batch of the given lsn ends -@param[in] new_oldest target oldest_modified_lsn to wait for */ -void buf_flush_wait_flushed(lsn_t new_oldest) +/** Make a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log file. Use log_make_checkpoint() to flush also the pool. +@retval true if the checkpoint was or had been made +@retval false if a checkpoint write was already running */ +static bool log_checkpoint() { - ut_ad(new_oldest); + if (recv_recovery_is_on()) + recv_sys.apply(true); - if (srv_flush_sync) { - /* wake page cleaner for IO burst */ - buf_flush_request_force(new_oldest); - } + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } - for (;;) { - /* We don't need to wait for fsync of the flushed - blocks, because anyway we need fsync to make chekpoint. - So, we don't need to wait for the batch end here. */ - - mysql_mutex_lock(&buf_pool.flush_list_mutex); - - buf_page_t* bpage; - /* FIXME: Keep temporary tablespace pages in a separate flush - list. We would only need to write out temporary pages if the - page is about to be evicted from the buffer pool, and the page - contents is still needed (the page has not been freed). */ - for (bpage = UT_LIST_GET_LAST(buf_pool.flush_list); - bpage && fsp_is_system_temporary(bpage->id().space()); - bpage = UT_LIST_GET_PREV(list, bpage)) { - ut_ad(bpage->oldest_modification()); - } + log_mutex_enter(); + const lsn_t end_lsn= log_sys.get_lsn(); + log_flush_order_mutex_enter(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_flush_order_mutex_exit(); + return log_checkpoint_low(oldest_lsn, end_lsn); +} - lsn_t oldest = bpage ? bpage->oldest_modification() : 0; +/** Make a checkpoint. */ +ATTRIBUTE_COLD void log_make_checkpoint() +{ + buf_flush_wait_flushed(log_sys.get_lsn()); + while (!log_checkpoint()); +} - mysql_mutex_unlock(&buf_pool.flush_list_mutex); +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) +{ + ut_ad(sync_lsn); + ut_ad(sync_lsn < LSN_MAX); + ut_ad(!log_mutex_own()); + ut_ad(!srv_read_only_mode); - if (oldest == 0 || oldest >= new_oldest) { - break; - } + if (recv_recovery_is_on()) + recv_sys.apply(true); - /* sleep and retry */ - os_thread_sleep(buf_flush_wait_flushed_sleep_time); + mysql_mutex_lock(&buf_pool.flush_list_mutex); - MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); - } +#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */ + if (UNIV_UNLIKELY(!buf_page_cleaner_is_active) + ut_d(|| innodb_page_cleaner_disabled_debug)) + { + for (;;) + { + const lsn_t lsn= buf_pool.get_oldest_modification(sync_lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (lsn >= sync_lsn) + return; + ulint n_pages= buf_flush_lists(srv_max_io_capacity, sync_lsn); + buf_flush_wait_batch_end_acquiring_mutex(false); + if (n_pages) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_pages); + log_checkpoint(); + } + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + return; + } + else if (UNIV_LIKELY(srv_flush_sync)) +#endif + { + if (buf_flush_sync_lsn < sync_lsn) + { + buf_flush_sync_lsn= sync_lsn; + mysql_cond_signal(&buf_pool.do_flush_list); + } + } + + while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn) + { + tpool::tpool_wait_begin(); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + mysql_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex); + thd_wait_end(nullptr); + tpool::tpool_wait_end(); + + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} + +/** If innodb_flush_sync=ON, initiate a furious flush. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target */ +void buf_flush_ahead(lsn_t lsn) +{ + ut_ad(!log_mutex_own()); + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + if (buf_flush_sync_lsn < lsn && + UNIV_LIKELY(srv_flush_sync) && UNIV_LIKELY(buf_page_cleaner_is_active)) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_flush_sync_lsn < lsn) + { + buf_flush_sync_lsn= lsn; + mysql_cond_signal(&buf_pool.do_flush_list); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } } /** Wait for pending flushes to complete. */ @@ -1587,29 +1710,84 @@ void buf_flush_wait_batch_end_acquiring_mutex(bool lru) } } +/** Conduct checkpoint-related flushing for innodb_flush_sync=ON, +and try to initiate checkpoints until the target is met. +@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ +ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) +{ + ut_ad(!srv_read_only_mode); + + for (;;) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (ulint n_flushed= buf_flush_lists(srv_max_io_capacity, lsn)) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_flushed); + } + + /* Attempt to perform a log checkpoint upon completing each batch. */ + if (recv_recovery_is_on()) + recv_sys.apply(true); + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + log_mutex_enter(); + const lsn_t newest_lsn= log_sys.get_lsn(); + log_flush_order_mutex_enter(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t measure= buf_pool.get_oldest_modification(0); + log_flush_order_mutex_exit(); + const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; + + if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_checkpoint_low(checkpoint_lsn, newest_lsn); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + measure= buf_pool.get_oldest_modification(LSN_MAX); + } + else + { + log_mutex_exit(); + if (!measure) + measure= LSN_MAX; + } + + ut_ad(!log_mutex_own()); + + /* After attempting log checkpoint, check if we have reached our target. */ + const lsn_t target= buf_flush_sync_lsn; + + if (measure >= target) + buf_flush_sync_lsn= 0; + + /* wake up buf_flush_wait_flushed() */ + mysql_cond_broadcast(&buf_pool.done_flush_list); + + lsn= std::max(lsn, target); + + if (measure >= lsn) + return; + } +} + /*********************************************************************//** Calculates if flushing is required based on number of dirty pages in the buffer pool. +@param dirty_pct 100*flush_list.count / (LRU.count + free.count) @return percent of io_capacity to flush to manage dirty page ratio */ -static -ulint -af_get_pct_for_dirty() +static ulint af_get_pct_for_dirty(double dirty_pct) { - const ulint dirty = UT_LIST_GET_LEN(buf_pool.flush_list); - if (!dirty) { - /* No pages modified */ - return 0; - } - - /* 1 + is there to avoid division by zero (in case the buffer - pool (including the flush_list) was emptied while we are - looking at it) */ - double dirty_pct = 100 * static_cast<double>(dirty) - / static_cast<double>(1 + UT_LIST_GET_LEN(buf_pool.LRU) - + UT_LIST_GET_LEN(buf_pool.free)); - - ut_a(srv_max_dirty_pages_pct_lwm - <= srv_max_buf_pool_modified_pct); + ut_ad(srv_max_dirty_pages_pct_lwm <= srv_max_buf_pool_modified_pct); if (srv_max_dirty_pages_pct_lwm == 0) { /* The user has not set the option to preflush dirty @@ -1620,7 +1798,7 @@ af_get_pct_for_dirty() innodb_io_capacity. */ return(100); } - } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) { + } else { /* We should start flushing pages gradually. */ return(static_cast<ulint>((dirty_pct * 100) / (srv_max_buf_pool_modified_pct + 1))); @@ -1638,30 +1816,16 @@ af_get_pct_for_lsn( /*===============*/ lsn_t age) /*!< in: current age of LSN. */ { - lsn_t max_async_age; - lsn_t lsn_age_factor; lsn_t af_lwm = static_cast<lsn_t>( srv_adaptive_flushing_lwm - * static_cast<double>(log_get_capacity()) / 100); + * static_cast<double>(log_sys.log_capacity) / 100); if (age < af_lwm) { /* No adaptive flushing. */ return(0); } - max_async_age = log_get_max_modified_age_async(); - - if (age < max_async_age && !srv_adaptive_flushing) { - /* We have still not reached the max_async point and - the user has disabled adaptive flushing. */ - return(0); - } - - /* If we are here then we know that either: - 1) User has enabled adaptive flushing - 2) User may have disabled adaptive flushing but we have reached - max_async_age. */ - lsn_age_factor = (age * 100) / max_async_age; + lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async; ut_ad(srv_max_io_capacity >= srv_io_capacity); return static_cast<ulint>( @@ -1671,46 +1835,40 @@ af_get_pct_for_lsn( / 7.5)); } -/*********************************************************************//** -This function is called approximately once every second by the -page_cleaner thread. Based on various factors it decides if there is a -need to do flushing. +/** This function is called approximately once every second by the +page_cleaner thread if innodb_adaptive_flushing=ON. +Based on various factors it decides if there is a need to do flushing. @return number of pages recommended to be flushed -@param last_pages_in the number of pages flushed by the last flush_list - flushing. */ -static -ulint -page_cleaner_flush_pages_recommendation(ulint last_pages_in) +@param last_pages_in number of pages flushed in previous batch +@param oldest_lsn buf_pool.get_oldest_modification(0) +@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */ +static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, + lsn_t oldest_lsn, + double dirty_pct) { static lsn_t prev_lsn = 0; static ulint sum_pages = 0; static ulint avg_page_rate = 0; static ulint n_iterations = 0; static time_t prev_time; - lsn_t oldest_lsn; - lsn_t age; lsn_t lsn_rate; ulint n_pages = 0; - ulint pct_for_dirty = 0; - ulint pct_for_lsn = 0; - ulint pct_total = 0; const lsn_t cur_lsn = log_sys.get_lsn(); + ulint pct_for_dirty = af_get_pct_for_dirty(dirty_pct); + ut_ad(oldest_lsn <= cur_lsn); + ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn); + time_t curr_time = time(nullptr); - if (prev_lsn == 0) { - /* First time around. */ + if (!prev_lsn || !pct_for_lsn) { + prev_time = curr_time; prev_lsn = cur_lsn; - prev_time = time(NULL); - return(0); - } - - if (prev_lsn == cur_lsn) { - return(0); + return ulint(double(pct_for_dirty) / 100.0 + * double(srv_io_capacity)); } sum_pages += last_pages_in; - time_t curr_time = time(NULL); double time_elapsed = difftime(curr_time, prev_time); /* We update our variables every srv_flushing_avg_loops @@ -1740,37 +1898,12 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) page_cleaner.flush_time = 0; page_cleaner.flush_pass = 0; - ulint list_tm = page_cleaner.slot.flush_list_time; - ulint list_pass = page_cleaner.slot.flush_list_pass; - page_cleaner.slot.flush_list_time = 0; - page_cleaner.slot.flush_list_pass = 0; - - /* minimum values are 1, to avoid dividing by zero. */ - if (list_tm < 1) { - list_tm = 1; - } - if (flush_tm < 1) { - flush_tm = 1; + if (flush_pass) { + flush_tm /= flush_pass; } - if (list_pass < 1) { - list_pass = 1; - } - if (flush_pass < 1) { - flush_pass = 1; - } - - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT, - list_tm / list_pass); - - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD, - list_tm / flush_pass); - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST, - flush_tm / flush_pass); - MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass); - - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, list_pass); - MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass); prev_lsn = cur_lsn; prev_time = curr_time; @@ -1780,30 +1913,24 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) sum_pages = 0; } - oldest_lsn = buf_pool.get_oldest_modification(); - - ut_ad(oldest_lsn <= log_get_lsn()); - - age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0; - - pct_for_dirty = af_get_pct_for_dirty(); - pct_for_lsn = af_get_pct_for_lsn(age); + mysql_mutex_lock(&buf_pool.flush_list_mutex); - pct_total = ut_max(pct_for_dirty, pct_for_lsn); + ulint pct_total = std::max(pct_for_dirty, pct_for_lsn); /* Estimate pages to be flushed for the lsn progress */ lsn_t target_lsn = oldest_lsn + lsn_avg_rate * buf_flush_lsn_scan_factor; ulint pages_for_lsn = 0; - mysql_mutex_lock(&buf_pool.flush_list_mutex); for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list); b != NULL; b = UT_LIST_GET_PREV(list, b)) { if (b->oldest_modification() > target_lsn) { break; } - ++pages_for_lsn; + if (++pages_for_lsn >= srv_max_io_capacity) { + break; + } } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -1812,11 +1939,6 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) pages_for_lsn = 1; } - /* Cap the maximum IO capacity that we are going to use by - max_io_capacity. Limit the value to avoid too quick increase */ - pages_for_lsn = std::min<ulint>( - pages_for_lsn, srv_max_io_capacity * 2); - n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0) + avg_page_rate + pages_for_lsn) / 3; @@ -1836,183 +1958,222 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) return(n_pages); } -/** Initiate a flushing batch. -@param max_n maximum mumber of blocks flushed -@param lsn oldest_modification limit -@return ut_time_ms() at the start of the wait */ -static ulint pc_request_flush_slot(ulint max_n, lsn_t lsn) -{ - ut_ad(max_n); - ut_ad(lsn); - - const ulint flush_start_tm= ut_time_ms(); - page_cleaner.slot.n_flushed_list= buf_flush_lists(max_n, lsn); - page_cleaner.slot.flush_list_time+= ut_time_ms() - flush_start_tm; - page_cleaner.slot.flush_list_pass++; - return flush_start_tm; -} - -#ifdef UNIV_DEBUG -/** Loop used to disable the page cleaner thread. */ -static void buf_flush_page_cleaner_disabled_loop() -{ - while (innodb_page_cleaner_disabled_debug - && srv_shutdown_state == SRV_SHUTDOWN_NONE) { - os_thread_sleep(100000); - } -} -#endif /* UNIV_DEBUG */ - /******************************************************************//** page_cleaner thread tasked with flushing dirty pages from the buffer pools. As of now we'll have only one coordinator. @return a dummy parameter */ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*) { - my_thread_init(); + my_thread_init(); #ifdef UNIV_PFS_THREAD - pfs_register_thread(page_cleaner_thread_key); + pfs_register_thread(page_cleaner_thread_key); #endif /* UNIV_PFS_THREAD */ - ut_ad(!srv_read_only_mode); - ut_ad(buf_page_cleaner_is_active); + ut_ad(!srv_read_only_mode); + ut_ad(buf_page_cleaner_is_active); #ifdef UNIV_DEBUG_THREAD_CREATION - ib::info() << "page_cleaner thread running, id " - << os_thread_pf(os_thread_get_curr_id()); + ib::info() << "page_cleaner thread running, id " + << os_thread_pf(os_thread_get_curr_id()); #endif /* UNIV_DEBUG_THREAD_CREATION */ #ifdef UNIV_LINUX - /* linux might be able to set different setting for each thread. - worth to try to set high priority for the page cleaner thread */ - const pid_t tid= static_cast<pid_t>(syscall(SYS_gettid)); - setpriority(PRIO_PROCESS, tid, -20); - if (getpriority(PRIO_PROCESS, tid) != -20) { - ib::info() << "If the mysqld execution user is authorized," - " page cleaner thread priority can be changed." - " See the man page of setpriority()."; - } + /* linux might be able to set different setting for each thread. + worth to try to set high priority for the page cleaner thread */ + const pid_t tid= static_cast<pid_t>(syscall(SYS_gettid)); + setpriority(PRIO_PROCESS, tid, -20); + if (getpriority(PRIO_PROCESS, tid) != -20) + ib::info() << "If the mysqld execution user is authorized," + " page cleaner thread priority can be changed." + " See the man page of setpriority()."; #endif /* UNIV_LINUX */ - ulint curr_time = ut_time_ms(); - ulint n_flushed = 0; - ulint last_activity = srv_get_activity_count(); - ulint last_pages = 0; - - for (ulint next_loop_time = curr_time + 1000; - srv_shutdown_state <= SRV_SHUTDOWN_INITIATED; - curr_time = ut_time_ms()) { - bool sleep_timeout; - - /* The page_cleaner skips sleep if the server is - idle and there are no pending IOs in the buffer pool - and there is work to do. */ - if (next_loop_time <= curr_time) { - sleep_timeout = true; - } else if (!n_flushed || !buf_pool.n_pend_reads - || srv_check_activity(&last_activity)) { - const ulint sleep_ms = std::min<ulint>(next_loop_time - - curr_time, - 1000); - timespec abstime; - set_timespec_nsec(abstime, 1000000ULL * sleep_ms); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - const auto error = mysql_cond_timedwait( - &buf_pool.do_flush_list, - &buf_pool.flush_list_mutex, - &abstime); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - sleep_timeout = error == ETIMEDOUT || error == ETIME; - if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { - break; - } - } else { - sleep_timeout = false; - } + ulint last_pages= 0; + timespec abstime; + set_timespec(abstime, 1); - if (sleep_timeout) { - /* no activity, slept enough */ - n_flushed = buf_flush_lists(srv_io_capacity, LSN_MAX); - last_pages = n_flushed; + mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (n_flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, - MONITOR_FLUSH_BACKGROUND_COUNT, - MONITOR_FLUSH_BACKGROUND_PAGES, - n_flushed); + lsn_t lsn_limit; - } - } else if (lsn_t lsn_limit = buf_flush_sync_lsn.exchange( - 0, std::memory_order_release)) { - page_cleaner.flush_time += ut_time_ms() - - pc_request_flush_slot(ULINT_MAX, lsn_limit); - page_cleaner.flush_pass++; - n_flushed = page_cleaner.slot.n_flushed_list; - - if (n_flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_SYNC_TOTAL_PAGE, - MONITOR_FLUSH_SYNC_COUNT, - MONITOR_FLUSH_SYNC_PAGES, - n_flushed); - } - } else if (!srv_check_activity(&last_activity)) { - /* no activity, but woken up by event */ - n_flushed = 0; - } else if (ulint n= page_cleaner_flush_pages_recommendation( - last_pages)) { - /* Estimate pages from flush_list to be flushed */ - ulint tm= pc_request_flush_slot(n, LSN_MAX); - - page_cleaner.flush_time += ut_time_ms() - tm; - page_cleaner.flush_pass++ ; - - n_flushed = page_cleaner.slot.n_flushed_list; - - if (n_flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, - MONITOR_FLUSH_ADAPTIVE_COUNT, - MONITOR_FLUSH_ADAPTIVE_PAGES, - n_flushed); - } - } else { - n_flushed = 0; - } + for (;;) + { + lsn_limit= buf_flush_sync_lsn; - if (!n_flushed) { - next_loop_time = curr_time + 1000; - } + if (UNIV_UNLIKELY(lsn_limit != 0)) + { +furious_flush: + buf_flush_sync_for_checkpoint(lsn_limit); + last_pages= 0; + set_timespec(abstime, 1); + continue; + } - ut_d(buf_flush_page_cleaner_disabled_loop()); - } + if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; - if (srv_fast_shutdown != 2) { - buf_flush_wait_batch_end_acquiring_mutex(true); - buf_flush_wait_batch_end_acquiring_mutex(false); - } + mysql_cond_timedwait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex, + &abstime); + set_timespec(abstime, 1); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_page_cleaner_is_active = false; - mysql_cond_broadcast(&buf_pool.done_flush_list); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + lsn_limit= buf_flush_sync_lsn; - my_thread_end(); - /* We count the number of threads in os_thread_exit(). A created - thread should always use that to exit and not use return() to exit. */ - os_thread_exit(); + if (UNIV_UNLIKELY(lsn_limit != 0)) + goto furious_flush; - OS_THREAD_DUMMY_RETURN; -} + if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; + + const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); + + if (!dirty_blocks) + continue; + + /* We perform dirty reads of the LRU+free list lengths here. + Division by zero is not possible, because buf_pool.flush_list is + guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ + const double dirty_pct= double(dirty_blocks) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + + if (dirty_pct < srv_max_dirty_pages_pct_lwm) + continue; + + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + ulint n_flushed; + + if (!srv_adaptive_flushing) + { + n_flushed= buf_flush_lists(srv_io_capacity, LSN_MAX); + + if (n_flushed) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); +do_checkpoint: + /* The periodic log_checkpoint() call here makes it harder to + reproduce bugs in crash recovery or mariabackup --prepare, or + in code that writes the redo log records. Omitting the call + here should not affect correctness, because log_free_check() + should still be invoking checkpoints when needed. */ + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;); + + if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL) + log_checkpoint(); + } + } + else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages, + oldest_lsn, + dirty_pct)) + { + page_cleaner.flush_pass++; + const ulint tm= ut_time_ms(); + last_pages= n_flushed= buf_flush_lists(n, LSN_MAX); + page_cleaner.flush_time+= ut_time_ms() - tm; + + if (n_flushed) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_flushed); + goto do_checkpoint; + } + } + +#ifdef UNIV_DEBUG + while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn && + srv_shutdown_state == SRV_SHUTDOWN_NONE) + os_thread_sleep(100000); +#endif /* UNIV_DEBUG */ + +#ifndef DBUG_OFF +next: +#endif /* !DBUG_OFF */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (srv_fast_shutdown != 2) + { + buf_flush_wait_batch_end_acquiring_mutex(true); + buf_flush_wait_batch_end_acquiring_mutex(false); + } + + log_flush_task.wait(); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_limit= buf_flush_sync_lsn; + if (UNIV_UNLIKELY(lsn_limit != 0)) + goto furious_flush; + buf_page_cleaner_is_active= false; + mysql_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + my_thread_end(); + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(); + + OS_THREAD_DUMMY_RETURN; +} /** Initialize page_cleaner. */ -void buf_flush_page_cleaner_init() +ATTRIBUTE_COLD void buf_flush_page_cleaner_init() { ut_ad(!buf_page_cleaner_is_active); + ut_ad(srv_operation == SRV_OPERATION_NORMAL || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + buf_flush_sync_lsn= 0; buf_page_cleaner_is_active= true; os_thread_create(buf_flush_page_cleaner); } +/** @return the number of dirty pages in the buffer pool */ +static ulint buf_flush_list_length() +{ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return len; +} + +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool() +{ + ut_ad(!buf_page_cleaner_is_active); + ut_ad(!buf_flush_sync_lsn); + + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush the buffer pool"); + + while (buf_pool.n_flush_list || buf_flush_list_length()) + { + buf_flush_lists(srv_max_io_capacity, LSN_MAX); + timespec abstime; + + if (buf_pool.n_flush_list) + { + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush " ULINTPF " pages", + buf_flush_list_length()); + set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); + mysql_mutex_lock(&buf_pool.mutex); + while (buf_pool.n_flush_list) + mysql_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex, + &abstime); + mysql_mutex_unlock(&buf_pool.mutex); + } + } + + ut_ad(!buf_pool.any_io_pending()); + log_flush_task.wait(); +} + /** Synchronously flush dirty blocks. NOTE: The calling thread is not allowed to hold any buffer page latches! */ void buf_flush_sync() @@ -2021,7 +2182,7 @@ void buf_flush_sync() for (;;) { - const ulint n_flushed= buf_flush_lists(ULINT_UNDEFINED, LSN_MAX); + const ulint n_flushed= buf_flush_lists(srv_max_io_capacity, LSN_MAX); buf_flush_wait_batch_end_acquiring_mutex(false); if (!n_flushed) { |