diff options
31 files changed, 834 insertions, 1189 deletions
diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 76dabdbfca1..3189bcd14cb 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -3011,6 +3011,7 @@ void xb_fil_io_init() { fil_system.create(srv_file_per_table ? 50000 : 5000); + fil_system.space_id_reuse_warned = true; } static diff --git a/mysql-test/suite/innodb/r/innodb_scrub.result b/mysql-test/suite/innodb/r/innodb_scrub.result index f783b9f167c..1a4db0b541e 100644 --- a/mysql-test/suite/innodb/r/innodb_scrub.result +++ b/mysql-test/suite/innodb/r/innodb_scrub.result @@ -6,5 +6,7 @@ UNLOCK TABLES; FOUND 500500 /unicycle|repairman/ in t1.ibd DELETE FROM t1; InnoDB 0 transactions not purged +FLUSH TABLE t1 FOR EXPORT; +UNLOCK TABLES; NOT FOUND /unicycle|repairman/ in t1.ibd DROP TABLE t1; diff --git a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result index 6a597a919e1..775bbc017a1 100644 --- a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result +++ b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result @@ -89,12 +89,8 @@ buffer_flush_neighbor buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NU buffer_flush_neighbor_pages buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 set_member Pages queued as a neighbor batch buffer_flush_n_to_flush_requested buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages requested for flushing. buffer_flush_n_to_flush_by_age buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages target by LSN Age for flushing. -buffer_flush_adaptive_avg_time_slot buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for adaptive flushing recently per slot. -buffer_flush_adaptive_avg_time_thread buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for adaptive flushing recently per thread. -buffer_flush_adaptive_avg_time_est buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Estimated time (ms) spent for adaptive flushing recently. -buffer_flush_avg_time buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for flushing recently. +buffer_flush_adaptive_avg_time buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for adaptive flushing recently. buffer_flush_adaptive_avg_pass buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of adaptive flushes passed during the recent Avg period. -buffer_flush_avg_pass buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of flushes passed during the recent Avg period. buffer_LRU_get_free_loops buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Total loops in LRU get free. buffer_LRU_get_free_waits buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Total sleep waits in LRU get free. buffer_flush_avg_page_rate buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Average number of pages at which flushing is happening @@ -194,7 +190,6 @@ log_lsn_current recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 log_lsn_checkpoint_age recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Current LSN value minus LSN at last checkpoint log_lsn_buf_pool_oldest recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value The oldest modified block LSN in the buffer pool log_max_modified_age_async recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Maximum LSN difference; when exceeded, start asynchronous preflush -log_max_modified_age_sync recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Maximum LSN difference; when exceeded, start synchronous preflush log_pending_log_flushes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending log flushes log_pending_checkpoint_writes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending checkpoints log_num_log_io recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of log I/Os @@ -245,7 +240,6 @@ innodb_log_flush_usec server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NU innodb_dict_lru_usec server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Time (in microseconds) spent to process DICT LRU list innodb_dict_lru_count_active server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of tables evicted from DICT LRU list in the active loop innodb_dict_lru_count_idle server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of tables evicted from DICT LRU list in the idle loop -innodb_checkpoint_usec server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Time (in microseconds) spent by master thread to do checkpoint innodb_dblwr_writes server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of doublewrite operations that have been performed (innodb_dblwr_writes) innodb_dblwr_pages_written server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of pages that have been written for doublewrite operations (innodb_dblwr_pages_written) innodb_page_size server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value InnoDB page size in bytes (innodb_page_size) diff --git a/mysql-test/suite/innodb/r/monitor.result b/mysql-test/suite/innodb/r/monitor.result index 4aeab1a8402..7bdcf7acb45 100644 --- a/mysql-test/suite/innodb/r/monitor.result +++ b/mysql-test/suite/innodb/r/monitor.result @@ -55,12 +55,8 @@ buffer_flush_neighbor disabled buffer_flush_neighbor_pages disabled buffer_flush_n_to_flush_requested disabled buffer_flush_n_to_flush_by_age disabled -buffer_flush_adaptive_avg_time_slot disabled -buffer_flush_adaptive_avg_time_thread disabled -buffer_flush_adaptive_avg_time_est disabled -buffer_flush_avg_time disabled +buffer_flush_adaptive_avg_time disabled buffer_flush_adaptive_avg_pass disabled -buffer_flush_avg_pass disabled buffer_LRU_get_free_loops disabled buffer_LRU_get_free_waits disabled buffer_flush_avg_page_rate disabled @@ -160,7 +156,6 @@ log_lsn_current disabled log_lsn_checkpoint_age disabled log_lsn_buf_pool_oldest disabled log_max_modified_age_async disabled -log_max_modified_age_sync disabled log_pending_log_flushes disabled log_pending_checkpoint_writes disabled log_num_log_io disabled @@ -211,7 +206,6 @@ innodb_log_flush_usec disabled innodb_dict_lru_usec disabled innodb_dict_lru_count_active disabled innodb_dict_lru_count_idle disabled -innodb_checkpoint_usec disabled innodb_dblwr_writes disabled innodb_dblwr_pages_written disabled innodb_page_size disabled diff --git a/mysql-test/suite/innodb/r/redo_log_during_checkpoint.result b/mysql-test/suite/innodb/r/redo_log_during_checkpoint.result index 3915b07f12e..c7b4bb4403b 100644 --- a/mysql-test/suite/innodb/r/redo_log_during_checkpoint.result +++ b/mysql-test/suite/innodb/r/redo_log_during_checkpoint.result @@ -10,6 +10,7 @@ set global innodb_log_checkpoint_now = 1; ERROR HY000: Lost connection to MySQL server during query # Skip MLOG_FILE_NAME redo records during recovery DROP DATABASE very_long_database_name; +SET GLOBAL innodb_flush_sync=OFF; SET GLOBAL innodb_page_cleaner_disabled_debug = 1; SET GLOBAL innodb_dict_stats_disabled_debug = 1; SET GLOBAL innodb_master_thread_disabled_debug = 1; diff --git a/mysql-test/suite/innodb/t/innodb_scrub.test b/mysql-test/suite/innodb/t/innodb_scrub.test index c1b7fb4df1d..88b4e9cfd76 100644 --- a/mysql-test/suite/innodb/t/innodb_scrub.test +++ b/mysql-test/suite/innodb/t/innodb_scrub.test @@ -22,5 +22,7 @@ let SEARCH_FILE= $MYSQLD_DATADIR/test/t1.ibd; -- source include/search_pattern_in_file.inc DELETE FROM t1; -- source include/wait_all_purged.inc +FLUSH TABLE t1 FOR EXPORT; +UNLOCK TABLES; -- source include/search_pattern_in_file.inc DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/redo_log_during_checkpoint.test b/mysql-test/suite/innodb/t/redo_log_during_checkpoint.test index 645ae8c7855..85beb3ee1d2 100644 --- a/mysql-test/suite/innodb/t/redo_log_during_checkpoint.test +++ b/mysql-test/suite/innodb/t/redo_log_during_checkpoint.test @@ -44,6 +44,7 @@ set global innodb_log_checkpoint_now = 1; DROP DATABASE very_long_database_name; +SET GLOBAL innodb_flush_sync=OFF; SET GLOBAL innodb_page_cleaner_disabled_debug = 1; SET GLOBAL innodb_dict_stats_disabled_debug = 1; SET GLOBAL innodb_master_thread_disabled_debug = 1; diff --git a/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_basic.result b/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_basic.result index ad0ffe9855a..e83f528a94e 100644 --- a/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_basic.result @@ -7,7 +7,7 @@ SELECT @global_start_value; SET @global_start_max_dirty_lwm_value = @@global.innodb_max_dirty_pages_pct_lwm; SELECT @global_start_max_dirty_lwm_value; @global_start_max_dirty_lwm_value -75 +0 SET @@global.innodb_max_dirty_pages_pct_lwm = 0; SELECT @@global.innodb_max_dirty_pages_pct_lwm; @@global.innodb_max_dirty_pages_pct_lwm @@ -171,5 +171,5 @@ SELECT @@global.innodb_max_dirty_pages_pct; SET @@global.innodb_max_dirty_pages_pct_lwm = @global_start_max_dirty_lwm_value; SELECT @@global.innodb_max_dirty_pages_pct_lwm; @@global.innodb_max_dirty_pages_pct_lwm -75.000000 +0.000000 SET @@global.innodb_max_dirty_pages_pct=@save_innodb_max_dirty_pages_pct; diff --git a/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_lwm_basic.result b/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_lwm_basic.result index 313bdf28e82..641386d5f23 100644 --- a/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_lwm_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_max_dirty_pages_pct_lwm_basic.result @@ -3,7 +3,7 @@ set @@global.innodb_max_dirty_pages_pct=75; SET @pct_lwm_start_value = @@global.innodb_max_dirty_pages_pct_lwm; SELECT @pct_lwm_start_value; @pct_lwm_start_value -75 +0 SET @pct_start_value = @@global.innodb_max_dirty_pages_pct; SELECT @pct_start_value; @pct_start_value @@ -13,13 +13,13 @@ SET @@global.innodb_max_dirty_pages_pct_lwm = 0; SET @@global.innodb_max_dirty_pages_pct_lwm = DEFAULT; SELECT @@global.innodb_max_dirty_pages_pct_lwm; @@global.innodb_max_dirty_pages_pct_lwm -75.000000 +0.000000 '#---------------------FN_DYNVARS_046_02-------------------------#' SET innodb_max_dirty_pages_pct_lwm = 1; ERROR HY000: Variable 'innodb_max_dirty_pages_pct_lwm' is a GLOBAL variable and should be set with SET GLOBAL SELECT @@innodb_max_dirty_pages_pct_lwm; @@innodb_max_dirty_pages_pct_lwm -75.000000 +0.000000 SELECT local.innodb_max_dirty_pages_pct_lwm; ERROR 42S02: Unknown table 'local' in field list SET global innodb_max_dirty_pages_pct_lwm = 0; @@ -130,5 +130,5 @@ SELECT @@global.innodb_max_dirty_pages_pct; SET @@global.innodb_max_dirty_pages_pct_lwm = @pct_lwm_start_value; SELECT @@global.innodb_max_dirty_pages_pct_lwm; @@global.innodb_max_dirty_pages_pct_lwm -75.000000 +0.000000 SET @@global.innodb_max_dirty_pages_pct=@save_innodb_max_dirty_pages_pct; diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 5227c08052e..eba68ff6185 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -1319,7 +1319,7 @@ READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_MAX_DIRTY_PAGES_PCT_LWM SESSION_VALUE NULL -DEFAULT_VALUE 75.000000 +DEFAULT_VALUE 0.000000 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE DOUBLE VARIABLE_COMMENT Percentage of dirty pages at which flushing kicks in. diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc index 791d1d84b53..f602e0d19a2 100644 --- a/storage/innobase/btr/btr0bulk.cc +++ b/storage/innobase/btr/btr0bulk.cc @@ -1108,13 +1108,9 @@ BtrBulk::insert( goto func_exit; } - /* Wake up page cleaner to flush dirty pages. */ srv_inc_activity_count(); - mysql_cond_signal(&buf_pool.do_flush_list); - logFreeCheck(); } - } /* Convert tuple to rec. */ diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 0f25232ebba..daf5e1aa511 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -519,31 +519,6 @@ decrypt_failed: ut_ad(node.space->pending_io()); return true; } - -/** -@return the smallest oldest_modification lsn for any page. -@retval 0 if all modified persistent pages have been flushed */ -lsn_t buf_pool_t::get_oldest_modification() -{ - mysql_mutex_lock(&flush_list_mutex); - - /* FIXME: Keep temporary tablespace pages in a separate flush - list. We would only need to write out temporary pages if the - page is about to be evicted from the buffer pool, and the page - contents is still needed (the page has not been freed). */ - const buf_page_t *bpage; - for (bpage= UT_LIST_GET_LAST(flush_list); - bpage && fsp_is_system_temporary(bpage->id().space()); - bpage= UT_LIST_GET_PREV(list, bpage)) - ut_ad(bpage->oldest_modification()); - - lsn_t oldest_lsn= bpage ? bpage->oldest_modification() : 0; - mysql_mutex_unlock(&flush_list_mutex); - - /* The result may become stale as soon as we released the mutex. - On log checkpoint, also log_sys.flush_order_mutex will be needed. */ - return oldest_lsn; -} #endif /* !UNIV_INNOCHECKSUM */ /** Checks if the page is in crc32 checksum format. @@ -3052,12 +3027,13 @@ buf_page_get_low( break; default: ut_error; + case BUF_GET_POSSIBLY_FREED: + break; case BUF_GET_NO_LATCH: ut_ad(rw_latch == RW_NO_LATCH); /* fall through */ case BUF_GET: case BUF_GET_IF_IN_POOL_OR_WATCH: - case BUF_GET_POSSIBLY_FREED: fil_space_t* s = fil_space_acquire_for_io(page_id.space()); ut_ad(s); ut_ad(s->zip_size() == zip_size); diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index ee9be7e47d2..b66f5e39744 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -665,6 +665,12 @@ buf_load() continue; } + if (space->is_stopping()) { + space->release_for_io(); + space = nullptr; + continue; + } + buf_read_page_background(dump[i], zip_size, true); if (buf_load_abort_flag) { diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index da25b825e7e..b69026ef990 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -26,6 +26,7 @@ Created 11/11/1995 Heikki Tuuri *******************************************************/ #include "univ.i" +#include <my_service_manager.h> #include <mysql/service_thd_wait.h> #include <sql_class.h> @@ -52,10 +53,6 @@ Created 11/11/1995 Heikki Tuuri # include "snappy-c.h" #endif -/** Sleep time in microseconds for loop waiting for the oldest -modification lsn */ -static constexpr ulint buf_flush_wait_flushed_sleep_time = 10000; - /** Number of pages flushed via LRU. Protected by buf_pool.mutex. Also included in buf_flush_page_count. */ ulint buf_lru_flush_page_count; @@ -68,41 +65,27 @@ bool buf_page_cleaner_is_active; /** Factor for scan length to determine n_pages for intended oldest LSN progress */ -static ulint buf_flush_lsn_scan_factor = 3; +static constexpr ulint buf_flush_lsn_scan_factor = 3; /** Average redo generation rate */ static lsn_t lsn_avg_rate = 0; -/** Target oldest LSN for the requested flush_sync */ -static std::atomic<lsn_t> buf_flush_sync_lsn; +/** Target oldest_modification for the page cleaner; writes are protected by +buf_pool.flush_list_mutex */ +static Atomic_relaxed<lsn_t> buf_flush_sync_lsn; #ifdef UNIV_PFS_THREAD mysql_pfs_key_t page_cleaner_thread_key; #endif /* UNIV_PFS_THREAD */ -/** Page cleaner request state for buf_pool */ -struct page_cleaner_slot_t { - ulint n_flushed_list; - /*!< number of flushed pages - by flush_list flushing */ - ulint flush_list_time; - /*!< elapsed time for flush_list - flushing */ - ulint flush_list_pass; - /*!< count to attempt flush_list - flushing */ -}; - /** Page cleaner structure */ -struct page_cleaner_t { - ulint flush_time; /*!< elapsed time to flush - requests for all slots */ - ulint flush_pass; /*!< count to finish to flush - requests for all slots */ - page_cleaner_slot_t slot; -}; - -static page_cleaner_t page_cleaner; +static struct +{ + /** total elapsed time in adaptive flushing, in seconds */ + ulint flush_time; + /** number of adaptive flushing passes */ + ulint flush_pass; +} page_cleaner; #ifdef UNIV_DEBUG my_bool innodb_page_cleaner_disabled_debug; @@ -257,7 +240,7 @@ ulint buf_flush_dirty_pages(ulint id) } mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (n) - buf_flush_lists(ULINT_UNDEFINED, LSN_MAX); + buf_flush_lists(srv_max_io_capacity, LSN_MAX); return n; } @@ -1449,6 +1432,12 @@ static std::atomic_flag log_flush_pending; /** Advance log_sys.get_flushed_lsn() */ static void log_flush(void *) { + /* Between batches, we try to prevent I/O stalls by these calls. + This should not be needed for correctness. */ + os_aio_wait_until_no_pending_writes(); + fil_flush_file_spaces(); + + /* Guarantee progress for buf_flush_lists(). */ log_write_up_to(log_sys.get_lsn(), true); log_flush_pending.clear(); } @@ -1515,65 +1504,199 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn) return n_flushed; } -/** Request IO burst and wake up the page_cleaner. -@param lsn desired lower bound of oldest_modification */ -static void buf_flush_request_force(lsn_t lsn) + +/** Initiate a log checkpoint, discarding the start of the log. +@param oldest_lsn the checkpoint LSN +@param end_lsn log_sys.get_lsn() +@return true if success, false if a checkpoint write was already running */ +static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) { - lsn+= lsn_avg_rate * 3; + ut_ad(!srv_read_only_mode); + ut_ad(log_mutex_own()); + ut_ad(oldest_lsn <= end_lsn); + ut_ad(end_lsn == log_sys.get_lsn()); + ut_ad(!recv_no_log_write); + + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); + + if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + /* Some log has been written since the previous checkpoint. */; + else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + /* MariaDB startup expects the redo log file to be logically empty + (not even containing a FILE_CHECKPOINT record) after a clean shutdown. + Perform an extra checkpoint at shutdown. */; + else + { + /* Do nothing, because nothing was logged (other than a + FILE_CHECKPOINT record) since the previous checkpoint. */ + log_mutex_exit(); + return true; + } - lsn_t o= 0; + /* Repeat the FILE_MODIFY records after the checkpoint, in case some + log records between the checkpoint and log_sys.lsn need them. + Finally, write a FILE_CHECKPOINT record. Redo log apply expects to + see a FILE_CHECKPOINT after the checkpoint, except on clean + shutdown, where the log will be empty after the checkpoint. - while (!buf_flush_sync_lsn.compare_exchange_weak(o, lsn, - std::memory_order_acquire, - std::memory_order_relaxed)) - if (lsn > o) - break; + It is important that we write out the redo log before any further + dirty pages are flushed to the tablespace files. At this point, + because we hold log_sys.mutex, mtr_t::commit() in other threads will + be blocked, and no pages can be added to the flush lists. */ + lsn_t flush_lsn= oldest_lsn; - mysql_cond_signal(&buf_pool.do_flush_list); + if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn || + srv_shutdown_state <= SRV_SHUTDOWN_INITIATED)) + { + flush_lsn= log_sys.get_lsn(); + ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); + log_mutex_exit(); + log_write_up_to(flush_lsn, true, true); + log_mutex_enter(); + if (log_sys.last_checkpoint_lsn >= oldest_lsn) + { + log_mutex_exit(); + return true; + } + } + else + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); + + ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); + + if (log_sys.n_pending_checkpoint_writes) + { + /* A checkpoint write is running */ + log_mutex_exit(); + return false; + } + + log_sys.next_checkpoint_lsn= oldest_lsn; + log_write_checkpoint_info(end_lsn); + ut_ad(!log_mutex_own()); + + return true; } -/** Wait until a flush batch of the given lsn ends -@param[in] new_oldest target oldest_modified_lsn to wait for */ -void buf_flush_wait_flushed(lsn_t new_oldest) +/** Make a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log file. Use log_make_checkpoint() to flush also the pool. +@retval true if the checkpoint was or had been made +@retval false if a checkpoint write was already running */ +static bool log_checkpoint() { - ut_ad(new_oldest); + if (recv_recovery_is_on()) + recv_sys.apply(true); - if (srv_flush_sync) { - /* wake page cleaner for IO burst */ - buf_flush_request_force(new_oldest); - } + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } - for (;;) { - /* We don't need to wait for fsync of the flushed - blocks, because anyway we need fsync to make chekpoint. - So, we don't need to wait for the batch end here. */ - - mysql_mutex_lock(&buf_pool.flush_list_mutex); - - buf_page_t* bpage; - /* FIXME: Keep temporary tablespace pages in a separate flush - list. We would only need to write out temporary pages if the - page is about to be evicted from the buffer pool, and the page - contents is still needed (the page has not been freed). */ - for (bpage = UT_LIST_GET_LAST(buf_pool.flush_list); - bpage && fsp_is_system_temporary(bpage->id().space()); - bpage = UT_LIST_GET_PREV(list, bpage)) { - ut_ad(bpage->oldest_modification()); - } + log_mutex_enter(); + const lsn_t end_lsn= log_sys.get_lsn(); + log_flush_order_mutex_enter(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_flush_order_mutex_exit(); + return log_checkpoint_low(oldest_lsn, end_lsn); +} - lsn_t oldest = bpage ? bpage->oldest_modification() : 0; +/** Make a checkpoint. */ +ATTRIBUTE_COLD void log_make_checkpoint() +{ + buf_flush_wait_flushed(log_sys.get_lsn()); + while (!log_checkpoint()); +} - mysql_mutex_unlock(&buf_pool.flush_list_mutex); +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) +{ + ut_ad(sync_lsn); + ut_ad(sync_lsn < LSN_MAX); + ut_ad(!log_mutex_own()); + ut_ad(!srv_read_only_mode); - if (oldest == 0 || oldest >= new_oldest) { - break; - } + if (recv_recovery_is_on()) + recv_sys.apply(true); - /* sleep and retry */ - os_thread_sleep(buf_flush_wait_flushed_sleep_time); + mysql_mutex_lock(&buf_pool.flush_list_mutex); - MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); - } +#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */ + if (UNIV_UNLIKELY(!buf_page_cleaner_is_active) + ut_d(|| innodb_page_cleaner_disabled_debug)) + { + for (;;) + { + const lsn_t lsn= buf_pool.get_oldest_modification(sync_lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (lsn >= sync_lsn) + return; + ulint n_pages= buf_flush_lists(srv_max_io_capacity, sync_lsn); + buf_flush_wait_batch_end_acquiring_mutex(false); + if (n_pages) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_pages); + log_checkpoint(); + } + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + return; + } + else if (UNIV_LIKELY(srv_flush_sync)) +#endif + { + if (buf_flush_sync_lsn < sync_lsn) + { + buf_flush_sync_lsn= sync_lsn; + mysql_cond_signal(&buf_pool.do_flush_list); + } + } + + while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn) + { + tpool::tpool_wait_begin(); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + mysql_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex); + thd_wait_end(nullptr); + tpool::tpool_wait_end(); + + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} + +/** If innodb_flush_sync=ON, initiate a furious flush. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target */ +void buf_flush_ahead(lsn_t lsn) +{ + ut_ad(!log_mutex_own()); + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + if (buf_flush_sync_lsn < lsn && + UNIV_LIKELY(srv_flush_sync) && UNIV_LIKELY(buf_page_cleaner_is_active)) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_flush_sync_lsn < lsn) + { + buf_flush_sync_lsn= lsn; + mysql_cond_signal(&buf_pool.do_flush_list); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } } /** Wait for pending flushes to complete. */ @@ -1587,29 +1710,84 @@ void buf_flush_wait_batch_end_acquiring_mutex(bool lru) } } +/** Conduct checkpoint-related flushing for innodb_flush_sync=ON, +and try to initiate checkpoints until the target is met. +@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ +ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) +{ + ut_ad(!srv_read_only_mode); + + for (;;) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (ulint n_flushed= buf_flush_lists(srv_max_io_capacity, lsn)) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_flushed); + } + + /* Attempt to perform a log checkpoint upon completing each batch. */ + if (recv_recovery_is_on()) + recv_sys.apply(true); + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + log_mutex_enter(); + const lsn_t newest_lsn= log_sys.get_lsn(); + log_flush_order_mutex_enter(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t measure= buf_pool.get_oldest_modification(0); + log_flush_order_mutex_exit(); + const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; + + if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_checkpoint_low(checkpoint_lsn, newest_lsn); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + measure= buf_pool.get_oldest_modification(LSN_MAX); + } + else + { + log_mutex_exit(); + if (!measure) + measure= LSN_MAX; + } + + ut_ad(!log_mutex_own()); + + /* After attempting log checkpoint, check if we have reached our target. */ + const lsn_t target= buf_flush_sync_lsn; + + if (measure >= target) + buf_flush_sync_lsn= 0; + + /* wake up buf_flush_wait_flushed() */ + mysql_cond_broadcast(&buf_pool.done_flush_list); + + lsn= std::max(lsn, target); + + if (measure >= lsn) + return; + } +} + /*********************************************************************//** Calculates if flushing is required based on number of dirty pages in the buffer pool. +@param dirty_pct 100*flush_list.count / (LRU.count + free.count) @return percent of io_capacity to flush to manage dirty page ratio */ -static -ulint -af_get_pct_for_dirty() +static ulint af_get_pct_for_dirty(double dirty_pct) { - const ulint dirty = UT_LIST_GET_LEN(buf_pool.flush_list); - if (!dirty) { - /* No pages modified */ - return 0; - } - - /* 1 + is there to avoid division by zero (in case the buffer - pool (including the flush_list) was emptied while we are - looking at it) */ - double dirty_pct = 100 * static_cast<double>(dirty) - / static_cast<double>(1 + UT_LIST_GET_LEN(buf_pool.LRU) - + UT_LIST_GET_LEN(buf_pool.free)); - - ut_a(srv_max_dirty_pages_pct_lwm - <= srv_max_buf_pool_modified_pct); + ut_ad(srv_max_dirty_pages_pct_lwm <= srv_max_buf_pool_modified_pct); if (srv_max_dirty_pages_pct_lwm == 0) { /* The user has not set the option to preflush dirty @@ -1620,7 +1798,7 @@ af_get_pct_for_dirty() innodb_io_capacity. */ return(100); } - } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) { + } else { /* We should start flushing pages gradually. */ return(static_cast<ulint>((dirty_pct * 100) / (srv_max_buf_pool_modified_pct + 1))); @@ -1638,30 +1816,16 @@ af_get_pct_for_lsn( /*===============*/ lsn_t age) /*!< in: current age of LSN. */ { - lsn_t max_async_age; - lsn_t lsn_age_factor; lsn_t af_lwm = static_cast<lsn_t>( srv_adaptive_flushing_lwm - * static_cast<double>(log_get_capacity()) / 100); + * static_cast<double>(log_sys.log_capacity) / 100); if (age < af_lwm) { /* No adaptive flushing. */ return(0); } - max_async_age = log_get_max_modified_age_async(); - - if (age < max_async_age && !srv_adaptive_flushing) { - /* We have still not reached the max_async point and - the user has disabled adaptive flushing. */ - return(0); - } - - /* If we are here then we know that either: - 1) User has enabled adaptive flushing - 2) User may have disabled adaptive flushing but we have reached - max_async_age. */ - lsn_age_factor = (age * 100) / max_async_age; + lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async; ut_ad(srv_max_io_capacity >= srv_io_capacity); return static_cast<ulint>( @@ -1671,46 +1835,40 @@ af_get_pct_for_lsn( / 7.5)); } -/*********************************************************************//** -This function is called approximately once every second by the -page_cleaner thread. Based on various factors it decides if there is a -need to do flushing. +/** This function is called approximately once every second by the +page_cleaner thread if innodb_adaptive_flushing=ON. +Based on various factors it decides if there is a need to do flushing. @return number of pages recommended to be flushed -@param last_pages_in the number of pages flushed by the last flush_list - flushing. */ -static -ulint -page_cleaner_flush_pages_recommendation(ulint last_pages_in) +@param last_pages_in number of pages flushed in previous batch +@param oldest_lsn buf_pool.get_oldest_modification(0) +@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */ +static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, + lsn_t oldest_lsn, + double dirty_pct) { static lsn_t prev_lsn = 0; static ulint sum_pages = 0; static ulint avg_page_rate = 0; static ulint n_iterations = 0; static time_t prev_time; - lsn_t oldest_lsn; - lsn_t age; lsn_t lsn_rate; ulint n_pages = 0; - ulint pct_for_dirty = 0; - ulint pct_for_lsn = 0; - ulint pct_total = 0; const lsn_t cur_lsn = log_sys.get_lsn(); + ulint pct_for_dirty = af_get_pct_for_dirty(dirty_pct); + ut_ad(oldest_lsn <= cur_lsn); + ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn); + time_t curr_time = time(nullptr); - if (prev_lsn == 0) { - /* First time around. */ + if (!prev_lsn || !pct_for_lsn) { + prev_time = curr_time; prev_lsn = cur_lsn; - prev_time = time(NULL); - return(0); - } - - if (prev_lsn == cur_lsn) { - return(0); + return ulint(double(pct_for_dirty) / 100.0 + * double(srv_io_capacity)); } sum_pages += last_pages_in; - time_t curr_time = time(NULL); double time_elapsed = difftime(curr_time, prev_time); /* We update our variables every srv_flushing_avg_loops @@ -1740,37 +1898,12 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) page_cleaner.flush_time = 0; page_cleaner.flush_pass = 0; - ulint list_tm = page_cleaner.slot.flush_list_time; - ulint list_pass = page_cleaner.slot.flush_list_pass; - page_cleaner.slot.flush_list_time = 0; - page_cleaner.slot.flush_list_pass = 0; - - /* minimum values are 1, to avoid dividing by zero. */ - if (list_tm < 1) { - list_tm = 1; - } - if (flush_tm < 1) { - flush_tm = 1; + if (flush_pass) { + flush_tm /= flush_pass; } - if (list_pass < 1) { - list_pass = 1; - } - if (flush_pass < 1) { - flush_pass = 1; - } - - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT, - list_tm / list_pass); - - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD, - list_tm / flush_pass); - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST, - flush_tm / flush_pass); - MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass); - - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, list_pass); - MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass); prev_lsn = cur_lsn; prev_time = curr_time; @@ -1780,30 +1913,24 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) sum_pages = 0; } - oldest_lsn = buf_pool.get_oldest_modification(); - - ut_ad(oldest_lsn <= log_get_lsn()); - - age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0; - - pct_for_dirty = af_get_pct_for_dirty(); - pct_for_lsn = af_get_pct_for_lsn(age); + mysql_mutex_lock(&buf_pool.flush_list_mutex); - pct_total = ut_max(pct_for_dirty, pct_for_lsn); + ulint pct_total = std::max(pct_for_dirty, pct_for_lsn); /* Estimate pages to be flushed for the lsn progress */ lsn_t target_lsn = oldest_lsn + lsn_avg_rate * buf_flush_lsn_scan_factor; ulint pages_for_lsn = 0; - mysql_mutex_lock(&buf_pool.flush_list_mutex); for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list); b != NULL; b = UT_LIST_GET_PREV(list, b)) { if (b->oldest_modification() > target_lsn) { break; } - ++pages_for_lsn; + if (++pages_for_lsn >= srv_max_io_capacity) { + break; + } } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -1812,11 +1939,6 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) pages_for_lsn = 1; } - /* Cap the maximum IO capacity that we are going to use by - max_io_capacity. Limit the value to avoid too quick increase */ - pages_for_lsn = std::min<ulint>( - pages_for_lsn, srv_max_io_capacity * 2); - n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0) + avg_page_rate + pages_for_lsn) / 3; @@ -1836,183 +1958,222 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) return(n_pages); } -/** Initiate a flushing batch. -@param max_n maximum mumber of blocks flushed -@param lsn oldest_modification limit -@return ut_time_ms() at the start of the wait */ -static ulint pc_request_flush_slot(ulint max_n, lsn_t lsn) -{ - ut_ad(max_n); - ut_ad(lsn); - - const ulint flush_start_tm= ut_time_ms(); - page_cleaner.slot.n_flushed_list= buf_flush_lists(max_n, lsn); - page_cleaner.slot.flush_list_time+= ut_time_ms() - flush_start_tm; - page_cleaner.slot.flush_list_pass++; - return flush_start_tm; -} - -#ifdef UNIV_DEBUG -/** Loop used to disable the page cleaner thread. */ -static void buf_flush_page_cleaner_disabled_loop() -{ - while (innodb_page_cleaner_disabled_debug - && srv_shutdown_state == SRV_SHUTDOWN_NONE) { - os_thread_sleep(100000); - } -} -#endif /* UNIV_DEBUG */ - /******************************************************************//** page_cleaner thread tasked with flushing dirty pages from the buffer pools. As of now we'll have only one coordinator. @return a dummy parameter */ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*) { - my_thread_init(); + my_thread_init(); #ifdef UNIV_PFS_THREAD - pfs_register_thread(page_cleaner_thread_key); + pfs_register_thread(page_cleaner_thread_key); #endif /* UNIV_PFS_THREAD */ - ut_ad(!srv_read_only_mode); - ut_ad(buf_page_cleaner_is_active); + ut_ad(!srv_read_only_mode); + ut_ad(buf_page_cleaner_is_active); #ifdef UNIV_DEBUG_THREAD_CREATION - ib::info() << "page_cleaner thread running, id " - << os_thread_pf(os_thread_get_curr_id()); + ib::info() << "page_cleaner thread running, id " + << os_thread_pf(os_thread_get_curr_id()); #endif /* UNIV_DEBUG_THREAD_CREATION */ #ifdef UNIV_LINUX - /* linux might be able to set different setting for each thread. - worth to try to set high priority for the page cleaner thread */ - const pid_t tid= static_cast<pid_t>(syscall(SYS_gettid)); - setpriority(PRIO_PROCESS, tid, -20); - if (getpriority(PRIO_PROCESS, tid) != -20) { - ib::info() << "If the mysqld execution user is authorized," - " page cleaner thread priority can be changed." - " See the man page of setpriority()."; - } + /* linux might be able to set different setting for each thread. + worth to try to set high priority for the page cleaner thread */ + const pid_t tid= static_cast<pid_t>(syscall(SYS_gettid)); + setpriority(PRIO_PROCESS, tid, -20); + if (getpriority(PRIO_PROCESS, tid) != -20) + ib::info() << "If the mysqld execution user is authorized," + " page cleaner thread priority can be changed." + " See the man page of setpriority()."; #endif /* UNIV_LINUX */ - ulint curr_time = ut_time_ms(); - ulint n_flushed = 0; - ulint last_activity = srv_get_activity_count(); - ulint last_pages = 0; - - for (ulint next_loop_time = curr_time + 1000; - srv_shutdown_state <= SRV_SHUTDOWN_INITIATED; - curr_time = ut_time_ms()) { - bool sleep_timeout; - - /* The page_cleaner skips sleep if the server is - idle and there are no pending IOs in the buffer pool - and there is work to do. */ - if (next_loop_time <= curr_time) { - sleep_timeout = true; - } else if (!n_flushed || !buf_pool.n_pend_reads - || srv_check_activity(&last_activity)) { - const ulint sleep_ms = std::min<ulint>(next_loop_time - - curr_time, - 1000); - timespec abstime; - set_timespec_nsec(abstime, 1000000ULL * sleep_ms); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - const auto error = mysql_cond_timedwait( - &buf_pool.do_flush_list, - &buf_pool.flush_list_mutex, - &abstime); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - sleep_timeout = error == ETIMEDOUT || error == ETIME; - if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { - break; - } - } else { - sleep_timeout = false; - } + ulint last_pages= 0; + timespec abstime; + set_timespec(abstime, 1); - if (sleep_timeout) { - /* no activity, slept enough */ - n_flushed = buf_flush_lists(srv_io_capacity, LSN_MAX); - last_pages = n_flushed; + mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (n_flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, - MONITOR_FLUSH_BACKGROUND_COUNT, - MONITOR_FLUSH_BACKGROUND_PAGES, - n_flushed); + lsn_t lsn_limit; - } - } else if (lsn_t lsn_limit = buf_flush_sync_lsn.exchange( - 0, std::memory_order_release)) { - page_cleaner.flush_time += ut_time_ms() - - pc_request_flush_slot(ULINT_MAX, lsn_limit); - page_cleaner.flush_pass++; - n_flushed = page_cleaner.slot.n_flushed_list; - - if (n_flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_SYNC_TOTAL_PAGE, - MONITOR_FLUSH_SYNC_COUNT, - MONITOR_FLUSH_SYNC_PAGES, - n_flushed); - } - } else if (!srv_check_activity(&last_activity)) { - /* no activity, but woken up by event */ - n_flushed = 0; - } else if (ulint n= page_cleaner_flush_pages_recommendation( - last_pages)) { - /* Estimate pages from flush_list to be flushed */ - ulint tm= pc_request_flush_slot(n, LSN_MAX); - - page_cleaner.flush_time += ut_time_ms() - tm; - page_cleaner.flush_pass++ ; - - n_flushed = page_cleaner.slot.n_flushed_list; - - if (n_flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, - MONITOR_FLUSH_ADAPTIVE_COUNT, - MONITOR_FLUSH_ADAPTIVE_PAGES, - n_flushed); - } - } else { - n_flushed = 0; - } + for (;;) + { + lsn_limit= buf_flush_sync_lsn; - if (!n_flushed) { - next_loop_time = curr_time + 1000; - } + if (UNIV_UNLIKELY(lsn_limit != 0)) + { +furious_flush: + buf_flush_sync_for_checkpoint(lsn_limit); + last_pages= 0; + set_timespec(abstime, 1); + continue; + } - ut_d(buf_flush_page_cleaner_disabled_loop()); - } + if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; - if (srv_fast_shutdown != 2) { - buf_flush_wait_batch_end_acquiring_mutex(true); - buf_flush_wait_batch_end_acquiring_mutex(false); - } + mysql_cond_timedwait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex, + &abstime); + set_timespec(abstime, 1); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_page_cleaner_is_active = false; - mysql_cond_broadcast(&buf_pool.done_flush_list); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + lsn_limit= buf_flush_sync_lsn; - my_thread_end(); - /* We count the number of threads in os_thread_exit(). A created - thread should always use that to exit and not use return() to exit. */ - os_thread_exit(); + if (UNIV_UNLIKELY(lsn_limit != 0)) + goto furious_flush; - OS_THREAD_DUMMY_RETURN; -} + if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; + + const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); + + if (!dirty_blocks) + continue; + + /* We perform dirty reads of the LRU+free list lengths here. + Division by zero is not possible, because buf_pool.flush_list is + guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ + const double dirty_pct= double(dirty_blocks) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + + if (dirty_pct < srv_max_dirty_pages_pct_lwm) + continue; + + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + ulint n_flushed; + + if (!srv_adaptive_flushing) + { + n_flushed= buf_flush_lists(srv_io_capacity, LSN_MAX); + + if (n_flushed) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); +do_checkpoint: + /* The periodic log_checkpoint() call here makes it harder to + reproduce bugs in crash recovery or mariabackup --prepare, or + in code that writes the redo log records. Omitting the call + here should not affect correctness, because log_free_check() + should still be invoking checkpoints when needed. */ + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;); + + if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL) + log_checkpoint(); + } + } + else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages, + oldest_lsn, + dirty_pct)) + { + page_cleaner.flush_pass++; + const ulint tm= ut_time_ms(); + last_pages= n_flushed= buf_flush_lists(n, LSN_MAX); + page_cleaner.flush_time+= ut_time_ms() - tm; + + if (n_flushed) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_flushed); + goto do_checkpoint; + } + } + +#ifdef UNIV_DEBUG + while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn && + srv_shutdown_state == SRV_SHUTDOWN_NONE) + os_thread_sleep(100000); +#endif /* UNIV_DEBUG */ + +#ifndef DBUG_OFF +next: +#endif /* !DBUG_OFF */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (srv_fast_shutdown != 2) + { + buf_flush_wait_batch_end_acquiring_mutex(true); + buf_flush_wait_batch_end_acquiring_mutex(false); + } + + log_flush_task.wait(); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_limit= buf_flush_sync_lsn; + if (UNIV_UNLIKELY(lsn_limit != 0)) + goto furious_flush; + buf_page_cleaner_is_active= false; + mysql_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + my_thread_end(); + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(); + + OS_THREAD_DUMMY_RETURN; +} /** Initialize page_cleaner. */ -void buf_flush_page_cleaner_init() +ATTRIBUTE_COLD void buf_flush_page_cleaner_init() { ut_ad(!buf_page_cleaner_is_active); + ut_ad(srv_operation == SRV_OPERATION_NORMAL || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + buf_flush_sync_lsn= 0; buf_page_cleaner_is_active= true; os_thread_create(buf_flush_page_cleaner); } +/** @return the number of dirty pages in the buffer pool */ +static ulint buf_flush_list_length() +{ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return len; +} + +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool() +{ + ut_ad(!buf_page_cleaner_is_active); + ut_ad(!buf_flush_sync_lsn); + + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush the buffer pool"); + + while (buf_pool.n_flush_list || buf_flush_list_length()) + { + buf_flush_lists(srv_max_io_capacity, LSN_MAX); + timespec abstime; + + if (buf_pool.n_flush_list) + { + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush " ULINTPF " pages", + buf_flush_list_length()); + set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); + mysql_mutex_lock(&buf_pool.mutex); + while (buf_pool.n_flush_list) + mysql_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex, + &abstime); + mysql_mutex_unlock(&buf_pool.mutex); + } + } + + ut_ad(!buf_pool.any_io_pending()); + log_flush_task.wait(); +} + /** Synchronously flush dirty blocks. NOTE: The calling thread is not allowed to hold any buffer page latches! */ void buf_flush_sync() @@ -2021,7 +2182,7 @@ void buf_flush_sync() for (;;) { - const ulint n_flushed= buf_flush_lists(ULINT_UNDEFINED, LSN_MAX); + const ulint n_flushed= buf_flush_lists(srv_max_io_capacity, LSN_MAX); buf_flush_wait_batch_end_acquiring_mutex(false); if (!n_flushed) { diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index f4207ff393e..bc81a8e9b86 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -438,6 +438,8 @@ read_ahead: { if (ibuf_bitmap_page(i, zip_size)) continue; + if (space->is_stopping()) + break; dberr_t err; count+= buf_read_page_low(&err, false, ibuf_mode, i, zip_size, false); } @@ -697,6 +699,8 @@ failed: { if (ibuf_bitmap_page(new_low, zip_size)) continue; + if (space->is_stopping()) + break; dberr_t err; count+= buf_read_page_low(&err, false, ibuf_mode, new_low, zip_size, false); diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc index 359684416ed..bd2cf4ffdd8 100644 --- a/storage/innobase/dict/dict0boot.cc +++ b/storage/innobase/dict/dict0boot.cc @@ -273,6 +273,11 @@ dict_boot(void) dict_sys.row_id = DICT_HDR_ROW_ID_WRITE_MARGIN + ut_uint64_align_up(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID), DICT_HDR_ROW_ID_WRITE_MARGIN); + if (ulint max_space_id = mach_read_from_4(dict_hdr + + DICT_HDR_MAX_SPACE_ID)) { + max_space_id--; + fil_assign_new_space_id(&max_space_id); + } /* Insert into the dictionary cache the descriptions of the basic system tables */ diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 021f2f15e3b..2da60b079f7 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1142,7 +1142,6 @@ fil_space_create( UT_LIST_INIT(space->chain, &fil_node_t::chain); if ((purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT) - && !recv_recovery_is_on() && id > fil_system.max_assigned_id) { if (!fil_system.space_id_reuse_warned) { fil_system.space_id_reuse_warned = true; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index d6f28192893..97cce77135b 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -19074,7 +19074,7 @@ static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct_lwm, srv_max_dirty_pages_pct_lwm, PLUGIN_VAR_RQCMDARG, "Percentage of dirty pages at which flushing kicks in.", - NULL, innodb_max_dirty_pages_pct_lwm_update, 75.0, 0, 99.999, 0); + NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99.999, 0); static MYSQL_SYSVAR_DOUBLE(adaptive_flushing_lwm, srv_adaptive_flushing_lwm, diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index d5b65bb7ed8..d2b52c4f520 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1567,8 +1567,18 @@ public: /** @return the smallest oldest_modification lsn for any page - @retval 0 if all modified persistent pages have been flushed */ - lsn_t get_oldest_modification(); + @retval empty_lsn if all modified persistent pages have been flushed */ + lsn_t get_oldest_modification(lsn_t empty_lsn) + { + mysql_mutex_assert_owner(&flush_list_mutex); + const buf_page_t *bpage= UT_LIST_GET_LAST(flush_list); +#if 1 /* MDEV-12227 FIXME: remove this loop */ + for (; bpage && fsp_is_system_temporary(bpage->id().space()); + bpage= UT_LIST_GET_PREV(list, bpage)) + ut_ad(bpage->oldest_modification()); +#endif + return bpage ? bpage->oldest_modification() : empty_lsn; + } /** Determine if a buffer block was created by chunk_t::create(). @param block block descriptor (not dereferenced) diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index 12ebf6f01e9..148db809077 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -104,9 +104,13 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn); /** Wait until a flush batch ends. @param lru true=buf_pool.LRU; false=buf_pool.flush_list */ void buf_flush_wait_batch_end(bool lru); -/** Wait until a flush batch of the given lsn ends -@param[in] new_oldest target oldest_modified_lsn to wait for */ -void buf_flush_wait_flushed(lsn_t new_oldest); +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn); +/** If innodb_flush_sync=ON, initiate a furious flush. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target */ +void buf_flush_ahead(lsn_t lsn); + /********************************************************************//** This function should be called at a mini-transaction commit, if a page was modified in it. Puts the block to the list of modified blocks, if it not @@ -122,11 +126,14 @@ buf_flush_note_modification( set of mtr's */ /** Initialize page_cleaner. */ -void buf_flush_page_cleaner_init(); +ATTRIBUTE_COLD void buf_flush_page_cleaner_init(); /** Wait for pending flushes to complete. */ void buf_flush_wait_batch_end_acquiring_mutex(bool lru); +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool(); + #ifdef UNIV_DEBUG /** Validate the flush list. */ void buf_flush_validate(); diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index c80537f809b..b82857a8e0f 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -88,50 +88,12 @@ log_free_check(void); @param[in] len requested minimum size in bytes */ void log_buffer_extend(ulong len); -/** Check margin not to overwrite transaction log from the last checkpoint. -If would estimate the log write to exceed the log_capacity, -waits for the checkpoint is done enough. -@param[in] len length of the data to be written */ - -void -log_margin_checkpoint_age( - ulint len); - -/** Open the log for log_write_low. The log must be closed with log_close. -@param[in] len length of the data to be written -@return start lsn of the log record */ -lsn_t -log_reserve_and_open( - ulint len); -/************************************************************//** -Writes to the log the string given. It is assumed that the caller holds the -log mutex. */ -void -log_write_low( -/*==========*/ - const byte* str, /*!< in: string */ - ulint str_len); /*!< in: string length */ -/************************************************************//** -Closes the log. -@return lsn */ -lsn_t -log_close(void); -/*===========*/ /** Read the current LSN. */ #define log_get_lsn() log_sys.get_lsn() /** Read the durable LSN */ #define log_get_flush_lsn() log_sys.get_flushed_lsn() -/**************************************************************** -Get log_sys::max_modified_age_async. It is OK to read the value without -holding log_sys::mutex because it is constant. -@return max_modified_age_async */ -UNIV_INLINE -lsn_t -log_get_max_modified_age_async(void); -/*================================*/ - /** Calculate the recommended highest values for lsn - last_checkpoint_lsn and lsn - buf_pool.get_oldest_modification(). @param[in] file_size requested innodb_log_file_size @@ -159,30 +121,22 @@ void log_buffer_flush_to_disk( bool sync = true); -/** Make a checkpoint. Note that this function does not flush dirty -blocks from the buffer pool: it only checks what is lsn of the oldest -modification in the pool, and writes information about the lsn in -log file. Use log_make_checkpoint() to flush also the pool. -@return true if success, false if a checkpoint write was already running */ -bool log_checkpoint(); - /** Make a checkpoint */ -void log_make_checkpoint(); +ATTRIBUTE_COLD void log_make_checkpoint(); /** Make a checkpoint at the latest lsn on shutdown. */ -void logs_empty_and_mark_files_at_shutdown(); +ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown(); /** Write checkpoint info to the log header and invoke log_mutex_exit(). @param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ -void log_write_checkpoint_info(lsn_t end_lsn); +ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn); /** Checks that there is enough free space in the log to start a new query step. Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this function may only be called if the calling thread owns no synchronization objects! */ -void -log_check_margins(void); +ATTRIBUTE_COLD void log_check_margins(); /************************************************************//** Gets a log block flush bit. @@ -521,10 +475,6 @@ private: std::atomic<lsn_t> lsn; /** the first guaranteed-durable log sequence number */ std::atomic<lsn_t> flushed_to_disk_lsn; -public: - /** first free offset within the log buffer in use */ - size_t buf_free; -private: /** set when there may be need to flush the log buffer, or preflush buffer pool pages, or initiate a log checkpoint. This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ @@ -534,6 +484,10 @@ public: /** mutex protecting the log */ MY_ALIGNED(CACHE_LINE_SIZE) LogSysMutex mutex; + /** first free offset within the log buffer in use */ + size_t buf_free; + /** recommended maximum size of buf, after which the buffer is flushed */ + size_t max_buf_free; /** mutex to serialize access to the flush list when we are putting dirty blocks in the list. The idea behind this mutex is to be able to release log_sys.mutex during mtr_commit and still ensure that @@ -545,8 +499,6 @@ public: /** log_buffer, writing data to file from this buffer. Before flushing write_buf is swapped with flush_buf */ byte *flush_buf; - /** recommended maximum size of buf, after which the buffer is flushed */ - size_t max_buf_free; /** Log file stuff. Protected by mutex. */ struct file { /** format of the redo log: e.g., FORMAT_10_5 */ @@ -664,17 +616,6 @@ public: buf_pool.get_oldest_modification() is exceeded, we start an asynchronous preflush of pool pages */ - lsn_t max_modified_age_sync; - /*!< when this recommended - value for lsn - - buf_pool.get_oldest_modification() - is exceeded, we start a - synchronous preflush of pool pages */ - lsn_t max_checkpoint_age_async; - /*!< when this checkpoint age - is exceeded we start an - asynchronous writing of a new - checkpoint */ lsn_t max_checkpoint_age; /*!< this is the maximum allowed value for lsn - last_checkpoint_lsn when a @@ -721,7 +662,10 @@ public: { flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); } bool check_flush_or_checkpoint() const - { return check_flush_or_checkpoint_.load(std::memory_order_relaxed); } + { + return UNIV_UNLIKELY + (check_flush_or_checkpoint_.load(std::memory_order_relaxed)); + } void set_check_flush_or_checkpoint(bool flag= true) { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); } @@ -784,11 +728,6 @@ extern log_t log_sys; extern bool log_write_lock_own(); #endif -/** Gets the log capacity. It is OK to read the value without -holding log_sys.mutex because it is constant. -@return log capacity */ -inline lsn_t log_get_capacity(void) { return log_sys.log_capacity; } - /** Calculate the offset of a log sequence number. @param[in] lsn log sequence number @return offset within the log */ diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic index 4fdc2b0258e..d7232bc5944 100644 --- a/storage/innobase/include/log0log.ic +++ b/storage/innobase/include/log0log.ic @@ -290,18 +290,6 @@ log_reserve_and_write_fast( return lsn; } -/**************************************************************** -Get log_sys::max_modified_age_async. It is OK to read the value without -holding log_sys::mutex because it is constant. -@return max_modified_age_async */ -UNIV_INLINE -lsn_t -log_get_max_modified_age_async(void) -/*================================*/ -{ - return(log_sys.max_modified_age_async); -} - /***********************************************************************//** Checks if there is need for a log buffer flush or a new checkpoint, and does this if yes. Any database operation should call this when it has modified diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 4487bf94e01..72eadc60ae8 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -628,8 +628,8 @@ private: /** Append the redo log records to the redo log buffer. @param len number of bytes to write - @return start_lsn */ - inline lsn_t finish_write(ulint len); + @return {start_lsn,flush_ahead} */ + inline std::pair<lsn_t,bool> finish_write(ulint len); /** Release the resources */ inline void release_resources(); diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index a18ff5d49ad..33d8c57a744 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -195,13 +195,9 @@ enum monitor_id_t { MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, - MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT, - MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD, - MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST, - MONITOR_FLUSH_AVG_TIME, + MONITOR_FLUSH_ADAPTIVE_AVG_TIME, MONITOR_FLUSH_ADAPTIVE_AVG_PASS, - MONITOR_FLUSH_AVG_PASS, MONITOR_LRU_GET_FREE_LOOPS, MONITOR_LRU_GET_FREE_WAITS, @@ -318,7 +314,6 @@ enum monitor_id_t { MONITOR_LSN_CHECKPOINT_AGE, MONITOR_OVLD_BUF_OLDEST_LSN, MONITOR_OVLD_MAX_AGE_ASYNC, - MONITOR_OVLD_MAX_AGE_SYNC, MONITOR_PENDING_LOG_FLUSH, MONITOR_PENDING_CHECKPOINT_WRITE, MONITOR_LOG_IO, @@ -394,7 +389,6 @@ enum monitor_id_t { MONITOR_SRV_DICT_LRU_MICROSECOND, MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE, MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE, - MONITOR_SRV_CHECKPOINT_MICROSECOND, MONITOR_OVLD_SRV_DBLWR_WRITES, MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN, MONITOR_OVLD_SRV_PAGE_SIZE, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 915cc3ffd4f..44712c5ae66 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -657,12 +657,6 @@ ulint srv_get_activity_count(void); /*========================*/ -/** Check if there has been any activity. -@param[in,out] activity_count recent activity count to be returned -if there is a change -@return FALSE if no change in activity counter. */ -bool srv_check_activity(ulint *activity_count); - /******************************************************************//** Increment the server activity counter. */ void diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 7563f30e8fb..472e39130c3 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -63,14 +63,6 @@ to the InnoDB redo log. */ /** Redo log system */ log_t log_sys; -/* These control how often we print warnings if the last checkpoint is too -old */ -static bool log_has_printed_chkp_warning = false; -static time_t log_last_warning_time; - -static bool log_has_printed_chkp_margine_warning = false; -static time_t log_last_margine_warning_time; - /* A margin for free space in the log buffer before a log entry is catenated */ #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE) @@ -79,31 +71,6 @@ static time_t log_last_margine_warning_time; #define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN \ + (4U << srv_page_size_shift)) -/* This parameter controls asynchronous making of a new checkpoint; the value -should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */ - -#define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32 - -/* This parameter controls synchronous preflushing of modified buffer pages */ -#define LOG_POOL_PREFLUSH_RATIO_SYNC 16 - -/* The same ratio for asynchronous preflushing; this value should be less than -the previous */ -#define LOG_POOL_PREFLUSH_RATIO_ASYNC 8 - -/** Return the oldest modified LSN in buf_pool.flush_list, -or the latest LSN if all pages are clean. -@return LSN of oldest modification */ -static lsn_t log_buf_pool_get_oldest_modification() -{ - ut_ad(log_mutex_own()); - log_flush_order_mutex_enter(); - lsn_t lsn= buf_pool.get_oldest_modification(); - log_flush_order_mutex_exit(); - - return lsn ? lsn : log_sys.get_lsn(); -} - /** Extends the log buffer. @param[in] len requested minimum size in bytes */ void log_buffer_extend(ulong len) @@ -151,276 +118,6 @@ void log_buffer_extend(ulong len) << new_buf_size << "."; } -/** Calculate actual length in redo buffer and file including -block header and trailer. -@param[in] len length to write -@return actual length to write including header and trailer. */ -static inline -ulint -log_calculate_actual_len( - ulint len) -{ - ut_ad(log_mutex_own()); - - const ulint framing_size = log_sys.framing_size(); - /* actual length stored per block */ - const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE - framing_size; - - /* actual data length in last block already written */ - ulint extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE); - - ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE); - extra_len -= LOG_BLOCK_HDR_SIZE; - - /* total extra length for block header and trailer */ - extra_len = ((len + extra_len) / len_per_blk) * framing_size; - - return(len + extra_len); -} - -/** Check margin not to overwrite transaction log from the last checkpoint. -If would estimate the log write to exceed the log_capacity, -waits for the checkpoint is done enough. -@param[in] len length of the data to be written */ - -void -log_margin_checkpoint_age( - ulint len) -{ - ulint margin = log_calculate_actual_len(len); - - ut_ad(log_mutex_own()); - - if (margin > log_sys.log_capacity) { - /* return with warning output to avoid deadlock */ - if (!log_has_printed_chkp_margine_warning - || difftime(time(NULL), - log_last_margine_warning_time) > 15) { - log_has_printed_chkp_margine_warning = true; - log_last_margine_warning_time = time(NULL); - - ib::error() << "The transaction log file is too" - " small for the single transaction log (size=" - << len << "). So, the last checkpoint age" - " might exceed the log capacity " - << log_sys.log_capacity << "."; - } - - return; - } - - /* Our margin check should ensure that we never reach this condition. - Try to do checkpoint once. We cannot keep waiting here as it might - result in hang in case the current mtr has latch on oldest lsn */ - const lsn_t lsn = log_sys.get_lsn(); - - if (lsn - log_sys.last_checkpoint_lsn + margin - > log_sys.log_capacity) { - /* The log write of 'len' might overwrite the transaction log - after the last checkpoint. Makes checkpoint. */ - - const bool flushed_enough = lsn - - log_buf_pool_get_oldest_modification() + margin - <= log_sys.log_capacity; - - log_sys.set_check_flush_or_checkpoint(); - log_mutex_exit(); - - DEBUG_SYNC_C("margin_checkpoint_age_rescue"); - - if (!flushed_enough) { - os_thread_sleep(100000); - } - log_checkpoint(); - - log_mutex_enter(); - } - - return; -} - -/** Open the log for log_write_low. The log must be closed with log_close. -@param[in] len length of the data to be written -@return start lsn of the log record */ -lsn_t -log_reserve_and_open( - ulint len) -{ - ulint len_upper_limit; -#ifdef UNIV_DEBUG - ulint count = 0; -#endif /* UNIV_DEBUG */ - -loop: - ut_ad(log_mutex_own()); - - /* Calculate an upper limit for the space the string may take in the - log buffer */ - - len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size - + (5 * len) / 4; - - if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) { - log_mutex_exit(); - - DEBUG_SYNC_C("log_buf_size_exceeded"); - - /* Not enough free space, do a write of the log buffer */ - log_sys.initiate_write(false); - - srv_stats.log_waits.inc(); - - ut_ad(++count < 50); - - log_mutex_enter(); - goto loop; - } - - return(log_sys.get_lsn()); -} - -/************************************************************//** -Writes to the log the string given. It is assumed that the caller holds the -log mutex. */ -void -log_write_low( -/*==========*/ - const byte* str, /*!< in: string */ - ulint str_len) /*!< in: string length */ -{ - ulint len; - - ut_ad(log_mutex_own()); - const ulint trailer_offset = log_sys.trailer_offset(); -part_loop: - /* Calculate a part length */ - - ulint data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; - - if (data_len <= trailer_offset) { - - /* The string fits within the current log block */ - - len = str_len; - } else { - data_len = trailer_offset; - - len = trailer_offset - - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; - } - - memcpy(log_sys.buf + log_sys.buf_free, str, len); - - str_len -= len; - str = str + len; - - byte* log_block = static_cast<byte*>( - ut_align_down(log_sys.buf + log_sys.buf_free, - OS_FILE_LOG_BLOCK_SIZE)); - - log_block_set_data_len(log_block, data_len); - lsn_t lsn = log_sys.get_lsn(); - - if (data_len == trailer_offset) { - /* This block became full */ - log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); - log_block_set_checkpoint_no(log_block, - log_sys.next_checkpoint_no); - len += log_sys.framing_size(); - - lsn += len; - - /* Initialize the next block header */ - log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, lsn); - } else { - lsn += len; - } - - log_sys.set_lsn(lsn); - log_sys.buf_free += len; - - ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size}); - - if (str_len > 0) { - goto part_loop; - } - - srv_stats.log_write_requests.inc(); -} - -/************************************************************//** -Closes the log. -@return lsn */ -lsn_t -log_close(void) -/*===========*/ -{ - byte* log_block; - ulint first_rec_group; - lsn_t oldest_lsn; - lsn_t lsn; - lsn_t checkpoint_age; - - ut_ad(log_mutex_own()); - - lsn = log_sys.get_lsn(); - - log_block = static_cast<byte*>( - ut_align_down(log_sys.buf + log_sys.buf_free, - OS_FILE_LOG_BLOCK_SIZE)); - - first_rec_group = log_block_get_first_rec_group(log_block); - - if (first_rec_group == 0) { - /* We initialized a new log block which was not written - full by the current mtr: the next mtr log record group - will start within this block at the offset data_len */ - - log_block_set_first_rec_group( - log_block, log_block_get_data_len(log_block)); - } - - if (log_sys.buf_free > log_sys.max_buf_free) { - log_sys.set_check_flush_or_checkpoint(); - } - - checkpoint_age = lsn - log_sys.last_checkpoint_lsn; - - if (checkpoint_age >= log_sys.log_capacity) { - DBUG_EXECUTE_IF( - "print_all_chkp_warnings", - log_has_printed_chkp_warning = false;); - - if (!log_has_printed_chkp_warning - || difftime(time(NULL), log_last_warning_time) > 15) { - - log_has_printed_chkp_warning = true; - log_last_warning_time = time(NULL); - - ib::error() << "The age of the last checkpoint is " - << checkpoint_age - << ", which exceeds the log capacity " - << log_sys.log_capacity << "."; - } - } - - if (checkpoint_age <= log_sys.max_modified_age_sync || - log_sys.check_flush_or_checkpoint()) { - goto function_exit; - } - - oldest_lsn = log_buf_pool_get_oldest_modification(); - - if (!oldest_lsn - || lsn - oldest_lsn > log_sys.max_modified_age_sync - || checkpoint_age > log_sys.max_checkpoint_age_async) { - log_sys.set_check_flush_or_checkpoint(); - } -function_exit: - - return(lsn); -} - /** Calculate the recommended highest values for lsn - last_checkpoint_lsn and lsn - buf_pool.get_oldest_modification(). @param[in] file_size requested innodb_log_file_size @@ -465,13 +162,7 @@ log_set_capacity(ulonglong file_size) log_sys.log_capacity = smallest_capacity; - log_sys.max_modified_age_async = margin - - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC; - log_sys.max_modified_age_sync = margin - - margin / LOG_POOL_PREFLUSH_RATIO_SYNC; - - log_sys.max_checkpoint_age_async = margin - margin - / LOG_POOL_CHECKPOINT_RATIO_ASYNC; + log_sys.max_modified_age_async = margin - margin / 8; log_sys.max_checkpoint_age = margin; log_mutex_exit(); @@ -518,8 +209,6 @@ void log_t::create() n_log_ios_old= 0; log_capacity= 0; max_modified_age_async= 0; - max_modified_age_sync= 0; - max_checkpoint_age_async= 0; max_checkpoint_age= 0; next_checkpoint_no= 0; next_checkpoint_lsn= 0; @@ -1151,10 +840,7 @@ log_buffer_flush_to_disk( Tries to establish a big enough margin of free space in the log buffer, such that a new log entry can be catenated without an immediate need for a flush. */ -static -void -log_flush_margin(void) -/*==================*/ +ATTRIBUTE_COLD static void log_flush_margin() { lsn_t lsn = 0; @@ -1172,61 +858,9 @@ log_flush_margin(void) } } -/** Advances the smallest lsn for which there are unflushed dirty blocks in the -buffer pool. -NOTE: this function may only be called if the calling thread owns no -synchronization objects! -@param[in] new_oldest try to advance oldest_modified_lsn at least to -this lsn -@return false if there was a flush batch of the same type running, -which means that we could not start this flush batch */ -static bool log_preflush_pool_modified_pages(lsn_t new_oldest) -{ - bool success; - - if (recv_recovery_is_on()) { - /* If the recovery is running, we must first apply all - log records to their respective file pages to get the - right modify lsn values to these pages: otherwise, there - might be pages on disk which are not yet recovered to the - current lsn, and even after calling this function, we could - not know how up-to-date the disk version of the database is, - and we could not make a new checkpoint on the basis of the - info on the buffer pool only. */ - recv_sys.apply(true); - } - - if (new_oldest == LSN_MAX - || !buf_page_cleaner_is_active - || srv_is_being_started) { - - ulint n_pages = buf_flush_lists(ULINT_UNDEFINED, new_oldest); - - buf_flush_wait_batch_end_acquiring_mutex(false); - - MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); - - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_SYNC_TOTAL_PAGE, - MONITOR_FLUSH_SYNC_COUNT, - MONITOR_FLUSH_SYNC_PAGES, - n_pages); - - const lsn_t oldest = buf_pool.get_oldest_modification(); - success = !oldest || oldest >= new_oldest; - } else { - /* better to wait for flushed by page cleaner */ - buf_flush_wait_flushed(new_oldest); - - success = true; - } - - return(success); -} - /** Write checkpoint info to the log header and invoke log_mutex_exit(). @param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ -void log_write_checkpoint_info(lsn_t end_lsn) +ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn) { ut_ad(log_mutex_own()); ut_ad(!srv_read_only_mode); @@ -1296,194 +930,40 @@ void log_write_checkpoint_info(lsn_t end_lsn) log_mutex_exit(); } -/** Make a checkpoint. Note that this function does not flush dirty -blocks from the buffer pool: it only checks what is lsn of the oldest -modification in the pool, and writes information about the lsn in -log file. Use log_make_checkpoint() to flush also the pool. -@return true if success, false if a checkpoint write was already running */ -bool log_checkpoint() -{ - lsn_t oldest_lsn; - - ut_ad(!srv_read_only_mode); - - DBUG_EXECUTE_IF("no_checkpoint", - /* We sleep for a long enough time, forcing - the checkpoint doesn't happen any more. */ - os_thread_sleep(360000000);); - - if (recv_recovery_is_on()) { - recv_sys.apply(true); - } - - switch (srv_file_flush_method) { - case SRV_NOSYNC: - break; - case SRV_O_DSYNC: - case SRV_FSYNC: - case SRV_LITTLESYNC: - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: -#ifdef _WIN32 - case SRV_ALL_O_DIRECT_FSYNC: -#endif - fil_flush_file_spaces(); - } - - log_mutex_enter(); - - ut_ad(!recv_no_log_write); - oldest_lsn = log_buf_pool_get_oldest_modification(); - - /* Because log also contains headers and dummy log records, - log_buf_pool_get_oldest_modification() will return log_sys.lsn - if the buffer pool contains no dirty buffers. - We must make sure that the log is flushed up to that lsn. - If there are dirty buffers in the buffer pool, then our - write-ahead-logging algorithm ensures that the log has been - flushed up to oldest_lsn. */ - - ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); - if (oldest_lsn - > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) { - /* Some log has been written since the previous checkpoint. */ - } else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { - /* MariaDB startup expects the redo log file to be - logically empty (not even containing a MLOG_CHECKPOINT record) - after a clean shutdown. Perform an extra checkpoint at - shutdown. */ - } else { - /* Do nothing, because nothing was logged (other than - a FILE_CHECKPOINT marker) since the previous checkpoint. */ - log_mutex_exit(); - return(true); - } - /* Repeat the FILE_MODIFY records after the checkpoint, in - case some log records between the checkpoint and log_sys.lsn - need them. Finally, write a FILE_CHECKPOINT marker. Redo log - apply expects to see a FILE_CHECKPOINT after the checkpoint, - except on clean shutdown, where the log will be empty after - the checkpoint. - It is important that we write out the redo log before any - further dirty pages are flushed to the tablespace files. At - this point, because log_mutex_own(), mtr_commit() in other - threads will be blocked, and no pages can be added to the - flush lists. */ - lsn_t flush_lsn = oldest_lsn; - const lsn_t end_lsn = log_sys.get_lsn(); - const bool do_write - = srv_shutdown_state <= SRV_SHUTDOWN_INITIATED - || flush_lsn != end_lsn; - - if (fil_names_clear(flush_lsn, do_write)) { - flush_lsn = log_sys.get_lsn(); - ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); - } - - log_mutex_exit(); - - log_write_up_to(flush_lsn, true, true); - - log_mutex_enter(); - - ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); - ut_ad(flush_lsn >= oldest_lsn); - - if (log_sys.last_checkpoint_lsn >= oldest_lsn) { - log_mutex_exit(); - return(true); - } - - if (log_sys.n_pending_checkpoint_writes > 0) { - /* A checkpoint write is running */ - log_mutex_exit(); - - return(false); - } - - log_sys.next_checkpoint_lsn = oldest_lsn; - log_write_checkpoint_info(end_lsn); - ut_ad(!log_mutex_own()); - - return(true); -} - -/** Make a checkpoint */ -void log_make_checkpoint() -{ - /* Preflush pages synchronously */ - - while (!log_preflush_pool_modified_pages(LSN_MAX)) { - /* Flush as much as we can */ - } - - while (!log_checkpoint()) { - /* Force a checkpoint */ - } -} - /****************************************************************//** -Tries to establish a big enough margin of free space in the log groups, such +Tries to establish a big enough margin of free space in the log, such that a new log entry can be catenated without an immediate need for a checkpoint. NOTE: this function may only be called if the calling thread owns no synchronization objects! */ -static -void -log_checkpoint_margin(void) -/*=======================*/ +ATTRIBUTE_COLD static void log_checkpoint_margin() { - ib_uint64_t advance; - bool success; -loop: - advance = 0; - - log_mutex_enter(); - ut_ad(!recv_no_log_write); - - if (!log_sys.check_flush_or_checkpoint()) { - log_mutex_exit(); - return; - } - - const lsn_t oldest_lsn = log_buf_pool_get_oldest_modification(); - const lsn_t lsn = log_sys.get_lsn(); - const lsn_t age = lsn - oldest_lsn; - - if (age > log_sys.max_modified_age_sync) { - - /* A flush is urgent: we have to do a synchronous preflush */ - advance = age - log_sys.max_modified_age_sync; - } - - const lsn_t checkpoint_age = lsn - log_sys.last_checkpoint_lsn; - - ut_ad(log_sys.max_checkpoint_age >= log_sys.max_checkpoint_age_async); - const bool do_checkpoint - = checkpoint_age > log_sys.max_checkpoint_age_async; - - if (checkpoint_age <= log_sys.max_checkpoint_age) { - log_sys.set_check_flush_or_checkpoint(false); - } - - log_mutex_exit(); + while (log_sys.check_flush_or_checkpoint()) + { + log_mutex_enter(); + ut_ad(!recv_no_log_write); - if (advance) { - lsn_t new_oldest = oldest_lsn + advance; + if (!log_sys.check_flush_or_checkpoint()) + { +func_exit: + log_mutex_exit(); + return; + } - success = log_preflush_pool_modified_pages(new_oldest); + const lsn_t lsn= log_sys.get_lsn(); + const lsn_t checkpoint= log_sys.last_checkpoint_lsn; + const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age; + if (lsn <= sync_lsn) + { + log_sys.set_check_flush_or_checkpoint(false); + goto func_exit; + } - /* If the flush succeeded, this thread has done its part - and can proceed. If it did not succeed, there was another - thread doing a flush at the same time. */ - if (!success) { - log_sys.set_check_flush_or_checkpoint(); - goto loop; - } - } + log_mutex_exit(); - if (do_checkpoint) { - log_checkpoint(); - } + /* We must wait to prevent the tail of the log overwriting the head. */ + buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20))); + os_thread_sleep(10000); /* Sleep 10ms to avoid a thundering herd */ + } } /** @@ -1491,7 +971,7 @@ Checks that there is enough free space in the log to start a new query step. Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this function may only be called if the calling thread owns no synchronization objects! */ -void log_check_margins() +ATTRIBUTE_COLD void log_check_margins() { do { @@ -1504,43 +984,8 @@ void log_check_margins() extern void buf_resize_shutdown(); -/** @return the number of dirty pages in the buffer pool */ -static ulint flush_list_length() -{ - mysql_mutex_lock(&buf_pool.flush_list_mutex); - const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - return len; -} - -static void flush_buffer_pool() -{ - service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, - "Waiting to flush the buffer pool"); - while (buf_pool.n_flush_list || flush_list_length()) - { - buf_flush_lists(ULINT_UNDEFINED, LSN_MAX); - timespec abstime; - - if (buf_pool.n_flush_list) - { - service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, - "Waiting to flush " ULINTPF " pages", - flush_list_length()); - set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); - mysql_mutex_lock(&buf_pool.mutex); - while (buf_pool.n_flush_list) - mysql_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex, - &abstime); - mysql_mutex_unlock(&buf_pool.mutex); - } - } - - ut_ad(!buf_pool.any_io_pending()); -} - /** Make a checkpoint at the latest lsn on shutdown. */ -void logs_empty_and_mark_files_at_shutdown() +ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown() { lsn_t lsn; ulint count = 0; @@ -1659,7 +1104,7 @@ wait_suspend_loop: goto loop; } else { - flush_buffer_pool(); + buf_flush_buffer_pool(); } if (log_sys.is_initialised()) { @@ -1777,14 +1222,19 @@ log_print( log_mutex_enter(); + const lsn_t lsn= log_sys.get_lsn(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + fprintf(file, "Log sequence number " LSN_PF "\n" "Log flushed up to " LSN_PF "\n" "Pages flushed up to " LSN_PF "\n" "Last checkpoint at " LSN_PF "\n", - log_sys.get_lsn(), + lsn, log_sys.get_flushed_lsn(), - log_buf_pool_get_oldest_modification(), + pages_flushed, log_sys.last_checkpoint_lsn); current_time = time(NULL); diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 17b524f610f..a3a2b8f4f45 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -350,17 +350,6 @@ struct ReleaseBlocks } }; -/** Write the block contents to the REDO log */ -struct mtr_write_log_t { - /** Append a block to the redo log buffer. - @return whether the appending should continue */ - bool operator()(const mtr_buf_t::block_t* block) const - { - log_write_low(block->begin(), block->used()); - return(true); - } -}; - /** Start a mini-transaction. */ void mtr_t::start() { @@ -411,12 +400,12 @@ void mtr_t::commit() { ut_ad(!srv_read_only_mode || m_log_mode == MTR_LOG_NO_REDO); - lsn_t start_lsn; + std::pair<lsn_t,bool> lsns; if (const ulint len= prepare_write()) - start_lsn= finish_write(len); + lsns= finish_write(len); else - start_lsn= m_commit_lsn; + lsns= { m_commit_lsn, false }; if (m_made_dirty) log_flush_order_mutex_enter(); @@ -453,12 +442,18 @@ void mtr_t::commit() } m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks> - (ReleaseBlocks(start_lsn, m_commit_lsn, + (ReleaseBlocks(lsns.first, m_commit_lsn, m_memo))); if (m_made_dirty) log_flush_order_mutex_exit(); m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>()); + + if (lsns.second) + buf_flush_ahead(m_commit_lsn); + + if (m_made_dirty) + srv_stats.log_write_requests.inc(); } else m_memo.for_each_block_in_reverse(CIterate<ReleaseAll>()); @@ -496,6 +491,7 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn) } finish_write(m_log.size()); + srv_stats.log_write_requests.inc(); release_resources(); if (checkpoint_lsn) { @@ -621,6 +617,200 @@ mtr_t::release_page(const void* ptr, mtr_memo_type_t type) ut_ad(0); } +static bool log_margin_warned; +static time_t log_margin_warn_time; +static bool log_close_warned; +static time_t log_close_warn_time; + +/** Check margin not to overwrite transaction log from the last checkpoint. +If would estimate the log write to exceed the log_capacity, +waits for the checkpoint is done enough. +@param len length of the data to be written */ +static void log_margin_checkpoint_age(ulint len) +{ + const ulint framing_size= log_sys.framing_size(); + /* actual length stored per block */ + const ulint len_per_blk= OS_FILE_LOG_BLOCK_SIZE - framing_size; + + /* actual data length in last block already written */ + ulint extra_len= log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; + + ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE); + extra_len-= LOG_BLOCK_HDR_SIZE; + + /* total extra length for block header and trailer */ + extra_len= ((len + extra_len) / len_per_blk) * framing_size; + + const ulint margin= len + extra_len; + + ut_ad(log_mutex_own()); + + const lsn_t lsn= log_sys.get_lsn(); + + if (UNIV_UNLIKELY(margin > log_sys.log_capacity)) + { + time_t t= time(nullptr); + + /* return with warning output to avoid deadlock */ + if (!log_margin_warned || difftime(t, log_margin_warn_time) > 15) + { + log_margin_warned= true; + log_margin_warn_time= t; + + ib::error() << "innodb_log_file_size is too small " + "for mini-transaction size " << len; + } + } + else if (UNIV_LIKELY(lsn + margin <= log_sys.last_checkpoint_lsn + + log_sys.log_capacity)) + return; + + log_sys.set_check_flush_or_checkpoint(); +} + + +/** Open the log for log_write_low(). The log must be closed with log_close(). +@param len length of the data to be written +@return start lsn of the log record */ +static lsn_t log_reserve_and_open(size_t len) +{ + for (ut_d(ulint count= 0);;) + { + ut_ad(log_mutex_own()); + + /* Calculate an upper limit for the space the string may take in + the log buffer */ + + size_t len_upper_limit= (4 * OS_FILE_LOG_BLOCK_SIZE) + + srv_log_write_ahead_size + (5 * len) / 4; + + if (log_sys.buf_free + len_upper_limit <= srv_log_buffer_size) + break; + + log_mutex_exit(); + DEBUG_SYNC_C("log_buf_size_exceeded"); + + /* Not enough free space, do a write of the log buffer */ + log_sys.initiate_write(false); + + srv_stats.log_waits.inc(); + + ut_ad(++count < 50); + + log_mutex_enter(); + } + + return log_sys.get_lsn(); +} + +/** Append data to the log buffer. */ +static void log_write_low(const void *str, size_t size) +{ + ut_ad(log_mutex_own()); + const ulint trailer_offset= log_sys.trailer_offset(); + + do + { + /* Calculate a part length */ + size_t len= size; + size_t data_len= (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + size; + + if (data_len > trailer_offset) + { + data_len= trailer_offset; + len= trailer_offset - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; + } + + memcpy(log_sys.buf + log_sys.buf_free, str, len); + + size-= len; + str= static_cast<const char*>(str) + len; + + byte *log_block= static_cast<byte*>(ut_align_down(log_sys.buf + + log_sys.buf_free, + OS_FILE_LOG_BLOCK_SIZE)); + + log_block_set_data_len(log_block, data_len); + lsn_t lsn= log_sys.get_lsn(); + + if (data_len == trailer_offset) + { + /* This block became full */ + log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); + log_block_set_checkpoint_no(log_block, log_sys.next_checkpoint_no); + len+= log_sys.framing_size(); + lsn+= len; + /* Initialize the next block header */ + log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, lsn); + } + else + lsn+= len; + + log_sys.set_lsn(lsn); + log_sys.buf_free+= len; + + ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size}); + } + while (size); +} + +/** Close the log at mini-transaction commit. +@return whether buffer pool flushing is needed */ +static bool log_close(lsn_t lsn) +{ + ut_ad(log_mutex_own()); + ut_ad(lsn == log_sys.get_lsn()); + + byte *log_block= static_cast<byte*>(ut_align_down(log_sys.buf + + log_sys.buf_free, + OS_FILE_LOG_BLOCK_SIZE)); + + if (!log_block_get_first_rec_group(log_block)) + { + /* We initialized a new log block which was not written + full by the current mtr: the next mtr log record group + will start within this block at the offset data_len */ + log_block_set_first_rec_group(log_block, + log_block_get_data_len(log_block)); + } + + if (log_sys.buf_free > log_sys.max_buf_free) + log_sys.set_check_flush_or_checkpoint(); + + const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn; + + if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity)) + { + time_t t= time(nullptr); + if (!log_close_warned || difftime(t, log_close_warn_time) > 15) + { + log_close_warned= true; + log_close_warn_time= t; + + ib::error() << "The age of the last checkpoint is " << checkpoint_age + << ", which exceeds the log capacity " + << log_sys.log_capacity << "."; + } + } + else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age)) + return false; + + log_sys.set_check_flush_or_checkpoint(); + return true; +} + +/** Write the block contents to the REDO log */ +struct mtr_write_log +{ + /** Append a block to the redo log buffer. + @return whether the appending should continue */ + bool operator()(const mtr_buf_t::block_t *block) const + { + log_write_low(block->begin(), block->used()); + return true; + } +}; + /** Prepare to write the mini-transaction log to the redo log buffer. @return number of bytes to write in finish_write() */ inline ulint mtr_t::prepare_write() @@ -668,10 +858,10 @@ inline ulint mtr_t::prepare_write() return(len); } -/** Append the redo log records to the redo log buffer -@param[in] len number of bytes to write -@return start_lsn */ -inline lsn_t mtr_t::finish_write(ulint len) +/** Append the redo log records to the redo log buffer. +@param len number of bytes to write +@return {start_lsn,flush_ahead_lsn} */ +inline std::pair<lsn_t,bool> mtr_t::finish_write(ulint len) { ut_ad(m_log_mode == MTR_LOG_ALL); ut_ad(log_mutex_own()); @@ -688,18 +878,19 @@ inline lsn_t mtr_t::finish_write(ulint len) &start_lsn); if (m_commit_lsn) { - return start_lsn; + return std::make_pair(start_lsn, false); } } /* Open the database log for log_write_low */ start_lsn = log_reserve_and_open(len); - mtr_write_log_t write_log; + mtr_write_log write_log; m_log.for_each_block(write_log); + m_commit_lsn = log_sys.get_lsn(); + bool flush = log_close(m_commit_lsn); - m_commit_lsn = log_close(); - return start_lsn; + return std::make_pair(start_lsn, flush); } /** Find buffer fix count of the given block acquired by the diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 81ab97daac9..2af29fded26 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -381,36 +381,16 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_BY_AGE}, - {"buffer_flush_adaptive_avg_time_slot", "buffer", - "Avg time (ms) spent for adaptive flushing recently per slot.", + {"buffer_flush_adaptive_avg_time", "buffer", + "Avg time (ms) spent for adaptive flushing recently.", MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT}, - - {"buffer_flush_adaptive_avg_time_thread", "buffer", - "Avg time (ms) spent for adaptive flushing recently per thread.", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD}, - - {"buffer_flush_adaptive_avg_time_est", "buffer", - "Estimated time (ms) spent for adaptive flushing recently.", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST}, - - {"buffer_flush_avg_time", "buffer", - "Avg time (ms) spent for flushing recently.", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_TIME}, + MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME}, {"buffer_flush_adaptive_avg_pass", "buffer", "Number of adaptive flushes passed during the recent Avg period.", MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_PASS}, - {"buffer_flush_avg_pass", "buffer", - "Number of flushes passed during the recent Avg period.", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PASS}, - {"buffer_LRU_get_free_loops", "buffer", "Total loops in LRU get free.", MONITOR_NONE, @@ -868,12 +848,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC}, - {"log_max_modified_age_sync", "recovery", - "Maximum LSN difference; when exceeded, start synchronous preflush", - static_cast<monitor_type_t>( - MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), - MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_SYNC}, - {"log_pending_log_flushes", "recovery", "Pending log flushes", static_cast<monitor_type_t>( MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), @@ -1172,11 +1146,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE}, - {"innodb_checkpoint_usec", "server", - "Time (in microseconds) spent by master thread to do checkpoint", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_SRV_CHECKPOINT_MICROSECOND}, - {"innodb_dblwr_writes", "server", "Number of doublewrite operations that have been performed" " (innodb_dblwr_writes)", @@ -1956,7 +1925,9 @@ srv_mon_process_existing_counter( break; case MONITOR_OVLD_BUF_OLDEST_LSN: - value = (mon_type_t) buf_pool.get_oldest_modification(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + value = (mon_type_t) buf_pool.get_oldest_modification(0); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); break; case MONITOR_OVLD_LSN_CHECKPOINT: @@ -1967,10 +1938,6 @@ srv_mon_process_existing_counter( value = log_sys.max_modified_age_async; break; - case MONITOR_OVLD_MAX_AGE_SYNC: - value = log_sys.max_modified_age_sync; - break; - #ifdef BTR_CUR_HASH_ADAPT case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH: value = btr_cur_n_sea; diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 6337428e450..44d33126e48 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -1553,11 +1553,10 @@ srv_get_activity_count(void) return(srv_sys.activity_count); } -/** Check if there has been any activity. -@param[in,out] activity_count recent activity count to be returned -if there is a change -@return FALSE if no change in activity counter. */ -bool srv_check_activity(ulint *activity_count) +/** Check if srv_inc_activity_count() has been called. +@param activity_count copy of srv_sys.activity_count +@return whether the activity_count had changed */ +static bool srv_check_activity(ulint *activity_count) { ulint new_activity_count= srv_sys.activity_count; if (new_activity_count != *activity_count) @@ -1757,28 +1756,6 @@ srv_master_do_active_tasks(void) MONITOR_INC_TIME_IN_MICRO_SECS( MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time); } - - /* The periodic log_checkpoint() call here makes it harder to - reproduce bugs in crash recovery or mariabackup --prepare, or - in code that writes the redo log records. Omitting the call - here should not affect correctness, because log_free_check() - should still be invoking checkpoints when needed. In a - production server, those calls could cause "furious flushing" - and stall the server. Normally we want to perform checkpoints - early and often to avoid those situations. */ - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", return;); - - if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { - return; - } - - /* Make a new checkpoint */ - if (cur_time % SRV_MASTER_CHECKPOINT_INTERVAL == 0) { - srv_main_thread_op_info = "making checkpoint"; - log_checkpoint(); - MONITOR_INC_TIME_IN_MICRO_SECS( - MONITOR_SRV_CHECKPOINT_MICROSECOND, counter_time); - } } /*********************************************************************//** @@ -1837,26 +1814,6 @@ srv_master_do_idle_tasks(void) srv_sync_log_buffer_in_background(); MONITOR_INC_TIME_IN_MICRO_SECS( MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time); - - /* The periodic log_checkpoint() call here makes it harder to - reproduce bugs in crash recovery or mariabackup --prepare, or - in code that writes the redo log records. Omitting the call - here should not affect correctness, because log_free_check() - should still be invoking checkpoints when needed. In a - production server, those calls could cause "furious flushing" - and stall the server. Normally we want to perform checkpoints - early and often to avoid those situations. */ - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", return;); - - if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { - return; - } - - /* Make a new checkpoint */ - srv_main_thread_op_info = "making checkpoint"; - log_checkpoint(); - MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_CHECKPOINT_MICROSECOND, - counter_time); } /** diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index f7fd1a3cec0..1746d351263 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -327,6 +327,7 @@ static dberr_t create_log_file(lsn_t lsn, std::string& logfile0) log_mutex_exit(); log_make_checkpoint(); + log_write_up_to(LSN_MAX, true); return DB_SUCCESS; } @@ -1297,6 +1298,7 @@ dberr_t srv_start(bool create_new_db) } std::string logfile0; + bool create_new_log = create_new_db; if (create_new_db) { flushed_lsn = log_sys.get_lsn(); log_sys.set_flushed_lsn(flushed_lsn); @@ -1318,7 +1320,8 @@ dberr_t srv_start(bool create_new_db) return srv_init_abort(err); } - if (srv_log_file_size == 0) { + create_new_log = srv_log_file_size == 0; + if (create_new_log) { if (flushed_lsn < lsn_t(1000)) { ib::error() << "Cannot create log file because" @@ -1433,10 +1436,17 @@ file_checked: return(srv_init_abort(err)); } } else { + /* Suppress warnings in fil_space_t::create() for files + that are being read before dict_boot() has recovered + DICT_HDR_MAX_SPACE_ID. */ + fil_system.space_id_reuse_warned = true; + /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ - err = recv_recovery_from_checkpoint_start(flushed_lsn); + err = create_new_log + ? DB_SUCCESS + : recv_recovery_from_checkpoint_start(flushed_lsn); recv_sys.close_files(); recv_sys.dblwr.pages.clear(); @@ -1492,6 +1502,8 @@ file_checked: } } + fil_system.space_id_reuse_warned = false; + if (!srv_read_only_mode) { const ulint flags = FSP_FLAGS_PAGE_SSIZE(); for (ulint id = 0; id <= srv_undo_tablespaces; id++) { diff --git a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result index d4623c2f054..ac6e2bcc633 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result @@ -71,12 +71,8 @@ buffer_flush_neighbor buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NU buffer_flush_neighbor_pages buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 set_member Pages queued as a neighbor batch buffer_flush_n_to_flush_requested buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages requested for flushing. buffer_flush_n_to_flush_by_age buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages target by LSN Age for flushing. -buffer_flush_adaptive_avg_time_slot buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for adaptive flushing recently per slot. -buffer_flush_adaptive_avg_time_thread buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for adaptive flushing recently per thread. -buffer_flush_adaptive_avg_time_est buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Estimated time (ms) spent for adaptive flushing recently. -buffer_flush_avg_time buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for flushing recently. +buffer_flush_adaptive_avg_time buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Avg time (ms) spent for adaptive flushing recently. buffer_flush_adaptive_avg_pass buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of adaptive flushes passed during the recent Avg period. -buffer_flush_avg_pass buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of flushes passed during the recent Avg period. buffer_LRU_get_free_loops buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Total loops in LRU get free. buffer_LRU_get_free_waits buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Total sleep waits in LRU get free. buffer_flush_avg_page_rate buffer 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Average number of pages at which flushing is happening @@ -176,7 +172,6 @@ log_lsn_current recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 log_lsn_checkpoint_age recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Current LSN value minus LSN at last checkpoint log_lsn_buf_pool_oldest recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value The oldest modified block LSN in the buffer pool log_max_modified_age_async recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Maximum LSN difference; when exceeded, start asynchronous preflush -log_max_modified_age_sync recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Maximum LSN difference; when exceeded, start synchronous preflush log_pending_log_flushes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending log flushes log_pending_checkpoint_writes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending checkpoints log_num_log_io recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of log I/Os @@ -227,7 +222,6 @@ innodb_log_flush_usec server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NU innodb_dict_lru_usec server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Time (in microseconds) spent to process DICT LRU list innodb_dict_lru_count_active server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of tables evicted from DICT LRU list in the active loop innodb_dict_lru_count_idle server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of tables evicted from DICT LRU list in the idle loop -innodb_checkpoint_usec server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Time (in microseconds) spent by master thread to do checkpoint innodb_dblwr_writes server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of doublewrite operations that have been performed (innodb_dblwr_writes) innodb_dblwr_pages_written server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of pages that have been written for doublewrite operations (innodb_dblwr_pages_written) innodb_page_size server 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value InnoDB page size in bytes (innodb_page_size) |