summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2022-12-14 14:43:32 +0200
committerMarko Mäkelä <marko.makela@mariadb.com>2023-01-11 17:55:56 +0200
commit24648768b443f6adeb8a0f4302958bfb300d536f (patch)
treea2fdb83f0ba5d45b120929dfc135736fce768eb6
parente581396b7aea94485580d2c9edaa9c5df647f2b7 (diff)
downloadmariadb-git-24648768b443f6adeb8a0f4302958bfb300d536f.tar.gz
MDEV-30136: Deprecate innodb_flush_method
We introduce the following settable Boolean global variables: innodb_log_file_write_through: Whether writes to ib_logfile0 are write-through (disabling any caching, as in O_SYNC or O_DSYNC). innodb_data_file_write_through: Whether writes to any InnoDB data files (including the temporary tablespace) are write-through. innodb_data_file_buffering: Whether the file system cache is enabled for InnoDB data files. All these parameters are OFF by default, that is, the file system cache will be disabled, but any hardware caching is enabled, that is, explicit calls to fsync(), fdatasync() or similar functions are needed. On systems that support FUA it may make sense to enable write-through, to avoid extra system calls. If the deprecated read-only start-up parameter is set to one of the following values, then the values of the 4 Boolean flags (the above 3 plus innodb_log_file_buffering) will be set as follows: O_DSYNC: innodb_log_file_write_through=ON, innodb_data_file_write_through=ON, innodb_data_file_buffering=OFF, and (if supported) innodb_log_file_buffering=OFF. fsync, littlesync, nosync, or (Microsoft Windows specific) normal: innodb_log_file_write_through=OFF, innodb_data_file_write_through=OFF, and innodb_data_file_buffering=ON. Note: fsync() or fdatasync() will only be disabled if the separate parameter debug_no_sync (in the code, my_disable_sync) is set. In mariadb-backup, the parameter innodb_flush_method will be ignored. The Boolean parameters can be modified by SET GLOBAL while the server is running. This will require reopening the ib_logfile0 or all currently open InnoDB data files. We will open files straight in O_DSYNC or O_SYNC mode when applicable. Data files we will try to open straight in O_DIRECT mode when the page size is at least 4096 bytes. For atomically creating data files, we will invoke os_file_set_nocache() to enable O_DIRECT afterwards, because O_DIRECT is not supported on some file systems. We will also continue to invoke os_file_set_nocache() on ib_logfile0 when innodb_log_file_buffering=OFF can be fulfilled. For reopening the ib_logfile0, we use the same logic that was developed for online log resizing and reused for updates of innodb_log_file_buffering. Reopening all data files is implemented in the new function fil_space_t::reopen_all(). Reviewed by: Vladislav Vaintroub Tested by: Matthias Leich
-rw-r--r--extra/mariabackup/fil_cur.cc6
-rw-r--r--extra/mariabackup/xtrabackup.cc26
-rw-r--r--mysql-test/lib/My/Debugger.pm2
-rw-r--r--mysql-test/suite/sys_vars/r/sysvars_innodb.result36
-rw-r--r--storage/innobase/buf/buf0flu.cc20
-rw-r--r--storage/innobase/fil/fil0fil.cc139
-rw-r--r--storage/innobase/handler/ha_innodb.cc115
-rw-r--r--storage/innobase/include/fil0fil.h52
-rw-r--r--storage/innobase/include/log0log.h4
-rw-r--r--storage/innobase/log/log0log.cc27
-rw-r--r--storage/innobase/os/os0file.cc198
-rw-r--r--storage/innobase/srv/srv0srv.cc3
-rw-r--r--storage/innobase/trx/trx0trx.cc2
13 files changed, 379 insertions, 251 deletions
diff --git a/extra/mariabackup/fil_cur.cc b/extra/mariabackup/fil_cur.cc
index e0a4711a2aa..2932fa6d5a6 100644
--- a/extra/mariabackup/fil_cur.cc
+++ b/extra/mariabackup/fil_cur.cc
@@ -199,12 +199,6 @@ xb_fil_cur_open(
return(XB_FIL_CUR_SKIP);
}
- if (srv_file_flush_method == SRV_O_DIRECT
- || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) {
-
- os_file_set_nocache(cursor->file, node->name, "OPEN");
- }
-
posix_fadvise(cursor->file, 0, 0, POSIX_FADV_SEQUENTIAL);
cursor->page_size = node->space->physical_size();
diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc
index 67560ec03aa..27bb10bf82e 100644
--- a/extra/mariabackup/xtrabackup.cc
+++ b/extra/mariabackup/xtrabackup.cc
@@ -311,6 +311,8 @@ extern const char *innodb_checksum_algorithm_names[];
extern TYPELIB innodb_checksum_algorithm_typelib;
extern const char *innodb_flush_method_names[];
extern TYPELIB innodb_flush_method_typelib;
+/** Ignored option */
+static ulong innodb_flush_method;
static const char *binlog_info_values[] = {"off", "lockless", "on", "auto",
NullS};
@@ -1032,6 +1034,8 @@ enum options_xtrabackup
#if defined __linux__ || defined _WIN32
OPT_INNODB_LOG_FILE_BUFFERING,
#endif
+ OPT_INNODB_DATA_FILE_BUFFERING,
+ OPT_INNODB_DATA_FILE_WRITE_THROUGH,
OPT_INNODB_LOG_FILE_SIZE,
OPT_INNODB_OPEN_FILES,
OPT_XTRA_DEBUG_SYNC,
@@ -1583,10 +1587,10 @@ struct my_option xb_server_options[] =
FALSE, 0, 0, 0, 0, 0},
{"innodb_flush_method", OPT_INNODB_FLUSH_METHOD,
- "With which method to flush data.",
- &srv_file_flush_method, &srv_file_flush_method,
+ "Ignored parameter with no effect",
+ &innodb_flush_method, &innodb_flush_method,
&innodb_flush_method_typelib, GET_ENUM, REQUIRED_ARG,
- IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), 0, 0, 0, 0, 0},
+ 4/* O_DIRECT */, 0, 0, 0, 0, 0},
{"innodb_log_buffer_size", OPT_INNODB_LOG_BUFFER_SIZE,
"Redo log buffer size in bytes.",
@@ -1600,6 +1604,16 @@ struct my_option xb_server_options[] =
(G_PTR*) &log_sys.log_buffered, 0, GET_BOOL, NO_ARG,
TRUE, 0, 0, 0, 0, 0},
#endif
+ {"innodb_data_file_buffering", OPT_INNODB_DATA_FILE_BUFFERING,
+ "Whether the file system cache for data files is enabled during --backup",
+ (G_PTR*) &fil_system.buffered,
+ (G_PTR*) &fil_system.buffered, 0, GET_BOOL, NO_ARG,
+ FALSE, 0, 0, 0, 0, 0},
+ {"innodb_data_file_write_through", OPT_INNODB_DATA_FILE_WRITE_THROUGH,
+ "Whether each write to data files writes through",
+ (G_PTR*) &fil_system.write_through,
+ (G_PTR*) &fil_system.write_through, 0, GET_BOOL, NO_ARG,
+ FALSE, 0, 0, 0, 0, 0},
{"innodb_log_file_size", OPT_INNODB_LOG_FILE_SIZE,
"Ignored for mysqld option compatibility",
(G_PTR*) &srv_log_file_size, (G_PTR*) &srv_log_file_size, 0,
@@ -1917,12 +1931,6 @@ xb_get_one_option(const struct my_option *opt,
ADD_PRINT_PARAM_OPT(srv_log_group_home_dir);
break;
- case OPT_INNODB_FLUSH_METHOD:
- ut_a(srv_file_flush_method
- <= IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT_NO_FSYNC));
- ADD_PRINT_PARAM_OPT(innodb_flush_method_names[srv_file_flush_method]);
- break;
-
case OPT_INNODB_PAGE_SIZE:
ADD_PRINT_PARAM_OPT(innobase_page_size);
diff --git a/mysql-test/lib/My/Debugger.pm b/mysql-test/lib/My/Debugger.pm
index c2062c2eaba..412c028cfc5 100644
--- a/mysql-test/lib/My/Debugger.pm
+++ b/mysql-test/lib/My/Debugger.pm
@@ -78,7 +78,7 @@ my %debuggers = (
options => '-f -o {log} {exe} {args}',
},
rr => {
- options => '_RR_TRACE_DIR={log} rr record {exe} {args} --loose-skip-innodb-use-native-aio --loose-innodb-flush-method=fsync',
+ options => '_RR_TRACE_DIR={log} rr record {exe} {args}',
run => 'env',
pre => sub {
::mtr_error('rr requires kernel.perf_event_paranoid <= 1')
diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
index 6eface8c097..998d82587b6 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
@@ -355,6 +355,18 @@ NUMERIC_BLOCK_SIZE 0
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
+VARIABLE_NAME INNODB_DATA_FILE_BUFFERING
+SESSION_VALUE NULL
+DEFAULT_VALUE OFF
+VARIABLE_SCOPE GLOBAL
+VARIABLE_TYPE BOOLEAN
+VARIABLE_COMMENT Whether the file system cache for data files is enabled
+NUMERIC_MIN_VALUE NULL
+NUMERIC_MAX_VALUE NULL
+NUMERIC_BLOCK_SIZE NULL
+ENUM_VALUE_LIST OFF,ON
+READ_ONLY NO
+COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_DATA_FILE_PATH
SESSION_VALUE NULL
DEFAULT_VALUE ibdata1:12M:autoextend
@@ -379,6 +391,18 @@ NUMERIC_BLOCK_SIZE 0
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
+VARIABLE_NAME INNODB_DATA_FILE_WRITE_THROUGH
+SESSION_VALUE NULL
+DEFAULT_VALUE OFF
+VARIABLE_SCOPE GLOBAL
+VARIABLE_TYPE BOOLEAN
+VARIABLE_COMMENT Whether each write to data files writes through
+NUMERIC_MIN_VALUE NULL
+NUMERIC_MAX_VALUE NULL
+NUMERIC_BLOCK_SIZE NULL
+ENUM_VALUE_LIST OFF,ON
+READ_ONLY NO
+COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_DATA_HOME_DIR
SESSION_VALUE NULL
DEFAULT_VALUE
@@ -1015,6 +1039,18 @@ NUMERIC_BLOCK_SIZE 4096
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
+VARIABLE_NAME INNODB_LOG_FILE_WRITE_THROUGH
+SESSION_VALUE NULL
+DEFAULT_VALUE OFF
+VARIABLE_SCOPE GLOBAL
+VARIABLE_TYPE BOOLEAN
+VARIABLE_COMMENT Whether each write to ib_logfile0 is write through
+NUMERIC_MIN_VALUE NULL
+NUMERIC_MAX_VALUE NULL
+NUMERIC_BLOCK_SIZE NULL
+ENUM_VALUE_LIST OFF,ON
+READ_ONLY NO
+COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_LOG_GROUP_HOME_DIR
SESSION_VALUE NULL
DEFAULT_VALUE
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 6bf03d3e72a..d71cbfbf743 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -1724,7 +1724,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
resize_log.write(CHECKPOINT_1, {c, get_block_size()});
}
- if (srv_file_flush_method != SRV_O_DSYNC)
+ if (!log_write_through)
ut_a(log.flush());
latch.wr_lock(SRW_LOCK_CALL);
ut_ad(checkpoint_pending);
@@ -1756,7 +1756,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
if (!is_pmem())
{
- if (srv_file_flush_method != SRV_O_DSYNC)
+ if (!log_write_through)
ut_a(resize_log.flush());
IF_WIN(log.close(),);
}
@@ -1902,13 +1902,7 @@ static bool log_checkpoint()
if (recv_recovery_is_on())
recv_sys.apply(true);
- switch (srv_file_flush_method) {
- case SRV_NOSYNC:
- case SRV_O_DIRECT_NO_FSYNC:
- break;
- default:
- fil_flush_file_spaces();
- }
+ fil_flush_file_spaces();
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t end_lsn= log_sys.get_lsn();
@@ -2060,13 +2054,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
MONITOR_FLUSH_SYNC_PAGES, n_flushed);
}
- switch (srv_file_flush_method) {
- case SRV_NOSYNC:
- case SRV_O_DIRECT_NO_FSYNC:
- break;
- default:
- fil_flush_file_spaces();
- }
+ fil_flush_file_spaces();
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t newest_lsn= log_sys.get_lsn();
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index 481a2dbce53..cecda94cac4 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -499,6 +499,9 @@ void fil_space_t::flush_low()
break;
}
+ if (fil_system.is_write_through())
+ goto skip_flush;
+
fil_n_pending_tablespace_flushes++;
for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
node= UT_LIST_GET_NEXT(chain, node))
@@ -523,8 +526,9 @@ void fil_space_t::flush_low()
mysql_mutex_unlock(&fil_system.mutex);
}
- clear_flush();
fil_n_pending_tablespace_flushes--;
+skip_flush:
+ clear_flush();
}
/** Try to extend a tablespace.
@@ -753,7 +757,6 @@ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
{
if (space->is_in_unflushed_spaces)
{
- ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
space->is_in_unflushed_spaces= false;
fil_system.unflushed_spaces.remove(*space);
}
@@ -786,7 +789,6 @@ pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle)
if (space->is_in_unflushed_spaces)
{
- ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
space->is_in_unflushed_spaces= false;
unflushed_spaces.remove(*space);
}
@@ -1320,6 +1322,120 @@ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
mysql_mutex_unlock(&mutex);
}
+ATTRIBUTE_COLD void fil_space_t::reopen_all()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ fil_system.freeze_space_list++;
+
+ for (fil_space_t &space : fil_system.space_list)
+ {
+ for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ if (node->is_open())
+ goto need_to_close;
+ continue;
+
+ need_to_close:
+ uint32_t p= space.n_pending.fetch_or(CLOSING, std::memory_order_acquire);
+ if (p & (STOPPING | CLOSING))
+ continue;
+
+ for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+ if (!node->is_open())
+ continue;
+
+ ulint type= OS_DATA_FILE;
+
+ switch (FSP_FLAGS_GET_ZIP_SSIZE(space.flags)) {
+ case 1: case 2:
+ type= OS_DATA_FILE_NO_O_DIRECT;
+ }
+
+ for (ulint count= 10000; count--;)
+ {
+ p= space.pending();
+
+ if (!(p & CLOSING) || (p & STOPPING))
+ break;
+
+ if (!(p & PENDING) && !node->being_extended)
+ {
+ space.reacquire();
+ mysql_mutex_unlock(&fil_system.mutex);
+ /* Unconditionally flush the file, because
+ fil_system.write_through was updated prematurely,
+ potentially causing some flushes to be lost. */
+ os_file_flush(node->handle);
+ mysql_mutex_lock(&fil_system.mutex);
+ p= space.n_pending.fetch_sub(1, std::memory_order_relaxed) - 1;
+
+ if (!(p & CLOSING) || (p & STOPPING))
+ break;
+
+ if (!(p & PENDING) && !node->being_extended)
+ {
+ ut_a(os_file_close(node->handle));
+ bool success;
+ node->handle= os_file_create(innodb_data_file_key, node->name,
+ node->is_raw_disk
+ ? OS_FILE_OPEN_RAW : OS_FILE_OPEN,
+ OS_FILE_AIO, type,
+ srv_read_only_mode, &success);
+ ut_a(success);
+ goto next_file;
+ }
+ }
+
+ space.reacquire();
+ mysql_mutex_unlock(&fil_system.mutex);
+ std::this_thread::sleep_for(std::chrono::microseconds(100));
+ mysql_mutex_lock(&fil_system.mutex);
+ space.release();
+
+ if (!node->is_open())
+ goto next_file;
+ }
+
+ if (!(p & CLOSING) || (p & STOPPING))
+ next_file:
+ continue;
+
+ sql_print_error("InnoDB: Failed to reopen file '%s' due to " UINT32PF
+ " operations", node->name, p & PENDING);
+ }
+ }
+
+ fil_system.freeze_space_list--;
+}
+
+void fil_system_t::set_write_through(bool write_through)
+{
+ mysql_mutex_lock(&mutex);
+
+ if (write_through != this->write_through)
+ {
+ this->write_through= write_through;
+ fil_space_t::reopen_all();
+ }
+
+ mysql_mutex_unlock(&mutex);
+}
+
+void fil_system_t::set_buffered(bool buffered)
+{
+ mysql_mutex_lock(&mutex);
+
+ if (buffered != this->buffered)
+ {
+ this->buffered= buffered;
+ fil_space_t::reopen_all();
+ }
+
+ mysql_mutex_unlock(&mutex);
+}
+
/** Close all tablespace files at shutdown */
void fil_space_t::close_all()
{
@@ -1340,12 +1456,9 @@ void fil_space_t::close_all()
for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL;
node= UT_LIST_GET_NEXT(chain, node))
{
-
if (!node->is_open())
- {
next:
continue;
- }
for (ulint count= 10000; count--;)
{
@@ -1361,8 +1474,8 @@ void fil_space_t::close_all()
goto next;
}
- ib::error() << "File '" << node->name << "' has " << space.referenced()
- << " operations";
+ sql_print_error("InnoDB: File '%s' has " UINT32PF " operations",
+ node->name, space.referenced());
}
fil_system.detach(&space);
@@ -2598,7 +2711,7 @@ inline void fil_node_t::complete_write()
mysql_mutex_assert_not_owner(&fil_system.mutex);
if (space->purpose != FIL_TYPE_TEMPORARY &&
- srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC &&
+ (!fil_system.is_write_through() && !my_disable_sync) &&
space->set_needs_flush())
{
mysql_mutex_lock(&fil_system.mutex);
@@ -2774,14 +2887,6 @@ write_completed:
possibly cached by the OS. */
void fil_flush_file_spaces()
{
- if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
- {
- ut_d(mysql_mutex_lock(&fil_system.mutex));
- ut_ad(fil_system.unflushed_spaces.empty());
- ut_d(mysql_mutex_unlock(&fil_system.mutex));
- return;
- }
-
rescan:
mysql_mutex_lock(&fil_system.mutex);
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 86fc747faed..a51e30e28ce 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -366,6 +366,8 @@ const char* innodb_flush_method_names[] = {
NullS
};
+static constexpr ulong innodb_flush_method_default = IF_WIN(6,4);
+
/** Enumeration of innodb_flush_method */
TYPELIB innodb_flush_method_typelib = {
array_elements(innodb_flush_method_names) - 1,
@@ -374,6 +376,9 @@ TYPELIB innodb_flush_method_typelib = {
NULL
};
+/** Deprecated parameter */
+static ulong innodb_flush_method;
+
/** Names of allowed values of innodb_deadlock_report */
static const char *innodb_deadlock_report_names[]= {
"off", /* Do not report any details of deadlocks */
@@ -4005,22 +4010,27 @@ static int innodb_init_params()
data_mysql_default_charset_coll = (ulint) default_charset_info->number;
+ if (innodb_flush_method == 1 /* O_DSYNC */) {
+ log_sys.log_write_through = true;
+ fil_system.write_through = true;
+ fil_system.buffered = false;
+#if defined __linux__ || defined _WIN32
+ log_sys.log_buffered = false;
+ goto skip_buffering_tweak;
+#endif
+ } else if (innodb_flush_method >= 4 /* O_DIRECT */
+ IF_WIN(&& innodb_flush_method < 8 /* normal */,)) {
+ /* O_DIRECT and similar settings do nothing */
#ifndef _WIN32
- if (srv_use_atomic_writes && my_may_have_atomic_write) {
- /*
- Force O_DIRECT on Unixes (on Windows writes are always
- unbuffered)
- */
- switch (srv_file_flush_method) {
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
- break;
- default:
- srv_file_flush_method = SRV_O_DIRECT;
- fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
- }
- }
+ } else if (srv_use_atomic_writes && my_may_have_atomic_write) {
+ /* If atomic writes are enabled, do the same as with
+ innodb_flush_method=O_DIRECT: retain the default settings */
#endif
+ } else {
+ log_sys.log_write_through = false;
+ fil_system.write_through = false;
+ fil_system.buffered = true;
+ }
#if defined __linux__ || defined _WIN32
if (srv_flush_log_at_trx_commit == 2) {
@@ -4028,6 +4038,7 @@ static int innodb_init_params()
innodb_flush_log_at_trx_commit=2. */
log_sys.log_buffered = true;
}
+skip_buffering_tweak:
#endif
if (srv_read_only_mode) {
@@ -4035,12 +4046,6 @@ static int innodb_init_params()
srv_use_doublewrite_buf = FALSE;
}
-#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32
- /* Currently native AIO is supported only on windows and linux
- and that also when the support is compiled in. In all other
- cases, we ignore the setting of innodb_use_native_aio. */
- srv_use_native_aio = FALSE;
-#endif
#ifdef HAVE_URING
if (srv_use_native_aio && io_uring_may_be_unsafe) {
sql_print_warning("innodb_use_native_aio may cause "
@@ -4048,22 +4053,13 @@ static int innodb_init_params()
"https://jira.mariadb.org/browse/MDEV-26674",
io_uring_may_be_unsafe);
}
+#elif !defined LINUX_NATIVE_AIO && !defined _WIN32
+ /* Currently native AIO is supported only on windows and linux
+ and that also when the support is compiled in. In all other
+ cases, we ignore the setting of innodb_use_native_aio. */
+ srv_use_native_aio = FALSE;
#endif
-#ifndef _WIN32
- ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
-#else
- switch (srv_file_flush_method) {
- case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
- srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC;
- break;
- case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
- srv_file_flush_method = SRV_FSYNC;
- break;
- default:
- ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
- }
-#endif
innodb_buffer_pool_size_init();
srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
@@ -18409,7 +18405,7 @@ buffer_pool_load_abort(
}
#if defined __linux__ || defined _WIN32
-static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
+static void innodb_log_file_buffering_update(THD *, st_mysql_sys_var*,
void *, const void *save)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
@@ -18418,6 +18414,30 @@ static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
}
#endif
+static void innodb_log_file_write_through_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ log_sys.set_write_through(*static_cast<const my_bool*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static void innodb_data_file_buffering_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_system.set_buffered(*static_cast<const my_bool*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static void innodb_data_file_write_through_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_system.set_write_through(*static_cast<const my_bool*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
void *var, const void *save)
{
@@ -18876,11 +18896,10 @@ static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
" guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
NULL, NULL, 1, 0, 3, 0);
-static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method,
- PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_DEPRECATED,
"With which method to flush data.",
- NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT),
- &innodb_flush_method_typelib);
+ NULL, NULL, innodb_flush_method_default, &innodb_flush_method_typelib);
static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -19312,6 +19331,21 @@ static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
nullptr, innodb_log_file_buffering_update, FALSE);
#endif
+static MYSQL_SYSVAR_BOOL(log_file_write_through, log_sys.log_write_through,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether each write to ib_logfile0 is write through",
+ nullptr, innodb_log_file_write_through_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(data_file_buffering, fil_system.buffered,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether the file system cache for data files is enabled",
+ nullptr, innodb_data_file_buffering_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(data_file_write_through, fil_system.write_through,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether each write to data files writes through",
+ nullptr, innodb_data_file_write_through_update, FALSE);
+
static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
PLUGIN_VAR_RQCMDARG,
"Redo log size in bytes.",
@@ -19756,6 +19790,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
#if defined __linux__ || defined _WIN32
MYSQL_SYSVAR(log_file_buffering),
#endif
+ MYSQL_SYSVAR(log_file_write_through),
+ MYSQL_SYSVAR(data_file_buffering),
+ MYSQL_SYSVAR(data_file_write_through),
MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct),
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 940e1b68458..210f365ddd8 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -51,35 +51,6 @@ using space_list_t= ilist<fil_space_t, space_list_tag_t>;
// Forward declaration
extern my_bool srv_use_doublewrite_buf;
-/** Possible values of innodb_flush_method */
-enum srv_flush_t
-{
- /** fsync, the default */
- SRV_FSYNC= 0,
- /** open log files in O_DSYNC mode */
- SRV_O_DSYNC,
- /** do not call os_file_flush() when writing data files, but do flush
- after writing to log files */
- SRV_LITTLESYNC,
- /** do not flush after writing */
- SRV_NOSYNC,
- /** invoke os_file_set_nocache() on data files. This implies using
- unbuffered I/O but still fdatasync(), because some filesystems might
- not flush meta-data on write completion */
- SRV_O_DIRECT,
- /** Like O_DIRECT, but skip fdatasync(), assuming that the data is
- durable on write completion */
- SRV_O_DIRECT_NO_FSYNC
-#ifdef _WIN32
- /** Traditional Windows appoach to open all files without caching,
- and do FileFlushBuffers() */
- ,SRV_ALL_O_DIRECT_FSYNC
-#endif
-};
-
-/** innodb_flush_method */
-extern ulong srv_file_flush_method;
-
/** Undo tablespaces starts with space_id. */
extern uint32_t srv_undo_space_id_start;
/** The number of UNDO tablespaces that are open and ready to use. */
@@ -631,6 +602,8 @@ private:
}
public:
+ /** Reopen all files on set_write_through() or set_buffered(). */
+ static void reopen_all();
/** Try to close a file to adhere to the innodb_open_files limit.
@param print_info whether to diagnose why a file cannot be closed
@return whether a file was closed */
@@ -1414,6 +1387,20 @@ public:
fil_space_t* temp_space; /*!< The innodb_temporary tablespace */
/** Map of fil_space_t::id to fil_space_t* */
hash_table_t spaces;
+
+ /** whether each write to data files is durable (O_DSYNC) */
+ my_bool write_through;
+ /** whether data files are buffered (not O_DIRECT) */
+ my_bool buffered;
+
+ /** Try to enable or disable write-through of data files */
+ void set_write_through(bool write_through);
+ /** Try to enable or disable file system caching of data files */
+ void set_buffered(bool buffered);
+
+ TPOOL_SUPPRESS_TSAN bool is_write_through() const { return write_through; }
+ TPOOL_SUPPRESS_TSAN bool is_buffered() const { return buffered; }
+
/** tablespaces for which fil_space_t::needs_flush() holds */
sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
/** number of currently open files; protected by mutex */
@@ -1527,12 +1514,7 @@ template<bool have_reference> inline void fil_space_t::flush()
mysql_mutex_assert_not_owner(&fil_system.mutex);
ut_ad(!have_reference || (pending() & PENDING));
ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
- if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
- {
- ut_ad(!is_in_unflushed_spaces);
- ut_ad(!needs_flush());
- }
- else if (have_reference)
+ if (have_reference)
flush_low();
else
{
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 09e4ece8894..8afa92abc93 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -275,6 +275,8 @@ public:
bool log_maybe_unbuffered;
# endif
#endif
+ /** whether each write to ib_logfile0 is durable (O_DSYNC) */
+ my_bool log_write_through;
/** Fields involved in checkpoints @{ */
lsn_t log_capacity; /*!< capacity of the log; if
@@ -362,6 +364,8 @@ public:
/** Try to enable or disable file system caching (update log_buffered) */
void set_buffered(bool buffered);
#endif
+ /** Try to enable or disable durable writes (update log_write_through) */
+ void set_write_through(bool write_through);
void attach(log_file_t file, os_offset_t size);
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 2804143721c..4e9ed1263f6 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -401,6 +401,31 @@ void log_t::set_buffered(bool buffered)
}
#endif
+ /** Try to enable or disable durable writes (update log_write_through) */
+void log_t::set_write_through(bool write_through)
+{
+ if (is_pmem() || high_level_read_only)
+ return;
+ log_resize_acquire();
+ if (!resize_in_progress() && is_opened() &&
+ bool(log_write_through) != write_through)
+ {
+ os_file_close_func(log.m_file);
+ log.m_file= OS_FILE_CLOSED;
+ std::string path{get_log_file_path()};
+ log_write_through= write_through;
+ bool success;
+ log.m_file= os_file_create_func(path.c_str(),
+ OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
+ false, &success);
+ ut_a(log.m_file != OS_FILE_CLOSED);
+ sql_print_information(log_write_through
+ ? "InnoDB: Log writes write through"
+ : "InnoDB: Log writes may be cached");
+ }
+ log_resize_release();
+}
+
/** Start resizing the log and release the exclusive latch.
@param size requested new file_size
@return whether the resizing was started successfully */
@@ -852,7 +877,7 @@ bool log_t::flush(lsn_t lsn) noexcept
{
ut_ad(lsn >= get_flushed_lsn());
flush_lock.set_pending(lsn);
- const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()};
+ const bool success{log_write_through || log.flush()};
if (UNIV_LIKELY(success))
{
flushed_to_disk_lsn.store(lsn, std::memory_order_release);
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index d4cfb6207bf..6141c9dcc37 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -65,7 +65,9 @@ Created 10/21/1995 Heikki Tuuri
#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
#ifdef _WIN32
-#include <winioctl.h>
+# include <winioctl.h>
+#elif !defined O_DSYNC
+# define O_DSYNC O_SYNC
#endif
// my_test_if_atomic_write() , my_win_secattr()
@@ -931,6 +933,8 @@ bool
os_file_flush_func(
os_file_t file)
{
+ if (UNIV_UNLIKELY(my_disable_sync)) return true;
+
int ret;
ret = os_file_sync_posix(file);
@@ -981,40 +985,19 @@ os_file_create_simple_func(
*success = false;
- int create_flag;
- const char* mode_str = NULL;
+ int create_flag = O_RDONLY;
ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
- if (create_mode == OS_FILE_OPEN) {
- mode_str = "OPEN";
-
- if (access_type == OS_FILE_READ_ONLY) {
-
- create_flag = O_RDONLY;
-
- } else if (read_only) {
-
- create_flag = O_RDONLY;
-
- } else {
+ if (read_only) {
+ } else if (create_mode == OS_FILE_OPEN) {
+ if (access_type != OS_FILE_READ_ONLY) {
create_flag = O_RDWR;
}
-
- } else if (read_only) {
-
- mode_str = "OPEN";
- create_flag = O_RDONLY;
-
} else if (create_mode == OS_FILE_CREATE) {
-
- mode_str = "CREATE";
create_flag = O_RDWR | O_CREAT | O_EXCL;
-
} else if (create_mode == OS_FILE_CREATE_PATH) {
-
- mode_str = "CREATE PATH";
/* Create subdirs along the path if needed. */
*success = os_file_create_subdirs_if_needed(name);
@@ -1040,40 +1023,32 @@ os_file_create_simple_func(
return(OS_FILE_CLOSED);
}
- bool retry;
+ create_flag |= O_CLOEXEC;
+ if (fil_system.is_write_through()) create_flag |= O_DSYNC;
+ int direct_flag = fil_system.is_buffered() ? 0 : O_DIRECT;
- do {
- file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+ for (;;) {
+ file = open(name, create_flag | direct_flag, os_innodb_umask);
if (file == -1) {
+ if (direct_flag && errno == EINVAL) {
+ direct_flag = 0;
+ continue;
+ }
+
*success = false;
- retry = os_file_handle_error(
+ if (!os_file_handle_error(
name,
create_mode == OS_FILE_OPEN
- ? "open" : "create");
+ ? "open" : "create")) {
+ break;
+ }
} else {
*success = true;
- retry = false;
- }
-
- } while (retry);
-
- /* This function is always called for data files, we should disable
- OS caching (O_DIRECT) here as we do in os_file_create_func(), so
- we open the same file in the same mode, see man page of open(2). */
- if (!srv_read_only_mode && *success) {
- switch (srv_file_flush_method) {
- case SRV_O_DSYNC:
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
- os_file_set_nocache(file, name, mode_str);
- break;
- default:
break;
}
}
-#ifndef _WIN32
if (!read_only
&& *success
&& access_type == OS_FILE_READ_WRITE
@@ -1084,7 +1059,6 @@ os_file_create_simple_func(
close(file);
file = -1;
}
-#endif /* !_WIN32 */
return(file);
}
@@ -1156,8 +1130,8 @@ os_file_create_func(
return(OS_FILE_CLOSED);
);
- int create_flag;
- const char* mode_str = NULL;
+ int create_flag = O_RDONLY | O_CLOEXEC;
+ const char* mode_str = "OPEN";
on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
? true : false;
@@ -1167,30 +1141,17 @@ os_file_create_func(
create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
| OS_FILE_ON_ERROR_SILENT));
- if (create_mode == OS_FILE_OPEN
- || create_mode == OS_FILE_OPEN_RAW
- || create_mode == OS_FILE_OPEN_RETRY) {
-
- mode_str = "OPEN";
-
- create_flag = read_only ? O_RDONLY : O_RDWR;
-
- } else if (read_only) {
-
- mode_str = "OPEN";
-
- create_flag = O_RDONLY;
-
+ if (read_only) {
+ } else if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RAW
+ || create_mode == OS_FILE_OPEN_RETRY) {
+ create_flag = O_RDWR | O_CLOEXEC;
} else if (create_mode == OS_FILE_CREATE) {
-
mode_str = "CREATE";
- create_flag = O_RDWR | O_CREAT | O_EXCL;
-
+ create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
} else if (create_mode == OS_FILE_OVERWRITE) {
-
mode_str = "OVERWRITE";
- create_flag = O_RDWR | O_CREAT | O_TRUNC;
-
+ create_flag = O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC;
} else {
ib::error()
<< "Unknown file create mode (" << create_mode << ")"
@@ -1205,25 +1166,30 @@ os_file_create_func(
ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
- /* We let O_DSYNC only affect log files */
+ create_flag |= O_CLOEXEC;
- if (!read_only
- && type == OS_LOG_FILE
- && srv_file_flush_method == SRV_O_DSYNC) {
-#ifdef O_DSYNC
+ int direct_flag = type == OS_DATA_FILE && create_mode != OS_FILE_CREATE
+ && !fil_system.is_buffered()
+ ? O_DIRECT : 0;
+
+ if (read_only) {
+ } else if ((type == OS_LOG_FILE)
+ ? log_sys.log_write_through
+ : fil_system.is_write_through()) {
create_flag |= O_DSYNC;
-#else
- create_flag |= O_SYNC;
-#endif
}
os_file_t file;
- bool retry;
- do {
- file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+ for (;;) {
+ file = open(name, create_flag | direct_flag, os_innodb_umask);
if (file == -1) {
+ if (direct_flag && errno == EINVAL) {
+ direct_flag = 0;
+ continue;
+ }
+
const char* operation;
operation = (create_mode == OS_FILE_CREATE
@@ -1232,39 +1198,30 @@ os_file_create_func(
*success = false;
if (on_error_no_exit) {
- retry = os_file_handle_error_no_exit(
- name, operation, on_error_silent);
+ if (os_file_handle_error_no_exit(
+ name, operation, on_error_silent))
+ continue;
} else {
- retry = os_file_handle_error(name, operation);
+ if (os_file_handle_error(name, operation))
+ continue;
}
+
+ return file;
} else {
*success = true;
- retry = false;
+ break;
}
-
- } while (retry);
-
- if (!*success) {
- return file;
}
#if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT
- if (type == OS_DATA_FILE) {
- switch (srv_file_flush_method) {
- case SRV_O_DSYNC:
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
+ if (type == OS_DATA_FILE && create_mode == OS_FILE_CREATE
+ && !fil_system.is_buffered()) {
# ifdef __linux__
use_o_direct:
# endif
- os_file_set_nocache(file, name, mode_str);
- break;
- default:
- break;
- }
- }
+ os_file_set_nocache(file, name, mode_str);
# ifdef __linux__
- else if (type == OS_LOG_FILE && !log_sys.is_opened()) {
+ } else if (type == OS_LOG_FILE && !log_sys.is_opened()) {
struct stat st;
char b[20 + sizeof "/sys/dev/block/" ":"
"/../queue/physical_block_size"];
@@ -1316,11 +1273,10 @@ skip_o_direct:
log_sys.log_buffered= true;
log_sys.set_block_size(512);
}
- }
# endif
+ }
#endif
-#ifndef _WIN32
if (!read_only
&& create_mode != OS_FILE_OPEN_RAW
&& !my_disable_locking
@@ -1348,7 +1304,6 @@ skip_o_direct:
close(file);
file = -1;
}
-#endif /* !_WIN32 */
return(file);
}
@@ -1786,6 +1741,9 @@ Flushes the write buffers of a given file to the disk.
@return true if success */
bool os_file_flush_func(os_file_t file)
{
+ if (UNIV_UNLIKELY(my_disable_sync))
+ return true;
+
++os_n_fsyncs;
static bool disable_datasync;
@@ -2011,6 +1969,11 @@ os_file_create_simple_func(
return(OS_FILE_CLOSED);
}
+ if (fil_system.is_write_through())
+ attributes |= FILE_FLAG_WRITE_THROUGH;
+ if (!fil_system.is_buffered())
+ attributes |= FILE_FLAG_NO_BUFFERING;
+
bool retry;
do {
@@ -2182,27 +2145,16 @@ os_file_create_func(
if (!log_sys.is_opened() && !log_sys.log_buffered) {
attributes|= FILE_FLAG_NO_BUFFERING;
}
- if (srv_file_flush_method == SRV_O_DSYNC)
+ if (log_sys.log_write_through)
attributes|= FILE_FLAG_WRITE_THROUGH;
- }
- else if (type == OS_DATA_FILE)
- {
- switch (srv_file_flush_method)
- {
- case SRV_FSYNC:
- case SRV_LITTLESYNC:
- case SRV_NOSYNC:
- break;
- default:
+ } else {
+ if (type == OS_DATA_FILE && !fil_system.is_buffered())
attributes|= FILE_FLAG_NO_BUFFERING;
- }
+ if (fil_system.is_write_through())
+ attributes|= FILE_FLAG_WRITE_THROUGH;
}
- DWORD access = GENERIC_READ;
-
- if (!read_only) {
- access |= GENERIC_WRITE;
- }
+ DWORD access = read_only ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE;
for (;;) {
const char *operation;
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index ffb7f53c15c..02c7367cc93 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -223,9 +223,6 @@ ulong srv_read_ahead_threshold;
buffer in terms of percentage of the buffer pool. */
uint srv_change_buffer_max_size;
-ulong srv_file_flush_method;
-
-
/** copy of innodb_open_files; @see innodb_init_params() */
ulint srv_max_n_open_files;
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 475ae887d23..0ff20b31771 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -1168,7 +1168,7 @@ static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx)
callback= &cb;
}
- log_write_up_to(lsn, srv_file_flush_method != SRV_NOSYNC &&
+ log_write_up_to(lsn, !my_disable_sync &&
(srv_flush_log_at_trx_commit & 1), callback);
}