summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2022-06-14 17:46:47 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2022-06-14 17:46:47 +0300
commit4c0cd953abffea841271f3a5cce1712d5e6c5633 (patch)
treec7e3267c9fb6d40dd1d93502e6d19e92f2f14605
parent813986a6473b23485654d50d3593f1c9aa3658d8 (diff)
downloadmariadb-git-4c0cd953abffea841271f3a5cce1712d5e6c5633.tar.gz
MDEV-28766: SET GLOBAL innodb_log_file_buffering
In commit c4c88307091cb16886562e9e7b77f5fd077d34b5 (MDEV-28111) we disabled the file system cache on the InnoDB write-ahead log file (ib_logfile0) by default on Linux. It turns out that especially with innodb_flush_trx_log_at_commit=2, writing to the log via the file system cache typically improves throughput, especially on slow storage or at a small number of concurrent transactions. For other values of innodb_flush_log_at_trx_commit, direct writes were observed to be mostly but not always faster. Whether it pays off to disable the file system cache on the log may depend on the type of storage, the workload, and the operating system kernel version. On Linux and Microsoft Windows, we will introduce the settable Boolean global variable innodb_log_file_buffering that indicates whether the file system cache on the redo log file is enabled. The default value is innodb_log_file_buffering=OFF. If the server is started up with innodb_flush_log_at_trx_commit=2, the value will be changed to innodb_log_file_buffering=ON. When a persistent memory interface is being used for the log, the value cannot be changed from innodb_log_file_buffering=OFF. On Linux, when the physical block size cannot be determined to be a power of 2 between 64 and 4096 bytes, the file system cache cannot be disabled, and innodb_log_file_buffering=ON cannot be changed. Server log messages will indicate whether the file system cache is enabled for the redo log: [Note] InnoDB: Buffered log writes (block size=512 bytes) [Note] InnoDB: File system buffers for log disabled (block size=512 bytes) After this change, the startup parameter innodb_flush_method will no longer control whether O_DIRECT will be set on the redo log on Linux. On other operating systems that support O_DIRECT, no interface has been implemented for controlling the file system cache for the redo log. The innodb_flush_method values O_DIRECT, O_DIRECT_NO_FSYNC, O_DSYNC will enable O_DIRECT for data files, not the log. Tested by: Matthias Leich, Axel Schwenke
-rw-r--r--mysql-test/suite/sys_vars/r/sysvars_innodb.result3
-rw-r--r--mysql-test/suite/sys_vars/t/sysvars_innodb.test1
-rw-r--r--storage/innobase/handler/ha_innodb.cc30
-rw-r--r--storage/innobase/include/log0log.h17
-rw-r--r--storage/innobase/log/log0log.cc75
-rw-r--r--storage/innobase/os/os0file.cc51
6 files changed, 139 insertions, 38 deletions
diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
index 11b90f41d5f..c3dd970f6e3 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
@@ -5,6 +5,7 @@ variable_name not in (
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
+'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
VARIABLE_NAME INNODB_ADAPTIVE_FLUSHING
@@ -1020,7 +1021,7 @@ SESSION_VALUE NULL
DEFAULT_VALUE
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE VARCHAR
-VARIABLE_COMMENT Path to InnoDB log files.
+VARIABLE_COMMENT Path to ib_logfile0
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
diff --git a/mysql-test/suite/sys_vars/t/sysvars_innodb.test b/mysql-test/suite/sys_vars/t/sysvars_innodb.test
index 15fd99e9984..6d46c22683f 100644
--- a/mysql-test/suite/sys_vars/t/sysvars_innodb.test
+++ b/mysql-test/suite/sys_vars/t/sysvars_innodb.test
@@ -12,5 +12,6 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
+ 'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 3ad8ae1c070..d286a034dc1 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -4066,6 +4066,14 @@ static int innodb_init_params()
}
#endif
+#if defined __linux__ || defined _WIN32
+ if (srv_flush_log_at_trx_commit == 2) {
+ /* Do not disable the file system cache if
+ innodb_flush_log_at_trx_commit=2. */
+ log_sys.log_buffered = true;
+ }
+#endif
+
if (srv_read_only_mode) {
ib::info() << "Started in read only mode";
srv_use_doublewrite_buf = FALSE;
@@ -18442,6 +18450,16 @@ buffer_pool_load_abort(
}
}
+#if defined __linux__ || defined _WIN32
+static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ log_sys.set_buffered(*static_cast<const my_bool*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+#endif
+
/** Update innodb_status_output or innodb_status_output_locks,
which control InnoDB "status monitor" output to the error log.
@param[out] var current value
@@ -18858,7 +18876,7 @@ static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method,
static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
- "Path to InnoDB log files.", NULL, NULL, NULL);
+ "Path to ib_logfile0", NULL, NULL, NULL);
static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
PLUGIN_VAR_RQCMDARG,
@@ -19250,6 +19268,13 @@ static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size,
"Redo log buffer size in bytes.",
NULL, NULL, 16U << 20, 2U << 20, SIZE_T_MAX, 4096);
+#if defined __linux__ || defined _WIN32
+static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether the file system cache for ib_logfile0 is enabled",
+ nullptr, innodb_log_file_buffering_update, FALSE);
+#endif
+
static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Redo log size in bytes.",
@@ -19692,6 +19717,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(deadlock_report),
MYSQL_SYSVAR(page_size),
MYSQL_SYSVAR(log_buffer_size),
+#if defined __linux__ || defined _WIN32
+ MYSQL_SYSVAR(log_file_buffering),
+#endif
MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct),
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 39c2fb8b01e..d1c6e40d946 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -249,6 +249,16 @@ public:
uint32_t format;
/** Log file */
log_file_t log;
+#if defined __linux__ || defined _WIN32
+ /** whether file system caching is enabled for the log */
+ my_bool log_buffered;
+# ifdef _WIN32
+ static constexpr bool log_maybe_unbuffered= true;
+# else
+ /** whether file system caching may be disabled */
+ bool log_maybe_unbuffered;
+# endif
+#endif
/** Fields involved in checkpoints @{ */
lsn_t log_capacity; /*!< capacity of the log; if
@@ -289,10 +299,17 @@ public:
bool is_opened() const noexcept { return log.is_opened(); }
+ static constexpr bool resize_in_progress() { return false; }
+
/** Rename a log file after resizing.
@return whether an error occurred */
static bool rename_resized() noexcept;
+#if defined __linux__ || defined _WIN32
+ /** Try to enable or disable file system caching (update log_buffered) */
+ void set_buffered(bool buffered);
+#endif
+
void attach(log_file_t file, os_offset_t size);
void close_file();
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 8a0ef712ab1..efdd527a28f 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -209,6 +209,8 @@ void log_t::attach(log_file_t file, os_offset_t size)
#if defined __linux__ || defined _WIN32
set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
#endif
+ log_maybe_unbuffered= true;
+ log_buffered= false;
return;
}
}
@@ -220,18 +222,11 @@ void log_t::attach(log_file_t file, os_offset_t size)
#endif
#if defined __linux__ || defined _WIN32
- if (!block_size)
- set_block_size(512);
-# ifdef __linux__
- else if (srv_file_flush_method != SRV_O_DSYNC &&
- srv_file_flush_method != SRV_O_DIRECT &&
- srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC)
- sql_print_information("InnoDB: Buffered log writes (block size=%u bytes)",
- block_size);
-#endif
- else
- sql_print_information("InnoDB: File system buffers for log"
- " disabled (block size=%u bytes)", block_size);
+ sql_print_information("InnoDB: %s (block size=%u bytes)",
+ log_buffered
+ ? "Buffered log writes"
+ : "File system buffers for log disabled",
+ block_size);
#endif
#ifdef HAVE_PMEM
@@ -327,6 +322,62 @@ void log_t::close_file()
ib::fatal() << "closing ib_logfile0 failed: " << err;
}
+#if defined __linux__ || defined _WIN32
+/** Acquire all latches that protect the log. */
+static void log_resize_acquire()
+{
+ if (!log_sys.is_pmem())
+ {
+ while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+ group_commit_lock::ACQUIRED);
+ while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+ group_commit_lock::ACQUIRED);
+ }
+
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+}
+
+/** Release the latches that protect the log. */
+void log_resize_release()
+{
+ log_sys.latch.wr_unlock();
+
+ if (!log_sys.is_pmem())
+ {
+ lsn_t lsn1= write_lock.release(write_lock.value());
+ lsn_t lsn2= flush_lock.release(flush_lock.value());
+ if (lsn1 || lsn2)
+ log_write_up_to(std::max(lsn1, lsn2), true, nullptr);
+ }
+}
+
+/** Try to enable or disable file system caching (update log_buffered) */
+void log_t::set_buffered(bool buffered)
+{
+ if (!log_maybe_unbuffered || is_pmem() || high_level_read_only)
+ return;
+ log_resize_acquire();
+ if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered)
+ {
+ os_file_close_func(log.m_file);
+ log.m_file= OS_FILE_CLOSED;
+ std::string path{get_log_file_path()};
+ log_buffered= buffered;
+ bool success;
+ log.m_file= os_file_create_func(path.c_str(),
+ OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
+ false, &success);
+ ut_a(log.m_file != OS_FILE_CLOSED);
+ sql_print_information("InnoDB: %s (block size=%u bytes)",
+ log_buffered
+ ? "Buffered log writes"
+ : "File system buffers for log disabled",
+ block_size);
+ }
+ log_resize_release();
+}
+#endif
+
/** Write an aligned buffer to ib_logfile0.
@param buf buffer to be written
@param len length of data to be written
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index e212e9a3ef3..01af15befd3 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -1055,6 +1055,7 @@ os_file_create_simple_func(
we open the same file in the same mode, see man page of open(2). */
if (!srv_read_only_mode && *success) {
switch (srv_file_flush_method) {
+ case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
os_file_set_nocache(file, name, mode_str);
@@ -1240,13 +1241,13 @@ os_file_create_func(
#if (defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)) || defined O_DIRECT
if (type == OS_DATA_FILE) {
-# ifdef __linux__
-use_o_direct:
-# endif
switch (srv_file_flush_method) {
case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
+# ifdef __linux__
+use_o_direct:
+# endif
os_file_set_nocache(file, name, mode_str);
break;
default:
@@ -1263,9 +1264,6 @@ use_o_direct:
goto skip_o_direct;
}
MSAN_STAT_WORKAROUND(&st);
- if (st.st_size & 4095) {
- goto skip_o_direct;
- }
if (snprintf(b, sizeof b,
"/sys/dev/block/%u:%u/queue/physical_block_size",
major(st.st_dev), minor(st.st_dev))
@@ -1298,11 +1296,16 @@ use_o_direct:
if (s > 4096 || s < 64 || !ut_is_2pow(s)) {
goto skip_o_direct;
}
+ log_sys.log_maybe_unbuffered= true;
log_sys.set_block_size(uint32_t(s));
- goto use_o_direct;
+ if (!log_sys.log_buffered && !(st.st_size & (s - 1))) {
+ goto use_o_direct;
+ }
} else {
skip_o_direct:
- log_sys.set_block_size(0);
+ log_sys.log_maybe_unbuffered= false;
+ log_sys.log_buffered= true;
+ log_sys.set_block_size(512);
}
}
# endif
@@ -2057,7 +2060,7 @@ os_file_create_directory(
}
/** Get disk sector size for a file. */
-size_t get_sector_size(HANDLE file)
+static size_t get_sector_size(HANDLE file)
{
FILE_STORAGE_INFO fsi;
ULONG s= 4096;
@@ -2065,9 +2068,7 @@ size_t get_sector_size(HANDLE file)
{
s= fsi.PhysicalBytesPerSectorForPerformance;
if (s > 4096 || s < 64 || !ut_is_2pow(s))
- {
return 4096;
- }
}
return s;
}
@@ -2165,8 +2166,9 @@ os_file_create_func(
? FILE_FLAG_OVERLAPPED : 0;
if (type == OS_LOG_FILE) {
- if(srv_flush_log_at_trx_commit != 2 && !log_sys.is_opened())
+ if (!log_sys.is_opened() && !log_sys.log_buffered) {
attributes|= FILE_FLAG_NO_BUFFERING;
+ }
if (srv_file_flush_method == SRV_O_DSYNC)
attributes|= FILE_FLAG_WRITE_THROUGH;
}
@@ -2197,21 +2199,22 @@ os_file_create_func(
name, access, share_mode, my_win_file_secattr(),
create_flag, attributes, NULL);
- if (file != INVALID_HANDLE_VALUE && type == OS_LOG_FILE
- && (attributes & FILE_FLAG_NO_BUFFERING)) {
- uint32 s= (uint32_t) get_sector_size(file);
- log_sys.set_block_size(uint32_t(s));
- /* FIXME! remove it when backup is fixed, so that it
- does not produce redo with irregular sizes.*/
- if (os_file_get_size(file) % s) {
- attributes &= ~FILE_FLAG_NO_BUFFERING;
- create_flag = OPEN_ALWAYS;
- CloseHandle(file);
- continue;
+ *success = file != INVALID_HANDLE_VALUE;
+
+ if (*success && type == OS_LOG_FILE) {
+ uint32_t s = uint32_t(get_sector_size(file));
+ log_sys.set_block_size(s);
+ if (attributes & FILE_FLAG_NO_BUFFERING) {
+ if (os_file_get_size(file) % s) {
+ attributes &= ~FILE_FLAG_NO_BUFFERING;
+ create_flag = OPEN_ALWAYS;
+ CloseHandle(file);
+ continue;
+ }
+ log_sys.log_buffered = false;
}
}
- *success = (file != INVALID_HANDLE_VALUE);
if (*success) {
break;
}