summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Vojtovich <svoj@mariadb.org>2020-01-21 17:21:20 +0400
committerSergey Vojtovich <svoj@mariadb.org>2020-02-04 23:13:15 +0400
commit41d17ea85c415f4fc4f9bc17d1aebc0ab5e83987 (patch)
tree4940fc774d6762db061c5092f7a7f1892a5fe63a
parentcc276f17661456efbf7ca79708bc661cef4447a2 (diff)
downloadmariadb-git-bb-10.5-svoj-MDEV-17084-redo.tar.gz
InnoDB redo log IO methodsbb-10.5-svoj-MDEV-17084-redo
normal - conventional IO using read()/write()/fdatasync() mmap - memory mapped IO, expected to be faster but less secure
-rw-r--r--mysql-test/suite/sys_vars/r/sysvars_innodb.result12
-rw-r--r--storage/innobase/CMakeLists.txt3
-rw-r--r--storage/innobase/handler/ha_innodb.cc22
-rw-r--r--storage/innobase/include/log0log.h27
-rw-r--r--storage/innobase/log/log0log.cc156
5 files changed, 193 insertions, 27 deletions
diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
index 5f4b031ada0..2ddca40d4d4 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result
@@ -1257,6 +1257,18 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
+VARIABLE_NAME INNODB_LOG_IO_METHOD
+SESSION_VALUE NULL
+DEFAULT_VALUE normal
+VARIABLE_SCOPE GLOBAL
+VARIABLE_TYPE ENUM
+VARIABLE_COMMENT InnoDB redo log IO method: normal (default), mmap.
+NUMERIC_MIN_VALUE NULL
+NUMERIC_MAX_VALUE NULL
+NUMERIC_BLOCK_SIZE NULL
+ENUM_VALUE_LIST normal,mmap
+READ_ONLY YES
+COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_LOG_OPTIMIZE_DDL
SESSION_VALUE NULL
DEFAULT_VALUE ON
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index 593567bb097..d21ea39c11a 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -155,7 +155,8 @@ MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
${CRC32_LIBRARY}
${NUMA_LIBRARY}
${LIBSYSTEMD}
- ${LINKER_SCRIPT})
+ ${LINKER_SCRIPT}
+ ${LIBPMEM})
IF(NOT TARGET innobase)
RETURN()
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 56dc0087045..484de2f840e 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -406,6 +406,21 @@ static TYPELIB innodb_change_buffering_typelib = {
NULL
};
+static const char *innodb_log_io_method_names[]=
+{
+ "normal",
+ "mmap",
+ NullS
+};
+
+static TYPELIB innodb_log_io_method_typelib=
+{
+ array_elements(innodb_log_io_method_names) - 1,
+ "innodb_log_io_method_typelib",
+ innodb_log_io_method_names,
+ NULL
+};
+
/** Retrieve the FTS Relevance Ranking result for doc with doc_id
of m_prebuilt->fts_doc_id
@param[in,out] fts_hdl FTS handler
@@ -19413,6 +19428,12 @@ static MYSQL_SYSVAR_BOOL(log_optimize_ddl, innodb_log_optimize_ddl,
" allows concurrent backup.",
NULL, NULL, TRUE);
+static MYSQL_SYSVAR_ENUM(log_io_method, innodb_log_io_method,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "InnoDB redo log IO method: normal (default), mmap.",
+ NULL, NULL, 0,
+ &innodb_log_io_method_typelib);
+
static MYSQL_SYSVAR_ULONG(autoextend_increment,
sys_tablespace_auto_extend_increment,
PLUGIN_VAR_RQCMDARG,
@@ -20303,6 +20324,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(log_compressed_pages),
MYSQL_SYSVAR(log_optimize_ddl),
+ MYSQL_SYSVAR(log_io_method),
MYSQL_SYSVAR(max_dirty_pages_pct),
MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
MYSQL_SYSVAR(adaptive_flushing_lwm),
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 4e619a203d9..cf19eaf297a 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -40,6 +40,7 @@ Created 12/9/1995 Heikki Tuuri
#include "os0file.h"
#include "span.h"
#include <atomic>
+#include <memory>
using st_::span;
@@ -449,6 +450,7 @@ typedef ib_mutex_t LogSysMutex;
typedef ib_mutex_t FlushOrderMutex;
extern my_bool srv_read_only_mode;
+extern ulong innodb_log_io_method;
/** Redo log buffer */
struct log_t{
@@ -516,6 +518,22 @@ struct log_t{
/** Log files. Protected by mutex or write_mutex. */
struct files {
+ class file_io
+ {
+ protected:
+ bool durable_writes;
+ public:
+ virtual ~file_io() {}
+ virtual dberr_t open(const char *path)= 0;
+ virtual dberr_t close()= 0;
+ virtual dberr_t read(os_offset_t offset, span<byte> buf)= 0;
+ virtual dberr_t write(const char *path, os_offset_t offset,
+ span<byte> buf)= 0;
+ virtual dberr_t flush_data_only()= 0;
+
+ bool writes_are_durable() const { return durable_writes; }
+ };
+
/** number of files */
ulint n_files;
/** format of the redo log: e.g., FORMAT_10_4 */
@@ -530,14 +548,14 @@ struct log_t{
lsn_t lsn;
/** the byte offset of the above lsn */
lsn_t lsn_offset;
+ /** file descriptors for all log files */
+ std::vector<std::unique_ptr<file_io>> files;
public:
/** used only in recovery: recovery scan succeeded up to this
lsn in this log group */
lsn_t scanned_lsn;
- /** file descriptors for all log files */
- std::vector<pfs_os_file_t> files;
/** file names for all log files */
std::vector<std::string> file_names;
@@ -553,6 +571,11 @@ struct log_t{
@param[in] total_offset offset in log files treated as a single file
@param[in] buf buffer from which to write */
void write(size_t total_offset, span<byte> buf);
+ /** checks whether flush_data_only() is needed to make data persistend */
+ bool writes_are_durable() const
+ {
+ return files.front()->writes_are_durable();
+ }
/** flushes OS page cache (excluding metadata!) for all log files */
void flush_data_only();
/** closes all log files */
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index d0b325c05aa..043de52d737 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -84,6 +84,8 @@ reduce the size of the log.
/** Redo log system */
log_t log_sys;
+ulong innodb_log_io_method;
+
/* Next log block number to do dummy record filling if no log records written
for a while */
static ulint next_lbn_to_pad = 0;
@@ -587,6 +589,112 @@ void log_t::create()
}
}
+
+class file_os_io final: public log_t::files::file_io
+{
+ pfs_os_file_t fd;
+public:
+ dberr_t open(const char *path) final
+ {
+ bool success;
+ fd= os_file_create(innodb_log_file_key, path,
+ OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_NORMAL, OS_LOG_FILE,
+ srv_read_only_mode, &success);
+ durable_writes= srv_file_flush_method == SRV_O_DSYNC;
+ return success ? DB_SUCCESS : DB_ERROR;
+ }
+ dberr_t close() final { return os_file_close(fd) ? DB_SUCCESS : DB_ERROR; }
+ dberr_t read(os_offset_t offset, span<byte> buf) final
+ {
+ return os_file_read(IORequestRead, fd, buf.data(), offset, buf.size());
+ }
+ dberr_t write(const char *path, os_offset_t offset, span<byte> buf) final
+ {
+ return os_file_write(IORequestWrite, path, fd, buf.data(), offset,
+ buf.size());
+ }
+ dberr_t flush_data_only() final
+ {
+ return os_file_flush_data(fd) ? DB_SUCCESS : DB_ERROR;
+ }
+};
+
+
+#ifdef HAVE_PMEM
+#include <libpmem.h>
+#endif
+class file_mmap_io final: public log_t::files::file_io
+{
+ File fd;
+ void *addr;
+ size_t length;
+public:
+ dberr_t open(const char *path) final
+ {
+ fd= mysql_file_open(innodb_log_file_key, path,
+ srv_read_only_mode ? O_RDONLY : O_RDWR, MYF(MY_WME));
+ if (fd >= 0)
+ {
+ MY_STAT sb;
+ if (!mysql_file_fstat(fd, &sb, MYF(0)))
+ {
+ int prot= srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE;
+ length= sb.st_size;
+ addr= my_mmap(0, length, prot, MAP_SHARED_VALIDATE | MAP_SYNC, fd, 0);
+ if (addr != MAP_FAILED)
+ {
+#ifdef HAVE_PMEM
+ durable_writes= true;
+ ib::info() << "The redo log file is located on a DAX storage. "
+ "Writes are durable, sync disabled.";
+#else
+ durable_writes= false;
+ ib::info() << "The redo log file is located on a DAX storage, "
+ "but persistent memory features were disabled "
+ "(WITH_PMEM=OFF). Page cache is bypassed, sync is "
+ "required to make writes durable.";
+#endif
+ return DB_SUCCESS;
+ }
+ addr= my_mmap(0, length, prot, MAP_SHARED, fd, 0);
+ if (addr != MAP_FAILED)
+ {
+ durable_writes= false;
+ return DB_SUCCESS;
+ }
+ }
+ mysql_file_close(fd, MYF(MY_WME));
+ }
+ return DB_ERROR;
+ }
+ dberr_t close() final
+ {
+ int err= my_munmap(addr, length);
+ return (!mysql_file_close(fd, MYF(MY_WME)) && !err) ? DB_SUCCESS : DB_ERROR;
+ }
+ dberr_t read(os_offset_t offset, span<byte> buf) final
+ {
+ memcpy(buf.data(), (char*) addr + offset, buf.size());
+ return DB_SUCCESS;
+ }
+ dberr_t write(const char *, os_offset_t offset, span<byte> buf) final
+ {
+#ifdef HAVE_PMEM
+ pmem_memcpy_persist((char*) addr + offset, buf.data(), buf.size());
+#else
+ memcpy((char*) addr + offset, buf.data(), buf.size());
+#endif
+ return DB_SUCCESS;
+ }
+ dberr_t flush_data_only() final
+ {
+ ut_ad(!durable_writes);
+ return my_msync(fd, addr, length, MS_SYNC) ? DB_ERROR : DB_SUCCESS;
+ }
+};
+
+
void log_t::files::set_file_names(std::vector<std::string> names)
{
file_names= std::move(names);
@@ -598,15 +706,18 @@ void log_t::files::open_files()
files.reserve(file_names.size());
for (const auto &name : file_names)
{
- bool success;
- files.push_back(os_file_create(innodb_log_file_key, name.c_str(),
- OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
- OS_FILE_NORMAL, OS_LOG_FILE,
- srv_read_only_mode, &success));
- if (!success)
+ file_io *io;
+
+ switch (innodb_log_io_method)
{
- ib::fatal() << "os_file_create(" << name << ") failed";
+ case 1: io= new file_mmap_io; break;
+ default: io= new file_os_io;
}
+ ut_a(io);
+
+ if (io->open(name.c_str()))
+ ib::fatal() << "open(" << name << ") failed";
+ files.emplace_back(io);
}
}
@@ -617,12 +728,8 @@ void log_t::files::read(size_t total_offset, span<byte> buf)
const size_t file_idx= total_offset / static_cast<size_t>(file_size);
const size_t offset= total_offset % static_cast<size_t>(file_size);
- if (const dberr_t err= os_file_read(IORequestRead, files[file_idx],
- buf.data(), offset, buf.size()))
- {
- ib::fatal() << "os_file_read(" << file_names[file_idx] << ") returned "
- << err;
- }
+ if (const dberr_t err= files[file_idx]->read(offset, buf))
+ ib::fatal() << "read(" << file_names[file_idx] << ") returned " << err;
}
void log_t::files::write(size_t total_offset, span<byte> buf)
@@ -632,13 +739,9 @@ void log_t::files::write(size_t total_offset, span<byte> buf)
const size_t file_idx= total_offset / static_cast<size_t>(file_size);
const size_t offset= total_offset % static_cast<size_t>(file_size);
- if (const dberr_t err=
- os_file_write(IORequestWrite, file_names[file_idx].c_str(),
- files[file_idx], buf.data(), offset, buf.size()))
- {
- ib::fatal() << "os_file_write(" << file_names[file_idx] << ") returned "
- << err;
- }
+ if (const dberr_t err= files[file_idx]->write(file_names[file_idx].c_str(),
+ offset, buf))
+ ib::fatal() << "write(" << file_names[file_idx] << ") returned " << err;
}
void log_t::files::flush_data_only()
@@ -648,10 +751,10 @@ void log_t::files::flush_data_only()
log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire);
for (auto it= files.begin(), end= files.end(); it != end; ++it)
{
- if (!os_file_flush_data(*it))
+ if ((*it)->flush_data_only())
{
const auto idx= std::distance(files.begin(), it);
- ib::fatal() << "os_file_flush_data(" << file_names[idx] << ") failed";
+ ib::fatal() << "flush_data_only(" << file_names[idx] << ") failed";
}
}
log_sys.pending_flushes.fetch_sub(1, std::memory_order_release);
@@ -662,10 +765,10 @@ void log_t::files::close_files()
{
for (auto it= files.begin(), end= files.end(); it != end; ++it)
{
- if (!os_file_close(*it))
+ if ((*it)->close())
{
const auto idx= std::distance(files.begin(), it);
- ib::fatal() << "os_file_close(" << file_names[idx] << ") failed";
+ ib::fatal() << "close(" << file_names[idx] << ") failed";
}
}
files.clear();
@@ -934,6 +1037,11 @@ void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
return;
}
+ /* FIXME!!! */
+ if (log_sys.log.writes_are_durable()) {
+ flush_to_disk= false;
+ }
+
loop:
ut_ad(++loop_count < 128);