diff options
author | Sergey Vojtovich <svoj@mariadb.org> | 2020-01-21 17:21:20 +0400 |
---|---|---|
committer | Sergey Vojtovich <svoj@mariadb.org> | 2020-02-04 23:13:15 +0400 |
commit | 41d17ea85c415f4fc4f9bc17d1aebc0ab5e83987 (patch) | |
tree | 4940fc774d6762db061c5092f7a7f1892a5fe63a | |
parent | cc276f17661456efbf7ca79708bc661cef4447a2 (diff) | |
download | mariadb-git-bb-10.5-svoj-MDEV-17084-redo.tar.gz |
InnoDB redo log IO methodsbb-10.5-svoj-MDEV-17084-redo
normal - conventional IO using read()/write()/fdatasync()
mmap - memory mapped IO, expected to be faster but less secure
-rw-r--r-- | mysql-test/suite/sys_vars/r/sysvars_innodb.result | 12 | ||||
-rw-r--r-- | storage/innobase/CMakeLists.txt | 3 | ||||
-rw-r--r-- | storage/innobase/handler/ha_innodb.cc | 22 | ||||
-rw-r--r-- | storage/innobase/include/log0log.h | 27 | ||||
-rw-r--r-- | storage/innobase/log/log0log.cc | 156 |
5 files changed, 193 insertions, 27 deletions
diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 5f4b031ada0..2ddca40d4d4 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -1257,6 +1257,18 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST NULL READ_ONLY YES COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_LOG_IO_METHOD +SESSION_VALUE NULL +DEFAULT_VALUE normal +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE ENUM +VARIABLE_COMMENT InnoDB redo log IO method: normal (default), mmap. +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST normal,mmap +READ_ONLY YES +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_LOG_OPTIMIZE_DDL SESSION_VALUE NULL DEFAULT_VALUE ON diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 593567bb097..d21ea39c11a 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -155,7 +155,8 @@ MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE ${CRC32_LIBRARY} ${NUMA_LIBRARY} ${LIBSYSTEMD} - ${LINKER_SCRIPT}) + ${LINKER_SCRIPT} + ${LIBPMEM}) IF(NOT TARGET innobase) RETURN() diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 56dc0087045..484de2f840e 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -406,6 +406,21 @@ static TYPELIB innodb_change_buffering_typelib = { NULL }; +static const char *innodb_log_io_method_names[]= +{ + "normal", + "mmap", + NullS +}; + +static TYPELIB innodb_log_io_method_typelib= +{ + array_elements(innodb_log_io_method_names) - 1, + "innodb_log_io_method_typelib", + innodb_log_io_method_names, + NULL +}; + /** Retrieve the FTS Relevance Ranking result for doc with doc_id of m_prebuilt->fts_doc_id @param[in,out] fts_hdl FTS handler @@ -19413,6 +19428,12 @@ static MYSQL_SYSVAR_BOOL(log_optimize_ddl, innodb_log_optimize_ddl, " allows concurrent backup.", NULL, NULL, TRUE); +static MYSQL_SYSVAR_ENUM(log_io_method, innodb_log_io_method, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB redo log IO method: normal (default), mmap.", + NULL, NULL, 0, + &innodb_log_io_method_typelib); + static MYSQL_SYSVAR_ULONG(autoextend_increment, sys_tablespace_auto_extend_increment, PLUGIN_VAR_RQCMDARG, @@ -20303,6 +20324,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_group_home_dir), MYSQL_SYSVAR(log_compressed_pages), MYSQL_SYSVAR(log_optimize_ddl), + MYSQL_SYSVAR(log_io_method), MYSQL_SYSVAR(max_dirty_pages_pct), MYSQL_SYSVAR(max_dirty_pages_pct_lwm), MYSQL_SYSVAR(adaptive_flushing_lwm), diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 4e619a203d9..cf19eaf297a 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -40,6 +40,7 @@ Created 12/9/1995 Heikki Tuuri #include "os0file.h" #include "span.h" #include <atomic> +#include <memory> using st_::span; @@ -449,6 +450,7 @@ typedef ib_mutex_t LogSysMutex; typedef ib_mutex_t FlushOrderMutex; extern my_bool srv_read_only_mode; +extern ulong innodb_log_io_method; /** Redo log buffer */ struct log_t{ @@ -516,6 +518,22 @@ struct log_t{ /** Log files. Protected by mutex or write_mutex. */ struct files { + class file_io + { + protected: + bool durable_writes; + public: + virtual ~file_io() {} + virtual dberr_t open(const char *path)= 0; + virtual dberr_t close()= 0; + virtual dberr_t read(os_offset_t offset, span<byte> buf)= 0; + virtual dberr_t write(const char *path, os_offset_t offset, + span<byte> buf)= 0; + virtual dberr_t flush_data_only()= 0; + + bool writes_are_durable() const { return durable_writes; } + }; + /** number of files */ ulint n_files; /** format of the redo log: e.g., FORMAT_10_4 */ @@ -530,14 +548,14 @@ struct log_t{ lsn_t lsn; /** the byte offset of the above lsn */ lsn_t lsn_offset; + /** file descriptors for all log files */ + std::vector<std::unique_ptr<file_io>> files; public: /** used only in recovery: recovery scan succeeded up to this lsn in this log group */ lsn_t scanned_lsn; - /** file descriptors for all log files */ - std::vector<pfs_os_file_t> files; /** file names for all log files */ std::vector<std::string> file_names; @@ -553,6 +571,11 @@ struct log_t{ @param[in] total_offset offset in log files treated as a single file @param[in] buf buffer from which to write */ void write(size_t total_offset, span<byte> buf); + /** checks whether flush_data_only() is needed to make data persistend */ + bool writes_are_durable() const + { + return files.front()->writes_are_durable(); + } /** flushes OS page cache (excluding metadata!) for all log files */ void flush_data_only(); /** closes all log files */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index d0b325c05aa..043de52d737 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -84,6 +84,8 @@ reduce the size of the log. /** Redo log system */ log_t log_sys; +ulong innodb_log_io_method; + /* Next log block number to do dummy record filling if no log records written for a while */ static ulint next_lbn_to_pad = 0; @@ -587,6 +589,112 @@ void log_t::create() } } + +class file_os_io final: public log_t::files::file_io +{ + pfs_os_file_t fd; +public: + dberr_t open(const char *path) final + { + bool success; + fd= os_file_create(innodb_log_file_key, path, + OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, OS_LOG_FILE, + srv_read_only_mode, &success); + durable_writes= srv_file_flush_method == SRV_O_DSYNC; + return success ? DB_SUCCESS : DB_ERROR; + } + dberr_t close() final { return os_file_close(fd) ? DB_SUCCESS : DB_ERROR; } + dberr_t read(os_offset_t offset, span<byte> buf) final + { + return os_file_read(IORequestRead, fd, buf.data(), offset, buf.size()); + } + dberr_t write(const char *path, os_offset_t offset, span<byte> buf) final + { + return os_file_write(IORequestWrite, path, fd, buf.data(), offset, + buf.size()); + } + dberr_t flush_data_only() final + { + return os_file_flush_data(fd) ? DB_SUCCESS : DB_ERROR; + } +}; + + +#ifdef HAVE_PMEM +#include <libpmem.h> +#endif +class file_mmap_io final: public log_t::files::file_io +{ + File fd; + void *addr; + size_t length; +public: + dberr_t open(const char *path) final + { + fd= mysql_file_open(innodb_log_file_key, path, + srv_read_only_mode ? O_RDONLY : O_RDWR, MYF(MY_WME)); + if (fd >= 0) + { + MY_STAT sb; + if (!mysql_file_fstat(fd, &sb, MYF(0))) + { + int prot= srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE; + length= sb.st_size; + addr= my_mmap(0, length, prot, MAP_SHARED_VALIDATE | MAP_SYNC, fd, 0); + if (addr != MAP_FAILED) + { +#ifdef HAVE_PMEM + durable_writes= true; + ib::info() << "The redo log file is located on a DAX storage. " + "Writes are durable, sync disabled."; +#else + durable_writes= false; + ib::info() << "The redo log file is located on a DAX storage, " + "but persistent memory features were disabled " + "(WITH_PMEM=OFF). Page cache is bypassed, sync is " + "required to make writes durable."; +#endif + return DB_SUCCESS; + } + addr= my_mmap(0, length, prot, MAP_SHARED, fd, 0); + if (addr != MAP_FAILED) + { + durable_writes= false; + return DB_SUCCESS; + } + } + mysql_file_close(fd, MYF(MY_WME)); + } + return DB_ERROR; + } + dberr_t close() final + { + int err= my_munmap(addr, length); + return (!mysql_file_close(fd, MYF(MY_WME)) && !err) ? DB_SUCCESS : DB_ERROR; + } + dberr_t read(os_offset_t offset, span<byte> buf) final + { + memcpy(buf.data(), (char*) addr + offset, buf.size()); + return DB_SUCCESS; + } + dberr_t write(const char *, os_offset_t offset, span<byte> buf) final + { +#ifdef HAVE_PMEM + pmem_memcpy_persist((char*) addr + offset, buf.data(), buf.size()); +#else + memcpy((char*) addr + offset, buf.data(), buf.size()); +#endif + return DB_SUCCESS; + } + dberr_t flush_data_only() final + { + ut_ad(!durable_writes); + return my_msync(fd, addr, length, MS_SYNC) ? DB_ERROR : DB_SUCCESS; + } +}; + + void log_t::files::set_file_names(std::vector<std::string> names) { file_names= std::move(names); @@ -598,15 +706,18 @@ void log_t::files::open_files() files.reserve(file_names.size()); for (const auto &name : file_names) { - bool success; - files.push_back(os_file_create(innodb_log_file_key, name.c_str(), - OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, - OS_FILE_NORMAL, OS_LOG_FILE, - srv_read_only_mode, &success)); - if (!success) + file_io *io; + + switch (innodb_log_io_method) { - ib::fatal() << "os_file_create(" << name << ") failed"; + case 1: io= new file_mmap_io; break; + default: io= new file_os_io; } + ut_a(io); + + if (io->open(name.c_str())) + ib::fatal() << "open(" << name << ") failed"; + files.emplace_back(io); } } @@ -617,12 +728,8 @@ void log_t::files::read(size_t total_offset, span<byte> buf) const size_t file_idx= total_offset / static_cast<size_t>(file_size); const size_t offset= total_offset % static_cast<size_t>(file_size); - if (const dberr_t err= os_file_read(IORequestRead, files[file_idx], - buf.data(), offset, buf.size())) - { - ib::fatal() << "os_file_read(" << file_names[file_idx] << ") returned " - << err; - } + if (const dberr_t err= files[file_idx]->read(offset, buf)) + ib::fatal() << "read(" << file_names[file_idx] << ") returned " << err; } void log_t::files::write(size_t total_offset, span<byte> buf) @@ -632,13 +739,9 @@ void log_t::files::write(size_t total_offset, span<byte> buf) const size_t file_idx= total_offset / static_cast<size_t>(file_size); const size_t offset= total_offset % static_cast<size_t>(file_size); - if (const dberr_t err= - os_file_write(IORequestWrite, file_names[file_idx].c_str(), - files[file_idx], buf.data(), offset, buf.size())) - { - ib::fatal() << "os_file_write(" << file_names[file_idx] << ") returned " - << err; - } + if (const dberr_t err= files[file_idx]->write(file_names[file_idx].c_str(), + offset, buf)) + ib::fatal() << "write(" << file_names[file_idx] << ") returned " << err; } void log_t::files::flush_data_only() @@ -648,10 +751,10 @@ void log_t::files::flush_data_only() log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire); for (auto it= files.begin(), end= files.end(); it != end; ++it) { - if (!os_file_flush_data(*it)) + if ((*it)->flush_data_only()) { const auto idx= std::distance(files.begin(), it); - ib::fatal() << "os_file_flush_data(" << file_names[idx] << ") failed"; + ib::fatal() << "flush_data_only(" << file_names[idx] << ") failed"; } } log_sys.pending_flushes.fetch_sub(1, std::memory_order_release); @@ -662,10 +765,10 @@ void log_t::files::close_files() { for (auto it= files.begin(), end= files.end(); it != end; ++it) { - if (!os_file_close(*it)) + if ((*it)->close()) { const auto idx= std::distance(files.begin(), it); - ib::fatal() << "os_file_close(" << file_names[idx] << ") failed"; + ib::fatal() << "close(" << file_names[idx] << ") failed"; } } files.clear(); @@ -934,6 +1037,11 @@ void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key) return; } + /* FIXME!!! */ + if (log_sys.log.writes_are_durable()) { + flush_to_disk= false; + } + loop: ut_ad(++loop_count < 128); |