diff options
author | Eugene Kosov <claprix@yandex.ru> | 2020-02-09 22:10:28 +0400 |
---|---|---|
committer | Eugene Kosov <claprix@yandex.ru> | 2020-02-14 14:11:10 +0300 |
commit | 3daef523af25e4f1e4e75d2c26a9b25475f0c679 (patch) | |
tree | 55bf364cef7fa083db1474c754824a325638a682 | |
parent | d901919db2e9faf8f9ef77b0d38e8db6c47e5e33 (diff) | |
download | mariadb-git-3daef523af25e4f1e4e75d2c26a9b25475f0c679.tar.gz |
MDEV-17084 Optimize append only files for NVDIMM
Optionally use libpmem for InnoDB redo log writing.
When server is built -DWITH_PMEM=ON InnoDB tries to detect
that redo log is located on persistent memory storage and
uses faster file access method.
When server is built with -DWITH_PMEM=OFF preprocessor is
used to ensure that no slowdown will present due to allocations
and virtual function calls. So, we don't slow down server
in a common case.
mapped_file_t: an map file, unmap file and returns mapped memory buffer
file_io: abstraction around memory mapped files and file descriptors.
Allows writing, reading and flushing to files.
file_io::writes_are_durable(): notable method of a class.
When it returns true writes are flushed immediately.
file_os_io: file descriptor based file access. Depends on a global state
like srv_read_only_mode
file_pmem_io: file access via libpmem
This is a collaboration work with Sergey Vojtovich
-rw-r--r-- | storage/innobase/CMakeLists.txt | 4 | ||||
-rw-r--r-- | storage/innobase/include/log0log.h | 95 | ||||
-rw-r--r-- | storage/innobase/log/log0log.cc | 224 |
3 files changed, 272 insertions, 51 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index b2cf1620ba4..23780f1ff8a 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -194,3 +194,7 @@ IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC)) TARGET_LINK_LIBRARIES(innobase tpool) ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup) ENDIF() + +IF(LIBPMEM) + TARGET_LINK_LIBRARIES(innobase LINK_PRIVATE ${LIBPMEM}) +ENDIF() diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 49851cd6929..bb55a4dab81 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -448,35 +448,90 @@ or the MySQL version that created the redo log file. */ typedef ib_mutex_t LogSysMutex; typedef ib_mutex_t FlushOrderMutex; -/** RAII wrapper over path and file descriptor. Supposed to be used for log -files only */ -class log_file_t +/** Memory mapped file */ +class mapped_file_t +{ +public: + mapped_file_t()= default; + mapped_file_t(const mapped_file_t &)= delete; + mapped_file_t &operator=(const mapped_file_t &)= delete; + mapped_file_t(mapped_file_t &&)= delete; + mapped_file_t &operator=(mapped_file_t &&)= delete; + ~mapped_file_t() noexcept; + + dberr_t map(const char *path, int flags= 0) noexcept; + dberr_t unmap() noexcept; + byte *data() noexcept { return m_area.data(); } + +private: + span<byte> m_area; +}; + +/** Abstraction for reading, writing and flushing file cache to disk */ +class file_io { public: - log_file_t()= default; - log_file_t(std::string path) : m_path{std::move(path)} {} + file_io(bool durable_writes= false) : m_durable_writes(durable_writes) {} + virtual ~file_io() noexcept {}; + virtual dberr_t open(const char *path) noexcept= 0; + virtual dberr_t rename(const char *old_path, + const char *new_path) noexcept= 0; + virtual dberr_t close() noexcept= 0; + virtual dberr_t read(os_offset_t offset, span<byte> buf) noexcept= 0; + virtual dberr_t write(const char *path, os_offset_t offset, + span<const byte> buf) noexcept= 0; + virtual dberr_t flush_data_only() noexcept= 0; + + /** Durable writes doesn't require calling flush_data_only() */ + bool writes_are_durable() const noexcept { return m_durable_writes; } + +protected: + bool m_durable_writes; +}; - log_file_t(const log_file_t &)= delete; - log_file_t &operator=(const log_file_t &)= delete; +class file_os_io : public file_io +{ +public: + file_os_io()= default; + file_os_io(const file_os_io &)= delete; + file_os_io &operator=(const file_os_io &)= delete; + file_os_io(file_os_io &&rhs); + file_os_io &operator=(file_os_io &&rhs); + ~file_os_io() noexcept; + + dberr_t open(const char *path) noexcept final; + bool is_opened() const noexcept { return m_fd != OS_FILE_CLOSED; } + dberr_t rename(const char *old_path, const char *new_path) noexcept final; + dberr_t close() noexcept final; + dberr_t read(os_offset_t offset, span<byte> buf) noexcept final; + dberr_t write(const char *path, os_offset_t offset, + span<const byte> buf) noexcept final; + dberr_t flush_data_only() noexcept final; - log_file_t(log_file_t &&rhs); - log_file_t &operator=(log_file_t &&rhs); +private: + pfs_os_file_t m_fd{OS_FILE_CLOSED}; +}; - ~log_file_t(); +/** File abstraction + path */ +class log_file_t +{ +public: + log_file_t(std::string path= "") noexcept : m_path{std::move(path)} {} - bool open(); + dberr_t open() noexcept; + bool is_opened() const noexcept; - bool is_opened() const { return m_fd != OS_FILE_CLOSED; } - const std::string get_path() const { return m_path; } + const std::string &get_path() const noexcept { return m_path; } - dberr_t rename(std::string new_path); - bool close(); - dberr_t read(os_offset_t offset, span<byte> buf); - dberr_t write(os_offset_t offset, span<const byte> buf); - bool flush_data_only(); + dberr_t rename(std::string new_path) noexcept; + dberr_t close() noexcept; + dberr_t read(os_offset_t offset, span<byte> buf) noexcept; + bool writes_are_durable() const noexcept; + dberr_t write(os_offset_t offset, span<const byte> buf) noexcept; + dberr_t flush_data_only() noexcept; private: - pfs_os_file_t m_fd; + std::unique_ptr<file_io> m_file; std::string m_path; }; @@ -579,6 +634,8 @@ struct log_t{ @param[in] total_offset offset in log files treated as a single file @param[in] buf buffer where to read */ void read(os_offset_t total_offset, span<byte> buf); + /** Tells whether writes require calling flush_data_only() */ + bool writes_are_durable() const noexcept; /** writes buffer to log files @param[in] total_offset offset in log files treated as a single file @param[in] buf buffer from which to write */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 5a9975f5429..723b3dbc0c5 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -568,74 +568,229 @@ void log_t::create() } } -log_file_t::log_file_t(log_file_t &&rhs) +mapped_file_t::~mapped_file_t() noexcept +{ + if (!m_area.empty()) + unmap(); +} + +dberr_t mapped_file_t::map(const char *path, int flags) noexcept +{ + auto fd= + mysql_file_open(innodb_log_file_key, path, + srv_read_only_mode ? O_RDONLY : O_RDWR, MYF(MY_WME)); + + if (fd == -1) + return DB_ERROR; + + MY_STAT stat; + if (mysql_file_fstat(fd, &stat, MYF(0))) + { + mysql_file_close(fd, MYF(MY_WME)); + return DB_ERROR; + } + + void *ptr= my_mmap(0, stat.st_size, + srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_SHARED_VALIDATE | flags, fd, 0); + mysql_file_close(fd, MYF(MY_WME)); + + if (ptr == MAP_FAILED) + return DB_ERROR; + + m_area= {static_cast<byte *>(ptr), + static_cast<span<byte>::index_type>(stat.st_size)}; + return DB_SUCCESS; +} + +dberr_t mapped_file_t::unmap() noexcept +{ + ut_ad(!m_area.empty()); + + if (my_munmap(m_area.data(), m_area.size())) + return DB_ERROR; + + m_area= {}; + return DB_SUCCESS; +} + +file_os_io::file_os_io(file_os_io &&rhs) : m_fd(rhs.m_fd) { - m_fd= std::move(rhs.m_fd); rhs.m_fd= OS_FILE_CLOSED; - m_path= std::move(rhs.m_path); } -log_file_t &log_file_t::operator=(log_file_t &&rhs) + +file_os_io &file_os_io::operator=(file_os_io &&rhs) { std::swap(m_fd, rhs.m_fd); - std::swap(m_path, rhs.m_path); return *this; } -log_file_t::~log_file_t() +file_os_io::~file_os_io() noexcept { if (is_opened()) - os_file_close(m_fd); + close(); } -bool log_file_t::open() +dberr_t file_os_io::open(const char *path) noexcept { - ut_a(!is_opened()); + ut_ad(!is_opened()); bool success; - m_fd= os_file_create(innodb_log_file_key, m_path.c_str(), - OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, - OS_LOG_FILE, srv_read_only_mode, &success); + auto tmp_fd= os_file_create( + innodb_log_file_key, path, OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, OS_LOG_FILE, srv_read_only_mode, &success); if (!success) - m_fd= OS_FILE_CLOSED; + return DB_ERROR; - return success; + m_durable_writes= srv_file_flush_method == SRV_O_DSYNC; + m_fd= tmp_fd; + return success ? DB_SUCCESS : DB_ERROR; } -dberr_t log_file_t::rename(std::string new_path) +dberr_t file_os_io::rename(const char *old_path, const char *new_path) noexcept { - if (!os_file_rename(innodb_log_file_key, m_path.c_str(), - new_path.c_str())) { + return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS + : DB_ERROR; +} + +dberr_t file_os_io::close() noexcept +{ + if (!os_file_close(m_fd)) return DB_ERROR; + + m_fd= OS_FILE_CLOSED; + return DB_SUCCESS; +} + +dberr_t file_os_io::read(os_offset_t offset, span<byte> buf) noexcept +{ + return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size()); +} + +dberr_t file_os_io::write(const char *path, os_offset_t offset, + span<const byte> buf) noexcept +{ + return os_file_write(IORequestWrite, path, m_fd, buf.data(), offset, + buf.size()); +} + +dberr_t file_os_io::flush_data_only() noexcept +{ + return os_file_flush_data(m_fd) ? DB_SUCCESS : DB_ERROR; +} + +#ifdef HAVE_PMEM + +#include <libpmem.h> + +static bool is_pmem(const char *path) noexcept +{ + mapped_file_t mf; + return mf.map(path, MAP_SYNC) == DB_SUCCESS ? true : false; +} + +class file_pmem_io final : public file_io +{ +public: + file_pmem_io() noexcept : file_io(true) {} + + dberr_t open(const char *path) noexcept final + { + return m_file.map(path, MAP_SYNC); + } + dberr_t rename(const char *old_path, const char *new_path) noexcept final + { + return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS + : DB_ERROR; + } + dberr_t close() noexcept final { return m_file.unmap(); } + dberr_t read(os_offset_t offset, span<byte> buf) noexcept final + { + memcpy(buf.data(), m_file.data() + offset, buf.size()); + return DB_SUCCESS; + } + dberr_t write(const char *, os_offset_t offset, + span<const byte> buf) noexcept final + { + pmem_memcpy_persist(m_file.data() + offset, buf.data(), buf.size()); + return DB_SUCCESS; } + dberr_t flush_data_only() noexcept final + { + ut_ad(0); + return DB_SUCCESS; + } + +private: + mapped_file_t m_file; +}; +#endif + +dberr_t log_file_t::open() noexcept +{ + ut_a(!is_opened()); + +#ifdef HAVE_PMEM + auto ptr= is_pmem(m_path.c_str()) + ? std::unique_ptr<file_io>(new file_pmem_io) + : std::unique_ptr<file_io>(new file_os_io); +#else + auto ptr= std::unique_ptr<file_io>(new file_os_io); +#endif + + if (dberr_t err= ptr->open(m_path.c_str())) + return err; + + m_file= std::move(ptr); + return DB_SUCCESS; +} + +bool log_file_t::is_opened() const noexcept +{ + return static_cast<bool>(m_file); +} + +dberr_t log_file_t::rename(std::string new_path) noexcept +{ + if (dberr_t err= m_file->rename(m_path.c_str(), new_path.c_str())) + return err; + m_path = std::move(new_path); return DB_SUCCESS; } -bool log_file_t::close() +dberr_t log_file_t::close() noexcept { ut_a(is_opened()); - bool result= os_file_close(m_fd); - m_fd= OS_FILE_CLOSED; - return result; + + if (dberr_t err= m_file->close()) + return err; + + m_file.reset(); + return DB_SUCCESS; } -dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) +dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept { ut_ad(is_opened()); - return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size()); + return m_file->read(offset, buf); } -dberr_t log_file_t::write(os_offset_t offset, span<const byte> buf) +bool log_file_t::writes_are_durable() const noexcept +{ + return m_file->writes_are_durable(); +} + +dberr_t log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept { ut_ad(is_opened()); - return os_file_write(IORequestWrite, m_path.c_str(), m_fd, buf.data(), - offset, buf.size()); + return m_file->write(m_path.c_str(), offset, buf); } -bool log_file_t::flush_data_only() +dberr_t log_file_t::flush_data_only() noexcept { ut_ad(is_opened()); - return os_file_flush_data(m_fd); + return m_file->flush_data_only(); } void log_t::files::open_files(std::vector<std::string> paths) @@ -645,8 +800,8 @@ void log_t::files::open_files(std::vector<std::string> paths) for (auto &&path : paths) { files.push_back(std::move(path)); - if (!files.back().open()) - ib::fatal() << "create(" << files.back().get_path() << ") failed"; + if (files.back().open() != DB_SUCCESS) + ib::fatal() << "open(" << files.back().get_path() << ") failed"; } } @@ -659,6 +814,11 @@ void log_t::files::read(os_offset_t total_offset, span<byte> buf) ib::fatal() << "read(" << file.get_path() << ") returned " << err; } +bool log_t::files::writes_are_durable() const noexcept +{ + return files[0].writes_are_durable(); +} + void log_t::files::write(os_offset_t total_offset, span<byte> buf) { auto &file= files[static_cast<size_t>(total_offset / file_size)]; @@ -673,7 +833,7 @@ void log_t::files::flush_data_only() log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire); for (auto &file : files) { - if (!file.flush_data_only()) + if (file.flush_data_only() != DB_SUCCESS) ib::fatal() << "flush_data_only(" << file.get_path() << ") failed"; } log_sys.pending_flushes.fetch_sub(1, std::memory_order_release); @@ -684,7 +844,7 @@ void log_t::files::close_files() { for (auto &file : files) { - if (file.is_opened() && !file.close()) + if (file.is_opened() && file.close() != DB_SUCCESS) ib::fatal() << "close(" << file.get_path() << ") failed"; } } |