summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEugene Kosov <claprix@yandex.ru>2020-02-09 22:10:28 +0400
committerEugene Kosov <claprix@yandex.ru>2020-02-14 14:11:10 +0300
commit3daef523af25e4f1e4e75d2c26a9b25475f0c679 (patch)
tree55bf364cef7fa083db1474c754824a325638a682
parentd901919db2e9faf8f9ef77b0d38e8db6c47e5e33 (diff)
downloadmariadb-git-3daef523af25e4f1e4e75d2c26a9b25475f0c679.tar.gz
MDEV-17084 Optimize append only files for NVDIMM
Optionally use libpmem for InnoDB redo log writing. When server is built -DWITH_PMEM=ON InnoDB tries to detect that redo log is located on persistent memory storage and uses faster file access method. When server is built with -DWITH_PMEM=OFF preprocessor is used to ensure that no slowdown will present due to allocations and virtual function calls. So, we don't slow down server in a common case. mapped_file_t: an map file, unmap file and returns mapped memory buffer file_io: abstraction around memory mapped files and file descriptors. Allows writing, reading and flushing to files. file_io::writes_are_durable(): notable method of a class. When it returns true writes are flushed immediately. file_os_io: file descriptor based file access. Depends on a global state like srv_read_only_mode file_pmem_io: file access via libpmem This is a collaboration work with Sergey Vojtovich
-rw-r--r--storage/innobase/CMakeLists.txt4
-rw-r--r--storage/innobase/include/log0log.h95
-rw-r--r--storage/innobase/log/log0log.cc224
3 files changed, 272 insertions, 51 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index b2cf1620ba4..23780f1ff8a 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -194,3 +194,7 @@ IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC))
TARGET_LINK_LIBRARIES(innobase tpool)
ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup)
ENDIF()
+
+IF(LIBPMEM)
+ TARGET_LINK_LIBRARIES(innobase LINK_PRIVATE ${LIBPMEM})
+ENDIF()
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 49851cd6929..bb55a4dab81 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -448,35 +448,90 @@ or the MySQL version that created the redo log file. */
typedef ib_mutex_t LogSysMutex;
typedef ib_mutex_t FlushOrderMutex;
-/** RAII wrapper over path and file descriptor. Supposed to be used for log
-files only */
-class log_file_t
+/** Memory mapped file */
+class mapped_file_t
+{
+public:
+ mapped_file_t()= default;
+ mapped_file_t(const mapped_file_t &)= delete;
+ mapped_file_t &operator=(const mapped_file_t &)= delete;
+ mapped_file_t(mapped_file_t &&)= delete;
+ mapped_file_t &operator=(mapped_file_t &&)= delete;
+ ~mapped_file_t() noexcept;
+
+ dberr_t map(const char *path, int flags= 0) noexcept;
+ dberr_t unmap() noexcept;
+ byte *data() noexcept { return m_area.data(); }
+
+private:
+ span<byte> m_area;
+};
+
+/** Abstraction for reading, writing and flushing file cache to disk */
+class file_io
{
public:
- log_file_t()= default;
- log_file_t(std::string path) : m_path{std::move(path)} {}
+ file_io(bool durable_writes= false) : m_durable_writes(durable_writes) {}
+ virtual ~file_io() noexcept {};
+ virtual dberr_t open(const char *path) noexcept= 0;
+ virtual dberr_t rename(const char *old_path,
+ const char *new_path) noexcept= 0;
+ virtual dberr_t close() noexcept= 0;
+ virtual dberr_t read(os_offset_t offset, span<byte> buf) noexcept= 0;
+ virtual dberr_t write(const char *path, os_offset_t offset,
+ span<const byte> buf) noexcept= 0;
+ virtual dberr_t flush_data_only() noexcept= 0;
+
+ /** Durable writes doesn't require calling flush_data_only() */
+ bool writes_are_durable() const noexcept { return m_durable_writes; }
+
+protected:
+ bool m_durable_writes;
+};
- log_file_t(const log_file_t &)= delete;
- log_file_t &operator=(const log_file_t &)= delete;
+class file_os_io : public file_io
+{
+public:
+ file_os_io()= default;
+ file_os_io(const file_os_io &)= delete;
+ file_os_io &operator=(const file_os_io &)= delete;
+ file_os_io(file_os_io &&rhs);
+ file_os_io &operator=(file_os_io &&rhs);
+ ~file_os_io() noexcept;
+
+ dberr_t open(const char *path) noexcept final;
+ bool is_opened() const noexcept { return m_fd != OS_FILE_CLOSED; }
+ dberr_t rename(const char *old_path, const char *new_path) noexcept final;
+ dberr_t close() noexcept final;
+ dberr_t read(os_offset_t offset, span<byte> buf) noexcept final;
+ dberr_t write(const char *path, os_offset_t offset,
+ span<const byte> buf) noexcept final;
+ dberr_t flush_data_only() noexcept final;
- log_file_t(log_file_t &&rhs);
- log_file_t &operator=(log_file_t &&rhs);
+private:
+ pfs_os_file_t m_fd{OS_FILE_CLOSED};
+};
- ~log_file_t();
+/** File abstraction + path */
+class log_file_t
+{
+public:
+ log_file_t(std::string path= "") noexcept : m_path{std::move(path)} {}
- bool open();
+ dberr_t open() noexcept;
+ bool is_opened() const noexcept;
- bool is_opened() const { return m_fd != OS_FILE_CLOSED; }
- const std::string get_path() const { return m_path; }
+ const std::string &get_path() const noexcept { return m_path; }
- dberr_t rename(std::string new_path);
- bool close();
- dberr_t read(os_offset_t offset, span<byte> buf);
- dberr_t write(os_offset_t offset, span<const byte> buf);
- bool flush_data_only();
+ dberr_t rename(std::string new_path) noexcept;
+ dberr_t close() noexcept;
+ dberr_t read(os_offset_t offset, span<byte> buf) noexcept;
+ bool writes_are_durable() const noexcept;
+ dberr_t write(os_offset_t offset, span<const byte> buf) noexcept;
+ dberr_t flush_data_only() noexcept;
private:
- pfs_os_file_t m_fd;
+ std::unique_ptr<file_io> m_file;
std::string m_path;
};
@@ -579,6 +634,8 @@ struct log_t{
@param[in] total_offset offset in log files treated as a single file
@param[in] buf buffer where to read */
void read(os_offset_t total_offset, span<byte> buf);
+ /** Tells whether writes require calling flush_data_only() */
+ bool writes_are_durable() const noexcept;
/** writes buffer to log files
@param[in] total_offset offset in log files treated as a single file
@param[in] buf buffer from which to write */
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 5a9975f5429..723b3dbc0c5 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -568,74 +568,229 @@ void log_t::create()
}
}
-log_file_t::log_file_t(log_file_t &&rhs)
+mapped_file_t::~mapped_file_t() noexcept
+{
+ if (!m_area.empty())
+ unmap();
+}
+
+dberr_t mapped_file_t::map(const char *path, int flags) noexcept
+{
+ auto fd=
+ mysql_file_open(innodb_log_file_key, path,
+ srv_read_only_mode ? O_RDONLY : O_RDWR, MYF(MY_WME));
+
+ if (fd == -1)
+ return DB_ERROR;
+
+ MY_STAT stat;
+ if (mysql_file_fstat(fd, &stat, MYF(0)))
+ {
+ mysql_file_close(fd, MYF(MY_WME));
+ return DB_ERROR;
+ }
+
+ void *ptr= my_mmap(0, stat.st_size,
+ srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE,
+ MAP_SHARED_VALIDATE | flags, fd, 0);
+ mysql_file_close(fd, MYF(MY_WME));
+
+ if (ptr == MAP_FAILED)
+ return DB_ERROR;
+
+ m_area= {static_cast<byte *>(ptr),
+ static_cast<span<byte>::index_type>(stat.st_size)};
+ return DB_SUCCESS;
+}
+
+dberr_t mapped_file_t::unmap() noexcept
+{
+ ut_ad(!m_area.empty());
+
+ if (my_munmap(m_area.data(), m_area.size()))
+ return DB_ERROR;
+
+ m_area= {};
+ return DB_SUCCESS;
+}
+
+file_os_io::file_os_io(file_os_io &&rhs) : m_fd(rhs.m_fd)
{
- m_fd= std::move(rhs.m_fd);
rhs.m_fd= OS_FILE_CLOSED;
- m_path= std::move(rhs.m_path);
}
-log_file_t &log_file_t::operator=(log_file_t &&rhs)
+
+file_os_io &file_os_io::operator=(file_os_io &&rhs)
{
std::swap(m_fd, rhs.m_fd);
- std::swap(m_path, rhs.m_path);
return *this;
}
-log_file_t::~log_file_t()
+file_os_io::~file_os_io() noexcept
{
if (is_opened())
- os_file_close(m_fd);
+ close();
}
-bool log_file_t::open()
+dberr_t file_os_io::open(const char *path) noexcept
{
- ut_a(!is_opened());
+ ut_ad(!is_opened());
bool success;
- m_fd= os_file_create(innodb_log_file_key, m_path.c_str(),
- OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
- OS_LOG_FILE, srv_read_only_mode, &success);
+ auto tmp_fd= os_file_create(
+ innodb_log_file_key, path, OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_NORMAL, OS_LOG_FILE, srv_read_only_mode, &success);
if (!success)
- m_fd= OS_FILE_CLOSED;
+ return DB_ERROR;
- return success;
+ m_durable_writes= srv_file_flush_method == SRV_O_DSYNC;
+ m_fd= tmp_fd;
+ return success ? DB_SUCCESS : DB_ERROR;
}
-dberr_t log_file_t::rename(std::string new_path)
+dberr_t file_os_io::rename(const char *old_path, const char *new_path) noexcept
{
- if (!os_file_rename(innodb_log_file_key, m_path.c_str(),
- new_path.c_str())) {
+ return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
+ : DB_ERROR;
+}
+
+dberr_t file_os_io::close() noexcept
+{
+ if (!os_file_close(m_fd))
return DB_ERROR;
+
+ m_fd= OS_FILE_CLOSED;
+ return DB_SUCCESS;
+}
+
+dberr_t file_os_io::read(os_offset_t offset, span<byte> buf) noexcept
+{
+ return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size());
+}
+
+dberr_t file_os_io::write(const char *path, os_offset_t offset,
+ span<const byte> buf) noexcept
+{
+ return os_file_write(IORequestWrite, path, m_fd, buf.data(), offset,
+ buf.size());
+}
+
+dberr_t file_os_io::flush_data_only() noexcept
+{
+ return os_file_flush_data(m_fd) ? DB_SUCCESS : DB_ERROR;
+}
+
+#ifdef HAVE_PMEM
+
+#include <libpmem.h>
+
+static bool is_pmem(const char *path) noexcept
+{
+ mapped_file_t mf;
+ return mf.map(path, MAP_SYNC) == DB_SUCCESS ? true : false;
+}
+
+class file_pmem_io final : public file_io
+{
+public:
+ file_pmem_io() noexcept : file_io(true) {}
+
+ dberr_t open(const char *path) noexcept final
+ {
+ return m_file.map(path, MAP_SYNC);
+ }
+ dberr_t rename(const char *old_path, const char *new_path) noexcept final
+ {
+ return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
+ : DB_ERROR;
+ }
+ dberr_t close() noexcept final { return m_file.unmap(); }
+ dberr_t read(os_offset_t offset, span<byte> buf) noexcept final
+ {
+ memcpy(buf.data(), m_file.data() + offset, buf.size());
+ return DB_SUCCESS;
+ }
+ dberr_t write(const char *, os_offset_t offset,
+ span<const byte> buf) noexcept final
+ {
+ pmem_memcpy_persist(m_file.data() + offset, buf.data(), buf.size());
+ return DB_SUCCESS;
}
+ dberr_t flush_data_only() noexcept final
+ {
+ ut_ad(0);
+ return DB_SUCCESS;
+ }
+
+private:
+ mapped_file_t m_file;
+};
+#endif
+
+dberr_t log_file_t::open() noexcept
+{
+ ut_a(!is_opened());
+
+#ifdef HAVE_PMEM
+ auto ptr= is_pmem(m_path.c_str())
+ ? std::unique_ptr<file_io>(new file_pmem_io)
+ : std::unique_ptr<file_io>(new file_os_io);
+#else
+ auto ptr= std::unique_ptr<file_io>(new file_os_io);
+#endif
+
+ if (dberr_t err= ptr->open(m_path.c_str()))
+ return err;
+
+ m_file= std::move(ptr);
+ return DB_SUCCESS;
+}
+
+bool log_file_t::is_opened() const noexcept
+{
+ return static_cast<bool>(m_file);
+}
+
+dberr_t log_file_t::rename(std::string new_path) noexcept
+{
+ if (dberr_t err= m_file->rename(m_path.c_str(), new_path.c_str()))
+ return err;
+
m_path = std::move(new_path);
return DB_SUCCESS;
}
-bool log_file_t::close()
+dberr_t log_file_t::close() noexcept
{
ut_a(is_opened());
- bool result= os_file_close(m_fd);
- m_fd= OS_FILE_CLOSED;
- return result;
+
+ if (dberr_t err= m_file->close())
+ return err;
+
+ m_file.reset();
+ return DB_SUCCESS;
}
-dberr_t log_file_t::read(os_offset_t offset, span<byte> buf)
+dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept
{
ut_ad(is_opened());
- return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size());
+ return m_file->read(offset, buf);
}
-dberr_t log_file_t::write(os_offset_t offset, span<const byte> buf)
+bool log_file_t::writes_are_durable() const noexcept
+{
+ return m_file->writes_are_durable();
+}
+
+dberr_t log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
{
ut_ad(is_opened());
- return os_file_write(IORequestWrite, m_path.c_str(), m_fd, buf.data(),
- offset, buf.size());
+ return m_file->write(m_path.c_str(), offset, buf);
}
-bool log_file_t::flush_data_only()
+dberr_t log_file_t::flush_data_only() noexcept
{
ut_ad(is_opened());
- return os_file_flush_data(m_fd);
+ return m_file->flush_data_only();
}
void log_t::files::open_files(std::vector<std::string> paths)
@@ -645,8 +800,8 @@ void log_t::files::open_files(std::vector<std::string> paths)
for (auto &&path : paths)
{
files.push_back(std::move(path));
- if (!files.back().open())
- ib::fatal() << "create(" << files.back().get_path() << ") failed";
+ if (files.back().open() != DB_SUCCESS)
+ ib::fatal() << "open(" << files.back().get_path() << ") failed";
}
}
@@ -659,6 +814,11 @@ void log_t::files::read(os_offset_t total_offset, span<byte> buf)
ib::fatal() << "read(" << file.get_path() << ") returned " << err;
}
+bool log_t::files::writes_are_durable() const noexcept
+{
+ return files[0].writes_are_durable();
+}
+
void log_t::files::write(os_offset_t total_offset, span<byte> buf)
{
auto &file= files[static_cast<size_t>(total_offset / file_size)];
@@ -673,7 +833,7 @@ void log_t::files::flush_data_only()
log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire);
for (auto &file : files)
{
- if (!file.flush_data_only())
+ if (file.flush_data_only() != DB_SUCCESS)
ib::fatal() << "flush_data_only(" << file.get_path() << ") failed";
}
log_sys.pending_flushes.fetch_sub(1, std::memory_order_release);
@@ -684,7 +844,7 @@ void log_t::files::close_files()
{
for (auto &file : files)
{
- if (file.is_opened() && !file.close())
+ if (file.is_opened() && file.close() != DB_SUCCESS)
ib::fatal() << "close(" << file.get_path() << ") failed";
}
}