summaryrefslogtreecommitdiff
path: root/storage/innobase/log/log0log.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/log/log0log.cc')
-rw-r--r--storage/innobase/log/log0log.cc269
1 files changed, 212 insertions, 57 deletions
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 83b78ebf385..f1dd89fd5d2 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -74,6 +74,38 @@ log_t log_sys;
#define LOG_BUF_FLUSH_MARGIN ((4 * 4096) /* cf. log_t::append_prepare() */ \
+ (4U << srv_page_size_shift))
+/**
+ group commit completion callback used for anything
+ that can run asynchronous
+*/
+static const completion_callback async_io_callback{nullptr, nullptr};
+
+/**
+ group commit completion callback that is forcing synchronous IO
+*/
+static const completion_callback sync_io_callback{nullptr, nullptr};
+
+#ifndef DBUG_OFF
+/**
+ Crashing after disk flush requested via dbug_debug flag.
+ flush can be executed by background thread,
+ where DBUG_EXECUTE_IF() does not work, this the value
+ is passed via global variable.
+*/
+static bool crash_after_flush;
+#endif
+
+static void report_aio_error(const char *text, tpool::aiocb *cb);
+
+/** AIO control block with auxilliary information, for async writing.
+Protected by write_lock.*/
+struct Log_aiocb : tpool::aiocb
+{
+ lsn_t lsn;
+ bool durable;
+};
+static Log_aiocb log_aiocb;
+
void log_t::set_capacity()
{
#ifndef SUX_LOCK_GENERIC
@@ -155,12 +187,29 @@ dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept
return os_file_read(IORequestRead, m_file, buf.data(), offset, buf.size());
}
-void log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
+void log_file_t::write(os_offset_t offset, span<const byte> buf,
+ tpool::aiocb *iocb) noexcept
{
ut_ad(is_opened());
- if (dberr_t err= os_file_write_func(IORequestWrite, "ib_logfile0", m_file,
+ if (iocb)
+ {
+ ut_ad(buf.size() < UINT_MAX);
+ iocb->m_fh= m_file;
+ iocb->m_opcode= tpool::aio_opcode::AIO_PWRITE;
+ iocb->m_offset= offset;
+ iocb->m_buffer= (void *) buf.data();
+ iocb->m_len= (unsigned) buf.size();
+ if (srv_thread_pool->submit_io(iocb))
+ {
+ iocb->m_err= IF_WIN(GetLastError(), errno);
+ report_aio_error("submitting asynchronous write to ib_logfile0", iocb);
+ }
+ }
+ else if (dberr_t err= os_file_write(IORequestWrite, "ib_logfile0", m_file,
buf.data(), offset, buf.size()))
+ {
ib::fatal() << "write(\"ib_logfile0\") returned " << err;
+ }
}
#ifdef HAVE_PMEM
@@ -505,10 +554,13 @@ void log_t::resize_abort() noexcept
/** Write an aligned buffer to ib_logfile0.
@param buf buffer to be written
@param len length of data to be written
-@param offset log file offset */
-static void log_write_buf(const byte *buf, size_t len, lsn_t offset)
+@param offset log file offset
+@param cb completion callback */
+static void log_write_buf(const byte *buf, size_t len, lsn_t offset,
+ tpool::aiocb *cb)
{
- ut_ad(write_lock.is_owner());
+ ut_ad(cb ? !write_lock.has_owner() : write_lock.is_owner());
+ ut_ad(write_lock.locked());
ut_ad(!recv_no_log_write);
ut_d(const size_t block_size_1= log_sys.get_block_size() - 1);
ut_ad(!(offset & block_size_1));
@@ -519,7 +571,7 @@ static void log_write_buf(const byte *buf, size_t len, lsn_t offset)
if (UNIV_LIKELY(offset + len <= log_sys.file_size))
{
write:
- log_sys.log.write(offset, {buf, len});
+ log_sys.log.write(offset, {buf, len}, cb);
return;
}
@@ -730,30 +782,110 @@ ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept
resize_flush_buf, offset, length) == DB_SUCCESS);
}
+
+static void report_aio_error(const char *text, tpool::aiocb *cb)
+{
+ ib::fatal() << "IO Error "
+ << cb->m_err IF_WIN(, << " " << strerror(cb->m_err)) << " "
+ << text << "," << cb->m_len << " bytes at offset "
+ << cb->m_offset;
+}
+
+/** Ensure that previous log writes are durable.
+@return new durable lsn target
+@retval 0 if caller does not need to call log_write_up_to() again
+
+*/
+static lsn_t log_flush()
+{
+ ut_ad(!log_sys.is_pmem());
+ lsn_t lsn= write_lock.value();
+ ut_a(log_sys.flush(lsn));
+#ifndef DBUG_OFF
+ if (crash_after_flush)
+ DBUG_SUICIDE();
+#endif
+ return flush_lock.release(lsn);
+}
+
+
+/** Complete write_buf().
+@param lsn new value of write_lsn
+@param durable whether the write was durable
+@return new write target
+@retval 0 if there is no need to call log_write_up_to() */
+inline lsn_t log_t::complete_write_buf(lsn_t lsn, bool durable) noexcept
+{
+ ut_ad(write_lock.is_owner());
+ ut_ad(durable == flush_lock.is_owner());
+
+ ut_a(lsn >= write_lsn);
+
+ write_lsn= lsn;
+ lsn_t pending_lsn= write_lock.release(lsn);
+ if (durable)
+ pending_lsn= std::max(pending_lsn, log_flush());
+ return pending_lsn;
+}
+
+static void aio_complete_write_buf(void *p)
+{
+ ut_ad(write_lock.locked());
+
+ Log_aiocb *cb= static_cast<Log_aiocb *>(p);
+ if (cb->m_err)
+ report_aio_error("in asynchronous write to ib_logfile0", cb);
+ const bool durable{cb->durable};
+#ifdef UNIV_DEBUG
+ if (durable)
+ {
+ ut_ad(flush_lock.locked());
+ flush_lock.set_owner();
+ }
+ write_lock.set_owner();
+#endif
+
+ if (lsn_t ret_lsn= log_sys.complete_write_buf(cb->lsn, durable))
+ {
+ /** prevent stalls. Also, force special synchronous callback
+ as optimization. We'll avoid threadpool machinery and context
+ switching (we're already in the background thread here)
+ */
+ log_write_up_to(ret_lsn, durable, &sync_io_callback);
+ }
+}
+
+
/** Write buf to ib_logfile0.
@tparam release_latch whether to invoke latch.wr_unlock()
-@return the current log sequence number */
-template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
+@param durable whether to invoke a durable write
+@param sync whether to invoke a synchronous write
+@return new write target
+@retval 0 if there is no need to call log_write_up_to() */
+template<bool release_latch>
+inline lsn_t log_t::write_buf(bool durable, bool sync) noexcept
{
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
#endif
ut_ad(!srv_read_only_mode);
ut_ad(!is_pmem());
+ ut_ad(write_lock.is_owner());
+ ut_ad(durable == flush_lock.is_owner());
const lsn_t lsn{get_lsn(std::memory_order_relaxed)};
-
+ DBUG_EXECUTE_IF("crash_after_log_write_upto", crash_after_flush= true;);
if (write_lsn >= lsn)
{
if (release_latch)
latch.wr_unlock();
- ut_ad(write_lsn == lsn);
+ ut_a(write_lsn == lsn);
+ return complete_write_buf(lsn, durable);
}
else
{
ut_ad(!recv_no_log_write);
- write_lock.set_pending(lsn);
- ut_ad(write_lsn >= get_flushed_lsn());
+ ut_a(write_lsn >= get_flushed_lsn());
const size_t block_size_1{get_block_size() - 1};
lsn_t offset{calc_lsn_offset(write_lsn) & ~lsn_t{block_size_1}};
@@ -804,20 +936,34 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
"InnoDB log write: " LSN_PF, write_lsn);
}
- /* Do the write to the log file */
- log_write_buf(write_buf, length, offset);
if (UNIV_LIKELY_NULL(resize_buf))
resize_write_buf(length);
- write_lsn= lsn;
- }
- return lsn;
+ /* Do the write to the log file */
+ if (sync)
+ {
+ log_write_buf(write_buf, length, offset, nullptr);
+ return complete_write_buf(lsn, durable);
+ }
+
+ /* Async log IO
+ Note : flush/write lock ownership is going to migrate to a
+ background thread*/
+ ut_d(write_lock.reset_owner());
+ ut_d(if (durable) flush_lock.reset_owner());
+
+ log_aiocb.m_callback= aio_complete_write_buf;
+ log_aiocb.durable= durable;
+ log_aiocb.lsn= lsn;
+ log_write_buf(write_buf, length, offset, &log_aiocb);
+ return 0;
+ }
}
bool log_t::flush(lsn_t lsn) noexcept
{
ut_ad(lsn >= get_flushed_lsn());
- flush_lock.set_pending(lsn);
+ ut_ad(flush_lock.is_owner());
const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()};
if (UNIV_LIKELY(success))
{
@@ -827,22 +973,25 @@ bool log_t::flush(lsn_t lsn) noexcept
return success;
}
-/** Ensure that previous log writes are durable.
-@param lsn previously written LSN
-@return new durable lsn target
-@retval 0 if there are no pending callbacks on flush_lock
- or there is another group commit lead.
+/*
+ Decide about whether to do synchronous IO.
+ Async might not make sense because of the higher latency or CPU
+ overhead in threadpool, or because the file is cached,and say libaio
+ can't do AIO on cached files.
+
+ Async IO apparently makes sense always if the waiter does
+ not care about result (i.e callback with NULL function)
+
+ NOTE: currently, async IO is mostly unused, because it turns
+ out to be worse in benchmarks. Perhaps it is just too many threads
+ involved in waking and waiting.
*/
-static lsn_t log_flush(lsn_t lsn)
+static bool use_sync_log_write(bool /* durable */,
+ const completion_callback *cb)
{
- ut_ad(!log_sys.is_pmem());
- ut_a(log_sys.flush(lsn));
- DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
- return flush_lock.release(lsn);
+ return !cb || cb->m_callback || cb == &sync_io_callback;
}
-static const completion_callback dummy_callback{[](void *) {},nullptr};
-
/** Ensure that the log has been written to the log file up to a given
log entry (such as that of a transaction commit). Start a new write, or
wait and check if an already running write is covering the request.
@@ -859,7 +1008,7 @@ void log_write_up_to(lsn_t lsn, bool durable,
{
/* A non-final batch of recovery is active no writes to the log
are allowed yet. */
- ut_a(!callback);
+ ut_a(!callback || !callback->m_callback);
return;
}
@@ -868,7 +1017,7 @@ void log_write_up_to(lsn_t lsn, bool durable,
#ifdef HAVE_PMEM
if (log_sys.is_pmem())
{
- ut_ad(!callback);
+ ut_ad(!callback || !callback->m_callback);
if (durable)
log_sys.persist(lsn);
return;
@@ -876,42 +1025,49 @@ void log_write_up_to(lsn_t lsn, bool durable,
#endif
repeat:
- if (durable)
- {
- if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED)
- return;
- flush_lock.set_pending(log_sys.get_lsn());
- }
+ if (durable &&
+ flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED)
+ return;
- lsn_t pending_write_lsn= 0, pending_flush_lsn= 0;
+ lsn_t pending_lsn= 0;
if (write_lock.acquire(lsn, durable ? nullptr : callback) ==
group_commit_lock::ACQUIRED)
{
+ const bool sync{use_sync_log_write(durable, callback)};
log_sys.latch.wr_lock(SRW_LOCK_CALL);
- pending_write_lsn= write_lock.release(log_sys.write_buf<true>());
+ pending_lsn= log_sys.write_buf<true>(durable, sync);
+ if (!pending_lsn)
+ return;
}
- if (durable)
+ if (durable && !pending_lsn)
{
- pending_flush_lsn= log_flush(write_lock.value());
+ /* We only get here if flush_lock is acquired, but write_lock
+ is expired, i.e lsn was already written, but not flushed yet.*/
+ pending_lsn= log_flush();
}
- if (pending_write_lsn || pending_flush_lsn)
+ if (pending_lsn)
{
- /* There is no new group commit lead; some async waiters could stall. */
- callback= &dummy_callback;
- lsn= std::max(pending_write_lsn, pending_flush_lsn);
+ /* There is no new group commit lead; some waiters could stall.
+ If special sync_io_callback was used we'll continue to use it
+ as optimization to reduce context switches.
+ */
+ if (callback != &sync_io_callback)
+ callback= &async_io_callback;
+ lsn= pending_lsn;
goto repeat;
}
}
/** Write to the log file up to the last log entry.
@param durable whether to wait for a durable write to complete */
-void log_buffer_flush_to_disk(bool durable)
+void log_buffer_flush_to_disk(bool durable, bool wait)
{
ut_ad(!srv_read_only_mode);
- log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable);
+ log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable,
+ wait ? nullptr : &async_io_callback);
}
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
@@ -926,20 +1082,19 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare()
group_commit_lock::ACQUIRED);
}
-/** Durably write the log up to log_sys.get_lsn(). */
-ATTRIBUTE_COLD void log_write_and_flush()
+/** Durably write the log up to log_sys.get_lsn().
+@return lsn that log_write_up_to() must be invoked with
+@retval 0 if there is no need to invoke log_write_up_to() */
+ATTRIBUTE_COLD __attribute__((warn_unused_result)) lsn_t log_write_and_flush()
{
ut_ad(!srv_read_only_mode);
if (!log_sys.is_pmem())
- {
- const lsn_t lsn{log_sys.write_buf<false>()};
- write_lock.release(lsn);
- log_flush(lsn);
- }
+ return log_sys.write_buf<false>(true, true);
+
#ifdef HAVE_PMEM
- else
- log_sys.persist(log_sys.get_lsn());
+ log_sys.persist(log_sys.get_lsn());
#endif
+ return 0;
}
/********************************************************************