diff options
Diffstat (limited to 'storage/innobase/log/log0log.cc')
-rw-r--r-- | storage/innobase/log/log0log.cc | 269 |
1 files changed, 212 insertions, 57 deletions
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 83b78ebf385..f1dd89fd5d2 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -74,6 +74,38 @@ log_t log_sys; #define LOG_BUF_FLUSH_MARGIN ((4 * 4096) /* cf. log_t::append_prepare() */ \ + (4U << srv_page_size_shift)) +/** + group commit completion callback used for anything + that can run asynchronous +*/ +static const completion_callback async_io_callback{nullptr, nullptr}; + +/** + group commit completion callback that is forcing synchronous IO +*/ +static const completion_callback sync_io_callback{nullptr, nullptr}; + +#ifndef DBUG_OFF +/** + Crashing after disk flush requested via dbug_debug flag. + flush can be executed by background thread, + where DBUG_EXECUTE_IF() does not work, this the value + is passed via global variable. +*/ +static bool crash_after_flush; +#endif + +static void report_aio_error(const char *text, tpool::aiocb *cb); + +/** AIO control block with auxilliary information, for async writing. +Protected by write_lock.*/ +struct Log_aiocb : tpool::aiocb +{ + lsn_t lsn; + bool durable; +}; +static Log_aiocb log_aiocb; + void log_t::set_capacity() { #ifndef SUX_LOCK_GENERIC @@ -155,12 +187,29 @@ dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept return os_file_read(IORequestRead, m_file, buf.data(), offset, buf.size()); } -void log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept +void log_file_t::write(os_offset_t offset, span<const byte> buf, + tpool::aiocb *iocb) noexcept { ut_ad(is_opened()); - if (dberr_t err= os_file_write_func(IORequestWrite, "ib_logfile0", m_file, + if (iocb) + { + ut_ad(buf.size() < UINT_MAX); + iocb->m_fh= m_file; + iocb->m_opcode= tpool::aio_opcode::AIO_PWRITE; + iocb->m_offset= offset; + iocb->m_buffer= (void *) buf.data(); + iocb->m_len= (unsigned) buf.size(); + if (srv_thread_pool->submit_io(iocb)) + { + iocb->m_err= IF_WIN(GetLastError(), errno); + report_aio_error("submitting asynchronous write to ib_logfile0", iocb); + } + } + else if (dberr_t err= os_file_write(IORequestWrite, "ib_logfile0", m_file, buf.data(), offset, buf.size())) + { ib::fatal() << "write(\"ib_logfile0\") returned " << err; + } } #ifdef HAVE_PMEM @@ -505,10 +554,13 @@ void log_t::resize_abort() noexcept /** Write an aligned buffer to ib_logfile0. @param buf buffer to be written @param len length of data to be written -@param offset log file offset */ -static void log_write_buf(const byte *buf, size_t len, lsn_t offset) +@param offset log file offset +@param cb completion callback */ +static void log_write_buf(const byte *buf, size_t len, lsn_t offset, + tpool::aiocb *cb) { - ut_ad(write_lock.is_owner()); + ut_ad(cb ? !write_lock.has_owner() : write_lock.is_owner()); + ut_ad(write_lock.locked()); ut_ad(!recv_no_log_write); ut_d(const size_t block_size_1= log_sys.get_block_size() - 1); ut_ad(!(offset & block_size_1)); @@ -519,7 +571,7 @@ static void log_write_buf(const byte *buf, size_t len, lsn_t offset) if (UNIV_LIKELY(offset + len <= log_sys.file_size)) { write: - log_sys.log.write(offset, {buf, len}); + log_sys.log.write(offset, {buf, len}, cb); return; } @@ -730,30 +782,110 @@ ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept resize_flush_buf, offset, length) == DB_SUCCESS); } + +static void report_aio_error(const char *text, tpool::aiocb *cb) +{ + ib::fatal() << "IO Error " + << cb->m_err IF_WIN(, << " " << strerror(cb->m_err)) << " " + << text << "," << cb->m_len << " bytes at offset " + << cb->m_offset; +} + +/** Ensure that previous log writes are durable. +@return new durable lsn target +@retval 0 if caller does not need to call log_write_up_to() again + +*/ +static lsn_t log_flush() +{ + ut_ad(!log_sys.is_pmem()); + lsn_t lsn= write_lock.value(); + ut_a(log_sys.flush(lsn)); +#ifndef DBUG_OFF + if (crash_after_flush) + DBUG_SUICIDE(); +#endif + return flush_lock.release(lsn); +} + + +/** Complete write_buf(). +@param lsn new value of write_lsn +@param durable whether the write was durable +@return new write target +@retval 0 if there is no need to call log_write_up_to() */ +inline lsn_t log_t::complete_write_buf(lsn_t lsn, bool durable) noexcept +{ + ut_ad(write_lock.is_owner()); + ut_ad(durable == flush_lock.is_owner()); + + ut_a(lsn >= write_lsn); + + write_lsn= lsn; + lsn_t pending_lsn= write_lock.release(lsn); + if (durable) + pending_lsn= std::max(pending_lsn, log_flush()); + return pending_lsn; +} + +static void aio_complete_write_buf(void *p) +{ + ut_ad(write_lock.locked()); + + Log_aiocb *cb= static_cast<Log_aiocb *>(p); + if (cb->m_err) + report_aio_error("in asynchronous write to ib_logfile0", cb); + const bool durable{cb->durable}; +#ifdef UNIV_DEBUG + if (durable) + { + ut_ad(flush_lock.locked()); + flush_lock.set_owner(); + } + write_lock.set_owner(); +#endif + + if (lsn_t ret_lsn= log_sys.complete_write_buf(cb->lsn, durable)) + { + /** prevent stalls. Also, force special synchronous callback + as optimization. We'll avoid threadpool machinery and context + switching (we're already in the background thread here) + */ + log_write_up_to(ret_lsn, durable, &sync_io_callback); + } +} + + /** Write buf to ib_logfile0. @tparam release_latch whether to invoke latch.wr_unlock() -@return the current log sequence number */ -template<bool release_latch> inline lsn_t log_t::write_buf() noexcept +@param durable whether to invoke a durable write +@param sync whether to invoke a synchronous write +@return new write target +@retval 0 if there is no need to call log_write_up_to() */ +template<bool release_latch> +inline lsn_t log_t::write_buf(bool durable, bool sync) noexcept { #ifndef SUX_LOCK_GENERIC ut_ad(latch.is_write_locked()); #endif ut_ad(!srv_read_only_mode); ut_ad(!is_pmem()); + ut_ad(write_lock.is_owner()); + ut_ad(durable == flush_lock.is_owner()); const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; - + DBUG_EXECUTE_IF("crash_after_log_write_upto", crash_after_flush= true;); if (write_lsn >= lsn) { if (release_latch) latch.wr_unlock(); - ut_ad(write_lsn == lsn); + ut_a(write_lsn == lsn); + return complete_write_buf(lsn, durable); } else { ut_ad(!recv_no_log_write); - write_lock.set_pending(lsn); - ut_ad(write_lsn >= get_flushed_lsn()); + ut_a(write_lsn >= get_flushed_lsn()); const size_t block_size_1{get_block_size() - 1}; lsn_t offset{calc_lsn_offset(write_lsn) & ~lsn_t{block_size_1}}; @@ -804,20 +936,34 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept "InnoDB log write: " LSN_PF, write_lsn); } - /* Do the write to the log file */ - log_write_buf(write_buf, length, offset); if (UNIV_LIKELY_NULL(resize_buf)) resize_write_buf(length); - write_lsn= lsn; - } - return lsn; + /* Do the write to the log file */ + if (sync) + { + log_write_buf(write_buf, length, offset, nullptr); + return complete_write_buf(lsn, durable); + } + + /* Async log IO + Note : flush/write lock ownership is going to migrate to a + background thread*/ + ut_d(write_lock.reset_owner()); + ut_d(if (durable) flush_lock.reset_owner()); + + log_aiocb.m_callback= aio_complete_write_buf; + log_aiocb.durable= durable; + log_aiocb.lsn= lsn; + log_write_buf(write_buf, length, offset, &log_aiocb); + return 0; + } } bool log_t::flush(lsn_t lsn) noexcept { ut_ad(lsn >= get_flushed_lsn()); - flush_lock.set_pending(lsn); + ut_ad(flush_lock.is_owner()); const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()}; if (UNIV_LIKELY(success)) { @@ -827,22 +973,25 @@ bool log_t::flush(lsn_t lsn) noexcept return success; } -/** Ensure that previous log writes are durable. -@param lsn previously written LSN -@return new durable lsn target -@retval 0 if there are no pending callbacks on flush_lock - or there is another group commit lead. +/* + Decide about whether to do synchronous IO. + Async might not make sense because of the higher latency or CPU + overhead in threadpool, or because the file is cached,and say libaio + can't do AIO on cached files. + + Async IO apparently makes sense always if the waiter does + not care about result (i.e callback with NULL function) + + NOTE: currently, async IO is mostly unused, because it turns + out to be worse in benchmarks. Perhaps it is just too many threads + involved in waking and waiting. */ -static lsn_t log_flush(lsn_t lsn) +static bool use_sync_log_write(bool /* durable */, + const completion_callback *cb) { - ut_ad(!log_sys.is_pmem()); - ut_a(log_sys.flush(lsn)); - DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); - return flush_lock.release(lsn); + return !cb || cb->m_callback || cb == &sync_io_callback; } -static const completion_callback dummy_callback{[](void *) {},nullptr}; - /** Ensure that the log has been written to the log file up to a given log entry (such as that of a transaction commit). Start a new write, or wait and check if an already running write is covering the request. @@ -859,7 +1008,7 @@ void log_write_up_to(lsn_t lsn, bool durable, { /* A non-final batch of recovery is active no writes to the log are allowed yet. */ - ut_a(!callback); + ut_a(!callback || !callback->m_callback); return; } @@ -868,7 +1017,7 @@ void log_write_up_to(lsn_t lsn, bool durable, #ifdef HAVE_PMEM if (log_sys.is_pmem()) { - ut_ad(!callback); + ut_ad(!callback || !callback->m_callback); if (durable) log_sys.persist(lsn); return; @@ -876,42 +1025,49 @@ void log_write_up_to(lsn_t lsn, bool durable, #endif repeat: - if (durable) - { - if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) - return; - flush_lock.set_pending(log_sys.get_lsn()); - } + if (durable && + flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) + return; - lsn_t pending_write_lsn= 0, pending_flush_lsn= 0; + lsn_t pending_lsn= 0; if (write_lock.acquire(lsn, durable ? nullptr : callback) == group_commit_lock::ACQUIRED) { + const bool sync{use_sync_log_write(durable, callback)}; log_sys.latch.wr_lock(SRW_LOCK_CALL); - pending_write_lsn= write_lock.release(log_sys.write_buf<true>()); + pending_lsn= log_sys.write_buf<true>(durable, sync); + if (!pending_lsn) + return; } - if (durable) + if (durable && !pending_lsn) { - pending_flush_lsn= log_flush(write_lock.value()); + /* We only get here if flush_lock is acquired, but write_lock + is expired, i.e lsn was already written, but not flushed yet.*/ + pending_lsn= log_flush(); } - if (pending_write_lsn || pending_flush_lsn) + if (pending_lsn) { - /* There is no new group commit lead; some async waiters could stall. */ - callback= &dummy_callback; - lsn= std::max(pending_write_lsn, pending_flush_lsn); + /* There is no new group commit lead; some waiters could stall. + If special sync_io_callback was used we'll continue to use it + as optimization to reduce context switches. + */ + if (callback != &sync_io_callback) + callback= &async_io_callback; + lsn= pending_lsn; goto repeat; } } /** Write to the log file up to the last log entry. @param durable whether to wait for a durable write to complete */ -void log_buffer_flush_to_disk(bool durable) +void log_buffer_flush_to_disk(bool durable, bool wait) { ut_ad(!srv_read_only_mode); - log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable); + log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable, + wait ? nullptr : &async_io_callback); } /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ @@ -926,20 +1082,19 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare() group_commit_lock::ACQUIRED); } -/** Durably write the log up to log_sys.get_lsn(). */ -ATTRIBUTE_COLD void log_write_and_flush() +/** Durably write the log up to log_sys.get_lsn(). +@return lsn that log_write_up_to() must be invoked with +@retval 0 if there is no need to invoke log_write_up_to() */ +ATTRIBUTE_COLD __attribute__((warn_unused_result)) lsn_t log_write_and_flush() { ut_ad(!srv_read_only_mode); if (!log_sys.is_pmem()) - { - const lsn_t lsn{log_sys.write_buf<false>()}; - write_lock.release(lsn); - log_flush(lsn); - } + return log_sys.write_buf<false>(true, true); + #ifdef HAVE_PMEM - else - log_sys.persist(log_sys.get_lsn()); + log_sys.persist(log_sys.get_lsn()); #endif + return 0; } /******************************************************************** |