diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2021-03-30 09:58:24 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2021-03-30 10:29:11 +0300 |
commit | 8c2e3259c13d1d0a494fb3f9c1e5fb780a193ab1 (patch) | |
tree | e8e644ab993da47feec3982913f050a51a0b5066 | |
parent | 0df74a0197a5c2acf50645516fbf6bf20ad7e27f (diff) | |
download | mariadb-git-bb-10.5-MDEV-24302.tar.gz |
MDEV-24302 follow-up: RESET MASTER hangsbb-10.5-MDEV-24302
As pointed out by Andrei Elkin, the previous fix did not fix one
race condition that may have caused the observed hang.
innodb_log_flush_request(): If we are enqueueing the very first
request at the same time the log write is being completed,
we must ensure that a near-concurrent call to log_flush_notify()
will not result in a missed notification. We guarantee this by
release-acquire operations on log_requests.start and
log_sys.flushed_to_disk_lsn.
log_flush_notify_and_unlock(): Cleanup: Always release the mutex.
log_sys_t::get_flushed_lsn(): Use acquire memory order.
log_sys_t::set_flushed_lsn(): Use release memory order.
log_sys_t::set_lsn(): Use release memory order.
log_sys_t::get_lsn(): Use relaxed memory order by default, and
allow the caller to specify acquire memory order explicitly.
Whenever the log_sys.mutex is being held or when log writes are
prohibited during startup, we can use a relaxed load. Likewise,
in some assertions where reading a stale value of log_sys.lsn
should not matter, we can use a relaxed load.
This will cause some additional instructions to be emitted on
architectures that do not implement Total Store Ordering (TSO),
such as POWER, ARM, and RISC-V Weak Memory Ordering (RVWMO).
-rw-r--r-- | include/my_atomic_wrapper.h | 6 | ||||
-rw-r--r-- | storage/innobase/buf/buf0flu.cc | 12 | ||||
-rw-r--r-- | storage/innobase/handler/ha_innodb.cc | 80 | ||||
-rw-r--r-- | storage/innobase/include/log0log.h | 17 | ||||
-rw-r--r-- | storage/innobase/log/log0log.cc | 8 | ||||
-rw-r--r-- | storage/innobase/srv/srv0srv.cc | 9 | ||||
-rw-r--r-- | storage/innobase/srv/srv0start.cc | 8 |
7 files changed, 71 insertions, 69 deletions
diff --git a/include/my_atomic_wrapper.h b/include/my_atomic_wrapper.h index c574fba4a8e..d57559d360d 100644 --- a/include/my_atomic_wrapper.h +++ b/include/my_atomic_wrapper.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020, MariaDB +/* Copyright (c) 2020, 2021, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -41,7 +41,9 @@ public: Atomic_relaxed(Type val) : m(val) {} Atomic_relaxed() {} - operator Type() const { return m.load(std::memory_order_relaxed); } + Type load(std::memory_order o= std::memory_order_relaxed) const + { return m.load(o); } + operator Type() const { return m.load(); } Type operator=(const Type val) { m.store(val, std::memory_order_relaxed); return val; } Type operator=(const Atomic_relaxed<Type> &rhs) { return *this= Type{rhs}; } diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 9ece21ed3a4..77c6225e455 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1559,7 +1559,7 @@ static void log_flush(void *) fil_flush_file_spaces(); /* Guarantee progress for buf_flush_lists(). */ - log_write_up_to(log_sys.get_lsn(), true); + log_buffer_flush_to_disk(true); log_flush_pending.clear(); } @@ -1577,15 +1577,17 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn) if (n_flush) return 0; - if (log_sys.get_lsn() > log_sys.get_flushed_lsn()) + lsn_t flushed_lsn= log_sys.get_flushed_lsn(); + if (log_sys.get_lsn() > flushed_lsn) { log_flush_task.wait(); - if (log_sys.get_lsn() > log_sys.get_flushed_lsn() && + flushed_lsn= log_sys.get_flushed_lsn(); + if (log_sys.get_lsn() > flushed_lsn && !log_flush_pending.test_and_set()) srv_thread_pool->submit_task(&log_flush_task); #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG if (UNIV_UNLIKELY(ibuf_debug)) - log_write_up_to(log_sys.get_lsn(), true); + log_buffer_flush_to_disk(true); #endif } @@ -1730,7 +1732,7 @@ static bool log_checkpoint() /** Make a checkpoint. */ ATTRIBUTE_COLD void log_make_checkpoint() { - buf_flush_wait_flushed(log_sys.get_lsn()); + buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire)); while (!log_checkpoint()); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 683dc92edd6..eda09099eaf 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -1182,7 +1182,7 @@ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) static struct { /** first request */ - Atomic_relaxed<log_flush_request*> start; + std::atomic<log_flush_request*> start; /** last request */ log_flush_request *end; /** mutex protecting this object */ @@ -4460,13 +4460,11 @@ innobase_rollback_trx( /** Invoke commit_checkpoint_notify_ha() on completed log flush requests. @param pending log_requests.start -@param lsn log_sys.get_flushed_lsn() -@return whether something was notified (and log_requests.mutex released) */ -static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn) +@param lsn log_sys.get_flushed_lsn() */ +static void log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn) { mysql_mutex_assert_owner(&log_requests.mutex); - ut_ad(pending == log_requests.start); - + ut_ad(pending == log_requests.start.load(std::memory_order_relaxed)); log_flush_request *entry= pending, *last= nullptr; /* Process the first requests that have been completed. Since the list is not necessarily in ascending order of LSN, we may @@ -4478,12 +4476,15 @@ static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn) for (; entry && entry->lsn <= lsn; last= entry, entry= entry->next); if (!last) - return false; + { + mysql_mutex_unlock(&log_requests.mutex); + return; + } /* Detach the head of the list that corresponds to persisted log writes. */ - log_requests.start= entry; if (!entry) - log_requests.end= nullptr; + log_requests.end= entry; + log_requests.start.store(entry, std::memory_order_relaxed); mysql_mutex_unlock(&log_requests.mutex); /* Now that we have released the mutex, notify the submitters @@ -4496,21 +4497,16 @@ static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn) my_free(entry); } while (entry != last); - - return true; } /** Invoke commit_checkpoint_notify_ha() to notify that outstanding log writes have been completed. */ void log_flush_notify(lsn_t flush_lsn) { - if (log_requests.start) + if (auto pending= log_requests.start.load(std::memory_order_acquire)) { mysql_mutex_lock(&log_requests.mutex); - if (log_flush_request *pending= log_requests.start) - if (log_flush_notify_and_unlock(pending, flush_lsn)) - return; - mysql_mutex_unlock(&log_requests.mutex); + log_flush_notify_and_unlock(pending, flush_lsn); } } @@ -4520,8 +4516,9 @@ checkpoint complete when we have flushed the redo log. If we have already flushed all relevant redo log, we notify immediately.*/ static void innodb_log_flush_request(void *cookie) { - const lsn_t lsn= log_sys.get_lsn(); lsn_t flush_lsn= log_sys.get_flushed_lsn(); + /* Load lsn relaxed after flush_lsn was loaded from the same cache line */ + const lsn_t lsn= log_sys.get_lsn(); if (flush_lsn >= lsn) /* All log is already persistent. */; @@ -4539,24 +4536,38 @@ static void innodb_log_flush_request(void *cookie) req->cookie= cookie; req->lsn= lsn; + log_flush_request *start= nullptr; + mysql_mutex_lock(&log_requests.mutex); - auto old_end= log_requests.end; - log_requests.end= req; - if (old_end) + /* In order to prevent a race condition where log_flush_notify() + would skip a notification due to, we must update log_requests.start from + nullptr (empty) to the first req using std::memory_order_release. */ + if (log_requests.start.compare_exchange_strong(start, req, + std::memory_order_release, + std::memory_order_relaxed)) + { + ut_ad(!log_requests.end); + start= req; + /* In case log_flush_notify() executed + log_requests.start.load(std::memory_order_acquire) right before + our successful compare_exchange, we must re-read flush_lsn to + ensure that our request will be notified immediately if applicable. */ + flush_lsn= log_sys.get_flushed_lsn(); + } + else { /* Append the entry to the list. Because we determined req->lsn before acquiring the mutex, this list may not be ordered by req->lsn, even though log_flush_notify_and_unlock() assumes so. */ - old_end->next= req; - /* This hopefully addresses the hang that was reported in MDEV-24302. - Upon receiving a new request, we will notify old requests of - completion. */ - if (log_flush_notify_and_unlock(log_requests.start, flush_lsn)) - return; + log_requests.end->next= req; } - else - log_requests.start= req; - mysql_mutex_unlock(&log_requests.mutex); + + log_requests.end= req; + + /* This hopefully addresses the hang that was reported in MDEV-24302. + Upon receiving a new request, we will notify old requests of + completion. */ + log_flush_notify_and_unlock(start, flush_lsn); return; } else @@ -18345,16 +18356,17 @@ checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save) if (*(my_bool*) save) { mysql_mutex_unlock(&LOCK_global_system_variables); - while (log_sys.last_checkpoint_lsn + lsn_t lsn; + + while (log_sys.last_checkpoint_lsn.load( + std::memory_order_acquire) + SIZE_OF_FILE_CHECKPOINT - < log_sys.get_lsn()) { + < (lsn= log_sys.get_lsn(std::memory_order_acquire))) { log_make_checkpoint(); log_sys.log.flush(); } - dberr_t err = fil_write_flushed_lsn(log_sys.get_lsn()); - - if (err != DB_SUCCESS) { + if (dberr_t err = fil_write_flushed_lsn(lsn)) { ib::warn() << "Checkpoint set failed " << err; } diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 1a79738bc9b..849485eae5a 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -87,12 +87,6 @@ log_free_check(void); @param[in] len requested minimum size in bytes */ void log_buffer_extend(ulong len); -/** Read the current LSN. */ -#define log_get_lsn() log_sys.get_lsn() - -/** Read the durable LSN */ -#define log_get_flush_lsn() log_sys.get_flushed_lsn() - /** Calculate the recommended highest values for lsn - last_checkpoint_lsn and lsn - buf_pool.get_oldest_modification(). @param[in] file_size requested innodb_log_file_size @@ -646,13 +640,14 @@ public: bool is_initialised() const { return m_initialised; } - lsn_t get_lsn() const { return lsn.load(std::memory_order_relaxed); } - void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_relaxed); } + lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const + { return lsn.load(order); } + void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); } lsn_t get_flushed_lsn() const - { return flushed_to_disk_lsn.load(std::memory_order_relaxed); } + { return flushed_to_disk_lsn.load(std::memory_order_acquire); } void set_flushed_lsn(lsn_t lsn) - { flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); } + { flushed_to_disk_lsn.store(lsn, std::memory_order_release); } bool check_flush_or_checkpoint() const { diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 71b0c37877e..83036d81658 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -835,12 +835,10 @@ void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key) /** write to the log file up to the last log entry. @param[in] sync whether we want the written log also to be flushed to disk. */ -void -log_buffer_flush_to_disk( - bool sync) +void log_buffer_flush_to_disk(bool sync) { - ut_ad(!srv_read_only_mode); - log_write_up_to(log_get_lsn(), sync); + ut_ad(!srv_read_only_mode); + log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), sync); } /******************************************************************** diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 6b7b56cb24e..4898b9e4e44 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -1340,14 +1340,7 @@ void srv_monitor_task(void*) where the lsn seems to decrease at times */ lsn_t new_lsn = log_sys.get_lsn(); - if (new_lsn < old_lsn) { - ib::error() << "Old log sequence number " << old_lsn << " was" - << " greater than the new log sequence number " - << new_lsn << ". Please submit a bug report to" - " https://jira.mariadb.org/"; - ut_ad(0); - } - + ut_a(new_lsn >= old_lsn); old_lsn = new_lsn; /* Update the statistics collected for deciding LRU diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 9ea554ec622..3b6a63a3251 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -956,7 +956,7 @@ static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists) log_sys.log.flush(); } - ut_ad(flushed_lsn == log_get_lsn()); + ut_ad(flushed_lsn == log_sys.get_lsn()); /* Check if the buffer pools are clean. If not retry till it is clean. */ @@ -1346,7 +1346,7 @@ dberr_t srv_start(bool create_new_db) /* Suppress the message about crash recovery. */ - flushed_lsn = log_get_lsn(); + flushed_lsn = log_sys.get_lsn(); goto file_checked; } @@ -1424,7 +1424,7 @@ file_checked: buf_flush_sync(); - flushed_lsn = log_get_lsn(); + flushed_lsn = log_sys.get_lsn(); err = fil_write_flushed_lsn(flushed_lsn); @@ -1600,7 +1600,7 @@ file_checked: InnoDB files is needed. */ ut_ad(!srv_force_recovery); ut_ad(recv_no_log_write); - err = fil_write_flushed_lsn(log_get_lsn()); + err = fil_write_flushed_lsn(log_sys.get_lsn()); DBUG_ASSERT(!buf_pool.any_io_pending()); log_sys.log.close_file(); if (err == DB_SUCCESS) { |