summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2021-03-30 09:58:24 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2021-03-30 10:29:11 +0300
commit8c2e3259c13d1d0a494fb3f9c1e5fb780a193ab1 (patch)
treee8e644ab993da47feec3982913f050a51a0b5066
parent0df74a0197a5c2acf50645516fbf6bf20ad7e27f (diff)
downloadmariadb-git-bb-10.5-MDEV-24302.tar.gz
MDEV-24302 follow-up: RESET MASTER hangsbb-10.5-MDEV-24302
As pointed out by Andrei Elkin, the previous fix did not fix one race condition that may have caused the observed hang. innodb_log_flush_request(): If we are enqueueing the very first request at the same time the log write is being completed, we must ensure that a near-concurrent call to log_flush_notify() will not result in a missed notification. We guarantee this by release-acquire operations on log_requests.start and log_sys.flushed_to_disk_lsn. log_flush_notify_and_unlock(): Cleanup: Always release the mutex. log_sys_t::get_flushed_lsn(): Use acquire memory order. log_sys_t::set_flushed_lsn(): Use release memory order. log_sys_t::set_lsn(): Use release memory order. log_sys_t::get_lsn(): Use relaxed memory order by default, and allow the caller to specify acquire memory order explicitly. Whenever the log_sys.mutex is being held or when log writes are prohibited during startup, we can use a relaxed load. Likewise, in some assertions where reading a stale value of log_sys.lsn should not matter, we can use a relaxed load. This will cause some additional instructions to be emitted on architectures that do not implement Total Store Ordering (TSO), such as POWER, ARM, and RISC-V Weak Memory Ordering (RVWMO).
-rw-r--r--include/my_atomic_wrapper.h6
-rw-r--r--storage/innobase/buf/buf0flu.cc12
-rw-r--r--storage/innobase/handler/ha_innodb.cc80
-rw-r--r--storage/innobase/include/log0log.h17
-rw-r--r--storage/innobase/log/log0log.cc8
-rw-r--r--storage/innobase/srv/srv0srv.cc9
-rw-r--r--storage/innobase/srv/srv0start.cc8
7 files changed, 71 insertions, 69 deletions
diff --git a/include/my_atomic_wrapper.h b/include/my_atomic_wrapper.h
index c574fba4a8e..d57559d360d 100644
--- a/include/my_atomic_wrapper.h
+++ b/include/my_atomic_wrapper.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020, MariaDB
+/* Copyright (c) 2020, 2021, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -41,7 +41,9 @@ public:
Atomic_relaxed(Type val) : m(val) {}
Atomic_relaxed() {}
- operator Type() const { return m.load(std::memory_order_relaxed); }
+ Type load(std::memory_order o= std::memory_order_relaxed) const
+ { return m.load(o); }
+ operator Type() const { return m.load(); }
Type operator=(const Type val)
{ m.store(val, std::memory_order_relaxed); return val; }
Type operator=(const Atomic_relaxed<Type> &rhs) { return *this= Type{rhs}; }
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 9ece21ed3a4..77c6225e455 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -1559,7 +1559,7 @@ static void log_flush(void *)
fil_flush_file_spaces();
/* Guarantee progress for buf_flush_lists(). */
- log_write_up_to(log_sys.get_lsn(), true);
+ log_buffer_flush_to_disk(true);
log_flush_pending.clear();
}
@@ -1577,15 +1577,17 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn)
if (n_flush)
return 0;
- if (log_sys.get_lsn() > log_sys.get_flushed_lsn())
+ lsn_t flushed_lsn= log_sys.get_flushed_lsn();
+ if (log_sys.get_lsn() > flushed_lsn)
{
log_flush_task.wait();
- if (log_sys.get_lsn() > log_sys.get_flushed_lsn() &&
+ flushed_lsn= log_sys.get_flushed_lsn();
+ if (log_sys.get_lsn() > flushed_lsn &&
!log_flush_pending.test_and_set())
srv_thread_pool->submit_task(&log_flush_task);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
if (UNIV_UNLIKELY(ibuf_debug))
- log_write_up_to(log_sys.get_lsn(), true);
+ log_buffer_flush_to_disk(true);
#endif
}
@@ -1730,7 +1732,7 @@ static bool log_checkpoint()
/** Make a checkpoint. */
ATTRIBUTE_COLD void log_make_checkpoint()
{
- buf_flush_wait_flushed(log_sys.get_lsn());
+ buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
while (!log_checkpoint());
}
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 683dc92edd6..eda09099eaf 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -1182,7 +1182,7 @@ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) static
struct
{
/** first request */
- Atomic_relaxed<log_flush_request*> start;
+ std::atomic<log_flush_request*> start;
/** last request */
log_flush_request *end;
/** mutex protecting this object */
@@ -4460,13 +4460,11 @@ innobase_rollback_trx(
/** Invoke commit_checkpoint_notify_ha() on completed log flush requests.
@param pending log_requests.start
-@param lsn log_sys.get_flushed_lsn()
-@return whether something was notified (and log_requests.mutex released) */
-static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
+@param lsn log_sys.get_flushed_lsn() */
+static void log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
{
mysql_mutex_assert_owner(&log_requests.mutex);
- ut_ad(pending == log_requests.start);
-
+ ut_ad(pending == log_requests.start.load(std::memory_order_relaxed));
log_flush_request *entry= pending, *last= nullptr;
/* Process the first requests that have been completed. Since
the list is not necessarily in ascending order of LSN, we may
@@ -4478,12 +4476,15 @@ static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
for (; entry && entry->lsn <= lsn; last= entry, entry= entry->next);
if (!last)
- return false;
+ {
+ mysql_mutex_unlock(&log_requests.mutex);
+ return;
+ }
/* Detach the head of the list that corresponds to persisted log writes. */
- log_requests.start= entry;
if (!entry)
- log_requests.end= nullptr;
+ log_requests.end= entry;
+ log_requests.start.store(entry, std::memory_order_relaxed);
mysql_mutex_unlock(&log_requests.mutex);
/* Now that we have released the mutex, notify the submitters
@@ -4496,21 +4497,16 @@ static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
my_free(entry);
}
while (entry != last);
-
- return true;
}
/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
log writes have been completed. */
void log_flush_notify(lsn_t flush_lsn)
{
- if (log_requests.start)
+ if (auto pending= log_requests.start.load(std::memory_order_acquire))
{
mysql_mutex_lock(&log_requests.mutex);
- if (log_flush_request *pending= log_requests.start)
- if (log_flush_notify_and_unlock(pending, flush_lsn))
- return;
- mysql_mutex_unlock(&log_requests.mutex);
+ log_flush_notify_and_unlock(pending, flush_lsn);
}
}
@@ -4520,8 +4516,9 @@ checkpoint complete when we have flushed the redo log.
If we have already flushed all relevant redo log, we notify immediately.*/
static void innodb_log_flush_request(void *cookie)
{
- const lsn_t lsn= log_sys.get_lsn();
lsn_t flush_lsn= log_sys.get_flushed_lsn();
+ /* Load lsn relaxed after flush_lsn was loaded from the same cache line */
+ const lsn_t lsn= log_sys.get_lsn();
if (flush_lsn >= lsn)
/* All log is already persistent. */;
@@ -4539,24 +4536,38 @@ static void innodb_log_flush_request(void *cookie)
req->cookie= cookie;
req->lsn= lsn;
+ log_flush_request *start= nullptr;
+
mysql_mutex_lock(&log_requests.mutex);
- auto old_end= log_requests.end;
- log_requests.end= req;
- if (old_end)
+ /* In order to prevent a race condition where log_flush_notify()
+ would skip a notification due to, we must update log_requests.start from
+ nullptr (empty) to the first req using std::memory_order_release. */
+ if (log_requests.start.compare_exchange_strong(start, req,
+ std::memory_order_release,
+ std::memory_order_relaxed))
+ {
+ ut_ad(!log_requests.end);
+ start= req;
+ /* In case log_flush_notify() executed
+ log_requests.start.load(std::memory_order_acquire) right before
+ our successful compare_exchange, we must re-read flush_lsn to
+ ensure that our request will be notified immediately if applicable. */
+ flush_lsn= log_sys.get_flushed_lsn();
+ }
+ else
{
/* Append the entry to the list. Because we determined req->lsn before
acquiring the mutex, this list may not be ordered by req->lsn,
even though log_flush_notify_and_unlock() assumes so. */
- old_end->next= req;
- /* This hopefully addresses the hang that was reported in MDEV-24302.
- Upon receiving a new request, we will notify old requests of
- completion. */
- if (log_flush_notify_and_unlock(log_requests.start, flush_lsn))
- return;
+ log_requests.end->next= req;
}
- else
- log_requests.start= req;
- mysql_mutex_unlock(&log_requests.mutex);
+
+ log_requests.end= req;
+
+ /* This hopefully addresses the hang that was reported in MDEV-24302.
+ Upon receiving a new request, we will notify old requests of
+ completion. */
+ log_flush_notify_and_unlock(start, flush_lsn);
return;
}
else
@@ -18345,16 +18356,17 @@ checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
if (*(my_bool*) save) {
mysql_mutex_unlock(&LOCK_global_system_variables);
- while (log_sys.last_checkpoint_lsn
+ lsn_t lsn;
+
+ while (log_sys.last_checkpoint_lsn.load(
+ std::memory_order_acquire)
+ SIZE_OF_FILE_CHECKPOINT
- < log_sys.get_lsn()) {
+ < (lsn= log_sys.get_lsn(std::memory_order_acquire))) {
log_make_checkpoint();
log_sys.log.flush();
}
- dberr_t err = fil_write_flushed_lsn(log_sys.get_lsn());
-
- if (err != DB_SUCCESS) {
+ if (dberr_t err = fil_write_flushed_lsn(lsn)) {
ib::warn() << "Checkpoint set failed " << err;
}
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 1a79738bc9b..849485eae5a 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2009, Google Inc.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -87,12 +87,6 @@ log_free_check(void);
@param[in] len requested minimum size in bytes */
void log_buffer_extend(ulong len);
-/** Read the current LSN. */
-#define log_get_lsn() log_sys.get_lsn()
-
-/** Read the durable LSN */
-#define log_get_flush_lsn() log_sys.get_flushed_lsn()
-
/** Calculate the recommended highest values for lsn - last_checkpoint_lsn
and lsn - buf_pool.get_oldest_modification().
@param[in] file_size requested innodb_log_file_size
@@ -646,13 +640,14 @@ public:
bool is_initialised() const { return m_initialised; }
- lsn_t get_lsn() const { return lsn.load(std::memory_order_relaxed); }
- void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_relaxed); }
+ lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
+ { return lsn.load(order); }
+ void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); }
lsn_t get_flushed_lsn() const
- { return flushed_to_disk_lsn.load(std::memory_order_relaxed); }
+ { return flushed_to_disk_lsn.load(std::memory_order_acquire); }
void set_flushed_lsn(lsn_t lsn)
- { flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); }
+ { flushed_to_disk_lsn.store(lsn, std::memory_order_release); }
bool check_flush_or_checkpoint() const
{
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 71b0c37877e..83036d81658 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -835,12 +835,10 @@ void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
/** write to the log file up to the last log entry.
@param[in] sync whether we want the written log
also to be flushed to disk. */
-void
-log_buffer_flush_to_disk(
- bool sync)
+void log_buffer_flush_to_disk(bool sync)
{
- ut_ad(!srv_read_only_mode);
- log_write_up_to(log_get_lsn(), sync);
+ ut_ad(!srv_read_only_mode);
+ log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), sync);
}
/********************************************************************
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 6b7b56cb24e..4898b9e4e44 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -1340,14 +1340,7 @@ void srv_monitor_task(void*)
where the lsn seems to decrease at times */
lsn_t new_lsn = log_sys.get_lsn();
- if (new_lsn < old_lsn) {
- ib::error() << "Old log sequence number " << old_lsn << " was"
- << " greater than the new log sequence number "
- << new_lsn << ". Please submit a bug report to"
- " https://jira.mariadb.org/";
- ut_ad(0);
- }
-
+ ut_a(new_lsn >= old_lsn);
old_lsn = new_lsn;
/* Update the statistics collected for deciding LRU
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 9ea554ec622..3b6a63a3251 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -956,7 +956,7 @@ static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists)
log_sys.log.flush();
}
- ut_ad(flushed_lsn == log_get_lsn());
+ ut_ad(flushed_lsn == log_sys.get_lsn());
/* Check if the buffer pools are clean. If not
retry till it is clean. */
@@ -1346,7 +1346,7 @@ dberr_t srv_start(bool create_new_db)
/* Suppress the message about
crash recovery. */
- flushed_lsn = log_get_lsn();
+ flushed_lsn = log_sys.get_lsn();
goto file_checked;
}
@@ -1424,7 +1424,7 @@ file_checked:
buf_flush_sync();
- flushed_lsn = log_get_lsn();
+ flushed_lsn = log_sys.get_lsn();
err = fil_write_flushed_lsn(flushed_lsn);
@@ -1600,7 +1600,7 @@ file_checked:
InnoDB files is needed. */
ut_ad(!srv_force_recovery);
ut_ad(recv_no_log_write);
- err = fil_write_flushed_lsn(log_get_lsn());
+ err = fil_write_flushed_lsn(log_sys.get_lsn());
DBUG_ASSERT(!buf_pool.any_io_pending());
log_sys.log.close_file();
if (err == DB_SUCCESS) {