MDEV-27701 Race on trx->lock.wait_lock between lock_rec_move() and lock_sys_t::cancel()

The initial issue was in assertion failure, which checked the equality of lock to cancel with trx->lock.wait_lock in lock_sys_t::cancel(). If we analyze lock_sys_t::cancel() code from the perspective of trx->lock.wait_lock racing, we won't find the error there, except the cases when we need to reload it after the corresponding latches acquiring. So the fix is just to remove the assertion and reload trx->lock.wait_lock after acquiring necessary latches. Reviewed by: Marko Mäkelä <marko.makela@mariadb.com>
author: Vlad Lesin <vlad_lesin@mail.ru> 2023-02-09 13:51:57 +0300
committer: Vlad Lesin <vlad_lesin@mail.ru> 2023-02-20 20:31:24 +0300
commit: a474e3278c202d5b429134071cedada3d525af95 (patch)
tree: 90c14db77fd9fe636db370fc16514eeb6b949072 /storage
parent: 67a6ad0a4a36bd59fabbdd6b1cdd38de54e82c79 (diff)
download: mariadb-git-a474e3278c202d5b429134071cedada3d525af95.tar.gz
2 files changed, 46 insertions, 4 deletions
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index b67a1011f6b..16acd031177 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -891,8 +891,8 @@ public:
 
   /** Cancel a waiting lock request.
   @tparam check_victim  whether to check for DB_DEADLOCK
-  @param lock           waiting lock request
   @param trx            active transaction
+  @param lock           waiting lock request
   @retval DB_SUCCESS    if no lock existed
   @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
   @retval DB_LOCK_WAIT  if the lock was canceled */
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 9577dfc62aa..2b30b9b1a03 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -46,12 +46,12 @@ Created 5/7/1996 Heikki Tuuri
 #include "srv0mon.h"
 #include "que0que.h"
 #include "scope.h"
+#include <debug_sync.h>
 
 #include <set>
 
 #ifdef WITH_WSREP
 #include <mysql/service_wsrep.h>
-#include <debug_sync.h>
 #endif /* WITH_WSREP */
 
 /** The value of innodb_deadlock_detect */
@@ -1882,6 +1882,7 @@ check_trx_error:
   if (row_lock_wait)
     lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse());
 
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
   if (lock_t *lock= trx->lock.wait_lock)
   {
     lock_sys_t::cancel<false>(trx, lock);
@@ -1905,6 +1906,12 @@ void lock_wait_end(trx_t *trx)
   ut_d(const auto state= trx->state);
   ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE ||
         state == TRX_STATE_PREPARED);
+  /* lock_wait() checks trx->lock.was_chosen_as_deadlock_victim flag before
+  requesting lock_sys.wait_mutex, and if the flag is set, it returns error,
+  what causes transaction rollback, which can reset trx->lock.wait_thr before
+  deadlock resolution starts cancelling victim's waiting lock. That's why we
+  don't check trx->lock.wait_thr here if the function was called from deadlock
+  resolution function. */
   ut_ad(from_deadlock || trx->lock.wait_thr);
 
   if (trx->lock.was_chosen_as_deadlock_victim)
@@ -3193,6 +3200,8 @@ lock_rec_store_on_page_infimum(
 
   ut_ad(block->page.frame == page_align(rec));
   const page_id_t id{block->page.id()};
+  ut_d(SCOPE_EXIT(
+      []() { DEBUG_SYNC_C("lock_rec_store_on_page_infimum_end"); }));
 
   LockGuard g{lock_sys.rec_hash, id};
   lock_rec_move(g.cell(), *block, id, g.cell(), id,
@@ -5770,17 +5779,30 @@ void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx)
 
 /** Cancel a waiting lock request.
 @tparam check_victim  whether to check for DB_DEADLOCK
-@param lock           waiting lock request
 @param trx            active transaction
+@param lock           waiting lock request
 @retval DB_SUCCESS    if no lock existed
 @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
 @retval DB_LOCK_WAIT  if the lock was canceled */
 template<bool check_victim>
 dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
 {
+  DEBUG_SYNC_C("lock_sys_t_cancel_enter");
   mysql_mutex_assert_owner(&lock_sys.wait_mutex);
-  ut_ad(trx->lock.wait_lock == lock);
   ut_ad(trx->state == TRX_STATE_ACTIVE);
+  /* trx->lock.wait_lock may be changed by other threads as long as
+  we are not holding lock_sys.latch.
+
+  So, trx->lock.wait_lock==lock does not necessarily hold, but both
+  pointers should be valid, because other threads cannot assign
+  trx->lock.wait_lock=nullptr (or invalidate *lock) while we are
+  holding lock_sys.wait_mutex. Also, the type of trx->lock.wait_lock
+  (record or table lock) cannot be changed by other threads. So, it is
+  safe to call lock->is_table() while not holding lock_sys.latch. If
+  we have to release and reacquire lock_sys.wait_mutex, we must reread
+  trx->lock.wait_lock. We must also reread trx->lock.wait_lock after
+  lock_sys.latch acquiring, as it can be changed to not-null in lock moving
+  functions even if we hold lock_sys.wait_mutex. */
   dberr_t err= DB_SUCCESS;
   /* This would be too large for a memory transaction, except in the
   DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */
@@ -5802,6 +5824,15 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
     }
     else
     {
+      /* This function is invoked from the thread which executes the
+      transaction. Table locks are requested before record locks. Some other
+      transaction can't change trx->lock.wait_lock from table to record for the
+      current transaction at this point, because the current transaction has not
+      requested record locks yet. There is no need to move any table locks by
+      other threads. And trx->lock.wait_lock can't be set to null while we are
+      holding lock_sys.wait_mutex. That's why there is no need to reload
+      trx->lock.wait_lock here. */
+      ut_ad(lock == trx->lock.wait_lock);
 resolve_table_lock:
       dict_table_t *table= lock->un_member.tab_lock.table;
       if (!table->lock_mutex_trylock())
@@ -5812,6 +5843,7 @@ resolve_table_lock:
         mysql_mutex_unlock(&lock_sys.wait_mutex);
         table->lock_mutex_lock();
         mysql_mutex_lock(&lock_sys.wait_mutex);
+        /* Cache trx->lock.wait_lock under the corresponding latches. */
         lock= trx->lock.wait_lock;
         if (!lock)
           goto retreat;
@@ -5821,6 +5853,10 @@ resolve_table_lock:
           goto retreat;
         }
       }
+      else
+        /* Cache trx->lock.wait_lock under the corresponding latches if
+        it was not cached yet */
+        lock= trx->lock.wait_lock;
       if (lock->is_waiting())
         lock_cancel_waiting_and_release(lock);
       /* Even if lock->is_waiting() did not hold above, we must return
@@ -5844,6 +5880,7 @@ retreat:
       mysql_mutex_unlock(&lock_sys.wait_mutex);
       lock_sys.wr_lock(SRW_LOCK_CALL);
       mysql_mutex_lock(&lock_sys.wait_mutex);
+      /* Cache trx->lock.wait_lock under the corresponding latches. */
       lock= trx->lock.wait_lock;
       /* Even if waiting lock was cancelled while lock_sys.wait_mutex was
       unlocked, we need to return deadlock error if transaction was chosen
@@ -5855,6 +5892,9 @@ retreat:
     }
     else
     {
+      /* Cache trx->lock.wait_lock under the corresponding latches if
+      it was not cached yet */
+      lock= trx->lock.wait_lock;
 resolve_record_lock:
       if (lock->is_waiting())
         lock_cancel_waiting_and_release(lock);
@@ -5876,6 +5916,7 @@ resolve_record_lock:
 void lock_sys_t::cancel(trx_t *trx)
 {
   mysql_mutex_lock(&lock_sys.wait_mutex);
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
   if (lock_t *lock= trx->lock.wait_lock)
   {
     /* Dictionary transactions must be immune to KILL, because they
@@ -5943,6 +5984,7 @@ dberr_t lock_trx_handle_wait(trx_t *trx)
   mysql_mutex_lock(&lock_sys.wait_mutex);
   if (trx->lock.was_chosen_as_deadlock_victim)
     err= DB_DEADLOCK;
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
   else if (lock_t *wait_lock= trx->lock.wait_lock)
     err= lock_sys_t::cancel<true>(trx, wait_lock);
   lock_sys.deadlock_check();
author	Vlad Lesin <vlad_lesin@mail.ru>	2023-02-09 13:51:57 +0300
committer	Vlad Lesin <vlad_lesin@mail.ru>	2023-02-20 20:31:24 +0300
commit	a474e3278c202d5b429134071cedada3d525af95 (patch)
tree	90c14db77fd9fe636db370fc16514eeb6b949072 /storage
parent	67a6ad0a4a36bd59fabbdd6b1cdd38de54e82c79 (diff)
download	mariadb-git-a474e3278c202d5b429134071cedada3d525af95.tar.gz