MDEV-27701 Race on trx->lock.wait_lock between lock_rec_move() and lock_sys_t::cancel()

The initial issue was in assertion failure, which checked the equality of lock to cancel with trx->lock.wait_lock in lock_sys_t::cancel(). If we analyze lock_sys_t::cancel() code from the perspective of trx->lock.wait_lock racing, we won't find the error there, except the cases when we need to reload it after the corresponding latches acquiring. So the fix is just to remove the assertion and reload trx->lock.wait_lock after acquiring necessary latches. Reviewed by: Marko Mäkelä <marko.makela@mariadb.com>
author: Vlad Lesin <vlad_lesin@mail.ru> 2023-02-09 13:51:57 +0300
committer: Vlad Lesin <vlad_lesin@mail.ru> 2023-02-20 20:31:24 +0300
commit: a474e3278c202d5b429134071cedada3d525af95 (patch)
tree: 90c14db77fd9fe636db370fc16514eeb6b949072
parent: 67a6ad0a4a36bd59fabbdd6b1cdd38de54e82c79 (diff)
download: mariadb-git-a474e3278c202d5b429134071cedada3d525af95.tar.gz
4 files changed, 138 insertions, 4 deletions
diff --git a/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result b/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result
new file mode 100644
index 00000000000..572fbc9b1d1
--- /dev/null
+++ b/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result
@@ -0,0 +1,34 @@
+CREATE TABLE t (pk int PRIMARY KEY, c varchar(10)) ENGINE=InnoDB;
+INSERT INTO t VALUES (10, "0123456789");
+connection default;
+BEGIN;
+SELECT * FROM t WHERE c = 10 FOR UPDATE;
+pk	c
+connect trx2, localhost,root,,;
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx2_start_waiting";
+SET DEBUG_SYNC="lock_wait_end SIGNAL trx2_wait_end WAIT_FOR trx2_cont_upd";
+SET DEBUG_SYNC="lock_rec_store_on_page_infimum_end SIGNAL trx2_moved_locks WAIT_FOR trx2_cont";
+UPDATE t SET c = NULL WHERE pk = 10;
+connect trx3, localhost,root,,;
+SET DEBUG_SYNC="now WAIT_FOR trx2_start_waiting";
+SET innodb_lock_wait_timeout=1;
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx3_start_waiting WAIT_FOR trx3_cont_waiting";
+SET DEBUG_SYNC="lock_sys_t_cancel_enter SIGNAL trx3_cancel_enter WAIT_FOR trx3_cont_cancel_waiting";
+UPDATE t SET c = "abcdefghij" WHERE pk = 10;
+connection default;
+SET DEBUG_SYNC="now WAIT_FOR trx3_start_waiting";
+COMMIT;
+SET DEBUG_SYNC="now WAIT_FOR trx2_wait_end";
+SET DEBUG_SYNC="now SIGNAL trx3_cont_waiting";
+SET DEBUG_SYNC="now WAIT_FOR trx3_cancel_enter";
+SET DEBUG_SYNC="now SIGNAL trx2_cont_upd";
+SET DEBUG_SYNC="now WAIT_FOR trx2_moved_locks";
+SET DEBUG_SYNC="now SIGNAL trx3_cont_cancel_waiting";
+SET DEBUG_SYNC="now SIGNAL trx2_cont";
+disconnect trx2;
+disconnect trx3;
+connection default;
+SET DEBUG_SYNC="RESET";
+DROP TABLE t;
diff --git a/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test b/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test
new file mode 100644
index 00000000000..3a04c7127c8
--- /dev/null
+++ b/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test
@@ -0,0 +1,58 @@
+--source include/have_innodb.inc
+--source include/count_sessions.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+CREATE TABLE t (pk int PRIMARY KEY, c varchar(10)) ENGINE=InnoDB;
+INSERT INTO t VALUES (10, "0123456789");
+
+--connection default
+BEGIN;
+SELECT * FROM t WHERE c = 10 FOR UPDATE;
+
+--connect(trx2, localhost,root,,)
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx2_start_waiting";
+SET DEBUG_SYNC="lock_wait_end SIGNAL trx2_wait_end WAIT_FOR trx2_cont_upd";
+SET DEBUG_SYNC="lock_rec_store_on_page_infimum_end SIGNAL trx2_moved_locks WAIT_FOR trx2_cont";
+#################
+# We need to update clustered record without changing ordering fields and
+# changing the size of non-ordering fields to cause locks moving from deleted
+# record to infimum.
+###
+--send UPDATE t SET c = NULL WHERE pk = 10
+
+
+--connect(trx3, localhost,root,,)
+SET DEBUG_SYNC="now WAIT_FOR trx2_start_waiting";
+#################
+# The condition wariable waiting in lock_wait() must be finished by timeout
+###
+SET innodb_lock_wait_timeout=1;
+BEGIN;
+SET DEBUG_SYNC="lock_wait_start SIGNAL trx3_start_waiting WAIT_FOR trx3_cont_waiting";
+SET DEBUG_SYNC="lock_sys_t_cancel_enter SIGNAL trx3_cancel_enter WAIT_FOR trx3_cont_cancel_waiting";
+--send UPDATE t SET c = "abcdefghij" WHERE pk = 10
+
+--connection default
+SET DEBUG_SYNC="now WAIT_FOR trx3_start_waiting";
+COMMIT;
+SET DEBUG_SYNC="now WAIT_FOR trx2_wait_end";
+SET DEBUG_SYNC="now SIGNAL trx3_cont_waiting";
+SET DEBUG_SYNC="now WAIT_FOR trx3_cancel_enter";
+SET DEBUG_SYNC="now SIGNAL trx2_cont_upd";
+SET DEBUG_SYNC="now WAIT_FOR trx2_moved_locks";
+#################
+# If the bug is not fixed, there will be assertion failure here, because trx2
+# moved trx3 lock from deleted record to infimum when trx3 tried to cancel the
+# lock.
+###
+SET DEBUG_SYNC="now SIGNAL trx3_cont_cancel_waiting";
+SET DEBUG_SYNC="now SIGNAL trx2_cont";
+
+--disconnect trx2
+--disconnect trx3
+--connection default
+SET DEBUG_SYNC="RESET";
+DROP TABLE t;
+--source include/wait_until_count_sessions.inc
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index b67a1011f6b..16acd031177 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -891,8 +891,8 @@ public:
 
   /** Cancel a waiting lock request.
   @tparam check_victim  whether to check for DB_DEADLOCK
-  @param lock           waiting lock request
   @param trx            active transaction
+  @param lock           waiting lock request
   @retval DB_SUCCESS    if no lock existed
   @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
   @retval DB_LOCK_WAIT  if the lock was canceled */
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 9577dfc62aa..2b30b9b1a03 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -46,12 +46,12 @@ Created 5/7/1996 Heikki Tuuri
 #include "srv0mon.h"
 #include "que0que.h"
 #include "scope.h"
+#include <debug_sync.h>
 
 #include <set>
 
 #ifdef WITH_WSREP
 #include <mysql/service_wsrep.h>
-#include <debug_sync.h>
 #endif /* WITH_WSREP */
 
 /** The value of innodb_deadlock_detect */
@@ -1882,6 +1882,7 @@ check_trx_error:
   if (row_lock_wait)
     lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse());
 
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
   if (lock_t *lock= trx->lock.wait_lock)
   {
     lock_sys_t::cancel<false>(trx, lock);
@@ -1905,6 +1906,12 @@ void lock_wait_end(trx_t *trx)
   ut_d(const auto state= trx->state);
   ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE ||
         state == TRX_STATE_PREPARED);
+  /* lock_wait() checks trx->lock.was_chosen_as_deadlock_victim flag before
+  requesting lock_sys.wait_mutex, and if the flag is set, it returns error,
+  what causes transaction rollback, which can reset trx->lock.wait_thr before
+  deadlock resolution starts cancelling victim's waiting lock. That's why we
+  don't check trx->lock.wait_thr here if the function was called from deadlock
+  resolution function. */
   ut_ad(from_deadlock || trx->lock.wait_thr);
 
   if (trx->lock.was_chosen_as_deadlock_victim)
@@ -3193,6 +3200,8 @@ lock_rec_store_on_page_infimum(
 
   ut_ad(block->page.frame == page_align(rec));
   const page_id_t id{block->page.id()};
+  ut_d(SCOPE_EXIT(
+      []() { DEBUG_SYNC_C("lock_rec_store_on_page_infimum_end"); }));
 
   LockGuard g{lock_sys.rec_hash, id};
   lock_rec_move(g.cell(), *block, id, g.cell(), id,
@@ -5770,17 +5779,30 @@ void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx)
 
 /** Cancel a waiting lock request.
 @tparam check_victim  whether to check for DB_DEADLOCK
-@param lock           waiting lock request
 @param trx            active transaction
+@param lock           waiting lock request
 @retval DB_SUCCESS    if no lock existed
 @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
 @retval DB_LOCK_WAIT  if the lock was canceled */
 template<bool check_victim>
 dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
 {
+  DEBUG_SYNC_C("lock_sys_t_cancel_enter");
   mysql_mutex_assert_owner(&lock_sys.wait_mutex);
-  ut_ad(trx->lock.wait_lock == lock);
   ut_ad(trx->state == TRX_STATE_ACTIVE);
+  /* trx->lock.wait_lock may be changed by other threads as long as
+  we are not holding lock_sys.latch.
+
+  So, trx->lock.wait_lock==lock does not necessarily hold, but both
+  pointers should be valid, because other threads cannot assign
+  trx->lock.wait_lock=nullptr (or invalidate *lock) while we are
+  holding lock_sys.wait_mutex. Also, the type of trx->lock.wait_lock
+  (record or table lock) cannot be changed by other threads. So, it is
+  safe to call lock->is_table() while not holding lock_sys.latch. If
+  we have to release and reacquire lock_sys.wait_mutex, we must reread
+  trx->lock.wait_lock. We must also reread trx->lock.wait_lock after
+  lock_sys.latch acquiring, as it can be changed to not-null in lock moving
+  functions even if we hold lock_sys.wait_mutex. */
   dberr_t err= DB_SUCCESS;
   /* This would be too large for a memory transaction, except in the
   DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */
@@ -5802,6 +5824,15 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
     }
     else
     {
+      /* This function is invoked from the thread which executes the
+      transaction. Table locks are requested before record locks. Some other
+      transaction can't change trx->lock.wait_lock from table to record for the
+      current transaction at this point, because the current transaction has not
+      requested record locks yet. There is no need to move any table locks by
+      other threads. And trx->lock.wait_lock can't be set to null while we are
+      holding lock_sys.wait_mutex. That's why there is no need to reload
+      trx->lock.wait_lock here. */
+      ut_ad(lock == trx->lock.wait_lock);
 resolve_table_lock:
       dict_table_t *table= lock->un_member.tab_lock.table;
       if (!table->lock_mutex_trylock())
@@ -5812,6 +5843,7 @@ resolve_table_lock:
         mysql_mutex_unlock(&lock_sys.wait_mutex);
         table->lock_mutex_lock();
         mysql_mutex_lock(&lock_sys.wait_mutex);
+        /* Cache trx->lock.wait_lock under the corresponding latches. */
         lock= trx->lock.wait_lock;
         if (!lock)
           goto retreat;
@@ -5821,6 +5853,10 @@ resolve_table_lock:
           goto retreat;
         }
       }
+      else
+        /* Cache trx->lock.wait_lock under the corresponding latches if
+        it was not cached yet */
+        lock= trx->lock.wait_lock;
       if (lock->is_waiting())
         lock_cancel_waiting_and_release(lock);
       /* Even if lock->is_waiting() did not hold above, we must return
@@ -5844,6 +5880,7 @@ retreat:
       mysql_mutex_unlock(&lock_sys.wait_mutex);
       lock_sys.wr_lock(SRW_LOCK_CALL);
       mysql_mutex_lock(&lock_sys.wait_mutex);
+      /* Cache trx->lock.wait_lock under the corresponding latches. */
       lock= trx->lock.wait_lock;
       /* Even if waiting lock was cancelled while lock_sys.wait_mutex was
       unlocked, we need to return deadlock error if transaction was chosen
@@ -5855,6 +5892,9 @@ retreat:
     }
     else
     {
+      /* Cache trx->lock.wait_lock under the corresponding latches if
+      it was not cached yet */
+      lock= trx->lock.wait_lock;
 resolve_record_lock:
       if (lock->is_waiting())
         lock_cancel_waiting_and_release(lock);
@@ -5876,6 +5916,7 @@ resolve_record_lock:
 void lock_sys_t::cancel(trx_t *trx)
 {
   mysql_mutex_lock(&lock_sys.wait_mutex);
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
   if (lock_t *lock= trx->lock.wait_lock)
   {
     /* Dictionary transactions must be immune to KILL, because they
@@ -5943,6 +5984,7 @@ dberr_t lock_trx_handle_wait(trx_t *trx)
   mysql_mutex_lock(&lock_sys.wait_mutex);
   if (trx->lock.was_chosen_as_deadlock_victim)
     err= DB_DEADLOCK;
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
   else if (lock_t *wait_lock= trx->lock.wait_lock)
     err= lock_sys_t::cancel<true>(trx, wait_lock);
   lock_sys.deadlock_check();
author	Vlad Lesin <vlad_lesin@mail.ru>	2023-02-09 13:51:57 +0300
committer	Vlad Lesin <vlad_lesin@mail.ru>	2023-02-20 20:31:24 +0300
commit	a474e3278c202d5b429134071cedada3d525af95 (patch)
tree	90c14db77fd9fe636db370fc16514eeb6b949072
parent	67a6ad0a4a36bd59fabbdd6b1cdd38de54e82c79 (diff)
download	mariadb-git-a474e3278c202d5b429134071cedada3d525af95.tar.gz