From 68cfcf9cb6821c3d333b97b213b44627e433861c Mon Sep 17 00:00:00 2001 From: sjaakola Date: Tue, 8 Nov 2022 16:36:34 +0200 Subject: MDEV-29512 deadlock between commit monitor and THD::LOCK_thd_data mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit contains only a mtr test for reproducing the issue in MDEV-29512 The actual fix will be pushed in wsrep-lib repository The hanging in MDEV-29512 happens when binlog purging is attempted, and there is one local BF aborted transaction waiting for commit monitor. The test will launch two node cluster and enable binlogging with expire log days, to force binlog purging to happen. A local transaction is executed so that will become BF abort victim, and has advanced to replication stage waiting for commit monitor for final cleanup (to mark position in innodb) after that, applier is released to complete the BF abort and due to binlog configuration, starting the binlog purging. This is where the hanging would occur, if code is buggy Reviewed-by: Jan Lindström --- mysql-test/suite/galera/r/galera_MDEV-29512.result | 40 ++++++++++ mysql-test/suite/galera/t/galera_MDEV-29512.cnf | 15 ++++ mysql-test/suite/galera/t/galera_MDEV-29512.test | 91 ++++++++++++++++++++++ 3 files changed, 146 insertions(+) create mode 100644 mysql-test/suite/galera/r/galera_MDEV-29512.result create mode 100644 mysql-test/suite/galera/t/galera_MDEV-29512.cnf create mode 100644 mysql-test/suite/galera/t/galera_MDEV-29512.test diff --git a/mysql-test/suite/galera/r/galera_MDEV-29512.result b/mysql-test/suite/galera/r/galera_MDEV-29512.result new file mode 100644 index 00000000000..aaf24df920e --- /dev/null +++ b/mysql-test/suite/galera/r/galera_MDEV-29512.result @@ -0,0 +1,40 @@ +connection node_2; +connection node_1; +CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 varchar(2000)); +INSERT INTO t1 VALUES (1, 0, REPEAT('1234567890', 200)); +INSERT INTO t1 VALUES (3, 3, REPEAT('1234567890', 200)); +SET SESSION wsrep_sync_wait=0; +SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb"; +connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connection node_1a; +SET SESSION wsrep_sync_wait=0; +connection node_1; +begin; +select f1,f2 from t1; +f1 f2 +1 0 +3 3 +connection node_2; +UPDATE t1 SET f2=2 WHERE f1=3; +connection node_1a; +SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached"; +connection node_1; +UPDATE t1 SET f2=1 WHERE f1=3; +SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync'; +COMMIT; +connection node_1a; +SET SESSION wsrep_on = 0; +SET SESSION wsrep_on = 1; +SET GLOBAL wsrep_provider_options = 'dbug='; +SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync'; +SET GLOBAL DEBUG_DBUG = ""; +SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb"; +SET GLOBAL debug_dbug = NULL; +SET debug_sync='RESET'; +connection node_1; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +select f1,f2 from t1; +f1 f2 +1 0 +3 2 +DROP TABLE t1; diff --git a/mysql-test/suite/galera/t/galera_MDEV-29512.cnf b/mysql-test/suite/galera/t/galera_MDEV-29512.cnf new file mode 100644 index 00000000000..bf8e0c37984 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_MDEV-29512.cnf @@ -0,0 +1,15 @@ +!include ../galera_2nodes.cnf + +[mysqld] +log-bin +log-slave-updates + +[mysqld.1] +log_bin +log_slave_updates +max-binlog-size=4096 +expire-logs-days=1 + + +[mysqld.2] + diff --git a/mysql-test/suite/galera/t/galera_MDEV-29512.test b/mysql-test/suite/galera/t/galera_MDEV-29512.test new file mode 100644 index 00000000000..ffcef792f85 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_MDEV-29512.test @@ -0,0 +1,91 @@ +# +# This test is for reproducing the issue in: +# https://jira.mariadb.org/browse/MDEV-29512 +# +# The hanging in MDEV-29512 happens when binlog purging is attempted, and there is +# one local BF aborted transaction waiting for commit monitor. +# +# The test will launch two node cluster and enable binlogging with expire log days, +# to force binlog purging to happen. +# A local transaction is executed so that will become BF abort victim, and has advanced +# to replication stage waiting for commit monitor for final cleanup (to mark position in innodb) +# after that, applier is released to complete the BF abort and due to binlog configuration, +# starting the binlog purging. This is where the hanging would occur, if code is buggy +# +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug_sync.inc +--source include/galera_have_debug_sync.inc + +# +# binlog size is limited to 4096 bytes, we will create enough events to +# cause binlog rotation +# +CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 varchar(2000)); +INSERT INTO t1 VALUES (1, 0, REPEAT('1234567890', 200)); +INSERT INTO t1 VALUES (3, 3, REPEAT('1234567890', 200)); + +SET SESSION wsrep_sync_wait=0; + +# set sync point for replication applier +SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb"; + +# Control connection to manage sync points for appliers +--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 +--connection node_1a +SET SESSION wsrep_sync_wait=0; + +# starting local transaction, only select so far, +# write will happen later and this will be ordered after the transaction in node_2 +--connection node_1 +begin; +select f1,f2 from t1; + +# send from node 2 an UPDATE transaction, which will BF abort the transaction in node_1 +--connection node_2 +--let $wait_condition=select count(*)=2 from t1 +--source include/wait_condition.inc + +UPDATE t1 SET f2=2 WHERE f1=3; + +--connection node_1a +# wait to see the UPDATE from node_2 in apply_cb sync point +SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached"; + +--connection node_1 +# now issuing conflicting update +UPDATE t1 SET f2=1 WHERE f1=3; + +# Block the local commit, send final COMMIT and wait until it gets blocked +--let $galera_sync_point = commit_monitor_master_enter_sync +--source include/galera_set_sync_point.inc +--send COMMIT + +--connection node_1a +# wait for the local commit to enter in commit monitor wait state +--let $galera_sync_point = commit_monitor_master_enter_sync +--source include/galera_wait_sync_point.inc +--source include/galera_clear_sync_point.inc + +# release the local transaction to continue with commit +--let $galera_sync_point = commit_monitor_master_enter_sync +--source include/galera_signal_sync_point.inc + +# and now release the applier, it should force local trx to abort +SET GLOBAL DEBUG_DBUG = ""; +SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb"; +SET GLOBAL debug_dbug = NULL; +SET debug_sync='RESET'; + +--connection node_1 +--error ER_LOCK_DEADLOCK +--reap + +# wait until applying is complete +--let $wait_condition = SELECT COUNT(*)=1 FROM t1 WHERE f2=2 +--source include/wait_condition.inc + +# final read to verify what we got +select f1,f2 from t1; + +DROP TABLE t1; -- cgit v1.2.1