diff options
-rw-r--r-- | mysql-test/suite/rpl/r/rpl_parallel.result | 40 | ||||
-rw-r--r-- | mysql-test/suite/rpl/t/rpl_parallel.test | 54 | ||||
-rw-r--r-- | sql/handler.cc | 18 | ||||
-rw-r--r-- | sql/log_event.cc | 2 | ||||
-rw-r--r-- | sql/rpl_parallel.cc | 39 |
5 files changed, 147 insertions, 6 deletions
diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result index 459da711171..a317650b41e 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel.result +++ b/mysql-test/suite/rpl/r/rpl_parallel.result @@ -1703,6 +1703,46 @@ include/stop_slave.inc SET GLOBAL debug_dbug= @old_debg; SET GLOBAL max_relay_log_size= @old_max; include/start_slave.inc +*** MDEV-8302: Duplicate key with parallel replication *** +include/stop_slave.inc +/* Inject a small sleep which makes the race easier to hit. */ +SET @old_dbug=@@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,inject_mdev8302"; +INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5); +SET @old_dbug= @@SESSION.debug_dbug; +SET @commit_id= 20000; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; +SET SESSION debug_dbug=@old_dbug; +SELECT * FROM t7 ORDER BY a; +a b +1 1 +2 2 +3 86 +4 4 +5 5 +100 5 +101 1 +102 2 +103 3 +104 4 +include/save_master_gtid.inc +include/start_slave.inc +include/sync_with_master_gtid.inc +SELECT * FROM t7 ORDER BY a; +a b +1 1 +2 2 +3 86 +4 4 +5 5 +100 5 +101 1 +102 2 +103 3 +104 4 +include/stop_slave.inc +SET GLOBAL debug_dbug=@old_dbug; +include/start_slave.inc include/stop_slave.inc SET GLOBAL slave_parallel_threads=@old_parallel_threads; include/start_slave.inc diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test index 6861f1bbebe..54294000b6f 100644 --- a/mysql-test/suite/rpl/t/rpl_parallel.test +++ b/mysql-test/suite/rpl/t/rpl_parallel.test @@ -2374,6 +2374,60 @@ SET GLOBAL max_relay_log_size= @old_max; --source include/start_slave.inc +--echo *** MDEV-8302: Duplicate key with parallel replication *** + +--connection server_2 +--source include/stop_slave.inc +/* Inject a small sleep which makes the race easier to hit. */ +SET @old_dbug=@@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,inject_mdev8302"; + + +--connection server_1 +INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5); + +# Artificially create a bunch of group commits with conflicting transactions. +# The bug happened when T1 and T2 was in one group commit, and T3 was in the +# following group commit. T2 is a DELETE of a row with same primary key as a +# row that T3 inserts. T1 and T2 can conflict, causing T2 to be deadlock +# killed after starting to commit. The bug was that T2 could roll back before +# doing unmark_start_commit(); this could allow T3 to run before the retry +# of T2, causing duplicate key violation. + +SET @old_dbug= @@SESSION.debug_dbug; +SET @commit_id= 20000; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; + +--let $n = 100 +--disable_query_log +while ($n) +{ + eval UPDATE t7 SET b=b+1 WHERE a=100+($n MOD 5); + eval DELETE FROM t7 WHERE a=100+($n MOD 5); + + SET @commit_id = @commit_id + 1; + eval INSERT INTO t7 VALUES (100+($n MOD 5), $n); + SET @commit_id = @commit_id + 1; + dec $n; +} +--enable_query_log +SET SESSION debug_dbug=@old_dbug; + + +SELECT * FROM t7 ORDER BY a; +--source include/save_master_gtid.inc + + +--connection server_2 +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc +SELECT * FROM t7 ORDER BY a; + +--source include/stop_slave.inc +SET GLOBAL debug_dbug=@old_dbug; +--source include/start_slave.inc + + # Clean up. --connection server_2 diff --git a/sql/handler.cc b/sql/handler.cc index 70f30580a07..4f070d3336b 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1615,6 +1615,24 @@ int ha_rollback_trans(THD *thd, bool all) DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL || trans == &thd->transaction.stmt); + if (is_real_trans) + { + /* + In parallel replication, if we need to rollback during commit, we must + first inform following transactions that we are going to abort our commit + attempt. Otherwise those following transactions can run too early, and + possibly cause replication to fail. See comments in retry_event_group(). + + There were several bugs with this in the past that were very hard to + track down (MDEV-7458, MDEV-8302). So we add here an assertion for + rollback without signalling following transactions. And in release + builds, we explicitly do the signalling before rolling back. + */ + DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)); + if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit) + thd->rgi_slave->unmark_start_commit(); + } + if (thd->in_sub_stmt) { DBUG_ASSERT(0); diff --git a/sql/log_event.cc b/sql/log_event.cc index 87fa145d730..786b249db39 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -4234,7 +4234,6 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi, "mysql", rpl_gtid_slave_state_table_name.str, errcode, thd->get_stmt_da()->message()); - trans_rollback(thd); sub_id= 0; thd->is_slave_error= 1; goto end; @@ -7365,7 +7364,6 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi) "%s.%s: %d: %s", "mysql", rpl_gtid_slave_state_table_name.str, ec, thd->get_stmt_da()->message()); - trans_rollback(thd); thd->is_slave_error= 1; return err; } diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 29a87cc627b..6a1ef257ec5 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -225,6 +225,11 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err) { rgi->worker_error= err; + /* + In case we get an error during commit, inform following transactions that + we aborted our commit. + */ + rgi->unmark_start_commit(); rgi->cleanup_context(thd, true); rgi->rli->abort_slave= true; rgi->rli->stop_for_until= false; @@ -377,6 +382,7 @@ do_retry: transaction we deadlocked with will not signal that it started to commit until after the unmark. */ + DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);}); rgi->unmark_start_commit(); DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark"); @@ -914,9 +920,24 @@ handle_rpl_parallel_thread(void *arg) group_ending= is_group_ending(qev->ev, event_type); if (group_ending && likely(!rgi->worker_error)) { - DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); - rgi->mark_start_commit(); - DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + /* + Do an extra check for (deadlock) kill here. This helps prevent a + lingering deadlock kill that occured during normal DML processing to + propagate past the mark_start_commit(). If we detect a deadlock only + after mark_start_commit(), we have to unmark, which has at least a + theoretical possibility of leaving a window where it looks like all + transactions in a GCO have started committing, while in fact one + will need to rollback and retry. This is not supposed to be possible + (since there is a deadlock, at least one transaction should be + blocked from reaching commit), but this seems a fragile ensurance, + and there were historically a number of subtle bugs in this area. + */ + if (!thd->killed) + { + DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); + rgi->mark_start_commit(); + DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + } } /* @@ -941,7 +962,17 @@ handle_rpl_parallel_thread(void *arg) }); if (!err) #endif - err= rpt_handle_event(qev, rpt); + { + if (thd->check_killed()) + { + thd->clear_error(); + thd->get_stmt_da()->reset_diagnostics_area(); + thd->send_kill_message(); + err= 1; + } + else + err= rpt_handle_event(qev, rpt); + } delete_or_keep_event_post_apply(rgi, event_type, qev->ev); DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100", err= dbug_simulate_tmp_error(rgi, thd);); |