summaryrefslogtreecommitdiff
path: root/sql/rpl_parallel.cc
diff options
context:
space:
mode:
authorKristian Nielsen <knielsen@knielsen-hq.org>2015-08-04 11:20:03 +0200
committerKristian Nielsen <knielsen@knielsen-hq.org>2015-08-04 11:40:19 +0200
commit9b9c5e890c16176eaf6b54d466708c54e2df7c9d (patch)
tree370ca46a9c0471e416b29bee0669b9ed61ee7714 /sql/rpl_parallel.cc
parenta6087e7dc1ef3561d8189c8db15e9591d0f9b520 (diff)
downloadmariadb-git-9b9c5e890c16176eaf6b54d466708c54e2df7c9d.tar.gz
MDEV-8302: Duplicate key with parallel replication
This bug is essentially another variant of MDEV-7458. If a transaction conflict caused a deadlock kill of T2 in record_gtid() during commit, the code would do a rollback _before_ running rgi->unmark_start_commit(). This creates a race where following transactions could start too early (before T2 has completed its transaction retry). This in turn could lead to replication failure, if there was a conflict that caused eg. duplicate key error or similar. The fix is to remove these rollbacks (in Query_log_event::do_apply_event() and Xid_log_event::do_apply_event(). They seem out-of-place; code in log_event.cc generally does not roll back on error, this is handled higher up. In addition, because of the extreme difficulty of reproducing bugs like MDEV-7458 and MDEV-8302, this patch adds some extra precations to try to detect (in debug builds) or prevent (in release builds) similar bugs. ha_rollback_trans() will now call unmark_start_commit() if needed (and assert in debug build when a caller does rollback without unmark first). We also add an extra check for thd->killed() so that we avoid doing mark_start_commit() if we already have a pending deadlock kill. And we add a missing unmark_start_commit() call in the error case, found by the above assertion.
Diffstat (limited to 'sql/rpl_parallel.cc')
-rw-r--r--sql/rpl_parallel.cc39
1 files changed, 35 insertions, 4 deletions
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 305e8053032..600d2ab41aa 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -226,6 +226,11 @@ static void
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
{
rgi->worker_error= err;
+ /*
+ In case we get an error during commit, inform following transactions that
+ we aborted our commit.
+ */
+ rgi->unmark_start_commit();
rgi->cleanup_context(thd, true);
rgi->rli->abort_slave= true;
rgi->rli->stop_for_until= false;
@@ -370,6 +375,7 @@ do_retry:
transaction we deadlocked with will not signal that it started to commit
until after the unmark.
*/
+ DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);});
rgi->unmark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark");
@@ -884,9 +890,24 @@ handle_rpl_parallel_thread(void *arg)
group_ending= is_group_ending(qev->ev, event_type);
if (group_ending && likely(!rgi->worker_error))
{
- DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
- rgi->mark_start_commit();
- DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
+ /*
+ Do an extra check for (deadlock) kill here. This helps prevent a
+ lingering deadlock kill that occured during normal DML processing to
+ propagate past the mark_start_commit(). If we detect a deadlock only
+ after mark_start_commit(), we have to unmark, which has at least a
+ theoretical possibility of leaving a window where it looks like all
+ transactions in a GCO have started committing, while in fact one
+ will need to rollback and retry. This is not supposed to be possible
+ (since there is a deadlock, at least one transaction should be
+ blocked from reaching commit), but this seems a fragile ensurance,
+ and there were historically a number of subtle bugs in this area.
+ */
+ if (!thd->killed)
+ {
+ DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
+ rgi->mark_start_commit();
+ DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
+ }
}
/*
@@ -911,7 +932,17 @@ handle_rpl_parallel_thread(void *arg)
});
if (!err)
#endif
- err= rpt_handle_event(qev, rpt);
+ {
+ if (thd->check_killed())
+ {
+ thd->clear_error();
+ thd->get_stmt_da()->reset_diagnostics_area();
+ thd->send_kill_message();
+ err= 1;
+ }
+ else
+ err= rpt_handle_event(qev, rpt);
+ }
delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",
err= dbug_simulate_tmp_error(rgi, thd););