diff options
author | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-08-04 11:20:03 +0200 |
---|---|---|
committer | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-08-04 11:40:19 +0200 |
commit | 9b9c5e890c16176eaf6b54d466708c54e2df7c9d (patch) | |
tree | 370ca46a9c0471e416b29bee0669b9ed61ee7714 /sql/rpl_parallel.cc | |
parent | a6087e7dc1ef3561d8189c8db15e9591d0f9b520 (diff) | |
download | mariadb-git-9b9c5e890c16176eaf6b54d466708c54e2df7c9d.tar.gz |
MDEV-8302: Duplicate key with parallel replication
This bug is essentially another variant of MDEV-7458.
If a transaction conflict caused a deadlock kill of T2 in record_gtid()
during commit, the code would do a rollback _before_ running
rgi->unmark_start_commit(). This creates a race where following transactions
could start too early (before T2 has completed its transaction retry). This
in turn could lead to replication failure, if there was a conflict that
caused eg. duplicate key error or similar.
The fix is to remove these rollbacks (in Query_log_event::do_apply_event()
and Xid_log_event::do_apply_event(). They seem out-of-place; code in
log_event.cc generally does not roll back on error, this is handled higher
up.
In addition, because of the extreme difficulty of reproducing bugs like
MDEV-7458 and MDEV-8302, this patch adds some extra precations to try to
detect (in debug builds) or prevent (in release builds) similar bugs.
ha_rollback_trans() will now call unmark_start_commit() if needed (and
assert in debug build when a caller does rollback without unmark first).
We also add an extra check for thd->killed() so that we avoid doing
mark_start_commit() if we already have a pending deadlock kill.
And we add a missing unmark_start_commit() call in the error case, found by
the above assertion.
Diffstat (limited to 'sql/rpl_parallel.cc')
-rw-r--r-- | sql/rpl_parallel.cc | 39 |
1 files changed, 35 insertions, 4 deletions
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 305e8053032..600d2ab41aa 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -226,6 +226,11 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err) { rgi->worker_error= err; + /* + In case we get an error during commit, inform following transactions that + we aborted our commit. + */ + rgi->unmark_start_commit(); rgi->cleanup_context(thd, true); rgi->rli->abort_slave= true; rgi->rli->stop_for_until= false; @@ -370,6 +375,7 @@ do_retry: transaction we deadlocked with will not signal that it started to commit until after the unmark. */ + DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);}); rgi->unmark_start_commit(); DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark"); @@ -884,9 +890,24 @@ handle_rpl_parallel_thread(void *arg) group_ending= is_group_ending(qev->ev, event_type); if (group_ending && likely(!rgi->worker_error)) { - DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); - rgi->mark_start_commit(); - DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + /* + Do an extra check for (deadlock) kill here. This helps prevent a + lingering deadlock kill that occured during normal DML processing to + propagate past the mark_start_commit(). If we detect a deadlock only + after mark_start_commit(), we have to unmark, which has at least a + theoretical possibility of leaving a window where it looks like all + transactions in a GCO have started committing, while in fact one + will need to rollback and retry. This is not supposed to be possible + (since there is a deadlock, at least one transaction should be + blocked from reaching commit), but this seems a fragile ensurance, + and there were historically a number of subtle bugs in this area. + */ + if (!thd->killed) + { + DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); + rgi->mark_start_commit(); + DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + } } /* @@ -911,7 +932,17 @@ handle_rpl_parallel_thread(void *arg) }); if (!err) #endif - err= rpt_handle_event(qev, rpt); + { + if (thd->check_killed()) + { + thd->clear_error(); + thd->get_stmt_da()->reset_diagnostics_area(); + thd->send_kill_message(); + err= 1; + } + else + err= rpt_handle_event(qev, rpt); + } delete_or_keep_event_post_apply(rgi, event_type, qev->ev); DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100", err= dbug_simulate_tmp_error(rgi, thd);); |