diff options
author | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-04-23 14:09:15 +0200 |
---|---|---|
committer | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-04-23 14:09:15 +0200 |
commit | b616991a68c78733770fa4519d2f92b052932160 (patch) | |
tree | 79bd1e8c98968c9c3b9d490c5160ceb7704b9354 /sql | |
parent | 4760528754697e9963720e8006cb4f983ec011e8 (diff) | |
download | mariadb-git-b616991a68c78733770fa4519d2f92b052932160.tar.gz |
MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill)
There was a rare race, where a deadlock error might not be correctly
handled, causing the slave to stop with something like this in the error
log:
150423 14:04:10 [ERROR] Slave SQL: Connection was killed, Gtid 0-1-2, Internal MariaDB error code: 1927
150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927
150423 14:04:10 [Warning] Slave: Deadlock found when trying to get lock; try restarting transaction Error_code: 1213
150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927
150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927
150423 14:04:10 [ERROR] Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'master-bin.000001 position 1234
The problem was incorrect error handling. When a deadlock is detected, it
causes a KILL CONNECTION on the offending thread. This error is then later
converted to a deadlock error, and the transaction is retried.
However, the deadlock error was not cleared at the start of the retry, nor
was the lingering kill signal. So it was possible to get another deadlock
kill early during retry. If this happened with particular thread
scheduling/timing, it was possible that the new KILL CONNECTION error was
masked by the earlier deadlock error, so that the second kill was not
properly converted into a deadlock error and retry.
This patch adds code that clears the old error and killed flag before
starting the retry. It also adds code to handle a deadlock kill caught in a
couple of places where it was not handled before.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/rpl_parallel.cc | 79 |
1 files changed, 65 insertions, 14 deletions
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index b7b1ffbeeff..99ddde95689 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -2,6 +2,7 @@ #include "rpl_parallel.h" #include "slave.h" #include "rpl_mi.h" +#include "sql_parse.h" #include "debug_sync.h" /* @@ -326,7 +327,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt, IO_CACHE rlog; LOG_INFO linfo; File fd= (File)-1; - const char *errmsg= NULL; + const char *errmsg; inuse_relaylog *ir= rgi->relay_log; uint64 event_count; uint64 events_to_execute= rgi->retry_event_count; @@ -342,6 +343,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt, do_retry: event_count= 0; err= 0; + errmsg= NULL; /* If we already started committing before getting the deadlock (or other @@ -377,7 +379,16 @@ do_retry: */ if(thd->wait_for_commit_ptr) thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); + DBUG_EXECUTE_IF("inject_mdev8031", { + /* Simulate that we get deadlock killed at this exact point. */ + rgi->killed_for_retry= true; + mysql_mutex_lock(&thd->LOCK_thd_data); + thd->killed= KILL_CONNECTION; + mysql_mutex_unlock(&thd->LOCK_thd_data); + }); rgi->cleanup_context(thd, 1); + thd->reset_killed(); + thd->clear_error(); /* If we retry due to a deadlock kill that occured during the commit step, we @@ -418,10 +429,22 @@ do_retry: complete its commit. */ thd->clear_error(); + thd->reset_killed(); if(thd->wait_for_commit_ptr) thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); + DBUG_EXECUTE_IF("inject_mdev8031", { + /* Inject a small sleep to give prior transaction a chance to commit. */ + my_sleep(100000); + }); } + /* + Let us clear any lingering deadlock kill one more time, here after + wait_for_prior_commit() has completed. This should rule out any + possibility of an old deadlock kill lingering on beyond this point. + */ + thd->reset_killed(); + strmake_buf(log_name, ir->name); if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0) { @@ -437,6 +460,14 @@ do_retry: err= 1; goto err; } + DBUG_EXECUTE_IF("inject_mdev8031", { + /* Simulate pending KILL caught in read_relay_log_description_event(). */ + if (thd->check_killed()) { + thd->send_kill_message(); + err= 1; + goto err; + } + }); my_b_seek(&rlog, cur_offset); do @@ -459,7 +490,7 @@ do_retry: { errmsg= "slave SQL thread aborted because of I/O error"; err= 1; - goto err; + goto check_retry; } if (rlog.error > 0) { @@ -488,10 +519,25 @@ do_retry: } strmake_buf(log_name ,linfo.log_file_name); + DBUG_EXECUTE_IF("inject_retry_event_group_open_binlog_kill", { + if (retries < 2) + { + /* Simulate that we get deadlock killed during open_binlog(). */ + mysql_reset_thd_for_next_command(thd); + rgi->killed_for_retry= true; + mysql_mutex_lock(&thd->LOCK_thd_data); + thd->killed= KILL_CONNECTION; + mysql_mutex_unlock(&thd->LOCK_thd_data); + thd->send_kill_message(); + fd= (File)-1; + err= 1; + goto check_retry; + } + }); if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0) { err= 1; - goto err; + goto check_retry; } /* Loop to try again on the new log file. */ } @@ -534,26 +580,31 @@ do_retry: if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd);); DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100", err= dbug_simulate_tmp_error(rgi, thd);); - if (err) + if (!err) + continue; + +check_retry: + convert_kill_to_deadlock_error(rgi); + if (has_temporary_error(thd)) { - convert_kill_to_deadlock_error(rgi); - if (has_temporary_error(thd)) + ++retries; + if (retries < slave_trans_retries) { - ++retries; - if (retries < slave_trans_retries) + if (fd >= 0) { end_io_cache(&rlog); mysql_file_close(fd, MYF(MY_WME)); fd= (File)-1; - goto do_retry; } - sql_print_error("Slave worker thread retried transaction %lu time(s) " - "in vain, giving up. Consider raising the value of " - "the slave_transaction_retries variable.", - slave_trans_retries); + goto do_retry; } - goto err; + sql_print_error("Slave worker thread retried transaction %lu time(s) " + "in vain, giving up. Consider raising the value of " + "the slave_transaction_retries variable.", + slave_trans_retries); } + goto err; + } while (event_count < events_to_execute); err: |