MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill)

There was a rare race, where a deadlock error might not be correctly handled, causing the slave to stop with something like this in the error log: 150423 14:04:10 [ERROR] Slave SQL: Connection was killed, Gtid 0-1-2, Internal MariaDB error code: 1927 150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927 150423 14:04:10 [Warning] Slave: Deadlock found when trying to get lock; try restarting transaction Error_code: 1213 150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927 150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927 150423 14:04:10 [ERROR] Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'master-bin.000001 position 1234 The problem was incorrect error handling. When a deadlock is detected, it causes a KILL CONNECTION on the offending thread. This error is then later converted to a deadlock error, and the transaction is retried. However, the deadlock error was not cleared at the start of the retry, nor was the lingering kill signal. So it was possible to get another deadlock kill early during retry. If this happened with particular thread scheduling/timing, it was possible that the new KILL CONNECTION error was masked by the earlier deadlock error, so that the second kill was not properly converted into a deadlock error and retry. This patch adds code that clears the old error and killed flag before starting the retry. It also adds code to handle a deadlock kill caught in a couple of places where it was not handled before.
author: Kristian Nielsen <knielsen@knielsen-hq.org> 2015-04-23 14:09:15 +0200
committer: Kristian Nielsen <knielsen@knielsen-hq.org> 2015-04-23 14:09:15 +0200
commit: b616991a68c78733770fa4519d2f92b052932160 (patch)
tree: 79bd1e8c98968c9c3b9d490c5160ceb7704b9354 /sql
parent: 4760528754697e9963720e8006cb4f983ec011e8 (diff)
download: mariadb-git-b616991a68c78733770fa4519d2f92b052932160.tar.gz
1 files changed, 65 insertions, 14 deletions
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index b7b1ffbeeff..99ddde95689 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -2,6 +2,7 @@
 #include "rpl_parallel.h"
 #include "slave.h"
 #include "rpl_mi.h"
+#include "sql_parse.h"
 #include "debug_sync.h"
 
 /*
@@ -326,7 +327,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
   IO_CACHE rlog;
   LOG_INFO linfo;
   File fd= (File)-1;
-  const char *errmsg= NULL;
+  const char *errmsg;
   inuse_relaylog *ir= rgi->relay_log;
   uint64 event_count;
   uint64 events_to_execute= rgi->retry_event_count;
@@ -342,6 +343,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
 do_retry:
   event_count= 0;
   err= 0;
+  errmsg= NULL;
 
   /*
     If we already started committing before getting the deadlock (or other
@@ -377,7 +379,16 @@ do_retry:
   */
   if(thd->wait_for_commit_ptr)
     thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
+  DBUG_EXECUTE_IF("inject_mdev8031", {
+      /* Simulate that we get deadlock killed at this exact point. */
+      rgi->killed_for_retry= true;
+      mysql_mutex_lock(&thd->LOCK_thd_data);
+      thd->killed= KILL_CONNECTION;
+      mysql_mutex_unlock(&thd->LOCK_thd_data);
+  });
   rgi->cleanup_context(thd, 1);
+  thd->reset_killed();
+  thd->clear_error();
 
   /*
     If we retry due to a deadlock kill that occured during the commit step, we
@@ -418,10 +429,22 @@ do_retry:
       complete its commit.
     */
     thd->clear_error();
+    thd->reset_killed();
     if(thd->wait_for_commit_ptr)
       thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
+    DBUG_EXECUTE_IF("inject_mdev8031", {
+        /* Inject a small sleep to give prior transaction a chance to commit. */
+        my_sleep(100000);
+    });
   }
 
+  /*
+    Let us clear any lingering deadlock kill one more time, here after
+    wait_for_prior_commit() has completed. This should rule out any
+    possibility of an old deadlock kill lingering on beyond this point.
+  */
+  thd->reset_killed();
+
   strmake_buf(log_name, ir->name);
   if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
   {
@@ -437,6 +460,14 @@ do_retry:
     err= 1;
     goto err;
   }
+  DBUG_EXECUTE_IF("inject_mdev8031", {
+      /* Simulate pending KILL caught in read_relay_log_description_event(). */
+      if (thd->check_killed()) {
+        thd->send_kill_message();
+        err= 1;
+        goto err;
+      }
+  });
   my_b_seek(&rlog, cur_offset);
 
   do
@@ -459,7 +490,7 @@ do_retry:
       {
         errmsg= "slave SQL thread aborted because of I/O error";
         err= 1;
-        goto err;
+        goto check_retry;
       }
       if (rlog.error > 0)
       {
@@ -488,10 +519,25 @@ do_retry:
       }
       strmake_buf(log_name ,linfo.log_file_name);
 
+      DBUG_EXECUTE_IF("inject_retry_event_group_open_binlog_kill", {
+          if (retries < 2)
+          {
+            /* Simulate that we get deadlock killed during open_binlog(). */
+            mysql_reset_thd_for_next_command(thd);
+            rgi->killed_for_retry= true;
+            mysql_mutex_lock(&thd->LOCK_thd_data);
+            thd->killed= KILL_CONNECTION;
+            mysql_mutex_unlock(&thd->LOCK_thd_data);
+            thd->send_kill_message();
+            fd= (File)-1;
+            err= 1;
+            goto check_retry;
+          }
+      });
       if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
       {
         err= 1;
-        goto err;
+        goto check_retry;
       }
       /* Loop to try again on the new log file. */
     }
@@ -534,26 +580,31 @@ do_retry:
                     if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd););
     DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100",
                     err= dbug_simulate_tmp_error(rgi, thd););
-    if (err)
+    if (!err)
+      continue;
+
+check_retry:
+    convert_kill_to_deadlock_error(rgi);
+    if (has_temporary_error(thd))
     {
-      convert_kill_to_deadlock_error(rgi);
-      if (has_temporary_error(thd))
+      ++retries;
+      if (retries < slave_trans_retries)
       {
-        ++retries;
-        if (retries < slave_trans_retries)
+        if (fd >= 0)
         {
           end_io_cache(&rlog);
           mysql_file_close(fd, MYF(MY_WME));
           fd= (File)-1;
-          goto do_retry;
         }
-        sql_print_error("Slave worker thread retried transaction %lu time(s) "
-                        "in vain, giving up. Consider raising the value of "
-                        "the slave_transaction_retries variable.",
-                        slave_trans_retries);
+        goto do_retry;
       }
-      goto err;
+      sql_print_error("Slave worker thread retried transaction %lu time(s) "
+                      "in vain, giving up. Consider raising the value of "
+                      "the slave_transaction_retries variable.",
+                      slave_trans_retries);
     }
+    goto err;
+
   } while (event_count < events_to_execute);
 
 err:
author	Kristian Nielsen <knielsen@knielsen-hq.org>	2015-04-23 14:09:15 +0200
committer	Kristian Nielsen <knielsen@knielsen-hq.org>	2015-04-23 14:09:15 +0200
commit	b616991a68c78733770fa4519d2f92b052932160 (patch)
tree	79bd1e8c98968c9c3b9d490c5160ceb7704b9354 /sql
parent	4760528754697e9963720e8006cb4f983ec011e8 (diff)
download	mariadb-git-b616991a68c78733770fa4519d2f92b052932160.tar.gz