BUG#25306 (Race conditions during replication slave shutdown (valgrind stacks)):

The possibility of the race is removed by changing sequence of calls pthread_mutex_unlock(&mi->run_lock); pthread_cond_broadcast(&mi->stop_cond); into pthread_cond_broadcast(&mi->stop_cond); pthread_mutex_unlock(&mi->run_lock); at the end of I/O thread (similar change at the end of SQL thread). This ensures that no thread waiting on the condition executes between the broadcast and the unlock and thus can't delete the mi structure which caused the bug. sql/slave.cc: Change order of condition broadcast and mutex unlock at the end of slave's I/O thread and SQL thread. sql/sql_repl.cc: Add DBUG_ENTER() call
author: unknown <rafal@quant.(none)> 2007-02-03 20:14:16 +0100
committer: unknown <rafal@quant.(none)> 2007-02-03 20:14:16 +0100
commit: d51d6adca7947c3fde0263b9f8da78788dc5b3f5 (patch)
tree: 58a1b330159900c872f5e7c6cd7c4b8fa24d92c4 /sql
parent: 501bf6de184d073f6dea6bd2b5bdc6247499b4e3 (diff)
download: mariadb-git-d51d6adca7947c3fde0263b9f8da78788dc5b3f5.tar.gz
2 files changed, 19 insertions, 8 deletions
diff --git a/sql/slave.cc b/sql/slave.cc
index 8805f950d50..13793705b9a 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -3758,8 +3758,13 @@ err:
   mi->abort_slave= 0;
   mi->slave_running= 0;
   mi->io_thd= 0;
-  pthread_mutex_unlock(&mi->run_lock);
+  /*
+    Note: the order of the two following calls (first broadcast, then unlock)
+    is important. Otherwise a killer_thread can execute between the calls and
+    delete the mi structure leading to a crash! (see BUG#25306 for details)
+   */ 
   pthread_cond_broadcast(&mi->stop_cond);       // tell the world we are done
+  pthread_mutex_unlock(&mi->run_lock);
 #ifndef DBUG_OFF
   if (abort_slave_event_count && !events_till_abort)
     goto slave_begin;
@@ -3977,8 +3982,13 @@ the slave SQL thread with \"SLAVE START\". We stopped at log \
   THD_CHECK_SENTRY(thd);
   delete thd;
   pthread_mutex_unlock(&LOCK_thread_count);
-  pthread_cond_broadcast(&rli->stop_cond);
 
+ /*
+  Note: the order of the broadcast and unlock calls below (first broadcast, then unlock)
+  is important. Otherwise a killer_thread can execute between the calls and
+  delete the mi structure leading to a crash! (see BUG#25306 for details)
+ */ 
+  pthread_cond_broadcast(&rli->stop_cond);
 #ifndef DBUG_OFF
   /*
     Bug #19938 Valgrind error (race) in handle_slave_sql()
@@ -3986,9 +3996,8 @@ the slave SQL thread with \"SLAVE START\". We stopped at log \
   */
   const int eta= rli->events_till_abort;
 #endif
-
-  // tell the world we are done
-  pthread_mutex_unlock(&rli->run_lock);
+  pthread_mutex_unlock(&rli->run_lock);  // tell the world we are done
+  
 #ifndef DBUG_OFF // TODO: reconsider the code below
   if (abort_slave_event_count && !eta)
     goto slave_begin;
diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc
index 88a752f7acb..004abc775bf 100644
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -882,12 +882,14 @@ int start_slave(THD* thd , MASTER_INFO* mi,  bool net_report)
 
 int stop_slave(THD* thd, MASTER_INFO* mi, bool net_report )
 {
+  DBUG_ENTER("stop_slave");
+  
   int slave_errno;
   if (!thd)
     thd = current_thd;
 
   if (check_access(thd, SUPER_ACL, any_db,0,0,0,0))
-    return 1;
+    DBUG_RETURN(1);
   thd->proc_info = "Killing slave";
   int thread_mask;
   lock_slave_threads(mi);
@@ -921,12 +923,12 @@ int stop_slave(THD* thd, MASTER_INFO* mi, bool net_report )
   {
     if (net_report)
       my_message(slave_errno, ER(slave_errno), MYF(0));
-    return 1;
+    DBUG_RETURN(1);
   }
   else if (net_report)
     send_ok(thd);
 
-  return 0;
+  DBUG_RETURN(0);
 }
author	unknown <rafal@quant.(none)>	2007-02-03 20:14:16 +0100
committer	unknown <rafal@quant.(none)>	2007-02-03 20:14:16 +0100
commit	d51d6adca7947c3fde0263b9f8da78788dc5b3f5 (patch)
tree	58a1b330159900c872f5e7c6cd7c4b8fa24d92c4 /sql
parent	501bf6de184d073f6dea6bd2b5bdc6247499b4e3 (diff)
download	mariadb-git-d51d6adca7947c3fde0263b9f8da78788dc5b3f5.tar.gz