New variable rli->ignore_log_space_limit to resolve

a deadlock between I/O and SQL threads in replication when relay_log_space is too small. This fixes bug #79. sql/log.cc: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small. sql/slave.cc: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small. sql/slave.h: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small. sql/sql_repl.cc: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small.
author: unknown <guilhem@mysql.com> 2003-03-17 22:51:56 +0100
committer: unknown <guilhem@mysql.com> 2003-03-17 22:51:56 +0100
commit: 40c0b2c6c8b730dec9900c3829b7babf0a5b4772 (patch)
tree: 5d9a0eaa1fe759e9fccf20738544fe2eab9bbbe8 /sql
parent: 2103479670e60d1cce4166a4cd67b704bf4743da (diff)
download: mariadb-git-40c0b2c6c8b730dec9900c3829b7babf0a5b4772.tar.gz
4 files changed, 64 insertions, 13 deletions
diff --git a/sql/log.cc b/sql/log.cc
index 27864e19c03..9befcaefb01 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -645,6 +645,8 @@ int MYSQL_LOG::purge_first_log(struct st_relay_log_info* rli)
   */
   pthread_mutex_lock(&rli->log_space_lock);
   rli->log_space_total -= rli->relay_log_pos;
+  //tell the I/O thread to take the relay_log_space_limit into account
+  rli->ignore_log_space_limit= 0;
   pthread_mutex_unlock(&rli->log_space_lock);
   pthread_cond_broadcast(&rli->log_space_cond);
   
diff --git a/sql/slave.cc b/sql/slave.cc
index 5ddea7501e4..771317f9431 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -238,7 +238,7 @@ int init_relay_log_pos(RELAY_LOG_INFO* rli,const char* log,
   if (log)					// If not first log
   {
     if (strcmp(log, rli->linfo.log_file_name))
-      rli->skip_log_purge=1;			// Different name; Don't purge
+      rli->skip_log_purge= 1;			// Different name; Don't purge
     if (rli->relay_log.find_log_pos(&rli->linfo, log, 1))
     {
       *errmsg="Could not find target log during relay log initialization";
@@ -273,6 +273,12 @@ int init_relay_log_pos(RELAY_LOG_INFO* rli,const char* log,
     my_b_seek(rli->cur_log,(off_t)pos);
 
 err:
+  /*
+    If we don't purge, we can't honour relay_log_space_limit ;
+    silently discard it
+  */
+  if (rli->skip_log_purge)
+    rli->log_space_limit= 0;
   pthread_cond_broadcast(&rli->data_cond);
   if (need_data_lock)
     pthread_mutex_unlock(&rli->data_lock);
@@ -1312,7 +1318,8 @@ static bool wait_for_relay_log_space(RELAY_LOG_INFO* rli)
   save_proc_info = thd->proc_info;
   thd->proc_info = "Waiting for relay log space to free";
   while (rli->log_space_limit < rli->log_space_total &&
-	 !(slave_killed=io_slave_killed(thd,mi)))
+	 !(slave_killed=io_slave_killed(thd,mi)) &&
+         !rli->ignore_log_space_limit)
   {
     pthread_cond_wait(&rli->log_space_cond, &rli->log_space_lock);
   }
@@ -1588,7 +1595,7 @@ bool flush_master_info(MASTER_INFO* mi)
 
 st_relay_log_info::st_relay_log_info()
   :info_fd(-1), cur_log_fd(-1), master_log_pos(0), save_temporary_tables(0),
-   cur_log_old_open_count(0), log_space_total(0), 
+   cur_log_old_open_count(0), log_space_total(0), ignore_log_space_limit(0),
    slave_skip_counter(0), abort_pos_wait(0), slave_run_id(0),
    sql_thd(0), last_slave_errno(0), inited(0), abort_slave(0),
    slave_running(0), skip_log_purge(0),
@@ -2296,7 +2303,8 @@ reconnect done to recover from failed read");
       }
       flush_master_info(mi);
       if (mi->rli.log_space_limit && mi->rli.log_space_limit <
-	  mi->rli.log_space_total)
+	  mi->rli.log_space_total &&
+          !mi->rli.ignore_log_space_limit)
 	if (wait_for_relay_log_space(&mi->rli))
 	{
 	  sql_print_error("Slave I/O thread aborted while waiting for relay \
@@ -2408,6 +2416,10 @@ slave_begin:
   pthread_cond_broadcast(&rli->start_cond);
   // This should always be set to 0 when the slave thread is started
   rli->pending = 0;
+
+  //tell the I/O thread to take relay_log_space_limit into account from now on
+  rli->ignore_log_space_limit= 0;
+
   if (init_relay_log_pos(rli,
 			 rli->relay_log_name,
 			 rli->relay_log_pos,
@@ -3086,11 +3098,41 @@ Log_event* next_event(RELAY_LOG_INFO* rli)
 	  update. If we do not, show slave status will block
 	*/
 	pthread_mutex_unlock(&rli->data_lock);
- 	/* Note that wait_for_update unlocks lock_log ! */
-	rli->relay_log.wait_for_update(rli->sql_thd);
-	
-	// re-acquire data lock since we released it earlier
-	pthread_mutex_lock(&rli->data_lock);
+
+        /*
+          Possible deadlock : 
+          - the I/O thread has reached log_space_limit
+          - the SQL thread has read all relay logs, but cannot purge for some
+          reason:
+            * it has already purged all logs except the current one
+            * there are other logs than the current one but they're involved in
+            a transaction that finishes in the current one (or is not finished)
+          Solution :
+          Wake up the possibly waiting I/O thread, and set a boolean asking
+          the I/O thread to temporarily ignore the log_space_limit
+          constraint, because we do not want the I/O thread to block because of
+          space (it's ok if it blocks for any other reason (e.g. because the
+          master does not send anything). Then the I/O thread stops waiting 
+          and reads more events.
+          The SQL thread decides when the I/O thread should take log_space_limit
+          into account again : ignore_log_space_limit is reset to 0 
+          in purge_first_log (when the SQL thread purges the just-read relay
+          log), and also when the SQL thread starts. We should also reset
+          ignore_log_space_limit to 0 when the user does RESET SLAVE, but in
+          fact, no need as RESET SLAVE requires that the slave
+          be stopped, and when the SQL thread is later restarted
+          ignore_log_space_limit will be reset to 0.
+        */
+        pthread_mutex_lock(&rli->log_space_lock);
+        // prevent the I/O thread from blocking next times
+        rli->ignore_log_space_limit= 1; 
+        // If the I/O thread is blocked, unblock it
+        pthread_cond_broadcast(&rli->log_space_cond);
+        pthread_mutex_unlock(&rli->log_space_lock);
+        // Note that wait_for_update unlocks lock_log !
+        rli->relay_log.wait_for_update(rli->sql_thd);
+        // re-acquire data lock since we released it earlier
+        pthread_mutex_lock(&rli->data_lock);
 	continue;
       }
       /*
diff --git a/sql/slave.h b/sql/slave.h
index fe0f0b045f3..8832302056d 100644
--- a/sql/slave.h
+++ b/sql/slave.h
@@ -137,7 +137,14 @@ typedef struct st_relay_log_info
     offset. pending stored the extra offset to be added to the position.
   */
   ulonglong relay_log_pos, pending;
+
+  /*
+    Handling of the relay_log_space_limit optional constraint.
+    ignore_log_space_limit is used to resolve a deadlock between I/O and SQL
+    threads, it makes the I/O thread temporarily forget about the constraint
+  */
   ulonglong log_space_limit,log_space_total;
+  bool ignore_log_space_limit;
 
   /*
     InnoDB internally stores the master log position it has processed
diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc
index 5e90bbf1b0f..d670c673b4a 100644
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -858,22 +858,21 @@ int change_master(THD* thd, MASTER_INFO* mi)
 
   if (lex_mi->relay_log_name)
   {
-    need_relay_log_purge = 0;
-    mi->rli.skip_log_purge=1;
+    need_relay_log_purge= 0;
     strmake(mi->rli.relay_log_name,lex_mi->relay_log_name,
 	    sizeof(mi->rli.relay_log_name)-1);
   }
 
   if (lex_mi->relay_log_pos)
   {
-    need_relay_log_purge=0;
+    need_relay_log_purge= 0;
     mi->rli.relay_log_pos=lex_mi->relay_log_pos;
   }
 
   flush_master_info(mi);
   if (need_relay_log_purge)
   {
-    mi->rli.skip_log_purge=0;
+    mi->rli.skip_log_purge= 0;
     thd->proc_info="purging old relay logs";
     if (purge_relay_logs(&mi->rli, thd,
 			 0 /* not only reset, but also reinit */,
@@ -887,6 +886,7 @@ int change_master(THD* thd, MASTER_INFO* mi)
   else
   {
     const char* msg;
+    mi->rli.skip_log_purge= 1;
     /* Relay log is already initialized */
     if (init_relay_log_pos(&mi->rli,
 			   mi->rli.relay_log_name,
author	unknown <guilhem@mysql.com>	2003-03-17 22:51:56 +0100
committer	unknown <guilhem@mysql.com>	2003-03-17 22:51:56 +0100
commit	40c0b2c6c8b730dec9900c3829b7babf0a5b4772 (patch)
tree	5d9a0eaa1fe759e9fccf20738544fe2eab9bbbe8 /sql
parent	2103479670e60d1cce4166a4cd67b704bf4743da (diff)
download	mariadb-git-40c0b2c6c8b730dec9900c3829b7babf0a5b4772.tar.gz