When the I/O thread was stopped while copying a long transaction, and restarted,

Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it by moving the test for corruption to Start_log_event::exec_event(). Changed Rotate_log_event::exec_event() to not increment positions when the event is seen in the middle of a transaction. I did a separate commit in 4.1 (so this should not be merged to 4.0) because code is a bit different in 4.1. A test to see if the slave detects when the master died while writing a transaction to the binlog (uses a forged truncated binlog I made). sql/log_event.cc: When the I/O thread was stopped while copying a long transaction, and restarted, Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it by moving the test for corruption to Start_log_event::exec_event(). Changed Rotate_log_event::exec_event() to not increment positions when the event is seen in the middle of a transaction.
author: unknown <guilhem@mysql.com> 2003-10-03 22:13:01 +0200
committer: unknown <guilhem@mysql.com> 2003-10-03 22:13:01 +0200
commit: 1bd7662b896f31786b5de5dd2fcd309d148d32cf (patch)
tree: 09360947cec8862c4740eeb29ec0f1325a6084de
parent: 7a58bfee6195cdb2804a75ccd90af81c87f696e4 (diff)
download: mariadb-git-1bd7662b896f31786b5de5dd2fcd309d148d32cf.tar.gz
4 files changed, 69 insertions, 33 deletions
diff --git a/mysql-test/r/rpl_trunc_binlog.result b/mysql-test/r/rpl_trunc_binlog.result
new file mode 100644
index 00000000000..6d2158eedfe
--- /dev/null
+++ b/mysql-test/r/rpl_trunc_binlog.result
@@ -0,0 +1,14 @@
+slave stop;
+drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
+reset master;
+reset slave;
+drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
+slave start;
+stop slave;
+flush logs;
+reset slave;
+start slave;
+show slave status;
+Master_Host	Master_User	Master_Port	Connect_retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_do_db	Replicate_ignore_db	Last_errno	Last_error	Skip_counter	Exec_master_log_pos	Relay_log_space
+127.0.0.1	root	MASTER_PORT	1	master-bin.002	4	slave-relay-bin.002	161	master-bin.001	Yes	No			0	there is an unfinished transaction in the relay log (could find neither COMMIT nor ROLLBACK in the relay log); it could be that the master died while writing the transaction to its binary log. Now the slave is rolling back the transaction.	0	79	317
+reset master;
diff --git a/mysql-test/std_data/trunc_binlog.001 b/mysql-test/std_data/trunc_binlog.001
new file mode 100644
index 00000000000..2c2b4ec6ce4
--- /dev/null
+++ b/mysql-test/std_data/trunc_binlog.001
diff --git a/mysql-test/t/rpl_trunc_binlog.test b/mysql-test/t/rpl_trunc_binlog.test
new file mode 100644
index 00000000000..efdc3012471
--- /dev/null
+++ b/mysql-test/t/rpl_trunc_binlog.test
@@ -0,0 +1,22 @@
+# We are testing if a binlog which contains BEGIN but not COMMIT (the master did
+# while writing the transaction to the binlog) triggers an error on slave.
+# So we use such a truncated binlog and simulate that the master restarted after
+# this.
+
+source include/master-slave.inc;
+
+connection slave;
+stop slave;
+connection master;
+flush logs;
+system mv -f var/log/master-bin.001 var/log/master-bin.002;
+system cp std_data/trunc_binlog.001 var/log/master-bin.001;
+connection slave;
+reset slave;
+start slave;
+# can't sync_with_master so we must sleep
+sleep 3;
+--replace_result $MASTER_MYPORT MASTER_PORT
+show slave status;
+connection master;
+reset master;
diff --git a/sql/log_event.cc b/sql/log_event.cc
index 292e371e4dd..18d1624dfde 100644
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@ -2091,6 +2091,23 @@ int Start_log_event::exec_event(struct st_relay_log_info* rli)
     */
     close_temporary_tables(thd);
     cleanup_load_tmpdir();
+    /*
+      As a transaction NEVER spans on 2 or more binlogs:
+      if we have an active transaction at this point, the master died while
+      writing the transaction to the binary log, i.e. while flushing the binlog
+      cache to the binlog. As the write was started, the transaction had been
+      committed on the master, so we lack of information to replay this
+      transaction on the slave; all we can do is stop with error.
+    */
+    if (rli->inside_transaction)
+    {
+      slave_print_error(rli, 0,
+                        "there is an unfinished transaction in the relay log \
+(could find neither COMMIT nor ROLLBACK in the relay log); it could be that \
+the master died while writing the transaction to its binary log. Now the slave \
+is rolling back the transaction.");
+      return(1);
+    }
     break;
   /* 
      Now the older formats; in that case load_tmpdir is cleaned up by the I/O
@@ -2166,51 +2183,34 @@ int Stop_log_event::exec_event(struct st_relay_log_info* rli)
     We can't rotate the slave as this will cause infinitive rotations
     in a A -> B -> A setup.
 
-  NOTES
-    As a transaction NEVER spans on 2 or more binlogs:
-    if we have an active transaction at this point, the master died while
-    writing the transaction to the binary log, i.e. while flushing the binlog
-    cache to the binlog. As the write was started, the transaction had been
-    committed on the master, so we lack of information to replay this
-    transaction on the slave; all we can do is stop with error.
-    If we didn't detect it, then positions would start to become garbage (as we
-    are incrementing rli->relay_log_pos whereas we are in a transaction: the new
-    rli->relay_log_pos will be
-    relay_log_pos of the BEGIN + size of the Rotate event = garbage.
-
-    Since MySQL 4.0.14, the master ALWAYS sends a Rotate event when it starts
-    sending the next binlog, so we are sure to receive a Rotate event just
-    after the end of the "dead master"'s binlog; so this exec_event() is the
-    right place to catch the problem. If we would wait until
-    Start_log_event::exec_event() it would be too late, rli->relay_log_pos would
-    already be garbage.
-
   RETURN VALUES
     0	ok
 */
 
 int Rotate_log_event::exec_event(struct st_relay_log_info* rli)
 {
-  char* log_name = rli->master_log_name;
   DBUG_ENTER("Rotate_log_event::exec_event");
 
   pthread_mutex_lock(&rli->data_lock);
-
-  if (rli->inside_transaction)
+  /*
+    If we are in a transaction: the only normal case is when the I/O thread was
+    copying a big transaction, then it was stopped and restarted: we have this
+    in the relay log:
+    BEGIN
+    ...
+    ROTATE (a fake one)
+    ...
+    COMMIT or ROLLBACK
+    In that case, we don't want to touch the coordinates which correspond to the
+    beginning of the transaction.
+  */
+  if (!rli->inside_transaction)
   {
-    slave_print_error(rli, 0,
-                      "there is an unfinished transaction in the relay log \
-(could find neither COMMIT nor ROLLBACK in the relay log); it could be that \
-the master died while writing the transaction to its binary log. Now the slave \
-is rolling back the transaction.");
-    pthread_mutex_unlock(&rli->data_lock);
-    DBUG_RETURN(1);
+    memcpy(rli->master_log_name, new_log_ident, ident_len+1);
+    rli->master_log_pos= pos;
+    DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
   }
-
-  memcpy(log_name, new_log_ident, ident_len+1);
-  rli->master_log_pos = pos;
   rli->relay_log_pos += get_event_len();
-  DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
   pthread_mutex_unlock(&rli->data_lock);
   pthread_cond_broadcast(&rli->data_cond);
   flush_relay_log_info(rli);
author	unknown <guilhem@mysql.com>	2003-10-03 22:13:01 +0200
committer	unknown <guilhem@mysql.com>	2003-10-03 22:13:01 +0200
commit	1bd7662b896f31786b5de5dd2fcd309d148d32cf (patch)
tree	09360947cec8862c4740eeb29ec0f1325a6084de
parent	7a58bfee6195cdb2804a75ccd90af81c87f696e4 (diff)
download	mariadb-git-1bd7662b896f31786b5de5dd2fcd309d148d32cf.tar.gz