MDEV-10863: parallel replication tries to continue from wrong position

This occured when the SQL thread (but not the IO thread) stops while GTID and parallel replication are used with multiple domain ids in the GTID position, and is restarted. In this case, the SQL needs to start some way back in the relay log, applying or skipping events within each replication domain as appropriate. The SQL threads starts at the beginning of an old relay log file, and this position may be in the middle of an event group. The bug was that such partial event group could be re-applied, causing replication corruption. This patch fixes the issue, by making sure to skip any initial events that were part of an earlier (already applied) event group.
author: Kristian Nielsen <knielsen@knielsen-hq.org> 2016-11-04 12:33:42 +0100
committer: Kristian Nielsen <knielsen@knielsen-hq.org> 2016-11-04 12:33:42 +0100
commit: 717f212840a35b2da228d0cc2877b8e1b32204a3 (patch)
tree: f3da9174cf78cbdd5ae3e3117bbf216f2375968e
parent: eca8c324e9a02f530853580991b11b587f54b24a (diff)
download: mariadb-git-717f212840a35b2da228d0cc2877b8e1b32204a3.tar.gz
3 files changed, 159 insertions, 1 deletions
diff --git a/mysql-test/suite/rpl/r/rpl_mdev10863.result b/mysql-test/suite/rpl/r/rpl_mdev10863.result
new file mode 100644
index 00000000000..475057d4c29
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_mdev10863.result
@@ -0,0 +1,39 @@
+include/rpl_init.inc [topology=1->2]
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=10;
+SET @old_max_relay= @@GLOBAL.max_relay_log_size;
+SET GLOBAL max_relay_log_size = 4096;
+CHANGE MASTER TO master_use_gtid=slave_pos;
+include/start_slave.inc
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, "a");
+*** Create a long transaction that will span a relay log file. ***
+SET @old_domain= @@gtid_domain_id;
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (10000, "domain 10");
+SET gtid_domain_id=20;
+INSERT INTO t1 VALUES (20000, "domain 20");
+SET gtid_domain_id=@old_domain;
+BEGIN;
+[lots of inserts omitted]
+COMMIT;
+BEGIN;
+[lots of inserts omitted]
+COMMIT;
+include/stop_slave_sql.inc
+START SLAVE SQL_THREAD;
+include/wait_for_slave_to_start.inc
+INSERT INTO t1 VALUES (100000, "More stuffs.");
+INSERT INTO t1 VALUES (100001, "And even more");
+SELECT * FROM t1 WHERE a >= 100000 ORDER BY a;
+a	b
+100000	More stuffs.
+100001	And even more
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL max_relay_log_size= @old_max_relay;
+include/start_slave.inc
+DROP TABLE t1;
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_mdev10863.test b/mysql-test/suite/rpl/t/rpl_mdev10863.test
new file mode 100644
index 00000000000..796e770672d
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_mdev10863.test
@@ -0,0 +1,104 @@
+--source include/have_innodb.inc
+--let $rpl_topology=1->2
+--source include/rpl_init.inc
+
+# Test various aspects of parallel replication.
+
+--connection server_2
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=10;
+SET @old_max_relay= @@GLOBAL.max_relay_log_size;
+SET GLOBAL max_relay_log_size = 4096;
+CHANGE MASTER TO master_use_gtid=slave_pos;
+--source include/start_slave.inc
+
+--connection server_1
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, "a");
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+
+--echo *** Create a long transaction that will span a relay log file. ***
+--connection server_1
+
+# Add some transactions in separate domains, that will cause the need to
+# have a multi-valued restart position in the relay log for the SQL thread.
+SET @old_domain= @@gtid_domain_id;
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (10000, "domain 10");
+SET gtid_domain_id=20;
+INSERT INTO t1 VALUES (20000, "domain 20");
+SET gtid_domain_id=@old_domain;
+
+BEGIN;
+--echo [lots of inserts omitted]
+--disable_query_log
+--let $count = 500
+while ($count) {
+  eval INSERT INTO t1 VALUES (1000+$count, REPEAT("hulubulu??!?", 8));
+  dec $count;
+}
+--enable_query_log
+COMMIT;
+
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+
+--connection server_1
+# Now do another one, to make the inuse_relaylog proceed to somewhere inside
+# the first large transaction.
+
+BEGIN;
+--echo [lots of inserts omitted]
+--disable_query_log
+--let $count = 500
+while ($count) {
+  eval INSERT INTO t1 VALUES (2000+$count, REPEAT("hulubulu??!?", 8));
+  dec $count;
+}
+--enable_query_log
+COMMIT;
+
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+
+
+# Stop and restart the SQL thread only.
+# The bug was that the SQL thread would restart at the start
+# of a relay log file, which could be in the middle of an event group.
+# This way, part of that event group could be wrongly re-applied.
+
+--source include/stop_slave_sql.inc
+START SLAVE SQL_THREAD;
+--source include/wait_for_slave_to_start.inc
+
+
+--connection server_1
+INSERT INTO t1 VALUES (100000, "More stuffs.");
+INSERT INTO t1 VALUES (100001, "And even more");
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+SELECT * FROM t1 WHERE a >= 100000 ORDER BY a;
+
+
+# Clean up.
+--connection server_2
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+SET GLOBAL max_relay_log_size= @old_max_relay;
+--source include/start_slave.inc
+
+--connection server_1
+DROP TABLE t1;
+
+--source include/rpl_end.inc
diff --git a/sql/slave.cc b/sql/slave.cc
index a124ca6be7e..04d62749fb1 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -4546,7 +4546,22 @@ pthread_handler_t handle_slave_sql(void *arg)
 
   serial_rgi->gtid_sub_id= 0;
   serial_rgi->gtid_pending= false;
-  rli->gtid_skip_flag = GTID_SKIP_NOT;
+  if (mi->using_gtid != Master_info::USE_GTID_NO &&
+      opt_slave_parallel_threads > 0 &&
+      rli->restart_gtid_pos.count() > 0)
+  {
+    /*
+      With parallel replication in GTID mode, if we have a multi-domain GTID
+      position, we need to start some way back in the relay log and skip any
+      GTID that was already applied before. Since event groups can be split
+      across multiple relay logs, this earlier starting point may be in the
+      middle of an already applied event group, so we also need to skip any
+      remaining part of such group.
+    */
+    rli->gtid_skip_flag = GTID_SKIP_TRANSACTION;
+  }
+  else
+    rli->gtid_skip_flag = GTID_SKIP_NOT;
   if (init_relay_log_pos(rli,
                          rli->group_relay_log_name,
                          rli->group_relay_log_pos,
author	Kristian Nielsen <knielsen@knielsen-hq.org>	2016-11-04 12:33:42 +0100
committer	Kristian Nielsen <knielsen@knielsen-hq.org>	2016-11-04 12:33:42 +0100
commit	717f212840a35b2da228d0cc2877b8e1b32204a3 (patch)
tree	f3da9174cf78cbdd5ae3e3117bbf216f2375968e
parent	eca8c324e9a02f530853580991b11b587f54b24a (diff)
download	mariadb-git-717f212840a35b2da228d0cc2877b8e1b32204a3.tar.gz