diff options
author | Kristian Nielsen <knielsen@knielsen-hq.org> | 2016-11-04 12:33:42 +0100 |
---|---|---|
committer | Kristian Nielsen <knielsen@knielsen-hq.org> | 2016-11-04 12:33:42 +0100 |
commit | 717f212840a35b2da228d0cc2877b8e1b32204a3 (patch) | |
tree | f3da9174cf78cbdd5ae3e3117bbf216f2375968e | |
parent | eca8c324e9a02f530853580991b11b587f54b24a (diff) | |
download | mariadb-git-717f212840a35b2da228d0cc2877b8e1b32204a3.tar.gz |
MDEV-10863: parallel replication tries to continue from wrong position
This occured when the SQL thread (but not the IO thread) stops while
GTID and parallel replication are used with multiple domain ids in the
GTID position, and is restarted.
In this case, the SQL needs to start some way back in the relay log,
applying or skipping events within each replication domain as
appropriate.
The SQL threads starts at the beginning of an old relay log file, and
this position may be in the middle of an event group. The bug was that
such partial event group could be re-applied, causing replication
corruption.
This patch fixes the issue, by making sure to skip any initial events
that were part of an earlier (already applied) event group.
-rw-r--r-- | mysql-test/suite/rpl/r/rpl_mdev10863.result | 39 | ||||
-rw-r--r-- | mysql-test/suite/rpl/t/rpl_mdev10863.test | 104 | ||||
-rw-r--r-- | sql/slave.cc | 17 |
3 files changed, 159 insertions, 1 deletions
diff --git a/mysql-test/suite/rpl/r/rpl_mdev10863.result b/mysql-test/suite/rpl/r/rpl_mdev10863.result new file mode 100644 index 00000000000..475057d4c29 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_mdev10863.result @@ -0,0 +1,39 @@ +include/rpl_init.inc [topology=1->2] +SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; +include/stop_slave.inc +SET GLOBAL slave_parallel_threads=10; +SET @old_max_relay= @@GLOBAL.max_relay_log_size; +SET GLOBAL max_relay_log_size = 4096; +CHANGE MASTER TO master_use_gtid=slave_pos; +include/start_slave.inc +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1, "a"); +*** Create a long transaction that will span a relay log file. *** +SET @old_domain= @@gtid_domain_id; +SET gtid_domain_id=10; +INSERT INTO t1 VALUES (10000, "domain 10"); +SET gtid_domain_id=20; +INSERT INTO t1 VALUES (20000, "domain 20"); +SET gtid_domain_id=@old_domain; +BEGIN; +[lots of inserts omitted] +COMMIT; +BEGIN; +[lots of inserts omitted] +COMMIT; +include/stop_slave_sql.inc +START SLAVE SQL_THREAD; +include/wait_for_slave_to_start.inc +INSERT INTO t1 VALUES (100000, "More stuffs."); +INSERT INTO t1 VALUES (100001, "And even more"); +SELECT * FROM t1 WHERE a >= 100000 ORDER BY a; +a b +100000 More stuffs. +100001 And even more +include/stop_slave.inc +SET GLOBAL slave_parallel_threads=@old_parallel_threads; +SET GLOBAL max_relay_log_size= @old_max_relay; +include/start_slave.inc +DROP TABLE t1; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_mdev10863.test b/mysql-test/suite/rpl/t/rpl_mdev10863.test new file mode 100644 index 00000000000..796e770672d --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_mdev10863.test @@ -0,0 +1,104 @@ +--source include/have_innodb.inc +--let $rpl_topology=1->2 +--source include/rpl_init.inc + +# Test various aspects of parallel replication. + +--connection server_2 +SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; +--source include/stop_slave.inc +SET GLOBAL slave_parallel_threads=10; +SET @old_max_relay= @@GLOBAL.max_relay_log_size; +SET GLOBAL max_relay_log_size = 4096; +CHANGE MASTER TO master_use_gtid=slave_pos; +--source include/start_slave.inc + +--connection server_1 +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1 (a int PRIMARY KEY, b VARCHAR(100)) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1, "a"); +--save_master_pos + +--connection server_2 +--sync_with_master + +--echo *** Create a long transaction that will span a relay log file. *** +--connection server_1 + +# Add some transactions in separate domains, that will cause the need to +# have a multi-valued restart position in the relay log for the SQL thread. +SET @old_domain= @@gtid_domain_id; +SET gtid_domain_id=10; +INSERT INTO t1 VALUES (10000, "domain 10"); +SET gtid_domain_id=20; +INSERT INTO t1 VALUES (20000, "domain 20"); +SET gtid_domain_id=@old_domain; + +BEGIN; +--echo [lots of inserts omitted] +--disable_query_log +--let $count = 500 +while ($count) { + eval INSERT INTO t1 VALUES (1000+$count, REPEAT("hulubulu??!?", 8)); + dec $count; +} +--enable_query_log +COMMIT; + +--save_master_pos + +--connection server_2 +--sync_with_master + +--connection server_1 +# Now do another one, to make the inuse_relaylog proceed to somewhere inside +# the first large transaction. + +BEGIN; +--echo [lots of inserts omitted] +--disable_query_log +--let $count = 500 +while ($count) { + eval INSERT INTO t1 VALUES (2000+$count, REPEAT("hulubulu??!?", 8)); + dec $count; +} +--enable_query_log +COMMIT; + +--save_master_pos + +--connection server_2 +--sync_with_master + + +# Stop and restart the SQL thread only. +# The bug was that the SQL thread would restart at the start +# of a relay log file, which could be in the middle of an event group. +# This way, part of that event group could be wrongly re-applied. + +--source include/stop_slave_sql.inc +START SLAVE SQL_THREAD; +--source include/wait_for_slave_to_start.inc + + +--connection server_1 +INSERT INTO t1 VALUES (100000, "More stuffs."); +INSERT INTO t1 VALUES (100001, "And even more"); +--save_master_pos + +--connection server_2 +--sync_with_master +SELECT * FROM t1 WHERE a >= 100000 ORDER BY a; + + +# Clean up. +--connection server_2 +--source include/stop_slave.inc +SET GLOBAL slave_parallel_threads=@old_parallel_threads; +SET GLOBAL max_relay_log_size= @old_max_relay; +--source include/start_slave.inc + +--connection server_1 +DROP TABLE t1; + +--source include/rpl_end.inc diff --git a/sql/slave.cc b/sql/slave.cc index a124ca6be7e..04d62749fb1 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -4546,7 +4546,22 @@ pthread_handler_t handle_slave_sql(void *arg) serial_rgi->gtid_sub_id= 0; serial_rgi->gtid_pending= false; - rli->gtid_skip_flag = GTID_SKIP_NOT; + if (mi->using_gtid != Master_info::USE_GTID_NO && + opt_slave_parallel_threads > 0 && + rli->restart_gtid_pos.count() > 0) + { + /* + With parallel replication in GTID mode, if we have a multi-domain GTID + position, we need to start some way back in the relay log and skip any + GTID that was already applied before. Since event groups can be split + across multiple relay logs, this earlier starting point may be in the + middle of an already applied event group, so we also need to skip any + remaining part of such group. + */ + rli->gtid_skip_flag = GTID_SKIP_TRANSACTION; + } + else + rli->gtid_skip_flag = GTID_SKIP_NOT; if (init_relay_log_pos(rli, rli->group_relay_log_name, rli->group_relay_log_pos, |