diff options
author | Kristian Nielsen <knielsen@knielsen-hq.org> | 2016-01-15 12:42:51 +0100 |
---|---|---|
committer | Kristian Nielsen <knielsen@knielsen-hq.org> | 2016-01-15 12:48:14 +0100 |
commit | 06b2e327fcb9ee8737e66cf0893766df41c9d8f8 (patch) | |
tree | f57a045668e9d486bb9cfb5a39b072253e4fba83 | |
parent | 0403790722e3941779ccea26e85fcd818e2320b5 (diff) | |
download | mariadb-git-06b2e327fcb9ee8737e66cf0893766df41c9d8f8.tar.gz |
Fix error handling for GTID and domain-based parallel replication
This occurs when replication stops with an error, domain-based parallel
replication is used, and the GTID position contains more than one domain.
Furthermore, it relates to the case where the SQL thread is restarted
without first stopping the IO thread.
In this case, the file/offset relay-log position does not correctly
represent the slave's multi-dimensional position, because other domains may
be far ahead of, or behind, the domain with the failing event. So the code
reverts the relay log position back to the start of a relay log file that is
known to be before all active domains.
There was a bug that when the SQL thread was restarted, the
rli->relay_log_state was incorrectly initialised from @@gtid_slave_pos. This
position will likely be too far ahead, due to reverting the relay log
position. Thus, if the replication fails again after the SQL thread restart,
the rli->restart_gtid_pos might be updated incorrectly. This in turn would
cause a second SQL thread restart to replicate from the wrong position, if
the IO thread was still left running.
The fix is to initialise rli->relay_log_state from @@gtid_slave_pos only
when we actually purge and re-fetch relay logs from the master, not at every
SQL thread start.
A related problem is the use of sql_slave_skip_counter to resolve
replication failures in this kind of scenario. Since the slave position is
multi-dimensional, sql_slave_skip_counter can not work properly - it is
indeterminate exactly which event is to be skipped, and is unlikely to work
as expected for the user. So make this an error in the case where
domain-based parallel replication is used with multiple domains, suggesting
instead the user to set @@gtid_slave_pos to reliably skip the desired event.
-rw-r--r-- | mysql-test/suite/rpl/r/rpl_parallel.result | 2 | ||||
-rw-r--r-- | mysql-test/suite/rpl/t/rpl_parallel.test | 7 | ||||
-rw-r--r-- | sql/rpl_rli.cc | 2 | ||||
-rw-r--r-- | sql/share/errmsg-utf8.txt | 2 | ||||
-rw-r--r-- | sql/slave.cc | 10 | ||||
-rw-r--r-- | sql/sys_vars.cc | 23 |
6 files changed, 36 insertions, 10 deletions
diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result index de67f5f0610..d915337725d 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel.result +++ b/mysql-test/suite/rpl/r/rpl_parallel.result @@ -929,6 +929,8 @@ a 31 32 SET sql_slave_skip_counter= 1; +ERROR HY000: When using parallel replication and GTID with multiple replication domains, @@sql_slave_skip_counter can not be used. Instead, setting @@gtid_slave_pos explicitly can be used to skip to after a given GTID position. +include/stop_slave_io.inc include/start_slave.inc include/sync_with_master_gtid.inc SELECT * FROM t2 WHERE a >= 30 ORDER BY a; diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test index e70dcfa5bb0..eec45bc0367 100644 --- a/mysql-test/suite/rpl/t/rpl_parallel.test +++ b/mysql-test/suite/rpl/t/rpl_parallel.test @@ -1423,6 +1423,7 @@ SELECT * FROM t6 ORDER BY a; --connection server_1 INSERT INTO t2 VALUES (31); +--let $gtid1= `SELECT @@LAST_GTID` --source include/save_master_gtid.inc --connection server_2 @@ -1438,6 +1439,7 @@ SET sql_log_bin= 1; --connection server_1 INSERT INTO t2 VALUES (32); +--let $gtid2= `SELECT @@LAST_GTID` # Rotate the binlog; the bug is triggered when the master binlog file changes # after the event group that causes the duplicate key error. FLUSH LOGS; @@ -1470,7 +1472,12 @@ START SLAVE SQL_THREAD; SELECT * FROM t2 WHERE a >= 30 ORDER BY a; # Skip the duplicate error, so we can proceed. +--error ER_SLAVE_SKIP_NOT_IN_GTID SET sql_slave_skip_counter= 1; +--source include/stop_slave_io.inc +--disable_query_log +eval SET GLOBAL gtid_slave_pos = REPLACE(@@gtid_slave_pos, "$gtid1", "$gtid2"); +--enable_query_log --source include/start_slave.inc --source include/sync_with_master_gtid.inc diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index 3207c858b20..7e0a1279e05 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -389,6 +389,7 @@ Failed to open the existing relay log info file '%s' (errno %d)", if (rli->is_relay_log_recovery && init_recovery(rli->mi, &msg)) goto err; + rli->relay_log_state.load(rpl_global_gtid_slave_state); if (init_relay_log_pos(rli, rli->group_relay_log_name, rli->group_relay_log_pos, @@ -1137,6 +1138,7 @@ int purge_relay_logs(Relay_log_info* rli, THD *thd, bool just_reset, error=1; goto err; } + rli->relay_log_state.load(rpl_global_gtid_slave_state); if (!just_reset) { /* Save name of used relay log file */ diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt index 5fed6b10425..4394e796a1a 100644 --- a/sql/share/errmsg-utf8.txt +++ b/sql/share/errmsg-utf8.txt @@ -7108,7 +7108,7 @@ ER_PRIOR_COMMIT_FAILED ER_IT_IS_A_VIEW 42S02 eng "'%-.192s' is a view" ER_SLAVE_SKIP_NOT_IN_GTID - eng "When using GTID, @@sql_slave_skip_counter can not be used. Instead, setting @@gtid_slave_pos explicitly can be used to skip to after a given GTID position." + eng "When using parallel replication and GTID with multiple replication domains, @@sql_slave_skip_counter can not be used. Instead, setting @@gtid_slave_pos explicitly can be used to skip to after a given GTID position." ER_TABLE_DEFINITION_TOO_BIG eng "The definition for table %`s is too big" ER_PLUGIN_INSTALLED diff --git a/sql/slave.cc b/sql/slave.cc index 0243272c443..7b493c4c0be 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -4527,15 +4527,6 @@ pthread_handler_t handle_slave_sql(void *arg) serial_rgi->gtid_sub_id= 0; serial_rgi->gtid_pending= false; - if (mi->using_gtid != Master_info::USE_GTID_NO) - { - /* - We initialize the relay log state from the know starting position. - It will then be updated as required by GTID and GTID_LIST events found - while applying events read from relay logs. - */ - rli->relay_log_state.load(&rpl_global_gtid_slave_state); - } rli->gtid_skip_flag = GTID_SKIP_NOT; if (init_relay_log_pos(rli, rli->group_relay_log_name, @@ -4791,6 +4782,7 @@ log '%s' at position %s, relay log '%s' position: %s%s", RPL_LOG_NAME, } strmake_buf(rli->group_relay_log_name, ir->name); rli->group_relay_log_pos= BIN_LOG_HEADER_SIZE; + rli->relay_log_state.load(ir->relay_log_state, ir->relay_log_state_count); } } } diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index d0ee489aa82..81c71ce23e3 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -4319,6 +4319,29 @@ static bool update_slave_skip_counter(sys_var *self, THD *thd, Master_info *mi) mi->connection_name.str); return true; } + if (mi->using_gtid != Master_info::USE_GTID_NO && + opt_slave_parallel_threads > 0) + { + ulong domain_count; + mysql_mutex_lock(&rpl_global_gtid_slave_state->LOCK_slave_state); + domain_count= rpl_global_gtid_slave_state->count(); + mysql_mutex_unlock(&rpl_global_gtid_slave_state->LOCK_slave_state); + if (domain_count > 1) + { + /* + With domain-based parallel replication, the slave position is + multi-dimensional, so the relay log position is not very meaningful. + It might not even correspond to the next GTID to execute in _any_ + domain (the case after error stop). So slave_skip_counter will most + likely not do what the user intends. Instead give an error, with a + suggestion to instead set @@gtid_slave_pos past the point of error; + this works reliably also in the case of multiple domains. + */ + my_error(ER_SLAVE_SKIP_NOT_IN_GTID, MYF(0)); + return true; + } + } + /* The value was stored temporarily in thd */ mi->rli.slave_skip_counter= thd->variables.slave_skip_counter; return false; |