summaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorunknown <guilhem@mysql.com>2005-03-02 11:29:48 +0100
committerunknown <guilhem@mysql.com>2005-03-02 11:29:48 +0100
commitd7e0784c65b44f3fa36f602e5f1d7579eed6b88b (patch)
treea63b632fd76de92b2e46c444930abb8d50336b2d /sql
parent3087b2f1fde43ea192bcf7a2fa217aadc7bf7d3a (diff)
downloadmariadb-git-d7e0784c65b44f3fa36f602e5f1d7579eed6b88b.tar.gz
Fix for BUG#8325 "Deadlock in replication thread stops replication":
in slave SQL thread: if a transaction fails because of InnoDB deadlock or innodb_lock_wait_timeout exceeded, optionally retry the transaction a certain number of times (new variable --slave_transaction_retries). sql/mysql_priv.h: new var slave_transaction_retries sql/mysqld.cc: new variable slave_transaction_retries. Plus fixing a typo. sql/set_var.cc: new global variable slave_transaction_retries (will be one per subslave, when we have multimaster). sql/slave.cc: Slave SQL thread: if a transaction fails because of InnoDB deadlock or innodb_lock_wait_timeout exceeded, optionally retry the transaction a certain number of times (--slave_transaction_retries). sql/slave.h: new RELAY_LOG_INFO::trans_retries.
Diffstat (limited to 'sql')
-rw-r--r--sql/mysql_priv.h2
-rw-r--r--sql/mysqld.cc12
-rw-r--r--sql/set_var.cc4
-rw-r--r--sql/slave.cc57
-rw-r--r--sql/slave.h4
5 files changed, 74 insertions, 5 deletions
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h
index 6c77a8934dd..9e395aa37da 100644
--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -890,7 +890,7 @@ extern ulong ha_read_first_count, ha_read_last_count;
extern ulong ha_read_rnd_count, ha_read_rnd_next_count, ha_discover_count;
extern ulong ha_commit_count, ha_rollback_count,table_cache_size;
extern ulong max_connections,max_connect_errors, connect_timeout;
-extern ulong slave_net_timeout;
+extern ulong slave_net_timeout, slave_trans_retries;
extern ulong max_user_connections;
extern ulong long_query_count, what_to_log,flush_time;
extern ulong query_buff_size, thread_stack,thread_stack_min;
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 1591b205d46..dba39299907 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -311,7 +311,7 @@ ulong server_id, thd_startup_options;
ulong table_cache_size, thread_stack, thread_stack_min, what_to_log;
ulong query_buff_size, slow_launch_time, slave_open_temp_tables;
ulong open_files_limit, max_binlog_size, max_relay_log_size;
-ulong slave_net_timeout;
+ulong slave_net_timeout, slave_trans_retries;
ulong thread_cache_size=0, binlog_cache_size=0, max_binlog_cache_size=0;
ulong query_cache_size=0;
ulong com_stat[(uint) SQLCOM_END], com_other;
@@ -4096,7 +4096,7 @@ enum options_mysqld
OPT_QUERY_CACHE_TYPE, OPT_QUERY_CACHE_WLOCK_INVALIDATE, OPT_RECORD_BUFFER,
OPT_RECORD_RND_BUFFER, OPT_RELAY_LOG_SPACE_LIMIT, OPT_RELAY_LOG_PURGE,
OPT_SLAVE_NET_TIMEOUT, OPT_SLAVE_COMPRESSED_PROTOCOL, OPT_SLOW_LAUNCH_TIME,
- OPT_READONLY, OPT_DEBUGGING,
+ OPT_SLAVE_TRANS_RETRIES, OPT_READONLY, OPT_DEBUGGING,
OPT_SORT_BUFFER, OPT_TABLE_CACHE,
OPT_THREAD_CONCURRENCY, OPT_THREAD_CACHE_SIZE,
OPT_TMP_TABLE_SIZE, OPT_THREAD_STACK,
@@ -5049,7 +5049,7 @@ The minimum value for this variable is 4096.",
(gptr*) &max_system_variables.max_length_for_sort_data, 0, GET_ULONG,
REQUIRED_ARG, 1024, 4, 8192*1024L, 0, 1, 0},
{"max_relay_log_size", OPT_MAX_RELAY_LOG_SIZE,
- "If non-zero: relay log will be rotated automatically when the size exceeds this value; if zero (the default): when the size exceeds max_binlog_size. 0 expected, the minimum value for this variable is 4096.",
+ "If non-zero: relay log will be rotated automatically when the size exceeds this value; if zero (the default): when the size exceeds max_binlog_size. 0 excepted, the minimum value for this variable is 4096.",
(gptr*) &max_relay_log_size, (gptr*) &max_relay_log_size, 0, GET_ULONG,
REQUIRED_ARG, 0L, 0L, 1024*1024L*1024L, 0, IO_SIZE, 0},
{ "max_seeks_for_key", OPT_MAX_SEEKS_FOR_KEY,
@@ -5220,6 +5220,12 @@ The minimum value for this variable is 4096.",
"Number of seconds to wait for more data from a master/slave connection before aborting the read.",
(gptr*) &slave_net_timeout, (gptr*) &slave_net_timeout, 0,
GET_ULONG, REQUIRED_ARG, SLAVE_NET_TIMEOUT, 1, LONG_TIMEOUT, 0, 1, 0},
+ {"slave_transaction_retries", OPT_SLAVE_TRANS_RETRIES,
+ "Number of times the slave SQL thread will retry a transaction in case "
+ "it failed with a deadlock or elapsed lock wait timeout, "
+ "before giving up and stopping.",
+ (gptr*) &slave_trans_retries, (gptr*) &slave_trans_retries, 0,
+ GET_ULONG, REQUIRED_ARG, 0L, 0L, (longlong) ULONG_MAX, 0, 1, 0},
#endif /* HAVE_REPLICATION */
{"slow_launch_time", OPT_SLOW_LAUNCH_TIME,
"If creating the thread takes longer than this value (in seconds), the Slow_launch_threads counter will be incremented.",
diff --git a/sql/set_var.cc b/sql/set_var.cc
index 29ebb2c8817..02b53f701cc 100644
--- a/sql/set_var.cc
+++ b/sql/set_var.cc
@@ -320,6 +320,8 @@ sys_var_bool_ptr sys_slave_compressed_protocol("slave_compressed_protocol",
#ifdef HAVE_REPLICATION
sys_var_long_ptr sys_slave_net_timeout("slave_net_timeout",
&slave_net_timeout);
+sys_var_long_ptr sys_slave_trans_retries("slave_transaction_retries",
+ &slave_trans_retries);
#endif
sys_var_long_ptr sys_slow_launch_time("slow_launch_time",
&slow_launch_time);
@@ -604,6 +606,7 @@ sys_var *sys_variables[]=
#ifdef HAVE_REPLICATION
&sys_slave_compressed_protocol,
&sys_slave_net_timeout,
+ &sys_slave_trans_retries,
&sys_slave_skip_counter,
#endif
&sys_slow_launch_time,
@@ -855,6 +858,7 @@ struct show_var_st init_vars[]= {
{"skip_show_database", (char*) &opt_skip_show_db, SHOW_BOOL},
#ifdef HAVE_REPLICATION
{sys_slave_net_timeout.name,(char*) &sys_slave_net_timeout, SHOW_SYS},
+ {sys_slave_trans_retries.name,(char*) &sys_slave_trans_retries, SHOW_SYS},
#endif
{sys_slow_launch_time.name, (char*) &sys_slow_launch_time, SHOW_SYS},
#ifdef HAVE_SYS_UN_H
diff --git a/sql/slave.cc b/sql/slave.cc
index a39cbdbe14b..145f4295075 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -2959,6 +2959,62 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
exec_res = ev->exec_event(rli);
DBUG_ASSERT(rli->sql_thd==thd);
delete ev;
+ if (slave_trans_retries)
+ {
+ if (exec_res &&
+ (thd->net.last_errno == ER_LOCK_DEADLOCK ||
+ thd->net.last_errno == ER_LOCK_WAIT_TIMEOUT) &&
+ !thd->is_fatal_error)
+ {
+ const char *errmsg;
+ /*
+ We were in a transaction which has been rolled back because of a
+ deadlock (currently, InnoDB deadlock detected by InnoDB) or lock
+ wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to
+ BEGIN log event and retry it all again.
+ We have to not only seek but also
+ a) init_master_info(), to seek back to hot relay log's start for later
+ (for when we will come back to this hot log after re-processing the
+ possibly existing old logs where BEGIN is: check_binlog_magic() will
+ then need the cache to be at position 0 (see comments at beginning of
+ init_master_info()).
+ b) init_relay_log_pos(), because the BEGIN may be an older relay log.
+ */
+ if (rli->trans_retries--)
+ {
+ sql_print_information("Slave SQL thread retries transaction");
+ if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL))
+ sql_print_error("Failed to initialize the master info structure");
+ else if (init_relay_log_pos(rli,
+ rli->group_relay_log_name,
+ rli->group_relay_log_pos,
+ 1, &errmsg))
+ sql_print_error("Error initializing relay log position: %s",
+ errmsg);
+ else
+ {
+ exec_res= 0;
+ sleep(2); // chance for concurrent connection to get more locks
+ }
+ }
+ else
+ sql_print_error("Slave SQL thread retried transaction %lu time(s) "
+ "in vain, giving up. Consider raising the value of "
+ "the slave_transaction_retries variable.",
+ slave_trans_retries);
+ }
+ if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
+ {
+ rli->trans_retries= slave_trans_retries; // restart from fresh
+ /*
+ TODO: when merged into 5.0, when slave does auto-rollback if
+ corrupted binlog, this should reset the retry counter too
+ (any rollback should). In fact it will work, as here we are just out
+ of a Format_description_log_event::exec_event() which rolled back.
+ But check repl code in 5.0 for new ha_rollback calls, just in case.
+ */
+ }
+ }
return exec_res;
}
else
@@ -3370,6 +3426,7 @@ slave_begin:
pthread_mutex_lock(&rli->log_space_lock);
rli->ignore_log_space_limit= 0;
pthread_mutex_unlock(&rli->log_space_lock);
+ rli->trans_retries= slave_trans_retries; // start from "no error"
if (init_relay_log_pos(rli,
rli->group_relay_log_name,
diff --git a/sql/slave.h b/sql/slave.h
index bcd79dd4a39..fb7560ec738 100644
--- a/sql/slave.h
+++ b/sql/slave.h
@@ -294,7 +294,9 @@ typedef struct st_relay_log_info
UNTIL_LOG_NAMES_CMP_UNKNOWN= -2, UNTIL_LOG_NAMES_CMP_LESS= -1,
UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1
} until_log_names_cmp_result;
-
+
+ ulong trans_retries;
+
st_relay_log_info();
~st_relay_log_info();