diff options
author | unknown <guilhem@mysql.com> | 2004-12-16 18:12:22 +0100 |
---|---|---|
committer | unknown <guilhem@mysql.com> | 2004-12-16 18:12:22 +0100 |
commit | 4869f022fd4761445843135102790691e8d48861 (patch) | |
tree | 747ea547f2e4eb2f3ee55054dac260feb2f91c5b /sql/slave.cc | |
parent | 730e2722fffd9cb7086770e7652cebc932c4c261 (diff) | |
download | mariadb-git-4869f022fd4761445843135102790691e8d48861.tar.gz |
A change of behaviour of Seconds_Behind_Master from SHOW SLAVE STATUS. It's going into 4.1
because old behaviour was somewhat nonsensical (kind of bug). Changes are that if repl threads are
down or disconnected the column will be NULL, and if master is idle the column will not grow indefinitely anymore.
sql/slave.cc:
mi->slave_running and rli->slave_running now uints (was needed only for mi but because of start_slave_thread() usage,
had to change both).
So mi->slave_running can now take 3 values: not running, running & not connected, running and connected.
The last value serves for calculation of Seconds_Behind_Master in SHOW SLAVE STATUS.
Changing this column's behaviour: if SQL or I/O thread is not running, or if I/O thread is not connected
(for example if it is reconnecting), it's NULL (to mean "unknown"). And if master is idle, the column will
not grow indefinitely like it used to (that was meaningless); this is fixed by forcing a value of 0
when the slave SQL thread has hit EOF of relay log (which has only a limited number of caveats explained
in comments in code).
sql/slave.h:
slave_running used to be bool but we need to distinguish, for the I/O slave thread, between
"running & connected" and "running & not connected" ("running" means the thread exists).
sql/sql_repl.cc:
we don't need anymore to set rli->last_master_timestamp to 0 (we used that to make Seconds_Behind_Master
be NULL) in RESET SLAVE and CHANGE MASTER, as these commands imply that slave threads are not running
and so Seconds_Behind_Master is already NULL because of that.
Diffstat (limited to 'sql/slave.cc')
-rw-r--r-- | sql/slave.cc | 61 |
1 files changed, 48 insertions, 13 deletions
diff --git a/sql/slave.cc b/sql/slave.cc index bd9650ed369..90c75243f3d 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -545,7 +545,7 @@ int terminate_slave_threads(MASTER_INFO* mi,int thread_mask,bool skip_lock) int terminate_slave_thread(THD* thd, pthread_mutex_t* term_lock, pthread_mutex_t *cond_lock, pthread_cond_t* term_cond, - volatile bool* slave_running) + volatile uint *slave_running) { if (term_lock) { @@ -583,7 +583,7 @@ int terminate_slave_thread(THD* thd, pthread_mutex_t* term_lock, int start_slave_thread(pthread_handler h_func, pthread_mutex_t *start_lock, pthread_mutex_t *cond_lock, pthread_cond_t *start_cond, - volatile bool *slave_running, + volatile uint *slave_running, volatile ulong *slave_run_id, MASTER_INFO* mi, bool high_priority) @@ -963,7 +963,7 @@ void end_slave() static bool io_slave_killed(THD* thd, MASTER_INFO* mi) { DBUG_ASSERT(mi->io_thd == thd); - DBUG_ASSERT(mi->slave_running == 1); // tracking buffer overrun + DBUG_ASSERT(mi->slave_running); // tracking buffer overrun return mi->abort_slave || abort_loop || thd->killed; } @@ -1767,19 +1767,13 @@ void init_master_info_with_options(MASTER_INFO* mi) strmake(mi->ssl_key, master_ssl_key, sizeof(mi->ssl_key)-1); } -static void clear_slave_error(RELAY_LOG_INFO* rli) +void clear_slave_error(RELAY_LOG_INFO* rli) { /* Clear the errors displayed by SHOW SLAVE STATUS */ rli->last_slave_error[0]= 0; rli->last_slave_errno= 0; } -void clear_slave_error_timestamp(RELAY_LOG_INFO* rli) -{ - rli->last_master_timestamp= 0; - clear_slave_error(rli); -} - /* Reset UNTIL condition for RELAY_LOG_INFO SYNOPSYS @@ -2166,6 +2160,11 @@ int show_master_info(THD* thd, MASTER_INFO* mi) String *packet= &thd->packet; protocol->prepare_for_resend(); + /* + TODO: we read slave_running without run_lock, whereas these variables + are updated under run_lock and not data_lock. In 5.0 we should lock + run_lock on top of data_lock (with good order). + */ pthread_mutex_lock(&mi->data_lock); pthread_mutex_lock(&mi->rli.data_lock); @@ -2226,7 +2225,12 @@ int show_master_info(THD* thd, MASTER_INFO* mi) protocol->store(mi->ssl_cipher, &my_charset_bin); protocol->store(mi->ssl_key, &my_charset_bin); - if (mi->rli.last_master_timestamp) + /* + Seconds_Behind_Master: if SQL thread is running and I/O thread is + connected, we can compute it otherwise show NULL (i.e. unknown). + */ + if ((mi->slave_running == MYSQL_SLAVE_RUN_CONNECT) && + mi->rli.slave_running) { long tmp= (long)((time_t)time((time_t*) 0) - mi->rli.last_master_timestamp) @@ -2246,9 +2250,13 @@ int show_master_info(THD* thd, MASTER_INFO* mi) slave is 2. At SHOW SLAVE STATUS time, assume that the difference between timestamp of slave and rli->last_master_timestamp is 0 (i.e. they are in the same second), then we get 0-(2-1)=-1 as a result. - This confuses users, so we don't go below 0. + This confuses users, so we don't go below 0: hence the max(). + + last_master_timestamp == 0 (an "impossible" timestamp 1970) is a + special marker to say "consider we have caught up". */ - protocol->store((longlong)(max(0, tmp))); + protocol->store((longlong)(mi->rli.last_master_timestamp ? max(0, tmp) + : 0)); } else protocol->store_null(); @@ -3041,6 +3049,8 @@ slave_begin: connected: + // TODO: the assignment below should be under mutex (5.0) + mi->slave_running= MYSQL_SLAVE_RUN_CONNECT; thd->slave_net = &mysql->net; thd->proc_info = "Checking master version"; if (get_master_version_and_clock(mysql, mi)) @@ -3072,6 +3082,7 @@ dump"); goto err; } + mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT; thd->proc_info= "Waiting to reconnect after a failed binlog dump request"; #ifdef SIGNAL_WITH_VIO_CLOSE thd->clear_active_vio(); @@ -3148,6 +3159,7 @@ max_allowed_packet", mysql_error(mysql)); goto err; } + mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT; thd->proc_info = "Waiting to reconnect after a failed master event read"; #ifdef SIGNAL_WITH_VIO_CLOSE thd->clear_active_vio(); @@ -3323,6 +3335,14 @@ slave_begin: pthread_mutex_lock(&LOCK_thread_count); threads.append(thd); pthread_mutex_unlock(&LOCK_thread_count); + /* + We are going to set slave_running to 1. Assuming slave I/O thread is + alive and connected, this is going to make Seconds_Behind_Master be 0 + i.e. "caught up". Even if we're just at start of thread. Well it's ok, at + the moment we start we can think we are caught up, and the next second we + start receiving data so we realize we are not caught up and + Seconds_Behind_Master grows. No big deal. + */ rli->slave_running = 1; rli->abort_slave = 0; pthread_mutex_unlock(&rli->run_lock); @@ -4211,10 +4231,25 @@ Before assert, my_b_tell(cur_log)=%s rli->event_relay_log_pos=%s", */ pthread_mutex_unlock(&rli->log_space_lock); pthread_cond_broadcast(&rli->log_space_cond); + /* + We say in Seconds_Behind_Master that we have "caught up". Note that + for example if network link is broken but I/O slave thread hasn't + noticed it (slave_net_timeout not elapsed), then we'll say "caught + up" whereas we're not really caught up. Fixing that would require + internally cutting timeout in smaller pieces in network read, no + thanks. Another example: SQL has caught up on I/O, now I/O has read + a new event and is queuing it; the false "0" will exist until SQL + finishes executing the new event; it will be look abnormal only if + the events have old timestamps (then you get "many", 0, "many"). + Transient phases like this can't really be fixed. + */ + time_t save_timestamp= rli->last_master_timestamp; + rli->last_master_timestamp= 0; // Note that wait_for_update unlocks lock_log ! rli->relay_log.wait_for_update(rli->sql_thd, 1); // re-acquire data lock since we released it earlier pthread_mutex_lock(&rli->data_lock); + rli->last_master_timestamp= save_timestamp; continue; } /* |