diff options
author | Jonas Oreland <jonaso@google.com> | 2014-12-23 13:38:00 +0100 |
---|---|---|
committer | Kristian Nielsen <knielsen@knielsen-hq.org> | 2014-12-23 14:16:32 +0100 |
commit | 0b87de124d716cee7c1aa56f30c7f80c2c2bfcce (patch) | |
tree | 5557407d03402bdf28daf1b64e438362358f6188 /sql | |
parent | 4d8b346e079a27960dbe49e4d0ec4364bed8d30e (diff) | |
download | mariadb-git-0b87de124d716cee7c1aa56f30c7f80c2c2bfcce.tar.gz |
MDEV-162 Enhanced semisync replication
Implement --semi-sync-master-wait-point=AFTER_SYNC|AFTER_COMMIT.
When AFTER_SYNC, the semi-sync wait will be done earlier, before the storage
engine commit rather than after. This means that a transaction will not be
visible on the master until at least one slave has received it.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/handler.cc | 7 | ||||
-rw-r--r-- | sql/log.cc | 118 | ||||
-rw-r--r-- | sql/log.h | 4 | ||||
-rw-r--r-- | sql/mysqld.cc | 5 | ||||
-rw-r--r-- | sql/replication.h | 33 | ||||
-rw-r--r-- | sql/rpl_handler.cc | 29 | ||||
-rw-r--r-- | sql/rpl_handler.h | 5 | ||||
-rw-r--r-- | sql/transaction.cc | 14 |
8 files changed, 192 insertions, 23 deletions
diff --git a/sql/handler.cc b/sql/handler.cc index c1363cfcaf1..4bb8e0b4397 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -32,6 +32,7 @@ #include "sql_acl.h" // SUPER_ACL #include "sql_base.h" // free_io_cache #include "discover.h" // extension_based_table_discovery, etc +#include "log.h" // for assert_LOCK_log_owner #include "log_event.h" // *_rows_log_event #include "create_options.h" #include "rpl_filter.h" @@ -1479,6 +1480,12 @@ int ha_commit_trans(THD *thd, bool all) done: DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE();); + + /* documentation of which mutexes are (not) owned */ + mysql_mutex_assert_not_owner(&LOCK_prepare_ordered); + assert_LOCK_log_owner(false); + mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync); + mysql_mutex_assert_not_owner(&LOCK_commit_ordered); RUN_HOOK(transaction, after_commit, (thd, FALSE)); goto end; diff --git a/sql/log.cc b/sql/log.cc index edb4c07c8cc..66e142668a9 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -93,6 +93,7 @@ ulong opt_binlog_dbug_fsync_sleep= 0; mysql_mutex_t LOCK_prepare_ordered; mysql_cond_t COND_prepare_ordered; +mysql_mutex_t LOCK_after_binlog_sync; mysql_mutex_t LOCK_commit_ordered; static ulonglong binlog_status_var_num_commits; @@ -3938,7 +3939,8 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd, bool create_new_log, Without binlog, we cannot XA recover prepared-but-not-committed transactions in engines. So force a commit checkpoint first. - Note that we take and immediately release LOCK_commit_ordered. This has + Note that we take and immediately + release LOCK_after_binlog_sync/LOCK_commit_ordered. This has the effect to ensure that any on-going group commit (in trx_group_commit_leader()) has completed before we request the checkpoint, due to the chaining of LOCK_log and LOCK_commit_ordered in that function. @@ -3949,7 +3951,10 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd, bool create_new_log, commit_ordered() in the engine of some transaction, and then a crash later would leave such transaction not recoverable. */ + + mysql_mutex_lock(&LOCK_after_binlog_sync); mysql_mutex_lock(&LOCK_commit_ordered); + mysql_mutex_unlock(&LOCK_after_binlog_sync); mysql_mutex_unlock(&LOCK_commit_ordered); mark_xids_active(current_binlog_id, 1); @@ -6035,11 +6040,6 @@ err: if ((error= flush_and_sync(&synced))) { } - else if ((error= RUN_HOOK(binlog_storage, after_flush, - (thd, log_file_name, file->pos_in_file, synced)))) - { - sql_print_error("Failed to run 'after_flush' hooks"); - } else { /* update binlog_end_pos so it can be read by dump thread @@ -6050,23 +6050,58 @@ err: */ update_binlog_end_pos(offset); - signal_update(); - if ((error= rotate(false, &check_purge))) - check_purge= false; + /* documentation of which mutexes are (not) owned */ + mysql_mutex_assert_not_owner(&LOCK_prepare_ordered); + mysql_mutex_assert_owner(&LOCK_log); + mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync); + mysql_mutex_assert_not_owner(&LOCK_commit_ordered); + bool first= true; + bool last= true; + if ((error= RUN_HOOK(binlog_storage, after_flush, + (thd, log_file_name, file->pos_in_file, + synced, first, last)))) + { + sql_print_error("Failed to run 'after_flush' hooks"); + error= 1; + } + else + { + signal_update(); + if ((error= rotate(false, &check_purge))) + check_purge= false; + } } } status_var_add(thd->status_var.binlog_bytes_written, offset - my_org_b_tell); + mysql_mutex_lock(&LOCK_after_binlog_sync); + mysql_mutex_unlock(&LOCK_log); + + /* documentation of which mutexes are (not) owned */ + mysql_mutex_assert_not_owner(&LOCK_prepare_ordered); + mysql_mutex_assert_not_owner(&LOCK_log); + mysql_mutex_assert_owner(&LOCK_after_binlog_sync); + mysql_mutex_assert_not_owner(&LOCK_commit_ordered); + bool first= true; + bool last= true; + if (RUN_HOOK(binlog_storage, after_sync, + (thd, log_file_name, file->pos_in_file, + first, last))) + { + error=1; + /* error is already printed inside hook */ + } + /* Take mutex to protect against a reader seeing partial writes of 64-bit offset on 32-bit CPUs. */ mysql_mutex_lock(&LOCK_commit_ordered); + mysql_mutex_unlock(&LOCK_after_binlog_sync); last_commit_pos_offset= offset; mysql_mutex_unlock(&LOCK_commit_ordered); - mysql_mutex_unlock(&LOCK_log); if (check_purge) checkpoint_and_purge(prev_binlog_id); @@ -7374,13 +7409,22 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) { bool any_error= false; bool all_error= true; + + /* documentation of which mutexes are (not) owned */ + mysql_mutex_assert_not_owner(&LOCK_prepare_ordered); + mysql_mutex_assert_owner(&LOCK_log); + mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync); + mysql_mutex_assert_not_owner(&LOCK_commit_ordered); + bool first= true, last; for (current= queue; current != NULL; current= current->next) { + last= current->next == NULL; if (!current->error && RUN_HOOK(binlog_storage, after_flush, (current->thd, current->cache_mngr->last_commit_pos_file, - current->cache_mngr->last_commit_pos_offset, synced))) + current->cache_mngr->last_commit_pos_offset, synced, + first, last))) { current->error= ER_ERROR_ON_WRITE; current->commit_errno= -1; @@ -7389,6 +7433,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) } else all_error= false; + first= false; } /* update binlog_end_pos so it can be read by dump thread @@ -7437,22 +7482,55 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) } } - DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered"); - mysql_mutex_lock(&LOCK_commit_ordered); - /** - * TODO(jonaso): Check with Kristian, - * if we rotate:d above, this offset is "wrong" - */ - last_commit_pos_offset= commit_offset; + DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_after_binlog_sync"); + mysql_mutex_lock(&LOCK_after_binlog_sync); /* - We cannot unlock LOCK_log until we have locked LOCK_commit_ordered; + We cannot unlock LOCK_log until we have locked LOCK_after_binlog_sync; otherwise scheduling could allow the next group commit to run ahead of us, messing up the order of commit_ordered() calls. But as soon as - LOCK_commit_ordered is obtained, we can let the next group commit start. + LOCK_after_binlog_sync is obtained, we can let the next group commit start. */ mysql_mutex_unlock(&LOCK_log); DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log"); + + /* + Loop through threads and run the binlog_sync hook + */ + { + /* documentation of which mutexes are (not) owned */ + mysql_mutex_assert_not_owner(&LOCK_prepare_ordered); + mysql_mutex_assert_not_owner(&LOCK_log); + mysql_mutex_assert_owner(&LOCK_after_binlog_sync); + mysql_mutex_assert_not_owner(&LOCK_commit_ordered); + + bool first= true, last; + for (current= queue; current != NULL; current= current->next) + { + last= current->next == NULL; + if (!current->error && + RUN_HOOK(binlog_storage, after_sync, + (current->thd, log_file_name, + current->cache_mngr->last_commit_pos_offset, + first, last))) + { + /* error is already printed inside hook */ + } + first= false; + } + } + + DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered"); + mysql_mutex_lock(&LOCK_commit_ordered); + last_commit_pos_offset= commit_offset; + + /* + Unlock LOCK_after_binlog_sync only *after* LOCK_commit_ordered has been + acquired so that groups can not reorder for the different stages of + the group commit procedure. + */ + mysql_mutex_unlock(&LOCK_after_binlog_sync); + DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_after_binlog_sync"); ++num_group_commits; if (!opt_optimize_thread_scheduling) diff --git a/sql/log.h b/sql/log.h index 21af7d08959..9b6f365eea7 100644 --- a/sql/log.h +++ b/sql/log.h @@ -88,9 +88,11 @@ protected: */ extern mysql_mutex_t LOCK_prepare_ordered; extern mysql_cond_t COND_prepare_ordered; +extern mysql_mutex_t LOCK_after_binlog_sync; extern mysql_mutex_t LOCK_commit_ordered; #ifdef HAVE_PSI_INTERFACE extern PSI_mutex_key key_LOCK_prepare_ordered, key_LOCK_commit_ordered; +extern PSI_mutex_key key_LOCK_after_binlog_sync; extern PSI_cond_key key_COND_prepare_ordered; #endif @@ -1157,4 +1159,6 @@ static inline TC_LOG *get_tc_log_implementation() void assert_LOCK_log_owner(bool owner); +void assert_LOCK_log_owner(bool owner); + #endif /* LOG_H */ diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 1bf91a9f965..07b4f1da3b1 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -890,6 +890,7 @@ PSI_mutex_key key_LOCK_stats, key_LOCK_wakeup_ready, key_LOCK_wait_commit; PSI_mutex_key key_LOCK_gtid_waiting; +PSI_mutex_key key_LOCK_after_binlog_sync; PSI_mutex_key key_LOCK_prepare_ordered, key_LOCK_commit_ordered, key_LOCK_slave_init; PSI_mutex_key key_TABLE_SHARE_LOCK_share; @@ -954,6 +955,7 @@ static PSI_mutex_info all_server_mutexes[]= { &key_TABLE_SHARE_LOCK_share, "TABLE_SHARE::LOCK_share", 0}, { &key_LOCK_error_messages, "LOCK_error_messages", PSI_FLAG_GLOBAL}, { &key_LOCK_prepare_ordered, "LOCK_prepare_ordered", PSI_FLAG_GLOBAL}, + { &key_LOCK_after_binlog_sync, "LOCK_after_binlog_sync", PSI_FLAG_GLOBAL}, { &key_LOCK_commit_ordered, "LOCK_commit_ordered", PSI_FLAG_GLOBAL}, { &key_LOCK_slave_init, "LOCK_slave_init", PSI_FLAG_GLOBAL}, { &key_LOG_INFO_lock, "LOG_INFO::lock", 0}, @@ -2243,6 +2245,7 @@ static void clean_up_mutexes() mysql_cond_destroy(&COND_server_started); mysql_mutex_destroy(&LOCK_prepare_ordered); mysql_cond_destroy(&COND_prepare_ordered); + mysql_mutex_destroy(&LOCK_after_binlog_sync); mysql_mutex_destroy(&LOCK_commit_ordered); mysql_mutex_destroy(&LOCK_slave_init); mysql_cond_destroy(&COND_slave_init); @@ -4535,6 +4538,8 @@ static int init_thread_environment() mysql_mutex_init(key_LOCK_prepare_ordered, &LOCK_prepare_ordered, MY_MUTEX_INIT_SLOW); mysql_cond_init(key_COND_prepare_ordered, &COND_prepare_ordered, NULL); + mysql_mutex_init(key_LOCK_after_binlog_sync, &LOCK_after_binlog_sync, + MY_MUTEX_INIT_SLOW); mysql_mutex_init(key_LOCK_commit_ordered, &LOCK_commit_ordered, MY_MUTEX_INIT_SLOW); mysql_mutex_init(key_LOCK_slave_init, &LOCK_slave_init, diff --git a/sql/replication.h b/sql/replication.h index 9f9cc9eadfc..4731c2246ef 100644 --- a/sql/replication.h +++ b/sql/replication.h @@ -81,6 +81,7 @@ typedef struct Trans_observer { succeeded. @note The return value is currently ignored by the server. + @note This hook is called wo/ any global mutex held @param param The parameter for transaction observers @@ -103,6 +104,8 @@ typedef struct Trans_observer { @param param The parameter for transaction observers + @note This hook is called wo/ any global mutex held + @retval 0 Sucess @retval 1 Failure */ @@ -114,7 +117,13 @@ typedef struct Trans_observer { */ enum Binlog_storage_flags { /** Binary log was sync:ed */ - BINLOG_STORAGE_IS_SYNCED = 1 + BINLOG_STORAGE_IS_SYNCED = 1, + + /** First(or alone) in a group commit */ + BINLOG_GROUP_COMMIT_LEADER = 2, + + /** Last(or alone) in a group commit */ + BINLOG_GROUP_COMMIT_TRAILER = 4 }; /** @@ -137,6 +146,8 @@ typedef struct Binlog_storage_observer { binary log file. Whether the binary log file is synchronized to disk is indicated by the bit BINLOG_STORAGE_IS_SYNCED in @a flags. + @note: this hook is called with LOCK_log mutex held + @param param Observer common parameter @param log_file Binlog file name been updated @param log_pos Binlog position after update @@ -148,6 +159,26 @@ typedef struct Binlog_storage_observer { int (*after_flush)(Binlog_storage_param *param, const char *log_file, my_off_t log_pos, uint32 flags); + + /** + This callback is called after binlog has been synced + + This callback is called after events flushed to disk has been sync:ed + ("group committed"). + + @note: this hook is called with LOCK_after_binlog_sync mutex held + + @param param Observer common parameter + @param log_file Binlog file name been updated + @param log_pos Binlog position after update + @param flags flags for binlog storage + + @retval 0 Sucess + @retval 1 Failure + */ + int (*after_sync)(Binlog_storage_param *param, + const char *log_file, my_off_t log_pos, + uint32 flags); } Binlog_storage_observer; /** diff --git a/sql/rpl_handler.cc b/sql/rpl_handler.cc index 09e221e9bd5..2c388e572f9 100644 --- a/sql/rpl_handler.cc +++ b/sql/rpl_handler.cc @@ -252,12 +252,18 @@ int Trans_delegate::after_rollback(THD *thd, bool all) int Binlog_storage_delegate::after_flush(THD *thd, const char *log_file, my_off_t log_pos, - bool synced) + bool synced, + bool first_in_group, + bool last_in_group) { Binlog_storage_param param; uint32 flags=0; if (synced) flags |= BINLOG_STORAGE_IS_SYNCED; + if (first_in_group) + flags|= BINLOG_GROUP_COMMIT_LEADER; + if (last_in_group) + flags|= BINLOG_GROUP_COMMIT_TRAILER; Trans_binlog_info *log_info= my_pthread_getspecific_ptr(Trans_binlog_info*, RPL_TRANS_BINLOG_INFO); @@ -279,6 +285,27 @@ int Binlog_storage_delegate::after_flush(THD *thd, return ret; } +int Binlog_storage_delegate::after_sync(THD *thd, + const char *log_file, + my_off_t log_pos, + bool first_in_group, + bool last_in_group) +{ + Binlog_storage_param param; + uint32 flags=0; + + if (first_in_group) + flags|= BINLOG_GROUP_COMMIT_LEADER; + if (last_in_group) + flags|= BINLOG_GROUP_COMMIT_TRAILER; + + int ret= 0; + FOREACH_OBSERVER(ret, after_sync, thd, + (¶m, log_file+dirname_length(log_file), log_pos, flags)); + + return ret; +} + #ifdef HAVE_REPLICATION int Binlog_transmit_delegate::transmit_start(THD *thd, ushort flags, const char *log_file, diff --git a/sql/rpl_handler.h b/sql/rpl_handler.h index e262ebdbd6b..afcfd9d55b1 100644 --- a/sql/rpl_handler.h +++ b/sql/rpl_handler.h @@ -153,7 +153,10 @@ class Binlog_storage_delegate public: typedef Binlog_storage_observer Observer; int after_flush(THD *thd, const char *log_file, - my_off_t log_pos, bool synced); + my_off_t log_pos, bool synced, + bool first_in_group, bool last_in_group); + int after_sync(THD *thd, const char *log_file, my_off_t log_pos, + bool first_in_group, bool last_in_group); }; #ifdef HAVE_REPLICATION diff --git a/sql/transaction.cc b/sql/transaction.cc index 5127d241e85..3628790aa63 100644 --- a/sql/transaction.cc +++ b/sql/transaction.cc @@ -24,6 +24,7 @@ #include "rpl_handler.h" #include "debug_sync.h" // DEBUG_SYNC #include "sql_acl.h" +#include "log.h" // for assert_LOCK_log_owner /* Conditions under which the transaction state must not change. */ static bool trans_check(THD *thd) @@ -232,6 +233,13 @@ bool trans_commit(THD *thd) ~(SERVER_STATUS_IN_TRANS | SERVER_STATUS_IN_TRANS_READONLY); DBUG_PRINT("info", ("clearing SERVER_STATUS_IN_TRANS")); res= ha_commit_trans(thd, TRUE); + + /* documentation of which mutexes are (not) owned */ + mysql_mutex_assert_not_owner(&LOCK_prepare_ordered); + assert_LOCK_log_owner(false); + mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync); + mysql_mutex_assert_not_owner(&LOCK_commit_ordered); + if (WSREP_ON) wsrep_post_commit(thd, TRUE); /* @@ -433,6 +441,12 @@ bool trans_commit_stmt(THD *thd) } } + /* documentation of which mutexes are (not) owned */ + mysql_mutex_assert_not_owner(&LOCK_prepare_ordered); + assert_LOCK_log_owner(false); + mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync); + mysql_mutex_assert_not_owner(&LOCK_commit_ordered); + /* if res is non-zero, then ha_commit_trans has rolled back the transaction, so the hooks for rollback will be called. |