summaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorJonas Oreland <jonaso@google.com>2014-12-23 13:38:00 +0100
committerKristian Nielsen <knielsen@knielsen-hq.org>2014-12-23 14:16:32 +0100
commit0b87de124d716cee7c1aa56f30c7f80c2c2bfcce (patch)
tree5557407d03402bdf28daf1b64e438362358f6188 /sql
parent4d8b346e079a27960dbe49e4d0ec4364bed8d30e (diff)
downloadmariadb-git-0b87de124d716cee7c1aa56f30c7f80c2c2bfcce.tar.gz
MDEV-162 Enhanced semisync replication
Implement --semi-sync-master-wait-point=AFTER_SYNC|AFTER_COMMIT. When AFTER_SYNC, the semi-sync wait will be done earlier, before the storage engine commit rather than after. This means that a transaction will not be visible on the master until at least one slave has received it.
Diffstat (limited to 'sql')
-rw-r--r--sql/handler.cc7
-rw-r--r--sql/log.cc118
-rw-r--r--sql/log.h4
-rw-r--r--sql/mysqld.cc5
-rw-r--r--sql/replication.h33
-rw-r--r--sql/rpl_handler.cc29
-rw-r--r--sql/rpl_handler.h5
-rw-r--r--sql/transaction.cc14
8 files changed, 192 insertions, 23 deletions
diff --git a/sql/handler.cc b/sql/handler.cc
index c1363cfcaf1..4bb8e0b4397 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -32,6 +32,7 @@
#include "sql_acl.h" // SUPER_ACL
#include "sql_base.h" // free_io_cache
#include "discover.h" // extension_based_table_discovery, etc
+#include "log.h" // for assert_LOCK_log_owner
#include "log_event.h" // *_rows_log_event
#include "create_options.h"
#include "rpl_filter.h"
@@ -1479,6 +1480,12 @@ int ha_commit_trans(THD *thd, bool all)
done:
DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE(););
+
+ /* documentation of which mutexes are (not) owned */
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ assert_LOCK_log_owner(false);
+ mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
RUN_HOOK(transaction, after_commit, (thd, FALSE));
goto end;
diff --git a/sql/log.cc b/sql/log.cc
index edb4c07c8cc..66e142668a9 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -93,6 +93,7 @@ ulong opt_binlog_dbug_fsync_sleep= 0;
mysql_mutex_t LOCK_prepare_ordered;
mysql_cond_t COND_prepare_ordered;
+mysql_mutex_t LOCK_after_binlog_sync;
mysql_mutex_t LOCK_commit_ordered;
static ulonglong binlog_status_var_num_commits;
@@ -3938,7 +3939,8 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd, bool create_new_log,
Without binlog, we cannot XA recover prepared-but-not-committed
transactions in engines. So force a commit checkpoint first.
- Note that we take and immediately release LOCK_commit_ordered. This has
+ Note that we take and immediately
+ release LOCK_after_binlog_sync/LOCK_commit_ordered. This has
the effect to ensure that any on-going group commit (in
trx_group_commit_leader()) has completed before we request the checkpoint,
due to the chaining of LOCK_log and LOCK_commit_ordered in that function.
@@ -3949,7 +3951,10 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd, bool create_new_log,
commit_ordered() in the engine of some transaction, and then a crash
later would leave such transaction not recoverable.
*/
+
+ mysql_mutex_lock(&LOCK_after_binlog_sync);
mysql_mutex_lock(&LOCK_commit_ordered);
+ mysql_mutex_unlock(&LOCK_after_binlog_sync);
mysql_mutex_unlock(&LOCK_commit_ordered);
mark_xids_active(current_binlog_id, 1);
@@ -6035,11 +6040,6 @@ err:
if ((error= flush_and_sync(&synced)))
{
}
- else if ((error= RUN_HOOK(binlog_storage, after_flush,
- (thd, log_file_name, file->pos_in_file, synced))))
- {
- sql_print_error("Failed to run 'after_flush' hooks");
- }
else
{
/* update binlog_end_pos so it can be read by dump thread
@@ -6050,23 +6050,58 @@ err:
*/
update_binlog_end_pos(offset);
- signal_update();
- if ((error= rotate(false, &check_purge)))
- check_purge= false;
+ /* documentation of which mutexes are (not) owned */
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_owner(&LOCK_log);
+ mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+ bool first= true;
+ bool last= true;
+ if ((error= RUN_HOOK(binlog_storage, after_flush,
+ (thd, log_file_name, file->pos_in_file,
+ synced, first, last))))
+ {
+ sql_print_error("Failed to run 'after_flush' hooks");
+ error= 1;
+ }
+ else
+ {
+ signal_update();
+ if ((error= rotate(false, &check_purge)))
+ check_purge= false;
+ }
}
}
status_var_add(thd->status_var.binlog_bytes_written,
offset - my_org_b_tell);
+ mysql_mutex_lock(&LOCK_after_binlog_sync);
+ mysql_mutex_unlock(&LOCK_log);
+
+ /* documentation of which mutexes are (not) owned */
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_not_owner(&LOCK_log);
+ mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+ bool first= true;
+ bool last= true;
+ if (RUN_HOOK(binlog_storage, after_sync,
+ (thd, log_file_name, file->pos_in_file,
+ first, last)))
+ {
+ error=1;
+ /* error is already printed inside hook */
+ }
+
/*
Take mutex to protect against a reader seeing partial writes of 64-bit
offset on 32-bit CPUs.
*/
mysql_mutex_lock(&LOCK_commit_ordered);
+ mysql_mutex_unlock(&LOCK_after_binlog_sync);
last_commit_pos_offset= offset;
mysql_mutex_unlock(&LOCK_commit_ordered);
- mysql_mutex_unlock(&LOCK_log);
if (check_purge)
checkpoint_and_purge(prev_binlog_id);
@@ -7374,13 +7409,22 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
{
bool any_error= false;
bool all_error= true;
+
+ /* documentation of which mutexes are (not) owned */
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_owner(&LOCK_log);
+ mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+ bool first= true, last;
for (current= queue; current != NULL; current= current->next)
{
+ last= current->next == NULL;
if (!current->error &&
RUN_HOOK(binlog_storage, after_flush,
(current->thd,
current->cache_mngr->last_commit_pos_file,
- current->cache_mngr->last_commit_pos_offset, synced)))
+ current->cache_mngr->last_commit_pos_offset, synced,
+ first, last)))
{
current->error= ER_ERROR_ON_WRITE;
current->commit_errno= -1;
@@ -7389,6 +7433,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
}
else
all_error= false;
+ first= false;
}
/* update binlog_end_pos so it can be read by dump thread
@@ -7437,22 +7482,55 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
}
}
- DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
- mysql_mutex_lock(&LOCK_commit_ordered);
- /**
- * TODO(jonaso): Check with Kristian,
- * if we rotate:d above, this offset is "wrong"
- */
- last_commit_pos_offset= commit_offset;
+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_after_binlog_sync");
+ mysql_mutex_lock(&LOCK_after_binlog_sync);
/*
- We cannot unlock LOCK_log until we have locked LOCK_commit_ordered;
+ We cannot unlock LOCK_log until we have locked LOCK_after_binlog_sync;
otherwise scheduling could allow the next group commit to run ahead of us,
messing up the order of commit_ordered() calls. But as soon as
- LOCK_commit_ordered is obtained, we can let the next group commit start.
+ LOCK_after_binlog_sync is obtained, we can let the next group commit start.
*/
mysql_mutex_unlock(&LOCK_log);
DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
+
+ /*
+ Loop through threads and run the binlog_sync hook
+ */
+ {
+ /* documentation of which mutexes are (not) owned */
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_not_owner(&LOCK_log);
+ mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+
+ bool first= true, last;
+ for (current= queue; current != NULL; current= current->next)
+ {
+ last= current->next == NULL;
+ if (!current->error &&
+ RUN_HOOK(binlog_storage, after_sync,
+ (current->thd, log_file_name,
+ current->cache_mngr->last_commit_pos_offset,
+ first, last)))
+ {
+ /* error is already printed inside hook */
+ }
+ first= false;
+ }
+ }
+
+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ last_commit_pos_offset= commit_offset;
+
+ /*
+ Unlock LOCK_after_binlog_sync only *after* LOCK_commit_ordered has been
+ acquired so that groups can not reorder for the different stages of
+ the group commit procedure.
+ */
+ mysql_mutex_unlock(&LOCK_after_binlog_sync);
+ DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_after_binlog_sync");
++num_group_commits;
if (!opt_optimize_thread_scheduling)
diff --git a/sql/log.h b/sql/log.h
index 21af7d08959..9b6f365eea7 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -88,9 +88,11 @@ protected:
*/
extern mysql_mutex_t LOCK_prepare_ordered;
extern mysql_cond_t COND_prepare_ordered;
+extern mysql_mutex_t LOCK_after_binlog_sync;
extern mysql_mutex_t LOCK_commit_ordered;
#ifdef HAVE_PSI_INTERFACE
extern PSI_mutex_key key_LOCK_prepare_ordered, key_LOCK_commit_ordered;
+extern PSI_mutex_key key_LOCK_after_binlog_sync;
extern PSI_cond_key key_COND_prepare_ordered;
#endif
@@ -1157,4 +1159,6 @@ static inline TC_LOG *get_tc_log_implementation()
void assert_LOCK_log_owner(bool owner);
+void assert_LOCK_log_owner(bool owner);
+
#endif /* LOG_H */
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 1bf91a9f965..07b4f1da3b1 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -890,6 +890,7 @@ PSI_mutex_key key_LOCK_stats,
key_LOCK_wakeup_ready, key_LOCK_wait_commit;
PSI_mutex_key key_LOCK_gtid_waiting;
+PSI_mutex_key key_LOCK_after_binlog_sync;
PSI_mutex_key key_LOCK_prepare_ordered, key_LOCK_commit_ordered,
key_LOCK_slave_init;
PSI_mutex_key key_TABLE_SHARE_LOCK_share;
@@ -954,6 +955,7 @@ static PSI_mutex_info all_server_mutexes[]=
{ &key_TABLE_SHARE_LOCK_share, "TABLE_SHARE::LOCK_share", 0},
{ &key_LOCK_error_messages, "LOCK_error_messages", PSI_FLAG_GLOBAL},
{ &key_LOCK_prepare_ordered, "LOCK_prepare_ordered", PSI_FLAG_GLOBAL},
+ { &key_LOCK_after_binlog_sync, "LOCK_after_binlog_sync", PSI_FLAG_GLOBAL},
{ &key_LOCK_commit_ordered, "LOCK_commit_ordered", PSI_FLAG_GLOBAL},
{ &key_LOCK_slave_init, "LOCK_slave_init", PSI_FLAG_GLOBAL},
{ &key_LOG_INFO_lock, "LOG_INFO::lock", 0},
@@ -2243,6 +2245,7 @@ static void clean_up_mutexes()
mysql_cond_destroy(&COND_server_started);
mysql_mutex_destroy(&LOCK_prepare_ordered);
mysql_cond_destroy(&COND_prepare_ordered);
+ mysql_mutex_destroy(&LOCK_after_binlog_sync);
mysql_mutex_destroy(&LOCK_commit_ordered);
mysql_mutex_destroy(&LOCK_slave_init);
mysql_cond_destroy(&COND_slave_init);
@@ -4535,6 +4538,8 @@ static int init_thread_environment()
mysql_mutex_init(key_LOCK_prepare_ordered, &LOCK_prepare_ordered,
MY_MUTEX_INIT_SLOW);
mysql_cond_init(key_COND_prepare_ordered, &COND_prepare_ordered, NULL);
+ mysql_mutex_init(key_LOCK_after_binlog_sync, &LOCK_after_binlog_sync,
+ MY_MUTEX_INIT_SLOW);
mysql_mutex_init(key_LOCK_commit_ordered, &LOCK_commit_ordered,
MY_MUTEX_INIT_SLOW);
mysql_mutex_init(key_LOCK_slave_init, &LOCK_slave_init,
diff --git a/sql/replication.h b/sql/replication.h
index 9f9cc9eadfc..4731c2246ef 100644
--- a/sql/replication.h
+++ b/sql/replication.h
@@ -81,6 +81,7 @@ typedef struct Trans_observer {
succeeded.
@note The return value is currently ignored by the server.
+ @note This hook is called wo/ any global mutex held
@param param The parameter for transaction observers
@@ -103,6 +104,8 @@ typedef struct Trans_observer {
@param param The parameter for transaction observers
+ @note This hook is called wo/ any global mutex held
+
@retval 0 Sucess
@retval 1 Failure
*/
@@ -114,7 +117,13 @@ typedef struct Trans_observer {
*/
enum Binlog_storage_flags {
/** Binary log was sync:ed */
- BINLOG_STORAGE_IS_SYNCED = 1
+ BINLOG_STORAGE_IS_SYNCED = 1,
+
+ /** First(or alone) in a group commit */
+ BINLOG_GROUP_COMMIT_LEADER = 2,
+
+ /** Last(or alone) in a group commit */
+ BINLOG_GROUP_COMMIT_TRAILER = 4
};
/**
@@ -137,6 +146,8 @@ typedef struct Binlog_storage_observer {
binary log file. Whether the binary log file is synchronized to
disk is indicated by the bit BINLOG_STORAGE_IS_SYNCED in @a flags.
+ @note: this hook is called with LOCK_log mutex held
+
@param param Observer common parameter
@param log_file Binlog file name been updated
@param log_pos Binlog position after update
@@ -148,6 +159,26 @@ typedef struct Binlog_storage_observer {
int (*after_flush)(Binlog_storage_param *param,
const char *log_file, my_off_t log_pos,
uint32 flags);
+
+ /**
+ This callback is called after binlog has been synced
+
+ This callback is called after events flushed to disk has been sync:ed
+ ("group committed").
+
+ @note: this hook is called with LOCK_after_binlog_sync mutex held
+
+ @param param Observer common parameter
+ @param log_file Binlog file name been updated
+ @param log_pos Binlog position after update
+ @param flags flags for binlog storage
+
+ @retval 0 Sucess
+ @retval 1 Failure
+ */
+ int (*after_sync)(Binlog_storage_param *param,
+ const char *log_file, my_off_t log_pos,
+ uint32 flags);
} Binlog_storage_observer;
/**
diff --git a/sql/rpl_handler.cc b/sql/rpl_handler.cc
index 09e221e9bd5..2c388e572f9 100644
--- a/sql/rpl_handler.cc
+++ b/sql/rpl_handler.cc
@@ -252,12 +252,18 @@ int Trans_delegate::after_rollback(THD *thd, bool all)
int Binlog_storage_delegate::after_flush(THD *thd,
const char *log_file,
my_off_t log_pos,
- bool synced)
+ bool synced,
+ bool first_in_group,
+ bool last_in_group)
{
Binlog_storage_param param;
uint32 flags=0;
if (synced)
flags |= BINLOG_STORAGE_IS_SYNCED;
+ if (first_in_group)
+ flags|= BINLOG_GROUP_COMMIT_LEADER;
+ if (last_in_group)
+ flags|= BINLOG_GROUP_COMMIT_TRAILER;
Trans_binlog_info *log_info=
my_pthread_getspecific_ptr(Trans_binlog_info*, RPL_TRANS_BINLOG_INFO);
@@ -279,6 +285,27 @@ int Binlog_storage_delegate::after_flush(THD *thd,
return ret;
}
+int Binlog_storage_delegate::after_sync(THD *thd,
+ const char *log_file,
+ my_off_t log_pos,
+ bool first_in_group,
+ bool last_in_group)
+{
+ Binlog_storage_param param;
+ uint32 flags=0;
+
+ if (first_in_group)
+ flags|= BINLOG_GROUP_COMMIT_LEADER;
+ if (last_in_group)
+ flags|= BINLOG_GROUP_COMMIT_TRAILER;
+
+ int ret= 0;
+ FOREACH_OBSERVER(ret, after_sync, thd,
+ (&param, log_file+dirname_length(log_file), log_pos, flags));
+
+ return ret;
+}
+
#ifdef HAVE_REPLICATION
int Binlog_transmit_delegate::transmit_start(THD *thd, ushort flags,
const char *log_file,
diff --git a/sql/rpl_handler.h b/sql/rpl_handler.h
index e262ebdbd6b..afcfd9d55b1 100644
--- a/sql/rpl_handler.h
+++ b/sql/rpl_handler.h
@@ -153,7 +153,10 @@ class Binlog_storage_delegate
public:
typedef Binlog_storage_observer Observer;
int after_flush(THD *thd, const char *log_file,
- my_off_t log_pos, bool synced);
+ my_off_t log_pos, bool synced,
+ bool first_in_group, bool last_in_group);
+ int after_sync(THD *thd, const char *log_file, my_off_t log_pos,
+ bool first_in_group, bool last_in_group);
};
#ifdef HAVE_REPLICATION
diff --git a/sql/transaction.cc b/sql/transaction.cc
index 5127d241e85..3628790aa63 100644
--- a/sql/transaction.cc
+++ b/sql/transaction.cc
@@ -24,6 +24,7 @@
#include "rpl_handler.h"
#include "debug_sync.h" // DEBUG_SYNC
#include "sql_acl.h"
+#include "log.h" // for assert_LOCK_log_owner
/* Conditions under which the transaction state must not change. */
static bool trans_check(THD *thd)
@@ -232,6 +233,13 @@ bool trans_commit(THD *thd)
~(SERVER_STATUS_IN_TRANS | SERVER_STATUS_IN_TRANS_READONLY);
DBUG_PRINT("info", ("clearing SERVER_STATUS_IN_TRANS"));
res= ha_commit_trans(thd, TRUE);
+
+ /* documentation of which mutexes are (not) owned */
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ assert_LOCK_log_owner(false);
+ mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+
if (WSREP_ON)
wsrep_post_commit(thd, TRUE);
/*
@@ -433,6 +441,12 @@ bool trans_commit_stmt(THD *thd)
}
}
+ /* documentation of which mutexes are (not) owned */
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ assert_LOCK_log_owner(false);
+ mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+
/*
if res is non-zero, then ha_commit_trans has rolled back the
transaction, so the hooks for rollback will be called.