summaryrefslogtreecommitdiff
path: root/sql/rpl_gtid.cc
diff options
context:
space:
mode:
authorKristian Nielsen <knielsen@knielsen-hq.org>2015-04-08 11:01:18 +0200
committerKristian Nielsen <knielsen@knielsen-hq.org>2015-04-08 11:01:18 +0200
commit3b961347db2b2ad1d31cf64829a6d0e31795e158 (patch)
tree8790bb8d47eec70ce414064db808ee64656320d1 /sql/rpl_gtid.cc
parent880f2273fdc39cd1a2ab28f448cdfbf3d6581af2 (diff)
downloadmariadb-git-3b961347db2b2ad1d31cf64829a6d0e31795e158.tar.gz
MDEV-7888, MDEV-7929: Parallel replication hangs sometimes on ANALYZE TABLE or DDL
The hangs occur when the group_commit_orderer object is freed before the last mark_start_commit() call on it - this loses the wakeup to other waiting worker threads, causing them to hang until killed manually. The object was freed because wakeup_subsequent_commits() was called two early in two places. For MDEV-7888, during ANALYZE TABLE, and for MDEV-7929 during record_gtid() after processing a DDL event. The group_commit_orderer object can be freed when its last transaction has called wait_for_prior_commit(). Fix by implementing a suspend/resume mechanism for wakeup_subsequent_commits() that can be used in places where a transaction is committed without this being the commit of the actual replication event group. Also add a protection mechanism (that asserts in debug builds) which can prevent the too-early free and hang if other similar bugs should remain in other parts of the code.
Diffstat (limited to 'sql/rpl_gtid.cc')
-rw-r--r--sql/rpl_gtid.cc29
1 files changed, 29 insertions, 0 deletions
diff --git a/sql/rpl_gtid.cc b/sql/rpl_gtid.cc
index e5620ec41a2..08df804ac4c 100644
--- a/sql/rpl_gtid.cc
+++ b/sql/rpl_gtid.cc
@@ -515,6 +515,7 @@ rpl_slave_state::record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
element *elem;
ulonglong thd_saved_option= thd->variables.option_bits;
Query_tables_list lex_backup;
+ wait_for_commit* suspended_wfc;
DBUG_ENTER("record_gtid");
if (unlikely(!loaded))
@@ -538,6 +539,28 @@ rpl_slave_state::record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
DBUG_RETURN(1);
} );
+ /*
+ If we are applying a non-transactional event group, we will be committing
+ here a transaction, but that does not imply that the event group has
+ completed or has been binlogged. So we should not trigger
+ wakeup_subsequent_commits() here.
+
+ Note: An alternative here could be to put a call to mark_start_commit() in
+ stmt_done() before the call to record_and_update_gtid(). This would
+ prevent later calling mark_start_commit() after we have run
+ wakeup_subsequent_commits() from committing the GTID update transaction
+ (which must be avoided to avoid accessing freed group_commit_orderer
+ object). It would also allow following event groups to start slightly
+ earlier. And in the cases where record_gtid() is called without an active
+ transaction, the current statement should have been binlogged already, so
+ binlog order is preserved.
+
+ But this is rather subtle, and potentially fragile. And it does not really
+ seem worth it; non-transactional loads are unlikely to benefit much from
+ parallel replication in any case. So for now, we go with the simple
+ suspend/resume of wakeup_subsequent_commits() here in record_gtid().
+ */
+ suspended_wfc= thd->suspend_subsequent_commits();
thd->lex->reset_n_backup_query_tables_list(&lex_backup);
tlist.init_one_table(STRING_WITH_LEN("mysql"),
rpl_gtid_slave_state_table_name.str,
@@ -689,6 +712,12 @@ end:
}
thd->lex->restore_backup_query_tables_list(&lex_backup);
thd->variables.option_bits= thd_saved_option;
+ thd->resume_subsequent_commits(suspended_wfc);
+ DBUG_EXECUTE_IF("inject_record_gtid_serverid_100_sleep",
+ {
+ if (gtid->server_id == 100)
+ my_sleep(500000);
+ });
DBUG_RETURN(err);
}