summaryrefslogtreecommitdiff
path: root/sql/rpl_parallel.cc
diff options
context:
space:
mode:
authorKristian Nielsen <knielsen@knielsen-hq.org>2015-04-08 11:01:18 +0200
committerKristian Nielsen <knielsen@knielsen-hq.org>2015-04-08 11:01:18 +0200
commit3b961347db2b2ad1d31cf64829a6d0e31795e158 (patch)
tree8790bb8d47eec70ce414064db808ee64656320d1 /sql/rpl_parallel.cc
parent880f2273fdc39cd1a2ab28f448cdfbf3d6581af2 (diff)
downloadmariadb-git-3b961347db2b2ad1d31cf64829a6d0e31795e158.tar.gz
MDEV-7888, MDEV-7929: Parallel replication hangs sometimes on ANALYZE TABLE or DDL
The hangs occur when the group_commit_orderer object is freed before the last mark_start_commit() call on it - this loses the wakeup to other waiting worker threads, causing them to hang until killed manually. The object was freed because wakeup_subsequent_commits() was called two early in two places. For MDEV-7888, during ANALYZE TABLE, and for MDEV-7929 during record_gtid() after processing a DDL event. The group_commit_orderer object can be freed when its last transaction has called wait_for_prior_commit(). Fix by implementing a suspend/resume mechanism for wakeup_subsequent_commits() that can be used in places where a transaction is committed without this being the commit of the actual replication event group. Also add a protection mechanism (that asserts in debug builds) which can prevent the too-early free and hang if other similar bugs should remain in other parts of the code.
Diffstat (limited to 'sql/rpl_parallel.cc')
-rw-r--r--sql/rpl_parallel.cc18
1 files changed, 17 insertions, 1 deletions
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 7df1ea3bd4b..48b6ad0ef89 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -171,8 +171,24 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
/* Now free any GCOs in which all transactions have committed. */
group_commit_orderer *tmp_gco= rgi->gco;
while (tmp_gco &&
- (!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id))
+ (!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id ||
+ tmp_gco->next_gco->wait_count > entry->count_committing_event_groups))
+ {
+ /*
+ We must not free a GCO before the wait_count of the following GCO has
+ been reached and wakeup has been sent. Otherwise we will lose the
+ wakeup and hang (there were several such bugs in the past).
+
+ The intention is that this is ensured already since we only free when
+ the last event group in the GCO has committed
+ (tmp_gco->last_sub_id <= sub_id). However, if we have a bug, we have
+ extra check on next_gco->wait_count to hopefully avoid hanging; we
+ have here an assertion in debug builds that this check does not in
+ fact trigger.
+ */
+ DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id);
tmp_gco= tmp_gco->prev_gco;
+ }
while (tmp_gco)
{
group_commit_orderer *prev_gco= tmp_gco->prev_gco;