summaryrefslogtreecommitdiff
path: root/sql/rpl_parallel.h
diff options
context:
space:
mode:
authorKristian Nielsen <knielsen@knielsen-hq.org>2015-01-07 14:45:39 +0100
committerKristian Nielsen <knielsen@knielsen-hq.org>2015-01-07 14:45:39 +0100
commitf27817c1d0e6d81392470e9086624e88ae08b11f (patch)
tree07143fafd819462ef1baf0d451d5537f1a60610b /sql/rpl_parallel.h
parent4a3251595cc697bfdb15b67c07514bd3c4779e37 (diff)
downloadmariadb-git-f27817c1d0e6d81392470e9086624e88ae08b11f.tar.gz
MDEV-7326: Server deadlock in connection with parallel replication
The bug occurs when a transaction does a retry after all transactions have done mark_start_commit() in a batch of group commit from the master. In this case, the retrying transaction can unmark_start_commit() after the following batch has already started running and de-allocated the GCO. Then after retry, the transaction will re-do mark_start_commit() on a de-allocated GCO, and also wakeup of later GCOs can be lost. This was seen "in the wild" by a user, even though it is not known exactly what circumstances can lead to retry of one transaction after all transactions in a group have reached the commit phase. The lifetime around GCO was somewhat clunky anyway. With this patch, a GCO lives until rpl_parallel_entry::last_committed_sub_id has reached the last transaction in the GCO. This guarantees that the GCO will still be alive when a transaction does mark_start_commit(). Also, we now loop over the list of active GCOs for wakeup, to ensure we do not lose a wakeup even in the problematic case.
Diffstat (limited to 'sql/rpl_parallel.h')
-rw-r--r--sql/rpl_parallel.h22
1 files changed, 18 insertions, 4 deletions
diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h
index 239818855b8..2604cd98527 100644
--- a/sql/rpl_parallel.h
+++ b/sql/rpl_parallel.h
@@ -39,9 +39,12 @@ struct inuse_relaylog;
rpl_parallel_entry::count_committing_event_groups has reached
gco->next_gco->wait_count.
- - When gco->wait_count is reached for a worker and the wait completes,
- the worker frees gco->prev_gco; at this point it is guaranteed not to
- be needed any longer.
+ - The gco lives until all its event groups have completed their commit.
+ This is detected by rpl_parallel_entry::last_committed_sub_id being
+ greater than or equal gco->last_sub_id. Once this happens, the gco is
+ freed. Note that since update of last_committed_sub_id can happen
+ out-of-order, the thread that frees a given gco can be for any later
+ event group, not necessarily an event group from the gco being freed.
*/
struct group_commit_orderer {
/* Wakeup condition, used with rpl_parallel_entry::LOCK_parallel_entry. */
@@ -49,6 +52,16 @@ struct group_commit_orderer {
uint64 wait_count;
group_commit_orderer *prev_gco;
group_commit_orderer *next_gco;
+ /*
+ The sub_id of last event group in this the previous GCO.
+ Only valid if prev_gco != NULL.
+ */
+ uint64 prior_sub_id;
+ /*
+ The sub_id of the last event group in this GCO. Only valid when next_gco
+ is non-NULL.
+ */
+ uint64 last_sub_id;
bool installed;
};
@@ -168,7 +181,8 @@ struct rpl_parallel_thread {
LOCK_rpl_thread mutex.
*/
void free_rgi(rpl_group_info *rgi);
- group_commit_orderer *get_gco(uint64 wait_count, group_commit_orderer *prev);
+ group_commit_orderer *get_gco(uint64 wait_count, group_commit_orderer *prev,
+ uint64 first_sub_id);
/*
Put a gco on the local free list, to be later released to the global free
list by batch_free().