summaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorSachin <sachinsetia1001@gmail.com>2020-07-16 14:24:30 +0530
committerSachin <sachinsetia1001@gmail.com>2020-08-04 11:28:26 +0530
commite3c18b8e849373821b9c009b285ae13ef0fcc1a8 (patch)
treeb8f61f0467fe47681f2d428b075bcceac92c6213 /sql
parent8bca92c8845212ea96be404e664f4cbb45f93e2d (diff)
downloadmariadb-git-e3c18b8e849373821b9c009b285ae13ef0fcc1a8.tar.gz
MDEV-23089 rpl_parallel2 fails in 10.5
Problem:- rpl_parallel2 was failing non-deterministically Analysis:- When FLUSH TABLES WITH READ LOCK is executed, it will allow all worker threads to complete their ongoing transactions and then it will pause them. At this state FTWRL will proceed to acquire global read lock. FTWRL first blocks threads from starting new commits, then upgrades the lock to block commit of existing transactions. Step1: FLUSH TABLES WITH READ LOCK - Blocks new commits Step2: * STOP SLAVE command enables 'force_abort=1' which unblocks workers, they continue to execute events. * T1: Waits in 'record_gtid' call to update 'gtid_slave_pos' table with its current GTID, but it is blocked becuase of Step1. * T2: Holds COMMIT lock and waits for T1 to commit. Step3: FLUSH TABLES WITH READ LOCK - Waiting to get BLOCK_COMMIT. This results in deadlock. When STOP SLAVE command allows paused workers to proceed, workers should skip the execution of all further events, similar to 'conservative' parallel mode. Solution:- We will assign 1 to skip_event_group when we are aborted in do_ftwrl_wait. rpl_parallel_entry->pause_sub_id is only reset when force_abort is off in rpl_pause_after_ftwrl.
Diffstat (limited to 'sql')
-rw-r--r--sql/rpl_parallel.cc27
1 files changed, 23 insertions, 4 deletions
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index fb6f23af295..e58729ebbf3 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -369,13 +369,14 @@ do_gco_wait(rpl_group_info *rgi, group_commit_orderer *gco,
}
-static void
+static bool
do_ftwrl_wait(rpl_group_info *rgi,
bool *did_enter_cond, PSI_stage_info *old_stage)
{
THD *thd= rgi->thd;
rpl_parallel_entry *entry= rgi->parallel_entry;
uint64 sub_id= rgi->gtid_sub_id;
+ bool aborted= false;
DBUG_ENTER("do_ftwrl_wait");
mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
@@ -397,7 +398,10 @@ do_ftwrl_wait(rpl_group_info *rgi,
do
{
if (entry->force_abort || rgi->worker_error)
+ {
+ aborted= true;
break;
+ }
if (thd->check_killed())
{
thd->send_kill_message();
@@ -417,7 +421,7 @@ do_ftwrl_wait(rpl_group_info *rgi,
if (sub_id > entry->largest_started_sub_id)
entry->largest_started_sub_id= sub_id;
- DBUG_VOID_RETURN;
+ DBUG_RETURN(aborted);
}
@@ -500,7 +504,22 @@ rpl_unpause_after_ftwrl(THD *thd)
mysql_mutex_lock(&e->LOCK_parallel_entry);
rpt->pause_for_ftwrl = false;
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
- e->pause_sub_id= (uint64)ULONGLONG_MAX;
+ /*
+ Do not change pause_sub_id if force_abort is set.
+ force_abort is set in case of STOP SLAVE.
+
+ Reason: If pause_sub_id is not changed and force_abort_is set,
+ any parallel slave thread waiting in do_ftwrl_wait() will
+ on wakeup return from do_ftwrl_wait() with 1. This will set
+ skip_event_group to 1 in handle_rpl_parallel_thread() and the
+ parallel thread will abort at once.
+
+ If pause_sub_id is changed, the code in handle_rpl_parallel_thread()
+ would continue to execute the transaction in the queue, which would
+ cause some transactions to be lost.
+ */
+ if (!e->force_abort)
+ e->pause_sub_id= (uint64)ULONGLONG_MAX;
mysql_cond_broadcast(&e->COND_parallel_entry);
mysql_mutex_unlock(&e->LOCK_parallel_entry);
}
@@ -1155,7 +1174,7 @@ handle_rpl_parallel_thread(void *arg)
rgi->worker_error= 1;
}
if (likely(!skip_event_group))
- do_ftwrl_wait(rgi, &did_enter_cond, &old_stage);
+ skip_event_group= do_ftwrl_wait(rgi, &did_enter_cond, &old_stage);
/*
Register ourself to wait for the previous commit, if we need to do