summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrei <andrei.elkin@mariadb.com>2021-12-14 20:33:52 +0200
committerAndrei <andrei.elkin@mariadb.com>2021-12-15 22:44:23 +0200
commitec1e3301e377295a9cbc5c2fdac14f2fd9b519d5 (patch)
tree02116a8e6680d7fd1902b33601f53651117b2596
parent51cc5fbb6d2b5454215c32c283da2604d1101204 (diff)
downloadmariadb-git-ec1e3301e377295a9cbc5c2fdac14f2fd9b519d5.tar.gz
MDEV-11675. Fixes FTWRL in between SA and CA, and some cleanup
A FTWRL hang reported in the previous commit's message is resolved. SA is notified by FTWRL that set it to a state start_alter_state::COMPLETED && direct_commit_alter <- true forcing a "graceful" no-error rollback. After FTWRL has done CA gets scheduled and behaves by general rules. That is it's first to wait for UNLOCK TABLES. That's what an included test demonstrates. Note, start_alter_state::COMPLETED is also reachable by another source of benign error, e.g SA deadlock. (Then SA retries and sets to the state after unsuccessfully running out of all attempts). To facilitate the above A. `rpl_parallel_entry` is augmented with `rli` member so FTWRL can find the SA states; (I picked `rli` not `mi` for some future) B. `Master_info` receives mem_root for itself, now merely for allocating links of sa_info list associated with an `mi`. Some insignificant cleanup is done around places involved in above.
-rw-r--r--sql/log.cc10
-rw-r--r--sql/log.h3
-rw-r--r--sql/log_event_server.cc53
-rw-r--r--sql/rpl_mi.cc2
-rw-r--r--sql/rpl_mi.h1
-rw-r--r--sql/rpl_parallel.cc37
-rw-r--r--sql/rpl_parallel.h4
-rw-r--r--sql/rpl_rli.h8
-rw-r--r--sql/sql_table.cc23
9 files changed, 105 insertions, 36 deletions
diff --git a/sql/log.cc b/sql/log.cc
index ad78f40d00f..0627bea45a8 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -597,8 +597,7 @@ private:
has been completed
*/
bool write_bin_log_start_alter(THD *thd, bool& partial_alter,
- uint64 start_alter_id, bool if_exists,
- MEM_ROOT *mem)
+ uint64 start_alter_id, bool if_exists)
{
#if defined(HAVE_REPLICATION)
if (start_alter_id)
@@ -606,6 +605,10 @@ bool write_bin_log_start_alter(THD *thd, bool& partial_alter,
if (thd->rgi_slave->get_finish_event_group_called())
return false; // can get here through retrying
+ DBUG_EXECUTE_IF("at_write_start_alter", {
+ debug_sync_set_action(thd,
+ STRING_WITH_LEN("now wait_for alter_cont"));
+ });
Master_info *mi= thd->rgi_slave->rli->mi;
start_alter_info *info= thd->rgi_slave->sa_info;
@@ -614,7 +617,7 @@ bool write_bin_log_start_alter(THD *thd, bool& partial_alter,
info->domain_id= thd->variables.gtid_domain_id;
info->state= start_alter_state::REGISTERED;
mysql_mutex_lock(&mi->start_alter_list_lock);
- mi->start_alter_list.push_back(info, mem);
+ mi->start_alter_list.push_back(info, &mi->mem_root);
mysql_mutex_unlock(&mi->start_alter_list_lock);
thd->rgi_slave->commit_orderer.wait_for_prior_commit(thd);
thd->rgi_slave->start_alter_ev->update_pos(thd->rgi_slave);
@@ -6569,6 +6572,7 @@ MYSQL_BIN_LOG::lookup_domain_in_binlog_state(uint32 domain_id,
return false;
}
+
int
MYSQL_BIN_LOG::bump_seq_no_counter_if_needed(uint32 domain_id, uint64 seq_no)
{
diff --git a/sql/log.h b/sql/log.h
index b55ed0f31f6..b4c82e0aed4 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -1266,5 +1266,8 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list);
int binlog_commit(THD *thd, bool all, bool is_ro_1pc= false);
int binlog_commit_by_xid(handlerton *hton, XID *xid);
int binlog_rollback_by_xid(handlerton *hton, XID *xid);
+bool write_bin_log_start_alter(THD *thd, bool& partial_alter,
+ uint64 start_alter_id, bool log_if_exists);
+
#endif /* LOG_H */
diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
index af4898dde58..8df37731ab8 100644
--- a/sql/log_event_server.cc
+++ b/sql/log_event_server.cc
@@ -1730,6 +1730,7 @@ int Query_log_event::handle_split_alter_query_log_event(rpl_group_info *rgi,
}
start_alter_info *info=NULL;
Master_info *mi= NULL;
+ bool is_direct= false;
rgi->gtid_ev_sa_seq_no= sa_seq_no;
// is set for both the direct execution and the write to binlog
@@ -1743,6 +1744,10 @@ int Query_log_event::handle_split_alter_query_log_event(rpl_group_info *rgi,
if(info->sa_seq_no == rgi->gtid_ev_sa_seq_no &&
info->domain_id == rgi->current_gtid.domain_id)
{
+ is_direct= info->direct_commit_alter;
+
+ DBUG_ASSERT(!is_direct || info->state == start_alter_state::COMPLETED);
+
info_iterator.remove();
break;
}
@@ -1750,7 +1755,7 @@ int Query_log_event::handle_split_alter_query_log_event(rpl_group_info *rgi,
}
mysql_mutex_unlock(&mi->start_alter_list_lock);
- if (!info )
+ if (!info || is_direct)
{
if (is_CA)
{
@@ -1759,7 +1764,7 @@ int Query_log_event::handle_split_alter_query_log_event(rpl_group_info *rgi,
wait for master reply in mysql_alter_table (in wait_for_master)
*/
rgi->direct_commit_alter= true;
- return rc;
+ goto cleanup;
}
else
{
@@ -1769,28 +1774,34 @@ int Query_log_event::handle_split_alter_query_log_event(rpl_group_info *rgi,
}
mysql_mutex_lock(&mi->start_alter_lock);
+ if (info->state != start_alter_state::COMPLETED)
+ {
+ if (is_CA)
+ info->state= start_alter_state::COMMIT_ALTER;
+ else
+ info->state= start_alter_state::ROLLBACK_ALTER;
+ mysql_cond_broadcast(&info->start_alter_cond);
+ mysql_mutex_unlock(&mi->start_alter_lock);
+ /*
+ Wait till Start Alter worker has changed the state to ::COMPLETED
+ when start alter worker reaches the old code write_bin_log(), it will
+ change state to COMMITTED.
+ COMMITTED and `direct_commit_alter == true` at the same time indicates
+ the query needs re-execution by the CA running thread.
+ */
+ mysql_mutex_lock(&mi->start_alter_lock);
- DBUG_ASSERT(info->state == start_alter_state::REGISTERED);
+ DBUG_ASSERT(info->state == start_alter_state::COMPLETED ||
+ !info->direct_commit_alter);
- if (is_CA)
- info->state= start_alter_state::COMMIT_ALTER;
+ while(info->state != start_alter_state::COMPLETED)
+ mysql_cond_wait(&info->start_alter_cond, &mi->start_alter_lock);
+ }
else
- info->state= start_alter_state::ROLLBACK_ALTER;
- mysql_cond_broadcast(&info->start_alter_cond);
- mysql_mutex_unlock(&mi->start_alter_lock);
-
- /*
- Wait till Start Alter worker has changed the state to ::COMPLETED
- when start alter worker reaches the old code write_bin_log(), it will
- change state to COMMITTED
- */
- mysql_mutex_lock(&mi->start_alter_lock);
-
- DBUG_ASSERT(info->state == start_alter_state::COMPLETED ||
- !info->direct_commit_alter);
-
- while(info->state != start_alter_state::COMPLETED)
- mysql_cond_wait(&info->start_alter_cond, &mi->start_alter_lock);
+ {
+ // SA has completed and left being kicked out by deadlock or ftwrl
+ DBUG_ASSERT(info->direct_commit_alter);
+ }
mysql_mutex_unlock(&mi->start_alter_lock);
if (info->direct_commit_alter)
diff --git a/sql/rpl_mi.cc b/sql/rpl_mi.cc
index 00ea7743810..99c7242d400 100644
--- a/sql/rpl_mi.cc
+++ b/sql/rpl_mi.cc
@@ -100,6 +100,7 @@ Master_info::Master_info(LEX_CSTRING *connection_name_arg,
mysql_cond_init(key_master_info_start_cond, &start_cond, NULL);
mysql_cond_init(key_master_info_stop_cond, &stop_cond, NULL);
mysql_cond_init(key_master_info_sleep_cond, &sleep_cond, NULL);
+ init_sql_alloc(PSI_INSTRUMENT_ME, &mem_root, MEM_ROOT_BLOCK_SIZE, 0, MYF(0));
}
@@ -135,6 +136,7 @@ Master_info::~Master_info()
mysql_cond_destroy(&start_cond);
mysql_cond_destroy(&stop_cond);
mysql_cond_destroy(&sleep_cond);
+ free_root(&mem_root, MYF(0));
}
/**
diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h
index 44508e48c0d..be275558625 100644
--- a/sql/rpl_mi.h
+++ b/sql/rpl_mi.h
@@ -353,6 +353,7 @@ class Master_info : public Slave_reporting_capability
*/
int semi_ack;
List <start_alter_info> start_alter_list;
+ MEM_ROOT mem_root;
};
struct start_alter_thd_args
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 0f86404c799..591446a3309 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -576,6 +576,7 @@ rpl_pause_for_ftwrl(THD *thd)
uint32 i;
rpl_parallel_thread_pool *pool= &global_rpl_thread_pool;
int err;
+ Dynamic_array<Master_info*> mi_arr(4, 4); // array of replication source mi:s
DBUG_ENTER("rpl_pause_for_ftwrl");
/*
@@ -627,6 +628,33 @@ rpl_pause_for_ftwrl(THD *thd)
mysql_cond_wait(&e->COND_parallel_entry, &e->LOCK_parallel_entry);
};
--e->need_sub_id_signal;
+
+ /*
+ Notify any source any domain waiting-for-master Start-Alter to give way.
+ */
+ Master_info *mi= e->rli->mi;
+ bool found= false;
+ for (uint i= 0; i < mi_arr.elements() && !found; i++)
+ found= mi_arr.at(i) == mi;
+ if (!found)
+ {
+ mi_arr.append(mi);
+ start_alter_info *info=NULL;
+ mysql_mutex_lock(&mi->start_alter_list_lock);
+ List_iterator<start_alter_info> info_iterator(mi->start_alter_list);
+ while ((info= info_iterator++))
+ {
+ mysql_mutex_lock(&mi->start_alter_lock);
+
+ DBUG_ASSERT(info->state == start_alter_state::REGISTERED);
+
+ info->state= start_alter_state::ROLLBACK_ALTER;
+ info->direct_commit_alter= true;
+ mysql_cond_broadcast(&info->start_alter_cond);
+ mysql_mutex_unlock(&mi->start_alter_lock);
+ }
+ mysql_mutex_unlock(&mi->start_alter_list_lock);
+ }
thd->EXIT_COND(&old_stage);
if (err)
break;
@@ -2494,7 +2522,7 @@ rpl_parallel::~rpl_parallel()
rpl_parallel_entry *
-rpl_parallel::find(uint32 domain_id)
+rpl_parallel::find(uint32 domain_id, Relay_log_info *rli)
{
struct rpl_parallel_entry *e;
@@ -2521,6 +2549,7 @@ rpl_parallel::find(uint32 domain_id)
e->stop_on_error_sub_id= (uint64)ULONGLONG_MAX;
e->pause_sub_id= (uint64)ULONGLONG_MAX;
e->pending_start_alters= 0;
+ e->rli= rli;
if (my_hash_insert(&domain_hash, (uchar *)e))
{
my_free(e);
@@ -2531,7 +2560,11 @@ rpl_parallel::find(uint32 domain_id)
mysql_cond_init(key_COND_parallel_entry, &e->COND_parallel_entry, NULL);
}
else
+ {
+ DBUG_ASSERT(rli == e->rli);
+
e->force_abort= false;
+ }
return e;
}
@@ -2965,7 +2998,7 @@ rpl_parallel::do_event(rpl_group_info *serial_rgi, Log_event *ev,
uint32 domain_id= (rli->mi->using_gtid == Master_info::USE_GTID_NO ||
rli->mi->parallel_mode <= SLAVE_PARALLEL_MINIMAL ?
0 : gtid_ev->domain_id);
- if (!(e= find(domain_id)))
+ if (!(e= find(domain_id, rli)))
{
my_error(ER_OUT_OF_RESOURCES, MYF(MY_WME));
delete ev;
diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h
index e54fd2abb48..66c7fc9f316 100644
--- a/sql/rpl_parallel.h
+++ b/sql/rpl_parallel.h
@@ -429,6 +429,8 @@ struct rpl_parallel_entry {
uint64 count_committing_event_groups;
/* The group_commit_orderer object for the events currently being queued. */
group_commit_orderer *current_gco;
+ /* Relay log info of replication source for this entry. */
+ Relay_log_info *rli;
rpl_parallel_thread * choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
PSI_stage_info *old_stage,
@@ -447,7 +449,7 @@ struct rpl_parallel {
rpl_parallel();
~rpl_parallel();
void reset();
- rpl_parallel_entry *find(uint32 domain_id);
+ rpl_parallel_entry *find(uint32 domain_id, Relay_log_info *rli);
void wait_for_done(THD *thd, Relay_log_info *rli);
void stop_during_until();
bool workers_idle();
diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
index 9f3de7f6be7..80ee143a8e8 100644
--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
@@ -653,11 +653,11 @@ enum start_alter_state
struct start_alter_info
{
/*
- Unique among replication channel at one point of time
- */
- uint64 sa_seq_no; //key for searching
+ ALTER id is defined as a pair of GTID's seq_no and domain_id.
+ */
+ decltype(rpl_gtid::seq_no) sa_seq_no; // key for searching (SA's id)
uint32 domain_id;
- bool direct_commit_alter;
+ bool direct_commit_alter; // when true CA thread executes the whole query
/*
0 prepared and not error from commit and rollback
>0 error expected in commit/rollback
diff --git a/sql/sql_table.cc b/sql/sql_table.cc
index 9b01e9c6985..2a85b7b4949 100644
--- a/sql/sql_table.cc
+++ b/sql/sql_table.cc
@@ -94,12 +94,12 @@ static int mysql_prepare_create_table(THD *, HA_CREATE_INFO *, Alter_info *,
static uint blob_length_by_type(enum_field_types type);
static bool fix_constraints_names(THD *, List<Virtual_column_info> *,
const HA_CREATE_INFO *);
-bool write_bin_log_start_alter(THD *thd, bool& partial_alter,
- uint64 start_alter_id, bool log_if_exists,
- MEM_ROOT *mem);
static bool wait_for_master(THD *thd);
static int process_master_state(THD *thd, int alter_result,
uint64 &start_alter_id);
+static bool
+write_bin_log_start_alter_rollback(THD *thd, uint64 &start_alter_id,
+ bool &partial_alter, bool if_exists);
/**
@brief Helper function for explain_filename
@@ -7328,6 +7328,19 @@ write_bin_log_start_alter_rollback(THD *thd, uint64 &start_alter_id,
*/
return true;
}
+ if (info->direct_commit_alter)
+ {
+ DBUG_ASSERT(info->state == start_alter_state::ROLLBACK_ALTER);
+
+ /*
+ SA may end up in the rollback state through FTWRL that breaks
+ SA's waiting for a master decision.
+ Then it completes "officially", and `direct_commit_alter` true status
+ will affect the future of CA to re-execute the whole query.
+ */
+ info->state= start_alter_state::COMPLETED;
+ return true; // not really an error to be handled by caller specifically
+ }
/*
We have to call wait for master here because in main calculation
we can error out before calling wait for master
@@ -7489,7 +7502,7 @@ static bool mysql_inplace_alter_table(THD *thd,
if (table->s->tmp_table == NO_TMP_TABLE)
if (write_bin_log_start_alter(thd, partial_alter, start_alter_id,
- if_exists, &table->s->mem_root))
+ if_exists))
goto cleanup;
DBUG_EXECUTE_IF("start_alter_kill_after_binlog", {
@@ -10630,7 +10643,7 @@ do_continue:;
if (table->s->tmp_table == NO_TMP_TABLE)
if (write_bin_log_start_alter(thd, partial_alter, start_alter_id,
- if_exists, &table->s->mem_root))
+ if_exists))
goto err_new_table_cleanup;
DBUG_EXECUTE_IF("start_alter_delay_master", {