diff options
author | sjaakola <seppo.jaakola@iki.fi> | 2021-10-21 14:49:51 +0300 |
---|---|---|
committer | Jan Lindström <jan.lindstrom@mariadb.com> | 2021-10-29 07:57:18 +0300 |
commit | db50ea3ad324c98e5adf144842304f2095954ab3 (patch) | |
tree | b100bf0faf642477be6f756d24136406538b43c9 /storage | |
parent | c8b39f7ee2a9910fae2f03563b3ea337c45132c2 (diff) | |
download | mariadb-git-db50ea3ad324c98e5adf144842304f2095954ab3.tar.gz |
MDEV-23328 Server hang due to Galera lock conflict resolution
Mutex order violation when wsrep bf thread kills a conflicting trx,
the stack is
wsrep_thd_LOCK()
wsrep_kill_victim()
lock_rec_other_has_conflicting()
lock_clust_rec_read_check_and_lock()
row_search_mvcc()
ha_innobase::index_read()
ha_innobase::rnd_pos()
handler::ha_rnd_pos()
handler::rnd_pos_by_record()
handler::ha_rnd_pos_by_record()
Rows_log_event::find_row()
Update_rows_log_event::do_exec_row()
Rows_log_event::do_apply_event()
Log_event::apply_event()
wsrep_apply_events()
and mutexes are taken in the order
lock_sys->mutex -> victim_trx->mutex -> victim_thread->LOCK_thd_data
When a normal KILL statement is executed, the stack is
innobase_kill_query()
kill_handlerton()
plugin_foreach_with_mask()
ha_kill_query()
THD::awake()
kill_one_thread()
and mutexes are
victim_thread->LOCK_thd_data -> lock_sys->mutex -> victim_trx->mutex
This patch is the plan D variant for fixing potetial mutex locking
order exercised by BF aborting and KILL command execution.
In this approach, KILL command is replicated as TOI operation.
This guarantees total isolation for the KILL command execution
in the first node: there is no concurrent replication applying
and no concurrent DDL executing. Therefore there is no risk of
BF aborting to happen in parallel with KILL command execution
either. Potential mutex deadlocks between the different mutex
access paths with KILL command execution and BF aborting cannot
therefore happen.
TOI replication is used, in this approach, purely as means
to provide isolated KILL command execution in the first node.
KILL command should not (and must not) be applied in secondary
nodes. In this patch, we make this sure by skipping KILL
execution in secondary nodes, in applying phase, where we
bail out if applier thread is trying to execute KILL command.
This is effective, but skipping the applying of KILL command
could happen much earlier as well.
This also fixed unprotected calls to wsrep_thd_abort
that will use wsrep_abort_transaction. This is fixed
by holding THD::LOCK_thd_data while we abort transaction.
Reviewed-by: Jan Lindström <jan.lindstrom@mariadb.com>
Diffstat (limited to 'storage')
-rw-r--r-- | storage/innobase/handler/ha_innodb.cc | 325 | ||||
-rw-r--r-- | storage/innobase/include/ha_prototypes.h | 3 | ||||
-rw-r--r-- | storage/innobase/lock/lock0wait.cc | 18 |
3 files changed, 196 insertions, 150 deletions
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 57e7ec236c0..8fb52b211c6 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -5233,17 +5233,18 @@ UNIV_INTERN void lock_cancel_waiting_and_release(lock_t* lock); @sa THD::awake() @sa ha_kill_query() */ static void innobase_kill_query(handlerton*, THD* thd, enum thd_kill_levels) { - DBUG_ENTER("innobase_kill_query"); + DBUG_ENTER("innobase_kill_query"); #ifdef WITH_WSREP - if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) { - /* if victim has been signaled by BF thread and/or aborting - is already progressing, following query aborting is not necessary - any more. - Also, BF thread should own trx mutex for the victim, which would - conflict with trx_mutex_enter() below - */ - DBUG_VOID_RETURN; - } + if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) + { + /* if victim has been signaled by BF thread and/or aborting + is already progressing, following query aborting is not necessary + any more. */ + WSREP_DEBUG("Victim thread %ld bail out conflict_state %s query %s", + thd_get_thread_id(thd), + wsrep_thd_conflict_state_str(thd), wsrep_thd_query(thd)); + DBUG_VOID_RETURN; + } #endif /* WITH_WSREP */ if (trx_t* trx= thd_to_trx(thd)) @@ -19497,70 +19498,66 @@ static struct st_mysql_storage_engine innobase_storage_engine= { MYSQL_HANDLERTON_INTERFACE_VERSION }; #ifdef WITH_WSREP +static void wsrep_abort_slave_trx( -/*==================*/ - wsrep_seqno_t bf_seqno, - wsrep_seqno_t victim_seqno) -{ - WSREP_ERROR("Trx %lld tries to abort slave trx %lld. This could be " - "caused by:\n\t" - "1) unsupported configuration options combination, please check documentation.\n\t" - "2) a bug in the code.\n\t" - "3) a database corruption.\n Node consistency compromized, " - "need to abort. Restart the node to resync with cluster.", - (long long)bf_seqno, (long long)victim_seqno); - abort(); -} -/*******************************************************************//** -This function is used to kill one transaction in BF. */ -UNIV_INTERN + THD* bf_thd, + THD* victim_thd) +{ + wsrep_seqno_t bf_seqno= wsrep_thd_trx_seqno(bf_thd); + wsrep_seqno_t victim_seqno= wsrep_thd_trx_seqno(victim_thd); + + WSREP_ERROR("wsrep_abort_slave_trx: BF Aborter %s thread: %ld " + "seqno: %lld query_state: %s conflict_state: %s " + "exec mode %s query: %s", + wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal", + thd_get_thread_id(bf_thd), + bf_seqno, + wsrep_thd_query_state_str(bf_thd), + wsrep_thd_conflict_state_str(bf_thd), + wsrep_thd_exec_mode_str(bf_thd), + wsrep_thd_query(bf_thd)); + + WSREP_ERROR("wsrep_abort_slave_trx: Victim %s thread: %ld " + "seqno: %lld query_state: %s conflict_state: %s " + "exec mode %s query: %s", + wsrep_thd_is_BF(victim_thd, false) ? "BF" : "normal", + thd_get_thread_id(victim_thd), + wsrep_thd_trx_seqno(victim_thd), + wsrep_thd_query_state_str(victim_thd), + wsrep_thd_conflict_state_str(victim_thd), + wsrep_thd_exec_mode_str(victim_thd), + wsrep_thd_query(victim_thd)); + + WSREP_ERROR("Trx %lld tries to abort slave trx %lld. This could be " + "caused by:\n\t" + "1) unsupported configuration options combination, please check documentation.\n\t" + "2) a bug in the code.\n\t" + "3) a database corruption.\n Node consistency compromized, " + "need to abort. Restart the node to resync with cluster.", + (long long)bf_seqno, (long long)victim_seqno); + abort(); +} + +/** This function is used to kill one transaction in BF. */ +static void -wsrep_innobase_kill_one_trx( -/*========================*/ +wsrep_kill_victim( MYSQL_THD const bf_thd, - const trx_t * const bf_trx, - trx_t *victim_trx, - ibool signal) + const trx_t* const bf_trx, + MYSQL_THD thd, + trx_t* victim_trx, + my_bool signal) { - ut_ad(bf_thd); - ut_ad(victim_trx); - ut_ad(lock_mutex_own()); - ut_ad(trx_mutex_own(victim_trx)); - - DBUG_ENTER("wsrep_innobase_kill_one_trx"); - THD *thd = (THD *) victim_trx->mysql_thd; - int64_t bf_seqno = wsrep_thd_trx_seqno(bf_thd); - - if (!thd) { - DBUG_PRINT("wsrep", ("no thd for conflicting lock")); - WSREP_WARN("no THD for trx: " TRX_ID_FMT, victim_trx->id); - DBUG_VOID_RETURN; - } - - WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); - - WSREP_DEBUG("BF kill (" ULINTPF ", seqno: " INT64PF - "), victim: (%lu) trx: " TRX_ID_FMT, - signal, bf_seqno, - thd_get_thread_id(thd), - victim_trx->id); - - WSREP_DEBUG("Aborting query: %s conf %d trx: %" PRId64, - (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void", - wsrep_thd_conflict_state(thd, FALSE), - wsrep_thd_ws_handle(thd)->trx_id); + ut_ad(bf_thd); + ut_ad(thd); + ut_ad(victim_trx); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(victim_trx)); - wsrep_thd_LOCK(thd); - DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock", - { - const char act[]= - "now " - "wait_for signal.wsrep_after_BF_victim_lock"; - DBUG_ASSERT(!debug_sync_set_action(bf_thd, - STRING_WITH_LEN(act))); - };); + DBUG_ENTER("wsrep_kill_victim"); + const int64_t bf_seqno= wsrep_thd_trx_seqno(bf_thd); if (wsrep_thd_query_state(thd) == QUERY_EXITING) { WSREP_DEBUG("kill trx EXITING for " TRX_ID_FMT, @@ -19570,27 +19567,32 @@ wsrep_innobase_kill_one_trx( } if (wsrep_thd_exec_mode(thd) != LOCAL_STATE) { - WSREP_DEBUG("withdraw for BF trx: " TRX_ID_FMT ", state: %d", + WSREP_DEBUG("withdraw for BF trx: " TRX_ID_FMT + ", state: %s exec %s", victim_trx->id, - wsrep_thd_get_conflict_state(thd)); + wsrep_thd_conflict_state_str(thd), + wsrep_thd_exec_mode_str(thd)); } switch (wsrep_thd_get_conflict_state(thd)) { case NO_CONFLICT: + /* This will cause any call to innobase_kill_query() + for this thd to bail out. */ wsrep_thd_set_conflict_state(thd, MUST_ABORT); break; case MUST_ABORT: WSREP_DEBUG("victim " TRX_ID_FMT " in MUST ABORT state", victim_trx->id); - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); DBUG_VOID_RETURN; break; case ABORTED: case ABORTING: // fall through default: - WSREP_DEBUG("victim " TRX_ID_FMT " in state %d", - victim_trx->id, wsrep_thd_get_conflict_state(thd)); + WSREP_DEBUG("victim " TRX_ID_FMT " in state %s", + victim_trx->id, + wsrep_thd_conflict_state_str(thd)); wsrep_thd_UNLOCK(thd); DBUG_VOID_RETURN; break; @@ -19598,6 +19600,7 @@ wsrep_innobase_kill_one_trx( switch (wsrep_thd_query_state(thd)) { case QUERY_COMMITTING: + { enum wsrep_status rcode; WSREP_DEBUG("kill query for: %ld", @@ -19606,8 +19609,7 @@ wsrep_innobase_kill_one_trx( victim_trx->id); if (wsrep_thd_exec_mode(thd) == REPL_RECV) { - wsrep_abort_slave_trx(bf_seqno, - wsrep_thd_trx_seqno(thd)); + wsrep_abort_slave_trx(bf_thd, thd); } else { wsrep_t *wsrep= get_wsrep(); rcode = wsrep->abort_pre_commit( @@ -19620,8 +19622,8 @@ wsrep_innobase_kill_one_trx( WSREP_DEBUG("cancel commit warning: " TRX_ID_FMT, victim_trx->id); - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); DBUG_VOID_RETURN; break; case WSREP_OK: @@ -19639,21 +19641,21 @@ wsrep_innobase_kill_one_trx( break; } } - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); break; + } case QUERY_EXEC: + { /* it is possible that victim trx is itself waiting for some * other lock. We need to cancel this waiting */ WSREP_DEBUG("kill trx QUERY_EXEC for " TRX_ID_FMT, victim_trx->id); - victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; - if (victim_trx->lock.wait_lock) { WSREP_DEBUG("victim has wait flag: %ld", - thd_get_thread_id(thd)); + thd_get_thread_id(thd)); lock_t* wait_lock = victim_trx->lock.wait_lock; if (wait_lock) { @@ -19662,107 +19664,166 @@ wsrep_innobase_kill_one_trx( lock_cancel_waiting_and_release(wait_lock); } - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); } else { /* abort currently executing query */ - DBUG_PRINT("wsrep",("sending KILL_QUERY to: %lu", - thd_get_thread_id(thd))); WSREP_DEBUG("kill query for: %ld", - thd_get_thread_id(thd)); - /* Note that innobase_kill_query will take lock_mutex - and trx_mutex */ - wsrep_thd_UNLOCK(thd); - wsrep_thd_awake(thd, signal); + thd_get_thread_id(thd)); /* for BF thd, we need to prevent him from committing */ if (wsrep_thd_exec_mode(thd) == REPL_RECV) { - wsrep_abort_slave_trx(bf_seqno, - wsrep_thd_trx_seqno(thd)); + wsrep_abort_slave_trx(bf_thd, thd); } + + /* Note that innobase_kill_query will take lock_mutex + and trx_mutex */ + wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); } break; + } case QUERY_IDLE: { WSREP_DEBUG("kill IDLE for " TRX_ID_FMT, victim_trx->id); if (wsrep_thd_exec_mode(thd) == REPL_RECV) { WSREP_DEBUG("kill BF IDLE, seqno: %lld", - (long long)wsrep_thd_trx_seqno(thd)); - wsrep_thd_UNLOCK(thd); - wsrep_abort_slave_trx(bf_seqno, - wsrep_thd_trx_seqno(thd)); - DBUG_VOID_RETURN; + wsrep_thd_trx_seqno(thd)); + wsrep_abort_slave_trx(bf_thd, thd); } - /* This will lock thd from proceeding after net_read() */ - wsrep_thd_set_conflict_state(thd, ABORTING); + /* This will lock thd from proceeding after net_read() and + will cause any call to innobase_kill_query() for this + thd to bail out. */ + wsrep_thd_set_conflict_state(thd, ABORTING); wsrep_lock_rollback(); if (wsrep_aborting_thd_contains(thd)) { WSREP_WARN("duplicate thd aborter %lu", - (ulong) thd_get_thread_id(thd)); + thd_get_thread_id(thd)); } else { wsrep_aborting_thd_enqueue(thd); - DBUG_PRINT("wsrep",("enqueuing trx abort for %lu", - thd_get_thread_id(thd))); WSREP_DEBUG("enqueuing trx abort for (%lu)", - thd_get_thread_id(thd)); + thd_get_thread_id(thd)); } - DBUG_PRINT("wsrep",("signalling wsrep rollbacker")); WSREP_DEBUG("signaling aborter"); wsrep_unlock_rollback(); wsrep_thd_UNLOCK(thd); - break; } default: WSREP_WARN("bad wsrep query state: %d", wsrep_thd_query_state(thd)); - wsrep_thd_UNLOCK(thd); - break; + ut_error; } - DBUG_VOID_RETURN; } +/******************************************************************* +This function is used to kill one transaction in BF. */ +void +wsrep_innobase_kill_one_trx( + MYSQL_THD const bf_thd, + const trx_t * const bf_trx, + trx_t *victim_trx, + my_bool signal) +{ + ut_ad(bf_thd); + ut_ad(victim_trx); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(victim_trx)); + + DBUG_ENTER("wsrep_innobase_kill_one_trx"); + THD *thd= (THD *) victim_trx->mysql_thd; + + /* Here we need to lock THD::LOCK_thd_data to protect from + concurrent usage or disconnect or delete. */ + DEBUG_SYNC(bf_thd, "wsrep_before_BF_victim_lock"); + wsrep_thd_LOCK(thd); + DEBUG_SYNC(bf_thd, "wsrep_after_BF_victim_lock"); + + WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); + + WSREP_DEBUG("wsrep_innobase_kill_one_trx: Aborter %s " + "trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld query_state: %s conflict_state: %s " + "exec mode %s query: %s", + wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal", + bf_trx ? bf_trx->id : TRX_ID_MAX, + thd_get_thread_id(bf_thd), + wsrep_thd_trx_seqno(bf_thd), + wsrep_thd_query_state_str(bf_thd), + wsrep_thd_conflict_state_str(bf_thd), + wsrep_thd_exec_mode_str(bf_thd), + wsrep_thd_query(bf_thd)); + + WSREP_DEBUG("wsrep_innobase_kill_one_trx: Victim %s " + "trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld query_state: %s conflict_state: %s " + "exec mode %s query: %s", + wsrep_thd_is_BF(thd, false) ? "BF" : "normal", + victim_trx->id, + thd_get_thread_id(thd), + wsrep_thd_trx_seqno(thd), + wsrep_thd_query_state_str(thd), + wsrep_thd_conflict_state_str(thd), + wsrep_thd_exec_mode_str(thd), + wsrep_thd_query(thd)); + + wsrep_kill_victim(bf_thd, bf_trx, thd, victim_trx, signal); + DBUG_VOID_RETURN; +} + static void wsrep_abort_transaction( -/*====================*/ handlerton* hton, THD *bf_thd, THD *victim_thd, my_bool signal) { - DBUG_ENTER("wsrep_abort_transaction"); - - trx_t* victim_trx = thd_to_trx(victim_thd); - trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL; - - WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %d", - wsrep_thd_query(bf_thd), - wsrep_thd_query(victim_thd), - wsrep_thd_conflict_state(victim_thd, FALSE)); - - if (victim_trx) { - lock_mutex_enter(); - trx_mutex_enter(victim_trx); - wsrep_innobase_kill_one_trx(bf_thd, bf_trx, victim_trx, signal); - lock_mutex_exit(); - trx_mutex_exit(victim_trx); - wsrep_srv_conc_cancel_wait(victim_trx); - DBUG_VOID_RETURN; - } else { - WSREP_DEBUG("victim does not have transaction"); - wsrep_thd_LOCK(victim_thd); - wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT); - wsrep_thd_UNLOCK(victim_thd); - wsrep_thd_awake(victim_thd, signal); - } + DBUG_ENTER("wsrep_abort_transaction"); + /* Note that victim thd is protected with + THD::LOCK_thd_data here. */ + trx_t* victim_trx= thd_to_trx(victim_thd); + trx_t* bf_trx= thd_to_trx(bf_thd); + + WSREP_DEBUG("wsrep_abort_transaction: BF:" + " thread %ld query_state %s conflict_state %s" + " exec %s query %s trx " TRX_ID_FMT, + thd_get_thread_id(bf_thd), + wsrep_thd_query_state_str(bf_thd), + wsrep_thd_conflict_state_str(bf_thd), + wsrep_thd_exec_mode_str(bf_thd), + wsrep_thd_query(bf_thd), + bf_trx ? bf_trx->id : 0); + + WSREP_DEBUG("wsrep_abort_transaction: victim:" + " thread %ld query_state %s conflict_state %s" + " exec %s query %s trx " TRX_ID_FMT, + thd_get_thread_id(victim_thd), + wsrep_thd_query_state_str(victim_thd), + wsrep_thd_conflict_state_str(victim_thd), + wsrep_thd_exec_mode_str(victim_thd), + wsrep_thd_query(victim_thd), + victim_trx ? victim_trx->id : 0); + + if (victim_trx) { + lock_mutex_enter(); + trx_mutex_enter(victim_trx); + wsrep_kill_victim(bf_thd, bf_trx, victim_thd, victim_trx, signal); + lock_mutex_exit(); + trx_mutex_exit(victim_trx); + wsrep_srv_conc_cancel_wait(victim_trx); + } else { + wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT); + wsrep_thd_awake(victim_thd, signal); + wsrep_thd_UNLOCK(victim_thd); + } - DBUG_VOID_RETURN; + DBUG_VOID_RETURN; } static diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index 3eab2135969..427e57f09d2 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -233,12 +233,11 @@ innobase_casedn_str( char* a); /*!< in/out: string to put in lower case */ #ifdef WITH_WSREP -UNIV_INTERN void wsrep_innobase_kill_one_trx(MYSQL_THD const thd_ptr, const trx_t * const bf_trx, trx_t *victim_trx, - ibool signal); + my_bool signal); int wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, unsigned char* str, unsigned int str_length, unsigned int buf_length); diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc index df1488b9df3..7da5e16a762 100644 --- a/storage/innobase/lock/lock0wait.cc +++ b/storage/innobase/lock/lock0wait.cc @@ -184,13 +184,11 @@ lock_wait_table_reserve_slot( check if lock timeout was for priority thread, as a side effect trigger lock monitor @param[in] trx transaction owning the lock -@param[in] locked true if trx and lock_sys_mutex is ownd @return false for regular lock timeout */ static bool wsrep_is_BF_lock_timeout( - const trx_t* trx, - bool locked = true) + const trx_t* trx) { bool long_wait= (trx->error_state != DB_DEADLOCK && trx->is_wsrep() && @@ -204,18 +202,6 @@ wsrep_is_BF_lock_timeout( ib::info() << "WSREP: BF lock wait long for trx:" << trx->id << " query: " << wsrep_thd_query(trx->mysql_thd); - if (!locked) - lock_mutex_enter(); - - ut_ad(lock_mutex_own()); - - wsrep_trx_print_locking(stderr, trx, 3000); - /* Note this will release lock_sys mutex */ - lock_print_info_all_transactions(stderr); - - if (locked) - lock_mutex_enter(); - return was_wait; } else return false; @@ -407,7 +393,7 @@ lock_wait_suspend_thread( && wait_time > (double) lock_wait_timeout #ifdef WITH_WSREP && (!trx->is_wsrep() - || (!wsrep_is_BF_lock_timeout(trx, false) + || (!wsrep_is_BF_lock_timeout(trx) && trx->error_state != DB_DEADLOCK)) #endif /* WITH_WSREP */ ) { |