diff options
author | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-06-29 11:41:06 +0200 |
---|---|---|
committer | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-06-29 11:52:03 +0200 |
commit | 888b6cd31c256785f6c4dfc97d3107332e900aee (patch) | |
tree | a6bd6606e8130a230c81a19adeaf901a73f24cba | |
parent | a6087e7dc1ef3561d8189c8db15e9591d0f9b520 (diff) | |
download | mariadb-git-10.0-custombld.tar.gz |
MDEV-8302: Duplicate key with parallel replication10.0-custombld
Intermediate patch to try to track down the root cause of the problem by
running a custom binary on the user's actual load.
This patch adds some printouts in the error log, all prefixed with
"MDEV8302:", that will hopefully trigger when the error condition occurs and
give more information about what is happening.
In addition, this patch implements what might or might not be a fix for the
error (along with a printout to show if the fix actually triggers).
The fix is to add a check for thd->killed just before a transaction does
mark_start_commit(). This could help reduce the chance of a transaction
detecting a deadlock kill only after running mark_start_commit(), which
might be a source of the original duplicate key problem (though no concrete
sequence of events is currently known to be able to cause this condition).
Finally, this patch marks the generated binaries with a -mdev8302a version
postfix to help identify them.
-rw-r--r-- | VERSION | 1 | ||||
-rw-r--r-- | sql/rpl_parallel.cc | 56 | ||||
-rw-r--r-- | sql/sql_class.cc | 8 |
3 files changed, 61 insertions, 4 deletions
@@ -1,3 +1,4 @@ MYSQL_VERSION_MAJOR=10 MYSQL_VERSION_MINOR=0 MYSQL_VERSION_PATCH=20 +MYSQL_VERSION_EXTRA=mdev8302a diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 305e8053032..9fcd6b48d9f 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -190,6 +190,17 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id, fact trigger. */ DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id); + if (!(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id)) + fprintf(stderr, "MDEV8302: GTID %u-%u-%lu subid=%lu skipping free of " + "GCO(wait_count=%lu prior=%lu last=%lu installed=%d) due to " + "next->wait_count %lu > count_committing %lu\n", + rgi->current_gtid.domain_id, rgi->current_gtid.server_id, + (ulong)rgi->current_gtid.seq_no, (ulong)rgi->gtid_sub_id, + (ulong)tmp_gco->wait_count, (ulong)tmp_gco->prior_sub_id, + (ulong)tmp_gco->last_sub_id, (int)tmp_gco->installed, + (ulong)tmp_gco->next_gco->wait_count, + (ulong)entry->count_committing_event_groups); + tmp_gco= tmp_gco->prev_gco; } while (tmp_gco) @@ -304,6 +315,11 @@ convert_kill_to_deadlock_error(rpl_group_info *rgi) if ((err_code == ER_QUERY_INTERRUPTED || err_code == ER_CONNECTION_KILLED) && rgi->killed_for_retry) { + fprintf(stderr, "MDEV8302: Got deadlock kill in GTID %u-%u-%lu " + "(subid %lu in_commit=%d)\n", rgi->current_gtid.domain_id, + rgi->current_gtid.server_id, + (ulong)rgi->current_gtid.seq_no, (ulong)rgi->gtid_sub_id, + (int)rgi->did_mark_start_commit); thd->clear_error(); my_error(ER_LOCK_DEADLOCK, MYF(0)); rgi->killed_for_retry= false; @@ -343,6 +359,9 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt, Format_description_log_event *description_event= NULL; do_retry: + fprintf(stderr, "MDEV8302: Retry #%lu of GTID %u-%u-%lu\n", retries+1, + rgi->current_gtid.domain_id, rgi->current_gtid.server_id, + (ulong)rgi->current_gtid.seq_no); event_count= 0; err= 0; errmsg= NULL; @@ -884,9 +903,28 @@ handle_rpl_parallel_thread(void *arg) group_ending= is_group_ending(qev->ev, event_type); if (group_ending && likely(!rgi->worker_error)) { - DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); - rgi->mark_start_commit(); - DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + /* + Do an extra check for (deadlock) kill here. This helps prevent a + lingering deadlock kill that occured during normal DML processing to + propagate past the mark_start_commit(). If we detect a deadlock only + after mark_start_commit(), we have to unmark, which has at least a + theoretical possibility of leaving a window where it looks like all + transactions in a GCO have started committing, while in fact one + will need to rollback and retry. This is not supposed to be possible + (since there is a deadlock, at least one transaction should be + blocked from reaching commit), but this seems a fragile ensurance, + and there were historically a number of subtle bugs in this area. + */ + if (!thd->killed) + { + DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); + rgi->mark_start_commit(); + DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + } + else + fprintf(stderr, "MDEV8302: Skip mark_start_commit(GTID %u-%u-%lu) " + "due to killed\n", rgi->current_gtid.domain_id, + rgi->current_gtid.server_id, (ulong)rgi->current_gtid.seq_no); } /* @@ -911,7 +949,17 @@ handle_rpl_parallel_thread(void *arg) }); if (!err) #endif - err= rpt_handle_event(qev, rpt); + { + if (thd->check_killed()) + { + thd->clear_error(); + thd->get_stmt_da()->reset_diagnostics_area(); + thd->send_kill_message(); + err= 1; + } + else + err= rpt_handle_event(qev, rpt); + } delete_or_keep_event_post_apply(rgi, event_type, qev->ev); DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100", err= dbug_simulate_tmp_error(rgi, thd);); diff --git a/sql/sql_class.cc b/sql/sql_class.cc index cf0c4a1b84f..f7416fcce6d 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -4309,6 +4309,14 @@ thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd) cause replication to rollback (and later re-try) the other transaction, releasing the lock for this transaction so replication can proceed. */ + fprintf(stderr, "MDEV8302: Deadlock kill GTID %u-%u-%lu (subid %lu " + "in_commit=%d) due to blocking GTID %u-%u-%lu (subid %lu " + "in_commit=%d)\n", other_rgi->current_gtid.domain_id, + other_rgi->current_gtid.server_id, + (ulong)other_rgi->current_gtid.seq_no, (ulong)other_rgi->gtid_sub_id, + (int)other_rgi->did_mark_start_commit, rgi->current_gtid.domain_id, + rgi->current_gtid.server_id, (ulong)rgi->current_gtid.seq_no, + (ulong)rgi->gtid_sub_id, (int)rgi->did_mark_start_commit); other_rgi->killed_for_retry= true; mysql_mutex_lock(&other_thd->LOCK_thd_data); other_thd->awake(KILL_CONNECTION); |