diff options
author | unknown <knielsen@knielsen-hq.org> | 2012-06-22 11:46:28 +0200 |
---|---|---|
committer | unknown <knielsen@knielsen-hq.org> | 2012-06-22 11:46:28 +0200 |
commit | 0697ee265fa4fe4d617dc96194fcf2532fd73402 (patch) | |
tree | 2da975668ffb4a36601eccb1b852a10dc9650bca /sql | |
parent | 9fe317ffd6e3545e7896389641b74c2977a5e4b4 (diff) | |
download | mariadb-git-0697ee265fa4fe4d617dc96194fcf2532fd73402.tar.gz |
MDEV-181: XID crash recovery across binlog boundaries
Keep track of how many pending XIDs (transactions that are prepared in
storage engine and written into binlog, but not yet durably committed
on disk in the engine) there are in each binlog.
When the count of one binlog drops to zero, write a new binlog checkpoint
event, telling which is the oldest binlog with pending XIDs.
When doing XA recovery after a crash, check the last binlog checkpoint
event, and scan all binlog files from that point onwards for XIDs that
must be committed if found in prepared state inside engine.
Remove the code in binlog rotation that waits for all prepared XIDs to
be committed before writing a new binlog file (this is no longer necessary
when recovery can scan multiple binlog files).
Diffstat (limited to 'sql')
-rw-r--r-- | sql/handler.cc | 1 | ||||
-rw-r--r-- | sql/log.cc | 526 | ||||
-rw-r--r-- | sql/log.h | 53 | ||||
-rw-r--r-- | sql/log_event.cc | 88 | ||||
-rw-r--r-- | sql/log_event.h | 35 | ||||
-rw-r--r-- | sql/mysqld.cc | 8 | ||||
-rw-r--r-- | sql/mysqld.h | 4 | ||||
-rw-r--r-- | sql/sql_repl.cc | 24 |
8 files changed, 635 insertions, 104 deletions
diff --git a/sql/handler.cc b/sql/handler.cc index 58d37532796..e7eddef55ed 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1286,6 +1286,7 @@ int ha_commit_trans(THD *thd, bool all) if (!cookie) goto err; + DEBUG_SYNC(thd, "ha_commit_trans_after_log_and_order"); DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE();); error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0; diff --git a/sql/log.cc b/sql/log.cc index 56c07f81c9e..430f0a0bf60 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -479,6 +479,7 @@ public: */ bool using_xa; my_xid xa_xid; + ulong cookie; private: @@ -1665,6 +1666,21 @@ binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr, end_ev, all, using_stmt, using_trx); } + else + { + /* + This can happen in row-format binlog with something like + BEGIN; INSERT INTO nontrans_table; INSERT IGNORE INTO trans_table; + The nontrans_table is written directly into the binlog before commit, + and if the trans_table is ignored there will be no rows to write when + we get here. + + So there is no work to do. Therefore, we will not increment any XID + count, so we must not decrement any XID count in unlog(). + */ + if (cache_mngr->using_xa && cache_mngr->xa_xid) + cache_mngr->cookie= BINLOG_COOKIE_DUMMY; + } cache_mngr->reset(using_stmt, using_trx); DBUG_ASSERT((!using_stmt || cache_mngr->stmt_cache.empty()) && @@ -2888,7 +2904,8 @@ const char *MYSQL_LOG::generate_name(const char *log_name, MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period) - :bytes_written(0), prepared_xids(0), file_id(1), open_count(1), + :current_binlog_id(BINLOG_COOKIE_START), reset_master_pending(false), + bytes_written(0), file_id(1), open_count(1), need_start_event(TRUE), group_commit_queue(0), group_commit_queue_busy(FALSE), num_commits(0), num_group_commits(0), @@ -2916,13 +2933,30 @@ void MYSQL_BIN_LOG::cleanup() DBUG_ENTER("cleanup"); if (inited) { + xid_count_per_binlog *b; + inited= 0; close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT); delete description_event_for_queue; delete description_event_for_exec; + + while ((b= binlog_xid_count_list.get())) + { + /* + There should be no pending XIDs at shutdown, and only one entry (for + the active binlog file) in the list. + */ + DBUG_ASSERT(b->xid_count == 0); + DBUG_ASSERT(!binlog_xid_count_list.head()); + my_free(b); + } + mysql_mutex_destroy(&LOCK_log); mysql_mutex_destroy(&LOCK_index); + mysql_mutex_destroy(&LOCK_xid_list); mysql_cond_destroy(&update_cond); + mysql_cond_destroy(&COND_queue_busy); + mysql_cond_destroy(&COND_xid_list); } DBUG_VOID_RETURN; } @@ -2944,8 +2978,11 @@ void MYSQL_BIN_LOG::init_pthread_objects() MYSQL_LOG::init_pthread_objects(); mysql_mutex_init(m_key_LOCK_index, &LOCK_index, MY_MUTEX_INIT_SLOW); mysql_mutex_setflags(&LOCK_index, MYF_NO_DEADLOCK_DETECTION); + mysql_mutex_init(key_BINLOG_LOCK_xid_list, + &LOCK_xid_list, MY_MUTEX_INIT_FAST); mysql_cond_init(m_key_update_cond, &update_cond, 0); mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0); + mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0); } @@ -3149,6 +3186,51 @@ bool MYSQL_BIN_LOG::open(const char *log_name, if (s.write(&log_file)) goto err; bytes_written+= s.data_written; + + if (!is_relay_log) + { + char buf[FN_REFLEN]; + /* + Put this one into the list of active binlogs. + Write the current binlog checkpoint into the log, so XA recovery will + know from where to start recovery. + */ + uint off= dirname_length(log_file_name); + uint len= strlen(log_file_name) - off; + char *entry_mem, *name_mem; + xid_count_per_binlog *b, *b2; + if (!(b = (xid_count_per_binlog *) + my_multi_malloc(MYF(MY_WME), + &entry_mem, sizeof(xid_count_per_binlog), + &name_mem, len, + NULL))) + goto err; + memcpy(name_mem, log_file_name+off, len); + b->binlog_name= name_mem; + b->binlog_name_len= len; + b->xid_count= 0; + + mysql_mutex_lock(&LOCK_xid_list); + b->binlog_id= ++current_binlog_id; + + /* + Remove any initial entries with no pending XIDs. + Normally this will be done in unlog(), but if there are no + transactions with an XA-capable engine at all in a given binlog + file, unlog() will never be used and we will remove the entry here. + */ + while ((b2= binlog_xid_count_list.head()) && b2->xid_count == 0) + my_free(binlog_xid_count_list.get()); + + binlog_xid_count_list.push_back(b); + b2= binlog_xid_count_list.head(); + strmake(buf, b2->binlog_name, b2->binlog_name_len); + mysql_mutex_unlock(&LOCK_xid_list); + Binlog_checkpoint_log_event ev(buf, len); + if (ev.write(&log_file)) + goto err; + bytes_written+= ev.data_written; + } } if (description_event_for_queue && description_event_for_queue->binlog_version>=4) @@ -3514,6 +3596,42 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd) mysql_mutex_lock(&LOCK_log); mysql_mutex_lock(&LOCK_index); + if (!is_relay_log) + { + /* + We are going to nuke all binary log files. + So first wait until all pending binlog checkpoints have completed. + */ + mysql_mutex_lock(&LOCK_xid_list); + xid_count_per_binlog *b; + reset_master_pending= true; + for (;;) + { + I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list); + while ((b= it++)) + { + if (b->xid_count > 0) + break; + } + if (!b) + break; /* No more pending XIDs */ + /* + Wait until signalled that one more binlog dropped to zero, then check + again. + */ + mysql_cond_wait(&COND_xid_list, &LOCK_xid_list); + } + + /* + Now all XIDs are fully flushed to disk, and we are holding LOCK_log so + no new ones will be written. So we can proceed to delete the logs. + */ + while ((b= binlog_xid_count_list.get())) + my_free(b); + reset_master_pending= false; + mysql_mutex_unlock(&LOCK_xid_list); + } + /* The following mutex is needed to ensure that no threads call 'delete thd' as we would then risk missing a 'rollback' from this @@ -3824,8 +3942,7 @@ int MYSQL_BIN_LOG::purge_logs(const char *to_log, if ((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/))) goto err; while ((strcmp(to_log,log_info.log_file_name) || (exit_loop=included)) && - !is_active(log_info.log_file_name) && - !log_in_use(log_info.log_file_name)) + can_purge_log(log_info.log_file_name)) { if ((error= register_purge_index_entry(log_info.log_file_name))) { @@ -4175,8 +4292,7 @@ int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time) goto err; while (strcmp(log_file_name, log_info.log_file_name) && - !is_active(log_info.log_file_name) && - !log_in_use(log_info.log_file_name)) + can_purge_log(log_info.log_file_name)) { if (!mysql_file_stat(m_key_file_log, log_info.log_file_name, &stat_area, MYF(0))) @@ -4231,6 +4347,28 @@ err: mysql_mutex_unlock(&LOCK_index); DBUG_RETURN(error); } + + +bool +MYSQL_BIN_LOG::can_purge_log(const char *log_file_name) +{ + xid_count_per_binlog *b; + + if (is_active(log_file_name)) + return false; + mysql_mutex_lock(&LOCK_xid_list); + { + I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list); + while ((b= it++) && + 0 != strncmp(log_file_name+dirname_length(log_file_name), + b->binlog_name, b->binlog_name_len)) + ; + } + mysql_mutex_unlock(&LOCK_xid_list); + if (b) + return false; + return !log_in_use(log_file_name); +} #endif /* HAVE_REPLICATION */ @@ -4324,26 +4462,6 @@ int MYSQL_BIN_LOG::new_file_impl(bool need_lock) mysql_mutex_assert_owner(&LOCK_log); mysql_mutex_assert_owner(&LOCK_index); - /* - if binlog is used as tc log, be sure all xids are "unlogged", - so that on recover we only need to scan one - latest - binlog file - for prepared xids. As this is expected to be a rare event, - simple wait strategy is enough. We're locking LOCK_log to be sure no - new Xid_log_event's are added to the log (and prepared_xids is not - increased), and waiting on COND_prep_xids for late threads to - catch up. - */ - if (prepared_xids) - { - tc_log_page_waits++; - mysql_mutex_lock(&LOCK_prep_xids); - while (prepared_xids) { - DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids)); - mysql_cond_wait(&COND_prep_xids, &LOCK_prep_xids); - } - mysql_mutex_unlock(&LOCK_prep_xids); - } - /* Reuse old name if not binlog and not update log */ new_name_ptr= name; @@ -5792,6 +5910,37 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd) DBUG_RETURN(error); } +void +MYSQL_BIN_LOG::write_binlog_checkpoint_event_already_locked(const char *name, + uint len) +{ + Binlog_checkpoint_log_event ev(name, len); + /* + Note that we must sync the binlog checkpoint to disk. + Otherwise a subsequent log purge could delete binlogs that XA recovery + thinks are needed (even though they are not really). + */ + if (!ev.write(&log_file) && !flush_and_sync(0)) + { + bool check_purge= false; + signal_update(); + rotate(false, &check_purge); + if (check_purge) + purge(); + return; + } + + /* + If we fail to write the checkpoint event, something is probably really + bad with the binlog. We complain in the error log. + Note that failure to write binlog checkpoint does not compromise the + ability to do crash recovery - crash recovery will just have to scan a + bit more of the binlog than strictly necessary. + */ + sql_print_error("Failed to write binlog checkpoint event to binary log\n"); +} + + /** Write a cached log entry to the binary log. - To support transaction over replication, we wrap the transaction @@ -5951,7 +6100,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry) for a transaction if log_xid() fails). */ if (entry->cache_mngr->using_xa && entry->cache_mngr->xa_xid) - mark_xid_done(); + mark_xid_done(entry->cache_mngr->cookie); return 1; } @@ -5972,7 +6121,6 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) uint xid_count= 0; my_off_t UNINIT_VAR(commit_offset); group_commit_entry *current; - group_commit_entry *last_in_queue; group_commit_entry *queue= NULL; bool check_purge= false; DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader"); @@ -5993,7 +6141,6 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) mysql_mutex_unlock(&LOCK_prepare_ordered); /* As the queue is in reverse order of entering, reverse it. */ - last_in_queue= current; while (current) { group_commit_entry *next= current->next; @@ -6032,7 +6179,10 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) commit_offset= my_b_write_tell(&log_file); cache_mngr->last_commit_pos_offset= commit_offset; if (cache_mngr->using_xa && cache_mngr->xa_xid) + { xid_count++; + cache_mngr->cookie= current_binlog_id; + } } bool synced= 0; @@ -6075,16 +6225,14 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) } /* - if any commit_events are Xid_log_event, increase the number of - prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated - if there're prepared xids in it - see the comment in new_file() for - an explanation. - If no Xid_log_events (then it's all Query_log_event) rotate binlog, - if necessary. + If any commit_events are Xid_log_event, increase the number of pending + XIDs in current binlog (it's decreased in ::unlog()). When the count in + a (not active) binlog file reaches zero, we know that it is no longer + needed in XA recovery, and we can log a new binlog checkpoint event. */ if (xid_count > 0) { - mark_xids_active(xid_count); + mark_xids_active(current_binlog_id, xid_count); } else { @@ -6092,11 +6240,11 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) { /* If we fail to rotate, which thread should get the error? - We give the error to the *last* transaction thread; that seems to - make the most sense, as it was the last to write to the log. + We give the error to the leader, as any my_error() thrown inside + rotate() will have been registered for the leader THD. */ - last_in_queue->error= ER_ERROR_ON_WRITE; - last_in_queue->commit_errno= errno; + leader->error= ER_ERROR_ON_WRITE; + leader->commit_errno= errno; check_purge= false; } } @@ -6113,9 +6261,6 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) */ mysql_mutex_unlock(&LOCK_log); - if (check_purge) - purge(); - DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log"); ++num_group_commits; @@ -6148,7 +6293,8 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered"); ++num_commits; - if (current->cache_mngr->using_xa && !current->error) + if (current->cache_mngr->using_xa && !current->error && + DBUG_EVALUATE_IF("skip_commit_ordered", 0, 1)) run_commit_ordered(current->thd, current->all); /* @@ -6163,6 +6309,9 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered"); mysql_mutex_unlock(&LOCK_commit_ordered); + if (check_purge) + purge(); + DBUG_VOID_RETURN; } @@ -7351,14 +7500,6 @@ int TC_LOG::using_heuristic_recover() /****** transaction coordinator log for 2pc - binlog() based solution ******/ #define TC_LOG_BINLOG MYSQL_BIN_LOG -/** - @todo - keep in-memory list of prepared transactions - (add to list in log(), remove on unlog()) - and copy it to the new binlog if rotated - but let's check the behaviour of tc_log_page_waits first! -*/ - int TC_LOG_BINLOG::open(const char *opt_name) { LOG_INFO log_info; @@ -7367,10 +7508,6 @@ int TC_LOG_BINLOG::open(const char *opt_name) DBUG_ASSERT(total_ha_2pc > 1); DBUG_ASSERT(opt_name && opt_name[0]); - mysql_mutex_init(key_BINLOG_LOCK_prep_xids, - &LOCK_prep_xids, MY_MUTEX_INIT_FAST); - mysql_cond_init(key_BINLOG_COND_prep_xids, &COND_prep_xids, 0); - if (!my_b_inited(&index_file)) { /* There was a failure to open the index file, can't open the binlog */ @@ -7429,7 +7566,8 @@ int TC_LOG_BINLOG::open(const char *opt_name) ev->flags & LOG_EVENT_BINLOG_IN_USE_F) { sql_print_information("Recovering after a crash using %s", opt_name); - error= recover(&log, (Format_description_log_event *)ev); + error= recover(&log_info, log_name, &log, + (Format_description_log_event *)ev); } else error=0; @@ -7449,9 +7587,6 @@ err: /** This is called on shutdown, after ha_panic. */ void TC_LOG_BINLOG::close() { - DBUG_ASSERT(prepared_xids==0); - mysql_mutex_destroy(&LOCK_prep_xids); - mysql_cond_destroy(&COND_prep_xids); } /* @@ -7471,11 +7606,23 @@ TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all, cache_mngr->using_xa= TRUE; cache_mngr->xa_xid= xid; +#ifndef DBUG_OFF + cache_mngr->cookie= 0; +#endif err= binlog_commit_flush_xid_caches(thd, cache_mngr, all, xid); DEBUG_SYNC(thd, "binlog_after_log_and_order"); - DBUG_RETURN(!err); + if (err) + DBUG_RETURN(0); + /* + If using explicit user XA, we will not have XID. We must still return a + non-zero cookie (as zero cookie signals error). + */ + if (!xid) + DBUG_RETURN(BINLOG_COOKIE_DUMMY); + DBUG_ASSERT(cache_mngr->cookie != 0); + DBUG_RETURN(cache_mngr->cookie); } /* @@ -7490,40 +7637,134 @@ TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all, binary log. */ void -TC_LOG_BINLOG::mark_xids_active(uint xid_count) +TC_LOG_BINLOG::mark_xids_active(ulong cookie, uint xid_count) { + xid_count_per_binlog *b; + DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active"); - DBUG_PRINT("info", ("xid_count=%u", xid_count)); - mysql_mutex_lock(&LOCK_prep_xids); - prepared_xids+= xid_count; - mysql_mutex_unlock(&LOCK_prep_xids); + DBUG_PRINT("info", ("cookie=%lu xid_count=%u", cookie, xid_count)); + DBUG_ASSERT(cookie != 0 && cookie != BINLOG_COOKIE_DUMMY); + + mysql_mutex_lock(&LOCK_xid_list); + I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list); + while ((b= it++)) + { + if (b->binlog_id == cookie) + { + b->xid_count += xid_count; + break; + } + } + /* + As we do not delete elements until count reach zero, elements should always + be found. + */ + DBUG_ASSERT(b); + mysql_mutex_unlock(&LOCK_xid_list); DBUG_VOID_RETURN; } /* - Once an XID is committed, it is safe to rotate the binary log, as it can no - longer be needed during crash recovery. + Once an XID is committed, it can no longer be needed during crash recovery, + as it has been durably recorded on disk as "committed". This function is called to mark an XID this way. It needs to decrease the - count of pending XIDs, and signal the log rotator thread when it reaches zero. + count of pending XIDs in the corresponding binlog. When the count reaches + zero (for an "old" binlog that is not the active one), that binlog file no + longer need to be scanned during crash recovery, so we can log a new binlog + checkpoint. */ void -TC_LOG_BINLOG::mark_xid_done() +TC_LOG_BINLOG::mark_xid_done(ulong cookie) { - my_bool send_signal; + xid_count_per_binlog *b; + bool first; + ulong current; DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done"); - mysql_mutex_lock(&LOCK_prep_xids); - // prepared_xids can be 0 if the transaction had ignorable errors. - DBUG_ASSERT(prepared_xids >= 0); - if (prepared_xids > 0) - prepared_xids--; - send_signal= (prepared_xids == 0); - mysql_mutex_unlock(&LOCK_prep_xids); - if (send_signal) { - DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids)); - mysql_cond_signal(&COND_prep_xids); + if (cookie == BINLOG_COOKIE_DUMMY) + DBUG_VOID_RETURN; /* Nothing to do. */ + + mysql_mutex_lock(&LOCK_xid_list); + current= current_binlog_id; + I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list); + first= true; + while ((b= it++)) + { + if (b->binlog_id == cookie) + { + --b->xid_count; + break; + } + first= false; + } + /* Binlog is always found, as we do not remove until count reaches 0 */ + DBUG_ASSERT(b); + if (likely(cookie == current && !reset_master_pending) || + b->xid_count != 0 || !first) + { + /* No new binlog checkpoint reached yet. */ + mysql_mutex_unlock(&LOCK_xid_list); + DBUG_VOID_RETURN; + } + + /* + Now log a binlog checkpoint for the first binlog file with a non-zero count. + + Note that it is possible (though perhaps unlikely) that when count of + binlog (N-2) drops to zero, binlog (N-1) is already at zero. So we may + need to skip several entries before we find the one to log in the binlog + checkpoint event. + + We chain the locking of LOCK_xid_list and LOCK_log, so that we ensure that + Binlog_checkpoint_events are logged in order. This simplifies recovery a + bit, as it can just take the last binlog checkpoint in the log, rather + than compare all found against each other to find the one pointing to the + most recent binlog. + + Note also that we need to first release LOCK_xid_list, then aquire + LOCK_log, then re-aquire LOCK_xid_list. If we were to take LOCK_log while + holding LOCK_xid_list, we might deadlock with other threads that take the + locks in the opposite order. + + If a RESET MASTER is pending, we are about to remove all log files, and + the RESET MASTER thread is waiting for all pending unlog() calls to + complete while holding LOCK_log. In this case we should not log a binlog + checkpoint event (it would be deleted immediately anywat and we would + deadlock on LOCK_log) but just signal the thread. + */ + if (!reset_master_pending) + { + mysql_mutex_unlock(&LOCK_xid_list); + mysql_mutex_lock(&LOCK_log); + mysql_mutex_lock(&LOCK_xid_list); + } + for (;;) + { + /* Remove initial element(s) with zero count. */ + b= binlog_xid_count_list.head(); + /* + Normally, we must not remove all elements in the list. + Only if a RESET MASTER is in progress may we delete everything - RESET + MASTER has LOCK_log held, and will create a new initial element before + releasing the lock. + */ + DBUG_ASSERT(b || reset_master_pending); + if (unlikely(!b) || b->binlog_id == current || b->xid_count > 0) + break; + my_free(binlog_xid_count_list.get()); } + if (reset_master_pending) + { + mysql_cond_signal(&COND_xid_list); + mysql_mutex_unlock(&LOCK_xid_list); + DBUG_VOID_RETURN; + } + + mysql_mutex_unlock(&LOCK_xid_list); + write_binlog_checkpoint_event_already_locked(b->binlog_name, + b->binlog_name_len); + mysql_mutex_unlock(&LOCK_log); DBUG_VOID_RETURN; } @@ -7531,16 +7772,24 @@ int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid) { DBUG_ENTER("TC_LOG_BINLOG::unlog"); if (xid) - mark_xid_done(); + mark_xid_done(cookie); /* As ::write_transaction_to_binlog() did not rotate, do it here. */ DBUG_RETURN(rotate_and_purge(0)); } -int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle) +int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, + IO_CACHE *first_log, + Format_description_log_event *fdle) { Log_event *ev; HASH xids; MEM_ROOT mem_root; + char binlog_checkpoint_name[FN_REFLEN]; + bool binlog_checkpoint_found; + bool first_round; + IO_CACHE log; + File file= -1; + const char *errmsg; if (! fdle->is_valid() || my_hash_init(&xids, &my_charset_bin, TC_LOG_PAGE_SIZE/3, 0, @@ -7551,19 +7800,109 @@ int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle) fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error - while ((ev= Log_event::read_log_event(log, 0, fdle, - opt_master_verify_checksum)) - && ev->is_valid()) + /* + Scan the binlog for XIDs that need to be committed if still in the + prepared stage. + + Start with the latest binlog file, then continue with any other binlog + files if the last found binlog checkpoint indicates it is needed. + */ + + binlog_checkpoint_found= false; + first_round= true; + for (;;) { - if (ev->get_type_code() == XID_EVENT) + while ((ev= Log_event::read_log_event(first_round ? first_log : &log, + 0, fdle, opt_master_verify_checksum)) + && ev->is_valid()) + { + switch (ev->get_type_code()) + { + case XID_EVENT: + { + Xid_log_event *xev=(Xid_log_event *)ev; + uchar *x= (uchar *) memdup_root(&mem_root, (uchar*) &xev->xid, + sizeof(xev->xid)); + if (!x || my_hash_insert(&xids, x)) + { + delete ev; + goto err2; + } + break; + } + case BINLOG_CHECKPOINT_EVENT: + if (first_round) + { + uint dir_len; + Binlog_checkpoint_log_event *cev= (Binlog_checkpoint_log_event *)ev; + if (cev->binlog_file_len >= FN_REFLEN) + sql_print_warning("Incorrect binlog checkpoint event with too " + "long file name found."); + else + { + /* + Note that we cannot use make_log_name() here, as we have not yet + initialised MYSQL_BIN_LOG::log_file_name. + */ + dir_len= dirname_length(last_log_name); + strmake(strnmov(binlog_checkpoint_name, last_log_name, dir_len), + cev->binlog_file_name, FN_REFLEN - 1 - dir_len); + binlog_checkpoint_found= true; + } + break; + } + default: + /* Nothing. */ + break; + } + delete ev; + } + + /* + If the last binlog checkpoint event points to an older log, we have to + scan all logs from there also, to get all possible XIDs to recover. + + If there was no binlog checkpoint event at all, this means the log was + written by an older version of MariaDB (or MySQL) - these always have an + (implicit) binlog checkpoint event at the start of the last binlog file. + */ + if (first_round) { - Xid_log_event *xev=(Xid_log_event *)ev; - uchar *x= (uchar *) memdup_root(&mem_root, (uchar*) &xev->xid, - sizeof(xev->xid)); - if (!x || my_hash_insert(&xids, x)) + if (!binlog_checkpoint_found) + break; + first_round= false; + if (find_log_pos(linfo, binlog_checkpoint_name, 1)) + { + sql_print_error("Binlog file '%s' not found in binlog index, needed " + "for recovery. Aborting.", binlog_checkpoint_name); goto err2; + } + } + else + { + end_io_cache(&log); + mysql_file_close(file, MYF(MY_WME)); + file= -1; + } + + if (0 == strcmp(linfo->log_file_name, last_log_name)) + break; // No more files to do + if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0) + { + sql_print_error("%s", errmsg); + goto err2; + } + /* + We do not need to read the Format_description_log_event of other binlog + files. It is not possible for a binlog checkpoint to span multiple + binlog files written by different versions of the server. So we can use + the first one read for reading from all binlog files. + */ + if (find_next_log(linfo, 1)) + { + sql_print_error("Error reading binlog files during recovery. Aborting."); + goto err2; } - delete ev; } if (ha_recover(&xids)) @@ -7574,6 +7913,11 @@ int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle) return 0; err2: + if (file >= 0) + { + end_io_cache(&log); + mysql_file_close(file, MYF(MY_WME)); + } free_root(&mem_root, MYF(0)); my_hash_free(&xids); err1: diff --git a/sql/log.h b/sql/log.h index 9b9bed1262a..179d302d2cc 100644 --- a/sql/log.h +++ b/sql/log.h @@ -354,6 +354,15 @@ private: time_t last_time; }; +/* + We assign each binlog file an internal ID, used to identify them for unlog(). + Ids start from BINLOG_COOKIE_START; the value BINLOG_COOKIE_DUMMY is special + meaning "no binlog" (we cannot use zero as that is reserved for error return + from log_and_order). +*/ +#define BINLOG_COOKIE_DUMMY 1 +#define BINLOG_COOKIE_START 2 + class binlog_cache_mngr; class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG { @@ -394,10 +403,40 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG bool all; }; + /* + A list of struct xid_count_per_binlog is used to keep track of how many + XIDs are in prepared, but not committed, state in each binlog. + + When count drops to zero in a binlog after rotation, it means that there + are no more XIDs in prepared state, so that binlog is no longer needed + for XA crash recovery, and we can log a new binlog checkpoint event. + + The list is protected against simultaneous access from multiple + threads by LOCK_xid_list. + */ + struct xid_count_per_binlog : public ilink { + char *binlog_name; + uint binlog_name_len; + ulong binlog_id; + long xid_count; + xid_count_per_binlog(); /* Give link error if constructor used. */ + }; + ulong current_binlog_id; + I_List<xid_count_per_binlog> binlog_xid_count_list; + /* + When this is set, a RESET MASTER is in progress. + + Then we should not write any binlog checkpoints into the binlog (that + could result in deadlock on LOCK_log, and we will delete all binlog files + anyway). Instead we should signal COND_xid_list whenever a new binlog + checkpoint arrives - when all have arrived, RESET MASTER will complete. + */ + bool reset_master_pending; + /* LOCK_log and LOCK_index are inited by init_pthread_objects() */ mysql_mutex_t LOCK_index; - mysql_mutex_t LOCK_prep_xids; - mysql_cond_t COND_prep_xids; + mysql_mutex_t LOCK_xid_list; + mysql_cond_t COND_xid_list; mysql_cond_t update_cond; ulonglong bytes_written; IO_CACHE index_file; @@ -421,7 +460,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG fix_max_relay_log_size). */ ulong max_size; - long prepared_xids; /* for tc log - number of xids to remember */ // current file sequence number for load data infile binary logging uint file_id; uint open_count; // For replication @@ -473,8 +511,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG int write_transaction_or_stmt(group_commit_entry *entry); bool write_transaction_to_binlog_events(group_commit_entry *entry); void trx_group_commit_leader(group_commit_entry *leader); - void mark_xid_done(); - void mark_xids_active(uint xid_count); + void mark_xid_done(ulong cookie); + void mark_xids_active(ulong cookie, uint xid_count); public: using MYSQL_LOG::generate_name; @@ -562,7 +600,8 @@ public: int log_and_order(THD *thd, my_xid xid, bool all, bool need_prepare_ordered, bool need_commit_ordered); int unlog(ulong cookie, my_xid xid); - int recover(IO_CACHE *log, Format_description_log_event *fdle); + int recover(LOG_INFO *linfo, const char *last_log_name, IO_CACHE *first_log, + Format_description_log_event *fdle); #if !defined(MYSQL_CLIENT) int flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event, @@ -614,6 +653,7 @@ public: bool write_incident_already_locked(THD *thd); bool write_incident(THD *thd); + void write_binlog_checkpoint_event_already_locked(const char *name, uint len); int write_cache(THD *thd, IO_CACHE *cache); void set_write_error(THD *thd, bool is_transactional); bool check_write_error(THD *thd); @@ -631,6 +671,7 @@ public: void make_log_name(char* buf, const char* log_ident); bool is_active(const char* log_file_name); + bool can_purge_log(const char *log_file_name); int update_log_index(LOG_INFO* linfo, bool need_update_threads); int rotate(bool force_rotate, bool* check_purge); void purge(); diff --git a/sql/log_event.cc b/sql/log_event.cc index f9849fa0ff5..6cefdb4fa59 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -731,6 +731,7 @@ const char* Log_event::get_type_str(Log_event_type type) case EXECUTE_LOAD_QUERY_EVENT: return "Execute_load_query"; case INCIDENT_EVENT: return "Incident"; case ANNOTATE_ROWS_EVENT: return "Annotate_rows"; + case BINLOG_CHECKPOINT_EVENT: return "Binlog_checkpoint"; default: return "Unknown"; /* impossible */ } } @@ -1387,7 +1388,7 @@ err: DBUG_ASSERT(error != 0); sql_print_error("Error in Log_event::read_log_event(): " "'%s', data_len: %d, event_type: %d", - error,data_len,head[EVENT_TYPE_OFFSET]); + error,data_len,(uchar)(head[EVENT_TYPE_OFFSET])); my_free(buf); /* The SQL slave thread will check if file->error<0 to know @@ -1536,6 +1537,9 @@ Log_event* Log_event::read_log_event(const char* buf, uint event_len, case ROTATE_EVENT: ev = new Rotate_log_event(buf, event_len, description_event); break; + case BINLOG_CHECKPOINT_EVENT: + ev = new Binlog_checkpoint_log_event(buf, event_len, description_event); + break; #ifdef HAVE_REPLICATION case SLAVE_EVENT: /* can never happen (unused event) */ ev = new Slave_log_event(buf, event_len, description_event); @@ -4405,6 +4409,8 @@ Format_description_log_event(uint8 binlog_ver, const char* server_ver) // Set header lengths of Maria events post_header_len[ANNOTATE_ROWS_EVENT-1]= ANNOTATE_ROWS_HEADER_LEN; + post_header_len[BINLOG_CHECKPOINT_EVENT-1]= + BINLOG_CHECKPOINT_HEADER_LEN; // Sanity-check that all post header lengths are initialized. int i; @@ -5864,6 +5870,86 @@ Rotate_log_event::do_shall_skip(Relay_log_info *rli) /************************************************************************** + Binlog_checkpoint_log_event methods +**************************************************************************/ + +#if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT) +void Binlog_checkpoint_log_event::pack_info(Protocol *protocol) +{ + protocol->store(binlog_file_name, binlog_file_len, &my_charset_bin); +} +#endif + + +#ifdef MYSQL_CLIENT +void Binlog_checkpoint_log_event::print(FILE *file, + PRINT_EVENT_INFO *print_event_info) +{ + Write_on_release_cache cache(&print_event_info->head_cache, file, + Write_on_release_cache::FLUSH_F); + + if (print_event_info->short_form) + return; + print_header(&cache, print_event_info, FALSE); + my_b_printf(&cache, "\tBinlog checkpoint "); + my_b_write(&cache, (uchar*)binlog_file_name, binlog_file_len); + my_b_printf(&cache, "\n"); +} +#endif /* MYSQL_CLIENT */ + + +#ifdef MYSQL_SERVER +Binlog_checkpoint_log_event::Binlog_checkpoint_log_event( + const char *binlog_file_name_arg, + uint binlog_file_len_arg) + :Log_event(), + binlog_file_name(my_strndup(binlog_file_name_arg, binlog_file_len_arg, + MYF(MY_WME))), + binlog_file_len(binlog_file_len_arg) +{ + cache_type= EVENT_NO_CACHE; +} +#endif /* MYSQL_SERVER */ + + +Binlog_checkpoint_log_event::Binlog_checkpoint_log_event( + const char *buf, uint event_len, + const Format_description_log_event *description_event) + :Log_event(buf, description_event), binlog_file_name(0) +{ + uint8 header_size= description_event->common_header_len; + uint8 post_header_len= + description_event->post_header_len[BINLOG_CHECKPOINT_EVENT-1]; + if (event_len < header_size + post_header_len || + post_header_len < BINLOG_CHECKPOINT_HEADER_LEN) + return; + buf+= header_size; + /* See uint4korr and int4store below */ + compile_time_assert(BINLOG_CHECKPOINT_HEADER_LEN == 4); + binlog_file_len= uint4korr(buf); + if (event_len - (header_size + post_header_len) < binlog_file_len) + return; + binlog_file_name= my_strndup(buf + post_header_len, binlog_file_len, + MYF(MY_WME)); + return; +} + + +#ifndef MYSQL_CLIENT +bool Binlog_checkpoint_log_event::write(IO_CACHE *file) +{ + uchar buf[BINLOG_CHECKPOINT_HEADER_LEN]; + int4store(buf, binlog_file_len); + return write_header(file, BINLOG_CHECKPOINT_HEADER_LEN + binlog_file_len) || + wrapper_my_b_safe_write(file, buf, BINLOG_CHECKPOINT_HEADER_LEN) || + wrapper_my_b_safe_write(file, (const uchar *)binlog_file_name, + binlog_file_len) || + write_footer(file); +} +#endif /* MYSQL_CLIENT */ + + +/************************************************************************** Intvar_log_event methods **************************************************************************/ diff --git a/sql/log_event.h b/sql/log_event.h index 12d5e1417f4..c76e538618b 100644 --- a/sql/log_event.h +++ b/sql/log_event.h @@ -259,6 +259,7 @@ struct sql_ex_info #define INCIDENT_HEADER_LEN 2 #define HEARTBEAT_HEADER_LEN 0 #define ANNOTATE_ROWS_HEADER_LEN 0 +#define BINLOG_CHECKPOINT_HEADER_LEN 4 /* Max number of possible extra bytes in a replication event compared to a @@ -685,6 +686,14 @@ enum Log_event_type MARIA_EVENTS_BEGIN= 160, /* New Maria event numbers start from here */ ANNOTATE_ROWS_EVENT= 160, + /* + Binlog checkpoint event. Used for XA crash recovery on the master, not used + in replication. + A binlog checkpoint event specifies a binlog file such that XA crash + recovery can start from that file - and it is guaranteed to find all XIDs + that are prepared in storage engines but not yet committed. + */ + BINLOG_CHECKPOINT_EVENT= 161, /* Add new MariaDB events here - right above this comment! */ @@ -2892,6 +2901,32 @@ private: }; +class Binlog_checkpoint_log_event: public Log_event +{ +public: + char *binlog_file_name; + uint binlog_file_len; + +#ifdef MYSQL_SERVER + Binlog_checkpoint_log_event(const char *binlog_file_name_arg, + uint binlog_file_len_arg); +#ifdef HAVE_REPLICATION + void pack_info(Protocol *protocol); +#endif +#else + void print(FILE *file, PRINT_EVENT_INFO *print_event_info); +#endif + Binlog_checkpoint_log_event(const char *buf, uint event_len, + const Format_description_log_event *description_event); + ~Binlog_checkpoint_log_event() { my_free(binlog_file_name); } + Log_event_type get_type_code() { return BINLOG_CHECKPOINT_EVENT;} + int get_data_size() { return binlog_file_len + BINLOG_CHECKPOINT_HEADER_LEN;} + bool is_valid() const { return binlog_file_name != 0; } +#ifdef MYSQL_SERVER + bool write(IO_CACHE* file); +#endif +}; + /* the classes below are for the new LOAD DATA INFILE logging */ /** diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 91bc2bfe226..3c0f209235a 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -722,7 +722,7 @@ PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active, key_LOCK_pool; PSI_mutex_key key_LOCK_des_key_file; #endif /* HAVE_OPENSSL */ -PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_prep_xids, +PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi, key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create, key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log, @@ -763,7 +763,7 @@ static PSI_mutex_info all_server_mutexes[]= #endif /* HAVE_OPENSSL */ { &key_BINLOG_LOCK_index, "MYSQL_BIN_LOG::LOCK_index", 0}, - { &key_BINLOG_LOCK_prep_xids, "MYSQL_BIN_LOG::LOCK_prep_xids", 0}, + { &key_BINLOG_LOCK_xid_list, "MYSQL_BIN_LOG::LOCK_xid_list", 0}, { &key_RELAYLOG_LOCK_index, "MYSQL_RELAY_LOG::LOCK_index", 0}, { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0}, { &key_hash_filo_lock, "hash_filo::lock", 0}, @@ -831,7 +831,7 @@ static PSI_rwlock_info all_server_rwlocks[]= PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool; #endif /* HAVE_MMAP */ -PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond, +PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond, key_COND_cache_status_changed, key_COND_manager, key_COND_rpl_status, key_COND_server_started, key_delayed_insert_cond, key_delayed_insert_cond_client, @@ -859,7 +859,7 @@ static PSI_cond_info all_server_conds[]= { &key_COND_pool, "TC_LOG_MMAP::COND_pool", 0}, { &key_TC_LOG_MMAP_COND_queue_busy, "TC_LOG_MMAP::COND_queue_busy", 0}, #endif /* HAVE_MMAP */ - { &key_BINLOG_COND_prep_xids, "MYSQL_BIN_LOG::COND_prep_xids", 0}, + { &key_BINLOG_COND_xid_list, "MYSQL_BIN_LOG::COND_xid_list", 0}, { &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0}, { &key_BINLOG_COND_queue_busy, "MYSQL_BIN_LOG::COND_queue_busy", 0}, { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0}, diff --git a/sql/mysqld.h b/sql/mysqld.h index 56419acdcd4..28c4d771a48 100644 --- a/sql/mysqld.h +++ b/sql/mysqld.h @@ -225,7 +225,7 @@ extern PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active, extern PSI_mutex_key key_LOCK_des_key_file; #endif -extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_prep_xids, +extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi, key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create, key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log, @@ -256,7 +256,7 @@ extern PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger, extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool; #endif /* HAVE_MMAP */ -extern PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond, +extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond, key_COND_cache_status_changed, key_COND_manager, key_COND_rpl_status, key_COND_server_started, key_delayed_insert_cond, key_delayed_insert_cond_client, diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index d7308eddb7f..079aab27101 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -624,6 +624,30 @@ send_event_to_slave(THD *thd, NET *net, String* const packet, ushort flags, } /* + Do not send binlog checkpoint events to a slave that does not understand it. + */ + if (unlikely(event_type == BINLOG_CHECKPOINT_EVENT) && + mariadb_slave_capability < MARIA_SLAVE_CAPABILITY_BINLOG_CHECKPOINT) + { + if (mariadb_slave_capability >= MARIA_SLAVE_CAPABILITY_TOLERATE_HOLES) + { + /* This slave can tolerate events omitted from the binlog stream. */ + return NULL; + } + else + { + /* + The slave does not understand BINLOG_CHECKPOINT_EVENT. Send a dummy + event instead, with same length so slave does not get confused about + binlog positions. + */ + if (Query_log_event::dummy_event(packet, ev_offset, current_checksum_alg)) + return "Failed to replace binlog checkpoint event with dummy: " + "too small event."; + } + } + + /* Skip events with the @@skip_replication flag set, if slave requested skipping of such events. */ |