diff options
author | Nirbhay Choubey <nirbhay@mariadb.com> | 2014-12-05 12:33:02 -0500 |
---|---|---|
committer | Nirbhay Choubey <nirbhay@mariadb.com> | 2014-12-05 12:33:02 -0500 |
commit | 3bb02f3e6d70687b02c6902fd6478e8c328608f8 (patch) | |
tree | 7962068cae78450246a484cf95cbe5cf2e37382a /sql/log.cc | |
parent | a4cdd20d49a5a6ca0eb7346b7b9e898ac5e7995b (diff) | |
parent | ffc0ef63164e9a121d8a02323afe32a82299171e (diff) | |
download | mariadb-git-3bb02f3e6d70687b02c6902fd6478e8c328608f8.tar.gz |
bzr merge -rtag:mariadb-10.0.15 maria/10.0
Diffstat (limited to 'sql/log.cc')
-rw-r--r-- | sql/log.cc | 269 |
1 files changed, 128 insertions, 141 deletions
diff --git a/sql/log.cc b/sql/log.cc index a5fda53ccea..7a8531b5f46 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -25,7 +25,7 @@ Abort logging when we get an error in reading or writing log files */ -#include "my_global.h" /* NO_EMBEDDED_ACCESS_CHECKS */ +#include <my_global.h> /* NO_EMBEDDED_ACCESS_CHECKS */ #include "sql_priv.h" #include "log.h" #include "sql_base.h" // open_log_table @@ -2651,6 +2651,7 @@ bool MYSQL_LOG::open( const char *new_name, enum cache_type io_cache_type_arg) { char buff[FN_REFLEN]; + MY_STAT f_stat; File file= -1; int open_flags= O_CREAT | O_BINARY; DBUG_ENTER("MYSQL_LOG::open"); @@ -2668,6 +2669,10 @@ bool MYSQL_LOG::open( log_type_arg, io_cache_type_arg)) goto err; + /* File is regular writable file */ + if (my_stat(log_file_name, &f_stat, MYF(0)) && !MY_S_ISREG(f_stat.st_mode)) + goto err; + if (io_cache_type == SEQ_READ_APPEND) open_flags |= O_RDWR | O_APPEND; else @@ -6028,7 +6033,10 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate) if (direct) { + int res; DBUG_PRINT("info", ("direct is set")); + if ((res= thd->wait_for_prior_commit())) + DBUG_RETURN(res); file= &log_file; my_org_b_tell= my_b_tell(file); mysql_mutex_lock(&LOCK_log); @@ -6961,6 +6969,10 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, to commit. If so, we add those to the queue as well, transitively for all waiters. + And if a transaction is marked to wait for a prior transaction, but that + prior transaction is already queued for group commit, then we can queue the + new transaction directly to participate in the group commit. + @retval < 0 Error @retval > 0 If queued as the first entry in the queue (meaning this is the leader) @@ -6970,8 +6982,8 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, int MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) { - group_commit_entry *entry, *orig_queue; - wait_for_commit *cur, *last; + group_commit_entry *entry, *orig_queue, *last; + wait_for_commit *cur; wait_for_commit *wfc; DBUG_ENTER("MYSQL_BIN_LOG::queue_for_group_commit"); @@ -6988,8 +7000,17 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) if (wfc && wfc->waitee) { mysql_mutex_lock(&wfc->LOCK_wait_commit); - /* Do an extra check here, this time safely under lock. */ - if (wfc->waitee) + /* + Do an extra check here, this time safely under lock. + + If waitee->commit_started is set, it means that the transaction we need + to wait for has already queued up for group commit. In this case it is + safe for us to queue up immediately as well, increasing the opprtunities + for group commit. Because waitee has taken the LOCK_prepare_ordered + before setting the flag, so there is no risk that we can queue ahead of + it. + */ + if (wfc->waitee && !wfc->waitee->commit_started) { PSI_stage_info old_stage; wait_for_commit *loc_waitee; @@ -7002,6 +7023,11 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) This other transaction may then take over the commit process for us to get us included in its own group commit. If this happens, the queued_by_other flag is set. + + Setting this flag may or may not be seen by the other thread, but we + are safe in any case: The other thread will set queued_by_other under + its LOCK_wait_commit, and we will not check queued_by_other only after + we have been woken up. */ wfc->opaque_pointer= orig_entry; DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior"); @@ -7074,41 +7100,41 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) /* Iteratively process everything added to the queue, looking for waiters, and their waiters, and so on. If a waiter is ready to commit, we - immediately add it to the queue; if not we just wake it up. + immediately add it to the queue, and mark it as queued_by_other. This would be natural to do with recursion, but we want to avoid potentially unbounded recursion blowing the C stack, so we use the list approach instead. - We keep a list of all the waiters that need to be processed in `list', - linked through the next_subsequent_commit pointer. Initially this list - contains only the entry passed into this function. + We keep a list of the group_commit_entry of all the waiters that need to + be processed. Initially this list contains only the entry passed into this + function. We process entries in the list one by one. The element currently being - processed is pointed to by `cur`, and the element at the end of the list + processed is pointed to by `entry`, and the element at the end of the list is pointed to by `last` (we do not use NULL to terminate the list). - As we process an element, it is first added to the group_commit_queue. - Then any waiters for that element are added at the end of the list, to - be processed in subsequent iterations. This continues until the list - is exhausted, with all elements ever added eventually processed. + As we process an entry, any waiters for that entry are added at the end of + the list, to be processed in subsequent iterations. The the entry is added + to the group_commit_queue. This continues until the list is exhausted, + with all entries ever added eventually processed. The end result is a breath-first traversal of the tree of waiters, - re-using the next_subsequent_commit pointers in place of extra stack - space in a recursive traversal. + re-using the `next' pointers of the group_commit_entry objects in place of + extra stack space in a recursive traversal. - The temporary list created in next_subsequent_commit is not - used by the caller or any other function. + The temporary list linked through these `next' pointers is not used by the + caller or any other function; it only exists while doing the iterative + tree traversal. After, all the processed entries are linked into the + group_commit_queue. */ cur= wfc; - last= wfc; + last= orig_entry; entry= orig_entry; for (;;) { - /* Add the entry to the group commit queue. */ - entry->next= group_commit_queue; - group_commit_queue= entry; + group_commit_entry *next_entry; if (entry->cache_mngr->using_xa) { @@ -7117,135 +7143,95 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered"); } - if (!cur) - break; // Can happen if initial entry has no wait_for_commit - - /* - Check if this transaction has other transaction waiting for it to commit. - - If so, process the waiting transactions, and their waiters and so on, - transitively. - */ - if (cur->subsequent_commits_list) + if (cur) { - wait_for_commit *waiter; - wait_for_commit *wakeup_list= NULL; - wait_for_commit **wakeup_next_ptr= &wakeup_list; - - mysql_mutex_lock(&cur->LOCK_wait_commit); /* - Grab the list, now safely under lock, and process it if still - non-empty. + Now that we have taken LOCK_prepare_ordered and will queue up in the + group commit queue, it is safe for following transactions to queue + themselves. We will grab here any transaction that is now ready to + queue up, but after that, more transactions may become ready while the + leader is waiting to start the group commit. So set the flag + `commit_started', so that later transactions can still participate in + the group commit.. */ - waiter= cur->subsequent_commits_list; - cur->subsequent_commits_list= NULL; - while (waiter) - { - wait_for_commit *next= waiter->next_subsequent_commit; - group_commit_entry *entry2= - (group_commit_entry *)waiter->opaque_pointer; - if (entry2) - { - /* - This is another transaction ready to be written to the binary - log. We can put it into the queue directly, without needing a - separate context switch to the other thread. We just set a flag - so that the other thread will know when it wakes up that it was - already processed. - - So put it at the end of the list to be processed in a subsequent - iteration of the outer loop. - */ - entry2->queued_by_other= true; - last->next_subsequent_commit= waiter; - last= waiter; - /* - As a small optimisation, we do not actually need to set - waiter->next_subsequent_commit to NULL, as we can use the - pointer `last' to check for end-of-list. - */ - } - else - { - /* - Wake up the waiting transaction. + cur->commit_started= true; - For this, we need to set the "wakeup running" flag and release - the waitee lock to avoid a deadlock, see comments on - THD::wakeup_subsequent_commits2() for details. + /* + Check if this transaction has other transaction waiting for it to + commit. - So we need to put these on a list and delay the wakeup until we - have released the lock. - */ - *wakeup_next_ptr= waiter; - wakeup_next_ptr= &waiter->next_subsequent_commit; - } - waiter= next; - } - if (wakeup_list) + If so, process the waiting transactions, and their waiters and so on, + transitively. + */ + if (cur->subsequent_commits_list) { - /* Now release our lock and do the wakeups that were delayed above. */ - cur->wakeup_subsequent_commits_running= true; - mysql_mutex_unlock(&cur->LOCK_wait_commit); - for (;;) - { - wait_for_commit *next; + wait_for_commit *waiter, **waiter_ptr; - /* - ToDo: We wakeup the waiter here, so that it can have the chance to - reach its own commit state and queue up for this same group commit, - if it is still pending. - - One problem with this is that if the waiter does not reach its own - commit state before this group commit starts, and then the group - commit fails (binlog write failure), we do not get to propagate - the error to the waiter. - - A solution for this could be to delay the wakeup until commit is - successful. But then we need to set a flag in the waitee that it is - already queued for group commit, so that the waiter can check this - flag and queue itself if it _does_ reach the commit state in time. - - (But error handling in case of binlog write failure is currently - broken in other ways, as well). - */ - if (&wakeup_list->next_subsequent_commit == wakeup_next_ptr) + mysql_mutex_lock(&cur->LOCK_wait_commit); + /* + Grab the list, now safely under lock, and process it if still + non-empty. + */ + waiter= cur->subsequent_commits_list; + waiter_ptr= &cur->subsequent_commits_list; + while (waiter) + { + wait_for_commit *next_waiter= waiter->next_subsequent_commit; + group_commit_entry *entry2= + (group_commit_entry *)waiter->opaque_pointer; + if (entry2) { - /* The last one in the list. */ - wakeup_list->wakeup(0); - break; + /* + This is another transaction ready to be written to the binary + log. We can put it into the queue directly, without needing a + separate context switch to the other thread. We just set a flag + so that the other thread will know when it wakes up that it was + already processed. + + So remove it from the list of our waiters, and instead put it at + the end of the list to be processed in a subsequent iteration of + the outer loop. + */ + *waiter_ptr= next_waiter; + entry2->queued_by_other= true; + last->next= entry2; + last= entry2; + /* + As a small optimisation, we do not actually need to set + entry2->next to NULL, as we can use the pointer `last' to check + for end-of-list. + */ } - /* - Important: don't access wakeup_list->next after the wakeup() call, - it may be invalidated by the other thread. - */ - next= wakeup_list->next_subsequent_commit; - wakeup_list->wakeup(0); - wakeup_list= next; + else + { + /* + This transaction is not ready to participate in the group commit + yet, so leave it in the waiter list. It might join the group + commit later, if it completes soon enough to do so (it will see + our wfc->commit_started flag set), or it might commit later in a + later group commit. + */ + waiter_ptr= &waiter->next_subsequent_commit; + } + waiter= next_waiter; } - /* - We need a full memory barrier between walking the list and clearing - the flag wakeup_subsequent_commits_running. This barrier is needed - to ensure that no other thread will start to modify the list - pointers before we are done traversing the list. - - But wait_for_commit::wakeup(), which was called above, does a full - memory barrier already (it locks a mutex). - */ - cur->wakeup_subsequent_commits_running= false; - } - else mysql_mutex_unlock(&cur->LOCK_wait_commit); + } } - if (cur == last) + + /* Add the entry to the group commit queue. */ + next_entry= entry->next; + entry->next= group_commit_queue; + group_commit_queue= entry; + if (entry == last) break; /* Move to the next entry in the flattened list of waiting transactions that still need to be processed transitively. */ - cur= cur->next_subsequent_commit; - entry= (group_commit_entry *)cur->opaque_pointer; + entry= next_entry; DBUG_ASSERT(entry != NULL); + cur= entry->thd->wait_for_commit_ptr; } if (opt_binlog_commit_wait_count > 0) @@ -7301,6 +7287,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry) DEBUG_SYNC(entry->thd, "commit_after_group_run_commit_ordered"); } mysql_mutex_unlock(&LOCK_commit_ordered); + entry->thd->wakeup_subsequent_commits(entry->error); if (next) { @@ -7334,7 +7321,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry) } if (likely(!entry->error)) - return 0; + return entry->thd->wait_for_prior_commit(); switch (entry->error) { @@ -7392,10 +7379,15 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) LINT_INIT(binlog_id); { + DBUG_EXECUTE_IF("inject_binlog_commit_before_get_LOCK_log", + DBUG_ASSERT(!debug_sync_set_action(leader->thd, STRING_WITH_LEN + ("commit_before_get_LOCK_log SIGNAL waiting WAIT_FOR cont TIMEOUT 1"))); + ); /* Lock the LOCK_log(), and once we get it, collect any additional writes that queued up while we were waiting. */ + DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_log"); mysql_mutex_lock(&LOCK_log); DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log"); @@ -7602,6 +7594,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) if (current->cache_mngr->using_xa && !current->error && DBUG_EVALUATE_IF("skip_commit_ordered", 0, 1)) run_commit_ordered(current->thd, current->all); + current->thd->wakeup_subsequent_commits(current->error); /* Careful not to access current->next after waking up the other thread! As @@ -7642,7 +7635,6 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE))) { entry->error_cache= &mngr->stmt_cache.cache_log; - entry->commit_errno= errno; DBUG_RETURN(ER_ERROR_ON_WRITE); } @@ -7663,7 +7655,6 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE))) { entry->error_cache= &mngr->trx_cache.cache_log; - entry->commit_errno= errno; DBUG_RETURN(ER_ERROR_ON_WRITE); } } @@ -7671,14 +7662,13 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, DBUG_EXECUTE_IF("inject_error_writing_xid", { entry->error_cache= NULL; - entry->commit_errno= 28; + errno= 28; DBUG_RETURN(ER_ERROR_ON_WRITE); }); if (entry->end_event->write(&log_file)) { entry->error_cache= NULL; - entry->commit_errno= errno; DBUG_RETURN(ER_ERROR_ON_WRITE); } status_var_add(entry->thd->status_var.binlog_bytes_written, @@ -7689,7 +7679,6 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, if (entry->incident_event->write(&log_file)) { entry->error_cache= NULL; - entry->commit_errno= errno; DBUG_RETURN(ER_ERROR_ON_WRITE); } } @@ -7697,13 +7686,11 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, if (mngr->get_binlog_cache_log(FALSE)->error) // Error on read { entry->error_cache= &mngr->stmt_cache.cache_log; - entry->commit_errno= errno; DBUG_RETURN(ER_ERROR_ON_WRITE); } if (mngr->get_binlog_cache_log(TRUE)->error) // Error on read { entry->error_cache= &mngr->trx_cache.cache_log; - entry->commit_errno= errno; DBUG_RETURN(ER_ERROR_ON_WRITE); } |