summaryrefslogtreecommitdiff
path: root/sql/log.cc
diff options
context:
space:
mode:
authorNirbhay Choubey <nirbhay@mariadb.com>2014-12-05 12:33:02 -0500
committerNirbhay Choubey <nirbhay@mariadb.com>2014-12-05 12:33:02 -0500
commit3bb02f3e6d70687b02c6902fd6478e8c328608f8 (patch)
tree7962068cae78450246a484cf95cbe5cf2e37382a /sql/log.cc
parenta4cdd20d49a5a6ca0eb7346b7b9e898ac5e7995b (diff)
parentffc0ef63164e9a121d8a02323afe32a82299171e (diff)
downloadmariadb-git-3bb02f3e6d70687b02c6902fd6478e8c328608f8.tar.gz
bzr merge -rtag:mariadb-10.0.15 maria/10.0
Diffstat (limited to 'sql/log.cc')
-rw-r--r--sql/log.cc269
1 files changed, 128 insertions, 141 deletions
diff --git a/sql/log.cc b/sql/log.cc
index a5fda53ccea..7a8531b5f46 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -25,7 +25,7 @@
Abort logging when we get an error in reading or writing log files
*/
-#include "my_global.h" /* NO_EMBEDDED_ACCESS_CHECKS */
+#include <my_global.h> /* NO_EMBEDDED_ACCESS_CHECKS */
#include "sql_priv.h"
#include "log.h"
#include "sql_base.h" // open_log_table
@@ -2651,6 +2651,7 @@ bool MYSQL_LOG::open(
const char *new_name, enum cache_type io_cache_type_arg)
{
char buff[FN_REFLEN];
+ MY_STAT f_stat;
File file= -1;
int open_flags= O_CREAT | O_BINARY;
DBUG_ENTER("MYSQL_LOG::open");
@@ -2668,6 +2669,10 @@ bool MYSQL_LOG::open(
log_type_arg, io_cache_type_arg))
goto err;
+ /* File is regular writable file */
+ if (my_stat(log_file_name, &f_stat, MYF(0)) && !MY_S_ISREG(f_stat.st_mode))
+ goto err;
+
if (io_cache_type == SEQ_READ_APPEND)
open_flags |= O_RDWR | O_APPEND;
else
@@ -6028,7 +6033,10 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
if (direct)
{
+ int res;
DBUG_PRINT("info", ("direct is set"));
+ if ((res= thd->wait_for_prior_commit()))
+ DBUG_RETURN(res);
file= &log_file;
my_org_b_tell= my_b_tell(file);
mysql_mutex_lock(&LOCK_log);
@@ -6961,6 +6969,10 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
to commit. If so, we add those to the queue as well, transitively for all
waiters.
+ And if a transaction is marked to wait for a prior transaction, but that
+ prior transaction is already queued for group commit, then we can queue the
+ new transaction directly to participate in the group commit.
+
@retval < 0 Error
@retval > 0 If queued as the first entry in the queue (meaning this
is the leader)
@@ -6970,8 +6982,8 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
int
MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
{
- group_commit_entry *entry, *orig_queue;
- wait_for_commit *cur, *last;
+ group_commit_entry *entry, *orig_queue, *last;
+ wait_for_commit *cur;
wait_for_commit *wfc;
DBUG_ENTER("MYSQL_BIN_LOG::queue_for_group_commit");
@@ -6988,8 +7000,17 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
if (wfc && wfc->waitee)
{
mysql_mutex_lock(&wfc->LOCK_wait_commit);
- /* Do an extra check here, this time safely under lock. */
- if (wfc->waitee)
+ /*
+ Do an extra check here, this time safely under lock.
+
+ If waitee->commit_started is set, it means that the transaction we need
+ to wait for has already queued up for group commit. In this case it is
+ safe for us to queue up immediately as well, increasing the opprtunities
+ for group commit. Because waitee has taken the LOCK_prepare_ordered
+ before setting the flag, so there is no risk that we can queue ahead of
+ it.
+ */
+ if (wfc->waitee && !wfc->waitee->commit_started)
{
PSI_stage_info old_stage;
wait_for_commit *loc_waitee;
@@ -7002,6 +7023,11 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
This other transaction may then take over the commit process for us to
get us included in its own group commit. If this happens, the
queued_by_other flag is set.
+
+ Setting this flag may or may not be seen by the other thread, but we
+ are safe in any case: The other thread will set queued_by_other under
+ its LOCK_wait_commit, and we will not check queued_by_other only after
+ we have been woken up.
*/
wfc->opaque_pointer= orig_entry;
DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior");
@@ -7074,41 +7100,41 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
/*
Iteratively process everything added to the queue, looking for waiters,
and their waiters, and so on. If a waiter is ready to commit, we
- immediately add it to the queue; if not we just wake it up.
+ immediately add it to the queue, and mark it as queued_by_other.
This would be natural to do with recursion, but we want to avoid
potentially unbounded recursion blowing the C stack, so we use the list
approach instead.
- We keep a list of all the waiters that need to be processed in `list',
- linked through the next_subsequent_commit pointer. Initially this list
- contains only the entry passed into this function.
+ We keep a list of the group_commit_entry of all the waiters that need to
+ be processed. Initially this list contains only the entry passed into this
+ function.
We process entries in the list one by one. The element currently being
- processed is pointed to by `cur`, and the element at the end of the list
+ processed is pointed to by `entry`, and the element at the end of the list
is pointed to by `last` (we do not use NULL to terminate the list).
- As we process an element, it is first added to the group_commit_queue.
- Then any waiters for that element are added at the end of the list, to
- be processed in subsequent iterations. This continues until the list
- is exhausted, with all elements ever added eventually processed.
+ As we process an entry, any waiters for that entry are added at the end of
+ the list, to be processed in subsequent iterations. The the entry is added
+ to the group_commit_queue. This continues until the list is exhausted,
+ with all entries ever added eventually processed.
The end result is a breath-first traversal of the tree of waiters,
- re-using the next_subsequent_commit pointers in place of extra stack
- space in a recursive traversal.
+ re-using the `next' pointers of the group_commit_entry objects in place of
+ extra stack space in a recursive traversal.
- The temporary list created in next_subsequent_commit is not
- used by the caller or any other function.
+ The temporary list linked through these `next' pointers is not used by the
+ caller or any other function; it only exists while doing the iterative
+ tree traversal. After, all the processed entries are linked into the
+ group_commit_queue.
*/
cur= wfc;
- last= wfc;
+ last= orig_entry;
entry= orig_entry;
for (;;)
{
- /* Add the entry to the group commit queue. */
- entry->next= group_commit_queue;
- group_commit_queue= entry;
+ group_commit_entry *next_entry;
if (entry->cache_mngr->using_xa)
{
@@ -7117,135 +7143,95 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
}
- if (!cur)
- break; // Can happen if initial entry has no wait_for_commit
-
- /*
- Check if this transaction has other transaction waiting for it to commit.
-
- If so, process the waiting transactions, and their waiters and so on,
- transitively.
- */
- if (cur->subsequent_commits_list)
+ if (cur)
{
- wait_for_commit *waiter;
- wait_for_commit *wakeup_list= NULL;
- wait_for_commit **wakeup_next_ptr= &wakeup_list;
-
- mysql_mutex_lock(&cur->LOCK_wait_commit);
/*
- Grab the list, now safely under lock, and process it if still
- non-empty.
+ Now that we have taken LOCK_prepare_ordered and will queue up in the
+ group commit queue, it is safe for following transactions to queue
+ themselves. We will grab here any transaction that is now ready to
+ queue up, but after that, more transactions may become ready while the
+ leader is waiting to start the group commit. So set the flag
+ `commit_started', so that later transactions can still participate in
+ the group commit..
*/
- waiter= cur->subsequent_commits_list;
- cur->subsequent_commits_list= NULL;
- while (waiter)
- {
- wait_for_commit *next= waiter->next_subsequent_commit;
- group_commit_entry *entry2=
- (group_commit_entry *)waiter->opaque_pointer;
- if (entry2)
- {
- /*
- This is another transaction ready to be written to the binary
- log. We can put it into the queue directly, without needing a
- separate context switch to the other thread. We just set a flag
- so that the other thread will know when it wakes up that it was
- already processed.
-
- So put it at the end of the list to be processed in a subsequent
- iteration of the outer loop.
- */
- entry2->queued_by_other= true;
- last->next_subsequent_commit= waiter;
- last= waiter;
- /*
- As a small optimisation, we do not actually need to set
- waiter->next_subsequent_commit to NULL, as we can use the
- pointer `last' to check for end-of-list.
- */
- }
- else
- {
- /*
- Wake up the waiting transaction.
+ cur->commit_started= true;
- For this, we need to set the "wakeup running" flag and release
- the waitee lock to avoid a deadlock, see comments on
- THD::wakeup_subsequent_commits2() for details.
+ /*
+ Check if this transaction has other transaction waiting for it to
+ commit.
- So we need to put these on a list and delay the wakeup until we
- have released the lock.
- */
- *wakeup_next_ptr= waiter;
- wakeup_next_ptr= &waiter->next_subsequent_commit;
- }
- waiter= next;
- }
- if (wakeup_list)
+ If so, process the waiting transactions, and their waiters and so on,
+ transitively.
+ */
+ if (cur->subsequent_commits_list)
{
- /* Now release our lock and do the wakeups that were delayed above. */
- cur->wakeup_subsequent_commits_running= true;
- mysql_mutex_unlock(&cur->LOCK_wait_commit);
- for (;;)
- {
- wait_for_commit *next;
+ wait_for_commit *waiter, **waiter_ptr;
- /*
- ToDo: We wakeup the waiter here, so that it can have the chance to
- reach its own commit state and queue up for this same group commit,
- if it is still pending.
-
- One problem with this is that if the waiter does not reach its own
- commit state before this group commit starts, and then the group
- commit fails (binlog write failure), we do not get to propagate
- the error to the waiter.
-
- A solution for this could be to delay the wakeup until commit is
- successful. But then we need to set a flag in the waitee that it is
- already queued for group commit, so that the waiter can check this
- flag and queue itself if it _does_ reach the commit state in time.
-
- (But error handling in case of binlog write failure is currently
- broken in other ways, as well).
- */
- if (&wakeup_list->next_subsequent_commit == wakeup_next_ptr)
+ mysql_mutex_lock(&cur->LOCK_wait_commit);
+ /*
+ Grab the list, now safely under lock, and process it if still
+ non-empty.
+ */
+ waiter= cur->subsequent_commits_list;
+ waiter_ptr= &cur->subsequent_commits_list;
+ while (waiter)
+ {
+ wait_for_commit *next_waiter= waiter->next_subsequent_commit;
+ group_commit_entry *entry2=
+ (group_commit_entry *)waiter->opaque_pointer;
+ if (entry2)
{
- /* The last one in the list. */
- wakeup_list->wakeup(0);
- break;
+ /*
+ This is another transaction ready to be written to the binary
+ log. We can put it into the queue directly, without needing a
+ separate context switch to the other thread. We just set a flag
+ so that the other thread will know when it wakes up that it was
+ already processed.
+
+ So remove it from the list of our waiters, and instead put it at
+ the end of the list to be processed in a subsequent iteration of
+ the outer loop.
+ */
+ *waiter_ptr= next_waiter;
+ entry2->queued_by_other= true;
+ last->next= entry2;
+ last= entry2;
+ /*
+ As a small optimisation, we do not actually need to set
+ entry2->next to NULL, as we can use the pointer `last' to check
+ for end-of-list.
+ */
}
- /*
- Important: don't access wakeup_list->next after the wakeup() call,
- it may be invalidated by the other thread.
- */
- next= wakeup_list->next_subsequent_commit;
- wakeup_list->wakeup(0);
- wakeup_list= next;
+ else
+ {
+ /*
+ This transaction is not ready to participate in the group commit
+ yet, so leave it in the waiter list. It might join the group
+ commit later, if it completes soon enough to do so (it will see
+ our wfc->commit_started flag set), or it might commit later in a
+ later group commit.
+ */
+ waiter_ptr= &waiter->next_subsequent_commit;
+ }
+ waiter= next_waiter;
}
- /*
- We need a full memory barrier between walking the list and clearing
- the flag wakeup_subsequent_commits_running. This barrier is needed
- to ensure that no other thread will start to modify the list
- pointers before we are done traversing the list.
-
- But wait_for_commit::wakeup(), which was called above, does a full
- memory barrier already (it locks a mutex).
- */
- cur->wakeup_subsequent_commits_running= false;
- }
- else
mysql_mutex_unlock(&cur->LOCK_wait_commit);
+ }
}
- if (cur == last)
+
+ /* Add the entry to the group commit queue. */
+ next_entry= entry->next;
+ entry->next= group_commit_queue;
+ group_commit_queue= entry;
+ if (entry == last)
break;
/*
Move to the next entry in the flattened list of waiting transactions
that still need to be processed transitively.
*/
- cur= cur->next_subsequent_commit;
- entry= (group_commit_entry *)cur->opaque_pointer;
+ entry= next_entry;
DBUG_ASSERT(entry != NULL);
+ cur= entry->thd->wait_for_commit_ptr;
}
if (opt_binlog_commit_wait_count > 0)
@@ -7301,6 +7287,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
DEBUG_SYNC(entry->thd, "commit_after_group_run_commit_ordered");
}
mysql_mutex_unlock(&LOCK_commit_ordered);
+ entry->thd->wakeup_subsequent_commits(entry->error);
if (next)
{
@@ -7334,7 +7321,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
}
if (likely(!entry->error))
- return 0;
+ return entry->thd->wait_for_prior_commit();
switch (entry->error)
{
@@ -7392,10 +7379,15 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
LINT_INIT(binlog_id);
{
+ DBUG_EXECUTE_IF("inject_binlog_commit_before_get_LOCK_log",
+ DBUG_ASSERT(!debug_sync_set_action(leader->thd, STRING_WITH_LEN
+ ("commit_before_get_LOCK_log SIGNAL waiting WAIT_FOR cont TIMEOUT 1")));
+ );
/*
Lock the LOCK_log(), and once we get it, collect any additional writes
that queued up while we were waiting.
*/
+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_log");
mysql_mutex_lock(&LOCK_log);
DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
@@ -7602,6 +7594,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
if (current->cache_mngr->using_xa && !current->error &&
DBUG_EVALUATE_IF("skip_commit_ordered", 0, 1))
run_commit_ordered(current->thd, current->all);
+ current->thd->wakeup_subsequent_commits(current->error);
/*
Careful not to access current->next after waking up the other thread! As
@@ -7642,7 +7635,6 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE)))
{
entry->error_cache= &mngr->stmt_cache.cache_log;
- entry->commit_errno= errno;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}
@@ -7663,7 +7655,6 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE)))
{
entry->error_cache= &mngr->trx_cache.cache_log;
- entry->commit_errno= errno;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}
}
@@ -7671,14 +7662,13 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
DBUG_EXECUTE_IF("inject_error_writing_xid",
{
entry->error_cache= NULL;
- entry->commit_errno= 28;
+ errno= 28;
DBUG_RETURN(ER_ERROR_ON_WRITE);
});
if (entry->end_event->write(&log_file))
{
entry->error_cache= NULL;
- entry->commit_errno= errno;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}
status_var_add(entry->thd->status_var.binlog_bytes_written,
@@ -7689,7 +7679,6 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
if (entry->incident_event->write(&log_file))
{
entry->error_cache= NULL;
- entry->commit_errno= errno;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}
}
@@ -7697,13 +7686,11 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
if (mngr->get_binlog_cache_log(FALSE)->error) // Error on read
{
entry->error_cache= &mngr->stmt_cache.cache_log;
- entry->commit_errno= errno;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}
if (mngr->get_binlog_cache_log(TRUE)->error) // Error on read
{
entry->error_cache= &mngr->trx_cache.cache_log;
- entry->commit_errno= errno;
DBUG_RETURN(ER_ERROR_ON_WRITE);
}