summaryrefslogtreecommitdiff
path: root/storage/xtradb
diff options
context:
space:
mode:
authorunknown <knielsen@knielsen-hq.org>2012-12-14 15:38:07 +0100
committerunknown <knielsen@knielsen-hq.org>2012-12-14 15:38:07 +0100
commit40bbf697aad7d923fc1bd995bc5f547e45461cbe (patch)
tree1563a1c99589bb5853d35faee4ae49e22b3fdd0a /storage/xtradb
parente97d6232f366c474f6eba4013bfbd6dacc01d544 (diff)
downloadmariadb-git-40bbf697aad7d923fc1bd995bc5f547e45461cbe.tar.gz
MDEV-532: Async InnoDB commit checkpoint.
Make the commit checkpoint inside InnoDB be asynchroneous. Implement a background thread in binlog to do the writing and flushing of binlog checkpoint events to disk.
Diffstat (limited to 'storage/xtradb')
-rw-r--r--storage/xtradb/handler/ha_innodb.cc143
-rw-r--r--storage/xtradb/include/ha_prototypes.h11
-rw-r--r--storage/xtradb/include/log0log.h7
-rw-r--r--storage/xtradb/include/log0log.ic19
-rw-r--r--storage/xtradb/log/log0log.c7
5 files changed, 183 insertions, 4 deletions
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 653607c9381..a5873bc05d3 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -121,6 +121,7 @@ static ulong commit_threads = 0;
static mysql_mutex_t commit_threads_m;
static mysql_cond_t commit_cond;
static mysql_mutex_t commit_cond_m;
+static mysql_mutex_t pending_checkpoint_mutex;
static bool innodb_inited = 0;
@@ -254,11 +255,13 @@ static mysql_pfs_key_t innobase_share_mutex_key;
static mysql_pfs_key_t commit_threads_m_key;
static mysql_pfs_key_t commit_cond_mutex_key;
static mysql_pfs_key_t commit_cond_key;
+static mysql_pfs_key_t pending_checkpoint_mutex_key;
static PSI_mutex_info all_pthread_mutexes[] = {
{&commit_threads_m_key, "commit_threads_m", 0},
{&commit_cond_mutex_key, "commit_cond_mutex", 0},
- {&innobase_share_mutex_key, "innobase_share_mutex", 0}
+ {&innobase_share_mutex_key, "innobase_share_mutex", 0},
+ {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
};
static PSI_cond_info all_innodb_conds[] = {
@@ -3088,6 +3091,9 @@ skip_overwrite:
mysql_mutex_init(commit_cond_mutex_key,
&commit_cond_m, MY_MUTEX_INIT_FAST);
mysql_cond_init(commit_cond_key, &commit_cond, NULL);
+ mysql_mutex_init(pending_checkpoint_mutex_key,
+ &pending_checkpoint_mutex,
+ MY_MUTEX_INIT_FAST);
innodb_inited= 1;
#ifdef MYSQL_DYNAMIC_PLUGIN
if (innobase_hton != p) {
@@ -3135,6 +3141,7 @@ innobase_end(
mysql_mutex_destroy(&commit_threads_m);
mysql_mutex_destroy(&commit_cond_m);
mysql_cond_destroy(&commit_cond);
+ mysql_mutex_destroy(&pending_checkpoint_mutex);
}
DBUG_RETURN(err);
@@ -3530,17 +3537,145 @@ innobase_rollback_trx(
DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
}
+
+struct pending_checkpoint {
+ struct pending_checkpoint *next;
+ handlerton *hton;
+ void *cookie;
+ ib_uint64_t lsn;
+};
+static struct pending_checkpoint *pending_checkpoint_list;
+static struct pending_checkpoint *pending_checkpoint_list_end;
+
/*****************************************************************//**
Handle a commit checkpoint request from server layer.
-We simply flush the redo log immediately and do the notify call.*/
+We put the request in a queue, so that we can notify upper layer about
+checkpoint complete when we have flushed the redo log.
+If we have already flushed all relevant redo log, we notify immediately.*/
static
void
innobase_checkpoint_request(
handlerton *hton,
void *cookie)
{
- log_buffer_flush_to_disk();
- commit_checkpoint_notify_ha(hton, cookie);
+ ib_uint64_t lsn;
+ ib_uint64_t flush_lsn;
+ struct pending_checkpoint * entry;
+
+ /* Do the allocation outside of lock to reduce contention. The normal
+ case is that not everything is flushed, so we will need to enqueue. */
+ entry = static_cast<struct pending_checkpoint *>
+ (my_malloc(sizeof(*entry), MYF(MY_WME)));
+ if (!entry) {
+ sql_print_error("Failed to allocate %u bytes."
+ " Commit checkpoint will be skipped.",
+ static_cast<unsigned>(sizeof(*entry)));
+ return;
+ }
+
+ entry->next = NULL;
+ entry->hton = hton;
+ entry->cookie = cookie;
+
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ lsn = log_get_lsn();
+ flush_lsn = log_get_flush_lsn();
+ if (lsn > flush_lsn) {
+ /* Put the request in queue.
+ When the log gets flushed past the lsn, we will remove the
+ entry from the queue and notify the upper layer. */
+ entry->lsn = lsn;
+ if (pending_checkpoint_list_end) {
+ pending_checkpoint_list_end->next = entry;
+ /* There is no need to order the entries in the list
+ by lsn. The upper layer can accept notifications in
+ any order, and short delays in notifications do not
+ significantly impact performance. */
+ } else {
+ pending_checkpoint_list = entry;
+ }
+ pending_checkpoint_list_end = entry;
+ entry = NULL;
+ }
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+ if (entry) {
+ /* We are already flushed. Notify the checkpoint immediately. */
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+ my_free(entry);
+ }
+}
+
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+extern "C" UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+ ib_uint64_t write_lsn, /*!< in: LSN written to log file */
+ ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */
+{
+ struct pending_checkpoint * pending;
+ struct pending_checkpoint * entry;
+ struct pending_checkpoint * last_ready;
+
+ /* It is safe to do a quick check for NULL first without lock.
+ Even if we should race, we will at most skip one checkpoint and
+ take the next one, which is harmless. */
+ if (!pending_checkpoint_list)
+ return;
+
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ pending = pending_checkpoint_list;
+ if (!pending)
+ {
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+ return;
+ }
+
+ last_ready = NULL;
+ for (entry = pending; entry != NULL; entry = entry -> next)
+ {
+ /* Notify checkpoints up until the first entry that has not
+ been fully flushed to the redo log. Since we do not maintain
+ the list ordered, in principle there could be more entries
+ later than were also flushed. But there is no harm in
+ delaying notifications for those a bit. And in practise, the
+ list is unlikely to have more than one element anyway, as we
+ flush the redo log at least once every second. */
+ if (entry->lsn > flush_lsn)
+ break;
+ last_ready = entry;
+ }
+
+ if (last_ready)
+ {
+ /* We found some pending checkpoints that are now flushed to
+ disk. So remove them from the list. */
+ pending_checkpoint_list = entry;
+ if (!entry)
+ pending_checkpoint_list_end = NULL;
+ }
+
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+ if (!last_ready)
+ return;
+
+ /* Now that we have released the lock, notify upper layer about all
+ commit checkpoints that have now completed. */
+ for (;;) {
+ entry = pending;
+ pending = pending->next;
+
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+
+ my_free(entry);
+ if (entry == last_ready)
+ break;
+ }
}
/*****************************************************************//**
diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h
index 2907365a32a..890bf33ac02 100644
--- a/storage/xtradb/include/ha_prototypes.h
+++ b/storage/xtradb/include/ha_prototypes.h
@@ -136,6 +136,17 @@ innobase_mysql_print_thd(
uint max_query_len); /*!< in: max query length to print, or 0 to
use the default max length */
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+ ib_uint64_t write_lsn, /*!< in: LSN written to log file */
+ ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */
+
/**************************************************************//**
Converts a MySQL type to an InnoDB type. Note that this function returns
the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h
index 857ec0946c2..8a6430fb105 100644
--- a/storage/xtradb/include/log0log.h
+++ b/storage/xtradb/include/log0log.h
@@ -151,6 +151,13 @@ UNIV_INLINE
ib_uint64_t
log_get_lsn(void);
/*=============*/
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+@return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void);
+/*=============*/
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic
index 67db6695cab..b54697637b0 100644
--- a/storage/xtradb/include/log0log.ic
+++ b/storage/xtradb/include/log0log.ic
@@ -411,6 +411,25 @@ log_get_lsn(void)
return(lsn);
}
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+@return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void)
+/*=============*/
+{
+ ib_uint64_t lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->flushed_to_disk_lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(lsn);
+}
+
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c
index dcaf951a0ed..4f8133b3036 100644
--- a/storage/xtradb/log/log0log.c
+++ b/storage/xtradb/log/log0log.c
@@ -1390,6 +1390,8 @@ log_write_up_to(
ulint loop_count = 0;
#endif /* UNIV_DEBUG */
ulint unlock;
+ ib_uint64_t write_lsn;
+ ib_uint64_t flush_lsn;
if (recv_no_ibuf_operations) {
/* Recovery is running and no operations on the log files are
@@ -1568,8 +1570,13 @@ loop:
log_flush_do_unlocks(unlock);
+ write_lsn = log_sys->write_lsn;
+ flush_lsn = log_sys->flushed_to_disk_lsn;
+
mutex_exit(&(log_sys->mutex));
+ innobase_mysql_log_notify(write_lsn, flush_lsn);
+
return;
do_waits: