summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <knielsen@knielsen-hq.org>2012-12-14 15:38:07 +0100
committerunknown <knielsen@knielsen-hq.org>2012-12-14 15:38:07 +0100
commit40bbf697aad7d923fc1bd995bc5f547e45461cbe (patch)
tree1563a1c99589bb5853d35faee4ae49e22b3fdd0a
parente97d6232f366c474f6eba4013bfbd6dacc01d544 (diff)
downloadmariadb-git-40bbf697aad7d923fc1bd995bc5f547e45461cbe.tar.gz
MDEV-532: Async InnoDB commit checkpoint.
Make the commit checkpoint inside InnoDB be asynchroneous. Implement a background thread in binlog to do the writing and flushing of binlog checkpoint events to disk.
-rw-r--r--mysql-test/suite/binlog/r/binlog_checkpoint.result6
-rw-r--r--mysql-test/suite/binlog/r/binlog_xa_recover.result4
-rw-r--r--mysql-test/suite/binlog/t/binlog_checkpoint.test12
-rw-r--r--mysql-test/suite/binlog/t/binlog_xa_recover.test38
-rw-r--r--mysql-test/suite/perfschema/r/all_instances.result3
-rw-r--r--mysql-test/suite/perfschema/r/relaylog.result8
-rw-r--r--sql/debug_sync.cc1
-rw-r--r--sql/log.cc167
-rw-r--r--sql/log.h55
-rw-r--r--sql/mysqld.cc6
-rw-r--r--sql/mysqld.h3
-rw-r--r--sql/rpl_rli.cc3
-rw-r--r--sql/sql_class.h3
-rw-r--r--storage/innobase/handler/ha_innodb.cc143
-rw-r--r--storage/innobase/include/ha_prototypes.h11
-rw-r--r--storage/innobase/include/log0log.h7
-rw-r--r--storage/innobase/include/log0log.ic19
-rw-r--r--storage/innobase/log/log0log.c7
-rw-r--r--storage/xtradb/handler/ha_innodb.cc143
-rw-r--r--storage/xtradb/include/ha_prototypes.h11
-rw-r--r--storage/xtradb/include/log0log.h7
-rw-r--r--storage/xtradb/include/log0log.ic19
-rw-r--r--storage/xtradb/log/log0log.c7
23 files changed, 639 insertions, 44 deletions
diff --git a/mysql-test/suite/binlog/r/binlog_checkpoint.result b/mysql-test/suite/binlog/r/binlog_checkpoint.result
index 7532e33367e..7bfa66e4770 100644
--- a/mysql-test/suite/binlog/r/binlog_checkpoint.result
+++ b/mysql-test/suite/binlog/r/binlog_checkpoint.result
@@ -70,8 +70,14 @@ show binlog events in 'master-bin.000003' from <binlog_start>;
Log_name Pos Event_type Server_id End_log_pos Info
master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
master-bin.000003 # Binlog_checkpoint # # master-bin.000001
+SET DEBUG_SYNC= "RESET";
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
SET DEBUG_SYNC= "now SIGNAL con2_continue";
con1 is still pending, no new binlog checkpoint should have been logged.
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
+SET DEBUG_SYNC= "RESET";
show binlog events in 'master-bin.000003' from <binlog_start>;
Log_name Pos Event_type Server_id End_log_pos Info
master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
diff --git a/mysql-test/suite/binlog/r/binlog_xa_recover.result b/mysql-test/suite/binlog/r/binlog_xa_recover.result
index 0ac14fd7f7d..4f57bd20690 100644
--- a/mysql-test/suite/binlog/r/binlog_xa_recover.result
+++ b/mysql-test/suite/binlog/r/binlog_xa_recover.result
@@ -118,7 +118,11 @@ master-bin.00000<binlog_start> # Table_map # # table_id: # (test.t1)
master-bin.00000<binlog_start> # Write_rows # # table_id: # flags: STMT_END_F
master-bin.00000<binlog_start> # Xid # # COMMIT /* XID */
SET DEBUG_SYNC= "now SIGNAL con10_cont";
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
SET DEBUG_SYNC= "now SIGNAL con12_cont";
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
SET DEBUG_SYNC= "now SIGNAL con11_cont";
Checking that master-bin.000004 is the last binlog checkpoint
show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
diff --git a/mysql-test/suite/binlog/t/binlog_checkpoint.test b/mysql-test/suite/binlog/t/binlog_checkpoint.test
index 557791c77e5..8c84e51c4df 100644
--- a/mysql-test/suite/binlog/t/binlog_checkpoint.test
+++ b/mysql-test/suite/binlog/t/binlog_checkpoint.test
@@ -71,6 +71,12 @@ SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
--let $binlog_file= master-bin.000003
--source include/show_binlog_events.inc
+# We need to sync the test case with the background processing of the
+# commit checkpoint, otherwise we get nondeterministic results.
+SET DEBUG_SYNC= "RESET";
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
+
SET DEBUG_SYNC= "now SIGNAL con2_continue";
connection con2;
@@ -78,6 +84,12 @@ reap;
connection default;
--echo con1 is still pending, no new binlog checkpoint should have been logged.
+# Make sure commit checkpoint is processed before we check that no checkpoint
+# event has been binlogged.
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
+SET DEBUG_SYNC= "RESET";
+
--let $binlog_file= master-bin.000003
--source include/show_binlog_events.inc
diff --git a/mysql-test/suite/binlog/t/binlog_xa_recover.test b/mysql-test/suite/binlog/t/binlog_xa_recover.test
index 36b2ddecb4f..e46857b265c 100644
--- a/mysql-test/suite/binlog/t/binlog_xa_recover.test
+++ b/mysql-test/suite/binlog/t/binlog_xa_recover.test
@@ -14,8 +14,24 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
# Insert some data to force a couple binlog rotations (3), so we get some
# normal binlog checkpoints before starting the test.
INSERT INTO t1 VALUES (100, REPEAT("x", 4100));
+# Wait for the master-bin.000002 binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000002"
+--let $field= Info
+--let $condition= = "master-bin.000002"
+--source include/wait_show_condition.inc
INSERT INTO t1 VALUES (101, REPEAT("x", 4100));
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
+--let $field= Info
+--let $condition= = "master-bin.000003"
+--source include/wait_show_condition.inc
INSERT INTO t1 VALUES (102, REPEAT("x", 4100));
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
+--let $field= Info
+--let $condition= = "master-bin.000004"
+--source include/wait_show_condition.inc
# Now start a bunch of transactions that span multiple binlog
# files. Leave then in the state prepared-but-not-committed in the engine
@@ -153,10 +169,19 @@ SET DEBUG_SYNC= "now SIGNAL con10_cont";
connection con10;
reap;
connection default;
+
+# We need to sync the test case with the background processing of the
+# commit checkpoint, otherwise we get nondeterministic results.
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
+
SET DEBUG_SYNC= "now SIGNAL con12_cont";
connection con12;
reap;
connection default;
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
+
SET DEBUG_SYNC= "now SIGNAL con11_cont";
connection con11;
reap;
@@ -210,7 +235,20 @@ RESET MASTER;
# crash recovery fails due to the error insert used for previous test.
INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
INSERT INTO t1 VALUES (22, REPEAT("x", 4100));
+# Wait for the master-bin.000003 binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
+--let $field= Info
+--let $condition= = "master-bin.000003"
+--source include/wait_show_condition.inc
INSERT INTO t1 VALUES (23, REPEAT("x", 4100));
+# Wait for the last (master-bin.000004) binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
+--let $field= Info
+--let $condition= = "master-bin.000004"
+--source include/wait_show_condition.inc
+
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
wait-binlog_xa_recover.test
EOF
diff --git a/mysql-test/suite/perfschema/r/all_instances.result b/mysql-test/suite/perfschema/r/all_instances.result
index 7d3484fc887..d59f17847e2 100644
--- a/mysql-test/suite/perfschema/r/all_instances.result
+++ b/mysql-test/suite/perfschema/r/all_instances.result
@@ -76,6 +76,7 @@ wait/synch/mutex/sql/Master_info::run_lock
wait/synch/mutex/sql/Master_info::sleep_lock
wait/synch/mutex/sql/MDL_map::mutex
wait/synch/mutex/sql/MDL_wait::LOCK_wait_status
+wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_background_thread
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list
wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index
@@ -129,6 +130,8 @@ wait/synch/cond/sql/Master_info::sleep_cond
wait/synch/cond/sql/Master_info::start_cond
wait/synch/cond/sql/Master_info::stop_cond
wait/synch/cond/sql/MDL_context::COND_wait_status
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread_end
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list
wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond
diff --git a/mysql-test/suite/perfschema/r/relaylog.result b/mysql-test/suite/perfschema/r/relaylog.result
index f8ff902245c..7de32ef07d0 100644
--- a/mysql-test/suite/perfschema/r/relaylog.result
+++ b/mysql-test/suite/perfschema/r/relaylog.result
@@ -56,8 +56,11 @@ where event_name like "%MYSQL_BIN_LOG%"
and event_name not like "%MYSQL_BIN_LOG::update_cond"
order by event_name;
EVENT_NAME COUNT_STAR
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread NONE
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread_end NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list NONE
+wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_background_thread MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY
"Expect no slave relay log"
@@ -131,8 +134,11 @@ where event_name like "%MYSQL_BIN_LOG%"
and event_name not like "%MYSQL_BIN_LOG::update_cond"
order by event_name;
EVENT_NAME COUNT_STAR
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread MANY
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_background_thread_end NONE
wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE
-wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list NONE
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list MANY
+wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_background_thread MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY
wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY
"Expect a slave relay log"
diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc
index 4097d7fe6e1..1c95869987d 100644
--- a/sql/debug_sync.cc
+++ b/sql/debug_sync.cc
@@ -984,6 +984,7 @@ static bool debug_sync_eval_action(THD *thd, char *action_str)
DBUG_ENTER("debug_sync_eval_action");
DBUG_ASSERT(thd);
DBUG_ASSERT(action_str);
+ DBUG_PRINT("debug_sync", ("action_str='%s'", action_str));
/*
Get debug sync point name. Or a special command.
diff --git a/sql/log.cc b/sql/log.cc
index e664540f0d6..da9d576d1da 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -54,6 +54,7 @@
#include "rpl_handler.h"
#include "debug_sync.h"
#include "sql_show.h"
+#include "my_pthread.h"
/* max size of the log message */
#define MAX_LOG_BUFFER_SIZE 1024
@@ -107,6 +108,17 @@ static SHOW_VAR binlog_status_vars_detail[]=
{NullS, NullS, SHOW_LONG}
};
+/*
+ Variables for the binlog background thread.
+ Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
+ */
+static bool binlog_background_thread_started= false;
+static bool binlog_background_thread_stop= false;
+static MYSQL_BIN_LOG::xid_count_per_binlog *
+ binlog_background_thread_queue= NULL;
+
+static bool start_binlog_background_thread();
+
/**
purge logs, master and slave sides both, related error code
@@ -2958,12 +2970,28 @@ void MYSQL_BIN_LOG::cleanup()
my_free(b);
}
+ /* Wait for the binlog background thread to stop. */
+ if (!is_relay_log && binlog_background_thread_started)
+ {
+ mysql_mutex_lock(&LOCK_binlog_background_thread);
+ binlog_background_thread_stop= true;
+ mysql_cond_signal(&COND_binlog_background_thread);
+ while (binlog_background_thread_stop)
+ mysql_cond_wait(&COND_binlog_background_thread_end,
+ &LOCK_binlog_background_thread);
+ mysql_mutex_unlock(&LOCK_binlog_background_thread);
+ binlog_background_thread_started= false;
+ }
+
mysql_mutex_destroy(&LOCK_log);
mysql_mutex_destroy(&LOCK_index);
mysql_mutex_destroy(&LOCK_xid_list);
+ mysql_mutex_destroy(&LOCK_binlog_background_thread);
mysql_cond_destroy(&update_cond);
mysql_cond_destroy(&COND_queue_busy);
mysql_cond_destroy(&COND_xid_list);
+ mysql_cond_destroy(&COND_binlog_background_thread);
+ mysql_cond_destroy(&COND_binlog_background_thread_end);
}
DBUG_VOID_RETURN;
}
@@ -2989,6 +3017,13 @@ void MYSQL_BIN_LOG::init_pthread_objects()
mysql_cond_init(m_key_update_cond, &update_cond, 0);
mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0);
+
+ mysql_mutex_init(key_BINLOG_LOCK_binlog_background_thread,
+ &LOCK_binlog_background_thread, MY_MUTEX_INIT_FAST);
+ mysql_cond_init(key_BINLOG_COND_binlog_background_thread,
+ &COND_binlog_background_thread, 0);
+ mysql_cond_init(key_BINLOG_COND_binlog_background_thread_end,
+ &COND_binlog_background_thread_end, 0);
}
@@ -3086,6 +3121,10 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
DBUG_ENTER("MYSQL_BIN_LOG::open");
DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg));
+ if (!is_relay_log && !binlog_background_thread_started &&
+ start_binlog_background_thread())
+ DBUG_RETURN(1);
+
if (init_and_set_log_file_name(log_name, new_name, log_type_arg,
io_cache_type_arg))
{
@@ -5541,11 +5580,7 @@ bool general_log_write(THD *thd, enum enum_server_command command,
}
-/*
- I would like to make this function static, but this causes compiler warnings
- when it is declared as friend function in log.h.
-*/
-void
+static void
binlog_checkpoint_callback(void *cookie)
{
MYSQL_BIN_LOG::xid_count_per_binlog *entry=
@@ -8135,9 +8170,129 @@ int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
void
TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
{
- mark_xid_done(((xid_count_per_binlog *)cookie)->binlog_id, true);
+ xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
+ mysql_mutex_lock(&LOCK_binlog_background_thread);
+ entry->next_in_queue= binlog_background_thread_queue;
+ binlog_background_thread_queue= entry;
+ mysql_cond_signal(&COND_binlog_background_thread);
+ mysql_mutex_unlock(&LOCK_binlog_background_thread);
}
+/*
+ Binlog background thread.
+
+ This thread is used to log binlog checkpoints in the background, rather than
+ in the context of random storage engine threads that happen to call
+ commit_checkpoint_notify_ha() and may not like the delays while syncing
+ binlog to disk or may not be setup with all my_thread_init() and other
+ necessary stuff.
+
+ In the future, this thread could also be used to do log rotation in the
+ background, which could elimiate all stalls around binlog rotations.
+*/
+pthread_handler_t
+binlog_background_thread(void *arg __attribute__((unused)))
+{
+ bool stop;
+ MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
+ THD *thd;
+
+ my_thread_init();
+ thd= new THD;
+ thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND;
+ thd->thread_stack= (char*) &thd; /* Set approximate stack start */
+ mysql_mutex_lock(&LOCK_thread_count);
+ thd->thread_id= thread_id++;
+ mysql_mutex_unlock(&LOCK_thread_count);
+ thd->store_globals();
+
+ for (;;)
+ {
+ /*
+ Wait until there is something in the queue to process, or we are asked
+ to shut down.
+ */
+ thd_proc_info(thd, "Waiting for background binlog tasks");
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ for (;;)
+ {
+ stop= binlog_background_thread_stop;
+ queue= binlog_background_thread_queue;
+ if (stop || queue)
+ break;
+ mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread,
+ &mysql_bin_log.LOCK_binlog_background_thread);
+ }
+ /* Grab the queue, if any. */
+ binlog_background_thread_queue= NULL;
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ /* Process any incoming commit_checkpoint_notify() calls. */
+ while (queue)
+ {
+ thd_proc_info(thd, "Processing binlog checkpoint notification");
+ /* Grab next pointer first, as mark_xid_done() may free the element. */
+ next= queue->next_in_queue;
+ mysql_bin_log.mark_xid_done(queue->binlog_id, true);
+ queue= next;
+
+ DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
+ DBUG_ASSERT(!debug_sync_set_action(
+ thd,
+ STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
+ );
+ }
+
+ if (stop)
+ break;
+ }
+
+ thd_proc_info(thd, "Stopping binlog background thread");
+
+ mysql_mutex_lock(&LOCK_thread_count);
+ delete thd;
+ mysql_mutex_unlock(&LOCK_thread_count);
+
+ my_thread_end();
+
+ /* Signal that we are (almost) stopped. */
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ binlog_background_thread_stop= false;
+ mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ return 0;
+}
+
+#ifdef HAVE_PSI_INTERFACE
+static PSI_thread_key key_thread_binlog;
+
+static PSI_thread_info all_binlog_threads[]=
+{
+ { &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL},
+};
+#endif /* HAVE_PSI_INTERFACE */
+
+static bool
+start_binlog_background_thread()
+{
+ pthread_t th;
+
+#ifdef HAVE_PSI_INTERFACE
+ if (PSI_server)
+ PSI_server->register_thread("sql", all_binlog_threads,
+ array_elements(all_binlog_threads));
+#endif
+
+ if (mysql_thread_create(key_thread_binlog, &th, NULL,
+ binlog_background_thread, NULL))
+ return 1;
+
+ binlog_background_thread_started= true;
+ return 0;
+}
+
+
int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
IO_CACHE *first_log,
Format_description_log_event *fdle)
diff --git a/sql/log.h b/sql/log.h
index 3959411d83a..9facf309279 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -395,8 +395,6 @@ private:
#define BINLOG_COOKIE_IS_DUMMY(c) \
( ((ulong)(c)>>1) == BINLOG_COOKIE_DUMMY_ID )
-void binlog_checkpoint_callback(void *cookie);
-
class binlog_cache_mngr;
class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
{
@@ -451,27 +449,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
};
/*
- A list of struct xid_count_per_binlog is used to keep track of how many
- XIDs are in prepared, but not committed, state in each binlog. And how
- many commit_checkpoint_request()'s are pending.
-
- When count drops to zero in a binlog after rotation, it means that there
- are no more XIDs in prepared state, so that binlog is no longer needed
- for XA crash recovery, and we can log a new binlog checkpoint event.
-
- The list is protected against simultaneous access from multiple
- threads by LOCK_xid_list.
- */
- struct xid_count_per_binlog : public ilink {
- char *binlog_name;
- uint binlog_name_len;
- ulong binlog_id;
- /* Total prepared XIDs and pending checkpoint requests in this binlog. */
- long xid_count;
- xid_count_per_binlog(); /* Give link error if constructor used. */
- };
- I_List<xid_count_per_binlog> binlog_xid_count_list;
- /*
When this is set, a RESET MASTER is in progress.
Then we should not write any binlog checkpoints into the binlog (that
@@ -480,7 +457,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
checkpoint arrives - when all have arrived, RESET MASTER will complete.
*/
bool reset_master_pending;
- friend void binlog_checkpoint_callback(void *cookie);
/* LOCK_log and LOCK_index are inited by init_pthread_objects() */
mysql_mutex_t LOCK_index;
@@ -550,10 +526,35 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
int write_transaction_or_stmt(group_commit_entry *entry);
bool write_transaction_to_binlog_events(group_commit_entry *entry);
void trx_group_commit_leader(group_commit_entry *leader);
- void mark_xid_done(ulong cookie, bool write_checkpoint);
- void mark_xids_active(ulong cookie, uint xid_count);
public:
+ /*
+ A list of struct xid_count_per_binlog is used to keep track of how many
+ XIDs are in prepared, but not committed, state in each binlog. And how
+ many commit_checkpoint_request()'s are pending.
+
+ When count drops to zero in a binlog after rotation, it means that there
+ are no more XIDs in prepared state, so that binlog is no longer needed
+ for XA crash recovery, and we can log a new binlog checkpoint event.
+
+ The list is protected against simultaneous access from multiple
+ threads by LOCK_xid_list.
+ */
+ struct xid_count_per_binlog : public ilink {
+ char *binlog_name;
+ uint binlog_name_len;
+ ulong binlog_id;
+ /* Total prepared XIDs and pending checkpoint requests in this binlog. */
+ long xid_count;
+ /* For linking in requests to the binlog background thread. */
+ xid_count_per_binlog *next_in_queue;
+ xid_count_per_binlog(); /* Give link error if constructor used. */
+ };
+ I_List<xid_count_per_binlog> binlog_xid_count_list;
+ mysql_mutex_t LOCK_binlog_background_thread;
+ mysql_cond_t COND_binlog_background_thread;
+ mysql_cond_t COND_binlog_background_thread_end;
+
using MYSQL_LOG::generate_name;
using MYSQL_LOG::is_open;
@@ -709,6 +710,8 @@ public:
bool appendv(const char* buf,uint len,...);
bool append(Log_event* ev);
+ void mark_xids_active(ulong cookie, uint xid_count);
+ void mark_xid_done(ulong cookie, bool write_checkpoint);
void make_log_name(char* buf, const char* log_ident);
bool is_active(const char* log_file_name);
bool can_purge_log(const char *log_file_name);
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 57f7ae49ecb..a3874bb49a6 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -726,6 +726,7 @@ PSI_mutex_key key_LOCK_des_key_file;
#endif /* HAVE_OPENSSL */
PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
+ key_BINLOG_LOCK_binlog_background_thread,
key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
@@ -768,6 +769,7 @@ static PSI_mutex_info all_server_mutexes[]=
{ &key_BINLOG_LOCK_index, "MYSQL_BIN_LOG::LOCK_index", 0},
{ &key_BINLOG_LOCK_xid_list, "MYSQL_BIN_LOG::LOCK_xid_list", 0},
+ { &key_BINLOG_LOCK_binlog_background_thread, "MYSQL_BIN_LOG::LOCK_binlog_background_thread", 0},
{ &key_RELAYLOG_LOCK_index, "MYSQL_RELAY_LOG::LOCK_index", 0},
{ &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
{ &key_hash_filo_lock, "hash_filo::lock", 0},
@@ -836,6 +838,8 @@ PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
#endif /* HAVE_MMAP */
PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
+ key_BINLOG_COND_binlog_background_thread,
+ key_BINLOG_COND_binlog_background_thread_end,
key_COND_cache_status_changed, key_COND_manager,
key_COND_rpl_status, key_COND_server_started,
key_delayed_insert_cond, key_delayed_insert_cond_client,
@@ -865,6 +869,8 @@ static PSI_cond_info all_server_conds[]=
#endif /* HAVE_MMAP */
{ &key_BINLOG_COND_xid_list, "MYSQL_BIN_LOG::COND_xid_list", 0},
{ &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0},
+ { &key_BINLOG_COND_binlog_background_thread, "MYSQL_BIN_LOG::COND_binlog_background_thread", 0},
+ { &key_BINLOG_COND_binlog_background_thread_end, "MYSQL_BIN_LOG::COND_binlog_background_thread_end", 0},
{ &key_BINLOG_COND_queue_busy, "MYSQL_BIN_LOG::COND_queue_busy", 0},
{ &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
{ &key_RELAYLOG_COND_queue_busy, "MYSQL_RELAY_LOG::COND_queue_busy", 0},
diff --git a/sql/mysqld.h b/sql/mysqld.h
index 430811a956c..d346defad8e 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -228,6 +228,7 @@ extern PSI_mutex_key key_LOCK_des_key_file;
#endif
extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
+ key_BINLOG_LOCK_binlog_background_thread,
key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
@@ -259,6 +260,8 @@ extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
#endif /* HAVE_MMAP */
extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
+ key_BINLOG_COND_binlog_background_thread,
+ key_BINLOG_COND_binlog_background_thread_end,
key_COND_cache_status_changed, key_COND_manager,
key_COND_rpl_status, key_COND_server_started,
key_delayed_insert_cond, key_delayed_insert_cond_client,
diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
index e74831464fa..17919d3c1f1 100644
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -58,6 +58,7 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery)
{
DBUG_ENTER("Relay_log_info::Relay_log_info");
+ relay_log.is_relay_log= TRUE;
#ifdef HAVE_PSI_INTERFACE
relay_log.set_psi_keys(key_RELAYLOG_LOCK_index,
key_RELAYLOG_update_cond,
@@ -216,8 +217,6 @@ a file name for --relay-log-index option", opt_relaylog_index_name);
&mi->connection_name);
}
- rli->relay_log.is_relay_log= TRUE;
-
/*
note, that if open() fails, we'll still have index file open
but a destructor will take care of that
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 34dcd0ff11d..7f941220140 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1255,7 +1255,8 @@ enum enum_thread_type
SYSTEM_THREAD_SLAVE_SQL= 4,
SYSTEM_THREAD_NDBCLUSTER_BINLOG= 8,
SYSTEM_THREAD_EVENT_SCHEDULER= 16,
- SYSTEM_THREAD_EVENT_WORKER= 32
+ SYSTEM_THREAD_EVENT_WORKER= 32,
+ SYSTEM_THREAD_BINLOG_BACKGROUND= 64
};
inline char const *
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 911e1165ae3..f8450cb89f0 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -106,6 +106,7 @@ static ulong commit_threads = 0;
static mysql_mutex_t commit_threads_m;
static mysql_cond_t commit_cond;
static mysql_mutex_t commit_cond_m;
+static mysql_mutex_t pending_checkpoint_mutex;
static bool innodb_inited = 0;
#define INSIDE_HA_INNOBASE_CC
@@ -222,11 +223,13 @@ static mysql_pfs_key_t innobase_share_mutex_key;
static mysql_pfs_key_t commit_threads_m_key;
static mysql_pfs_key_t commit_cond_mutex_key;
static mysql_pfs_key_t commit_cond_key;
+static mysql_pfs_key_t pending_checkpoint_mutex_key;
static PSI_mutex_info all_pthread_mutexes[] = {
{&commit_threads_m_key, "commit_threads_m", 0},
{&commit_cond_mutex_key, "commit_cond_mutex", 0},
- {&innobase_share_mutex_key, "innobase_share_mutex", 0}
+ {&innobase_share_mutex_key, "innobase_share_mutex", 0},
+ {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
};
static PSI_cond_info all_innodb_conds[] = {
@@ -2601,6 +2604,9 @@ innobase_change_buffering_inited_ok:
mysql_mutex_init(commit_cond_mutex_key,
&commit_cond_m, MY_MUTEX_INIT_FAST);
mysql_cond_init(commit_cond_key, &commit_cond, NULL);
+ mysql_mutex_init(pending_checkpoint_mutex_key,
+ &pending_checkpoint_mutex,
+ MY_MUTEX_INIT_FAST);
innodb_inited= 1;
#ifdef MYSQL_DYNAMIC_PLUGIN
if (innobase_hton != p) {
@@ -2648,6 +2654,7 @@ innobase_end(
mysql_mutex_destroy(&commit_threads_m);
mysql_mutex_destroy(&commit_cond_m);
mysql_cond_destroy(&commit_cond);
+ mysql_mutex_destroy(&pending_checkpoint_mutex);
}
DBUG_RETURN(err);
@@ -3017,17 +3024,145 @@ innobase_rollback_trx(
DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
}
+
+struct pending_checkpoint {
+ struct pending_checkpoint *next;
+ handlerton *hton;
+ void *cookie;
+ ib_uint64_t lsn;
+};
+static struct pending_checkpoint *pending_checkpoint_list;
+static struct pending_checkpoint *pending_checkpoint_list_end;
+
/*****************************************************************//**
Handle a commit checkpoint request from server layer.
-We simply flush the redo log immediately and do the notify call.*/
+We put the request in a queue, so that we can notify upper layer about
+checkpoint complete when we have flushed the redo log.
+If we have already flushed all relevant redo log, we notify immediately.*/
static
void
innobase_checkpoint_request(
handlerton *hton,
void *cookie)
{
- log_buffer_flush_to_disk();
- commit_checkpoint_notify_ha(hton, cookie);
+ ib_uint64_t lsn;
+ ib_uint64_t flush_lsn;
+ struct pending_checkpoint * entry;
+
+ /* Do the allocation outside of lock to reduce contention. The normal
+ case is that not everything is flushed, so we will need to enqueue. */
+ entry = static_cast<struct pending_checkpoint *>
+ (my_malloc(sizeof(*entry), MYF(MY_WME)));
+ if (!entry) {
+ sql_print_error("Failed to allocate %u bytes."
+ " Commit checkpoint will be skipped.",
+ static_cast<unsigned>(sizeof(*entry)));
+ return;
+ }
+
+ entry->next = NULL;
+ entry->hton = hton;
+ entry->cookie = cookie;
+
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ lsn = log_get_lsn();
+ flush_lsn = log_get_flush_lsn();
+ if (lsn > flush_lsn) {
+ /* Put the request in queue.
+ When the log gets flushed past the lsn, we will remove the
+ entry from the queue and notify the upper layer. */
+ entry->lsn = lsn;
+ if (pending_checkpoint_list_end) {
+ pending_checkpoint_list_end->next = entry;
+ /* There is no need to order the entries in the list
+ by lsn. The upper layer can accept notifications in
+ any order, and short delays in notifications do not
+ significantly impact performance. */
+ } else {
+ pending_checkpoint_list = entry;
+ }
+ pending_checkpoint_list_end = entry;
+ entry = NULL;
+ }
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+ if (entry) {
+ /* We are already flushed. Notify the checkpoint immediately. */
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+ my_free(entry);
+ }
+}
+
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+extern "C" UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+ ib_uint64_t write_lsn, /*!< in: LSN written to log file */
+ ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */
+{
+ struct pending_checkpoint * pending;
+ struct pending_checkpoint * entry;
+ struct pending_checkpoint * last_ready;
+
+ /* It is safe to do a quick check for NULL first without lock.
+ Even if we should race, we will at most skip one checkpoint and
+ take the next one, which is harmless. */
+ if (!pending_checkpoint_list)
+ return;
+
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ pending = pending_checkpoint_list;
+ if (!pending)
+ {
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+ return;
+ }
+
+ last_ready = NULL;
+ for (entry = pending; entry != NULL; entry = entry -> next)
+ {
+ /* Notify checkpoints up until the first entry that has not
+ been fully flushed to the redo log. Since we do not maintain
+ the list ordered, in principle there could be more entries
+ later than were also flushed. But there is no harm in
+ delaying notifications for those a bit. And in practise, the
+ list is unlikely to have more than one element anyway, as we
+ flush the redo log at least once every second. */
+ if (entry->lsn > flush_lsn)
+ break;
+ last_ready = entry;
+ }
+
+ if (last_ready)
+ {
+ /* We found some pending checkpoints that are now flushed to
+ disk. So remove them from the list. */
+ pending_checkpoint_list = entry;
+ if (!entry)
+ pending_checkpoint_list_end = NULL;
+ }
+
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+ if (!last_ready)
+ return;
+
+ /* Now that we have released the lock, notify upper layer about all
+ commit checkpoints that have now completed. */
+ for (;;) {
+ entry = pending;
+ pending = pending->next;
+
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+
+ my_free(entry);
+ if (entry == last_ready)
+ break;
+ }
}
/*****************************************************************//**
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index edf7a1a28c1..a0ec09c892f 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -136,6 +136,17 @@ innobase_mysql_print_thd(
uint max_query_len); /*!< in: max query length to print, or 0 to
use the default max length */
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+ ib_uint64_t write_lsn, /*!< in: LSN written to log file */
+ ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */
+
/**************************************************************//**
Converts a MySQL type to an InnoDB type. Note that this function returns
the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 30250d4fd27..0b8d7b6fa03 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -151,6 +151,13 @@ UNIV_INLINE
ib_uint64_t
log_get_lsn(void);
/*=============*/
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+@return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void);
+/*=============*/
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic
index 67db6695cab..b54697637b0 100644
--- a/storage/innobase/include/log0log.ic
+++ b/storage/innobase/include/log0log.ic
@@ -411,6 +411,25 @@ log_get_lsn(void)
return(lsn);
}
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+@return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void)
+/*=============*/
+{
+ ib_uint64_t lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->flushed_to_disk_lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(lsn);
+}
+
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
diff --git a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c
index 8bae95f0a5d..3914e34ed7d 100644
--- a/storage/innobase/log/log0log.c
+++ b/storage/innobase/log/log0log.c
@@ -1353,6 +1353,8 @@ log_write_up_to(
ulint loop_count = 0;
#endif /* UNIV_DEBUG */
ulint unlock;
+ ib_uint64_t write_lsn;
+ ib_uint64_t flush_lsn;
if (recv_no_ibuf_operations) {
/* Recovery is running and no operations on the log files are
@@ -1530,8 +1532,13 @@ loop:
log_flush_do_unlocks(unlock);
+ write_lsn = log_sys->write_lsn;
+ flush_lsn = log_sys->flushed_to_disk_lsn;
+
mutex_exit(&(log_sys->mutex));
+ innobase_mysql_log_notify(write_lsn, flush_lsn);
+
return;
do_waits:
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 653607c9381..a5873bc05d3 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -121,6 +121,7 @@ static ulong commit_threads = 0;
static mysql_mutex_t commit_threads_m;
static mysql_cond_t commit_cond;
static mysql_mutex_t commit_cond_m;
+static mysql_mutex_t pending_checkpoint_mutex;
static bool innodb_inited = 0;
@@ -254,11 +255,13 @@ static mysql_pfs_key_t innobase_share_mutex_key;
static mysql_pfs_key_t commit_threads_m_key;
static mysql_pfs_key_t commit_cond_mutex_key;
static mysql_pfs_key_t commit_cond_key;
+static mysql_pfs_key_t pending_checkpoint_mutex_key;
static PSI_mutex_info all_pthread_mutexes[] = {
{&commit_threads_m_key, "commit_threads_m", 0},
{&commit_cond_mutex_key, "commit_cond_mutex", 0},
- {&innobase_share_mutex_key, "innobase_share_mutex", 0}
+ {&innobase_share_mutex_key, "innobase_share_mutex", 0},
+ {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
};
static PSI_cond_info all_innodb_conds[] = {
@@ -3088,6 +3091,9 @@ skip_overwrite:
mysql_mutex_init(commit_cond_mutex_key,
&commit_cond_m, MY_MUTEX_INIT_FAST);
mysql_cond_init(commit_cond_key, &commit_cond, NULL);
+ mysql_mutex_init(pending_checkpoint_mutex_key,
+ &pending_checkpoint_mutex,
+ MY_MUTEX_INIT_FAST);
innodb_inited= 1;
#ifdef MYSQL_DYNAMIC_PLUGIN
if (innobase_hton != p) {
@@ -3135,6 +3141,7 @@ innobase_end(
mysql_mutex_destroy(&commit_threads_m);
mysql_mutex_destroy(&commit_cond_m);
mysql_cond_destroy(&commit_cond);
+ mysql_mutex_destroy(&pending_checkpoint_mutex);
}
DBUG_RETURN(err);
@@ -3530,17 +3537,145 @@ innobase_rollback_trx(
DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
}
+
+struct pending_checkpoint {
+ struct pending_checkpoint *next;
+ handlerton *hton;
+ void *cookie;
+ ib_uint64_t lsn;
+};
+static struct pending_checkpoint *pending_checkpoint_list;
+static struct pending_checkpoint *pending_checkpoint_list_end;
+
/*****************************************************************//**
Handle a commit checkpoint request from server layer.
-We simply flush the redo log immediately and do the notify call.*/
+We put the request in a queue, so that we can notify upper layer about
+checkpoint complete when we have flushed the redo log.
+If we have already flushed all relevant redo log, we notify immediately.*/
static
void
innobase_checkpoint_request(
handlerton *hton,
void *cookie)
{
- log_buffer_flush_to_disk();
- commit_checkpoint_notify_ha(hton, cookie);
+ ib_uint64_t lsn;
+ ib_uint64_t flush_lsn;
+ struct pending_checkpoint * entry;
+
+ /* Do the allocation outside of lock to reduce contention. The normal
+ case is that not everything is flushed, so we will need to enqueue. */
+ entry = static_cast<struct pending_checkpoint *>
+ (my_malloc(sizeof(*entry), MYF(MY_WME)));
+ if (!entry) {
+ sql_print_error("Failed to allocate %u bytes."
+ " Commit checkpoint will be skipped.",
+ static_cast<unsigned>(sizeof(*entry)));
+ return;
+ }
+
+ entry->next = NULL;
+ entry->hton = hton;
+ entry->cookie = cookie;
+
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ lsn = log_get_lsn();
+ flush_lsn = log_get_flush_lsn();
+ if (lsn > flush_lsn) {
+ /* Put the request in queue.
+ When the log gets flushed past the lsn, we will remove the
+ entry from the queue and notify the upper layer. */
+ entry->lsn = lsn;
+ if (pending_checkpoint_list_end) {
+ pending_checkpoint_list_end->next = entry;
+ /* There is no need to order the entries in the list
+ by lsn. The upper layer can accept notifications in
+ any order, and short delays in notifications do not
+ significantly impact performance. */
+ } else {
+ pending_checkpoint_list = entry;
+ }
+ pending_checkpoint_list_end = entry;
+ entry = NULL;
+ }
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+ if (entry) {
+ /* We are already flushed. Notify the checkpoint immediately. */
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+ my_free(entry);
+ }
+}
+
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+extern "C" UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+ ib_uint64_t write_lsn, /*!< in: LSN written to log file */
+ ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */
+{
+ struct pending_checkpoint * pending;
+ struct pending_checkpoint * entry;
+ struct pending_checkpoint * last_ready;
+
+ /* It is safe to do a quick check for NULL first without lock.
+ Even if we should race, we will at most skip one checkpoint and
+ take the next one, which is harmless. */
+ if (!pending_checkpoint_list)
+ return;
+
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ pending = pending_checkpoint_list;
+ if (!pending)
+ {
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+ return;
+ }
+
+ last_ready = NULL;
+ for (entry = pending; entry != NULL; entry = entry -> next)
+ {
+ /* Notify checkpoints up until the first entry that has not
+ been fully flushed to the redo log. Since we do not maintain
+ the list ordered, in principle there could be more entries
+ later than were also flushed. But there is no harm in
+ delaying notifications for those a bit. And in practise, the
+ list is unlikely to have more than one element anyway, as we
+ flush the redo log at least once every second. */
+ if (entry->lsn > flush_lsn)
+ break;
+ last_ready = entry;
+ }
+
+ if (last_ready)
+ {
+ /* We found some pending checkpoints that are now flushed to
+ disk. So remove them from the list. */
+ pending_checkpoint_list = entry;
+ if (!entry)
+ pending_checkpoint_list_end = NULL;
+ }
+
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+ if (!last_ready)
+ return;
+
+ /* Now that we have released the lock, notify upper layer about all
+ commit checkpoints that have now completed. */
+ for (;;) {
+ entry = pending;
+ pending = pending->next;
+
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+
+ my_free(entry);
+ if (entry == last_ready)
+ break;
+ }
}
/*****************************************************************//**
diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h
index 2907365a32a..890bf33ac02 100644
--- a/storage/xtradb/include/ha_prototypes.h
+++ b/storage/xtradb/include/ha_prototypes.h
@@ -136,6 +136,17 @@ innobase_mysql_print_thd(
uint max_query_len); /*!< in: max query length to print, or 0 to
use the default max length */
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+ ib_uint64_t write_lsn, /*!< in: LSN written to log file */
+ ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */
+
/**************************************************************//**
Converts a MySQL type to an InnoDB type. Note that this function returns
the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h
index 857ec0946c2..8a6430fb105 100644
--- a/storage/xtradb/include/log0log.h
+++ b/storage/xtradb/include/log0log.h
@@ -151,6 +151,13 @@ UNIV_INLINE
ib_uint64_t
log_get_lsn(void);
/*=============*/
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+@return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void);
+/*=============*/
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic
index 67db6695cab..b54697637b0 100644
--- a/storage/xtradb/include/log0log.ic
+++ b/storage/xtradb/include/log0log.ic
@@ -411,6 +411,25 @@ log_get_lsn(void)
return(lsn);
}
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+@return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void)
+/*=============*/
+{
+ ib_uint64_t lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->flushed_to_disk_lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(lsn);
+}
+
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c
index dcaf951a0ed..4f8133b3036 100644
--- a/storage/xtradb/log/log0log.c
+++ b/storage/xtradb/log/log0log.c
@@ -1390,6 +1390,8 @@ log_write_up_to(
ulint loop_count = 0;
#endif /* UNIV_DEBUG */
ulint unlock;
+ ib_uint64_t write_lsn;
+ ib_uint64_t flush_lsn;
if (recv_no_ibuf_operations) {
/* Recovery is running and no operations on the log files are
@@ -1568,8 +1570,13 @@ loop:
log_flush_do_unlocks(unlock);
+ write_lsn = log_sys->write_lsn;
+ flush_lsn = log_sys->flushed_to_disk_lsn;
+
mutex_exit(&(log_sys->mutex));
+ innobase_mysql_log_notify(write_lsn, flush_lsn);
+
return;
do_waits: