summaryrefslogtreecommitdiff
path: root/sql/log.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sql/log.cc')
-rw-r--r--sql/log.cc1653
1 files changed, 1281 insertions, 372 deletions
diff --git a/sql/log.cc b/sql/log.cc
index 21ad951f3c9..c28357c3f78 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -38,6 +38,7 @@
#endif
#include <mysql/plugin.h>
+#include "debug_sync.h"
/* max size of the log message */
#define MAX_LOG_BUFFER_SIZE 1024
@@ -61,6 +62,39 @@ static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv);
static int binlog_commit(handlerton *hton, THD *thd, bool all);
static int binlog_rollback(handlerton *hton, THD *thd, bool all);
static int binlog_prepare(handlerton *hton, THD *thd, bool all);
+static int binlog_start_consistent_snapshot(handlerton *hton, THD *thd);
+
+static LEX_STRING const write_error_msg=
+ { C_STRING_WITH_LEN("error writing to the binary log") };
+
+static my_bool opt_optimize_thread_scheduling= TRUE;
+ulong binlog_checksum_options;
+#ifndef DBUG_OFF
+ulong opt_binlog_dbug_fsync_sleep= 0;
+#endif
+
+static my_bool mutexes_inited;
+pthread_mutex_t LOCK_prepare_ordered;
+pthread_mutex_t LOCK_commit_ordered;
+
+static ulonglong binlog_status_var_num_commits;
+static ulonglong binlog_status_var_num_group_commits;
+static char binlog_snapshot_file[FN_REFLEN];
+static ulonglong binlog_snapshot_position;
+
+static SHOW_VAR binlog_status_vars_detail[]=
+{
+ {"commits",
+ (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
+ {"group_commits",
+ (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
+ {"snapshot_file",
+ (char *)&binlog_snapshot_file, SHOW_CHAR},
+ {"snapshot_position",
+ (char *)&binlog_snapshot_position, SHOW_LONGLONG},
+ {NullS, NullS, SHOW_LONG}
+};
+
/**
Silence all errors and warnings reported when performing a write
@@ -134,50 +168,17 @@ char *make_once_alloced_filename(const char *basename, const char *ext)
/*
- Helper class to hold a mutex for the duration of the
- block.
-
- Eliminates the need for explicit unlocking of mutexes on, e.g.,
- error returns. On passing a null pointer, the sentry will not do
- anything.
- */
-class Mutex_sentry
-{
-public:
- Mutex_sentry(pthread_mutex_t *mutex)
- : m_mutex(mutex)
- {
- if (m_mutex)
- pthread_mutex_lock(mutex);
- }
-
- ~Mutex_sentry()
- {
- if (m_mutex)
- pthread_mutex_unlock(m_mutex);
-#ifndef DBUG_OFF
- m_mutex= 0;
-#endif
- }
-
-private:
- pthread_mutex_t *m_mutex;
-
- // It's not allowed to copy this object in any way
- Mutex_sentry(Mutex_sentry const&);
- void operator=(Mutex_sentry const&);
-};
-
-/*
Helper class to store binary log transaction data.
*/
class binlog_trx_data {
public:
binlog_trx_data()
: at_least_one_stmt_committed(0), incident(FALSE), m_pending(0),
- before_stmt_pos(MY_OFF_T_UNDEF)
+ before_stmt_pos(MY_OFF_T_UNDEF), last_commit_pos_offset(0),
+ using_xa(FALSE), xa_xid(0)
{
trans_log.end_of_file= max_binlog_cache_size;
+ last_commit_pos_file[0]= 0;
}
~binlog_trx_data()
@@ -229,11 +230,14 @@ public:
completely.
*/
void reset() {
- if (!empty())
+ if (trans_log.type != WRITE_CACHE || !empty())
truncate(0);
before_stmt_pos= MY_OFF_T_UNDEF;
incident= FALSE;
trans_log.end_of_file= max_binlog_cache_size;
+ using_xa= FALSE;
+ last_commit_pos_file[0]= 0;
+ last_commit_pos_offset= 0;
DBUG_ASSERT(empty());
}
@@ -278,6 +282,22 @@ public:
Binlog position before the start of the current statement.
*/
my_off_t before_stmt_pos;
+ /*
+ Binlog position for current transaction.
+ For START TRANSACTION WITH CONSISTENT SNAPSHOT, this is the binlog
+ position corresponding to the snapshot taken. During (and after) commit,
+ this is set to the binlog position corresponding to just after the
+ commit (so storage engines can store it in their transaction log).
+ */
+ char last_commit_pos_file[FN_REFLEN];
+ my_off_t last_commit_pos_offset;
+
+ /*
+ Flag set true if this transaction is committed with log_xid() as part of
+ XA, false if not.
+ */
+ bool using_xa;
+ my_xid xa_xid;
};
handlerton *binlog_hton;
@@ -376,7 +396,7 @@ void Log_to_csv_event_handler::cleanup()
*/
bool Log_to_csv_event_handler::
- log_general(THD *thd, time_t event_time, const char *user_host,
+ log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
uint user_host_len, int thread_id,
const char *command_type, uint command_type_len,
const char *sql_text, uint sql_text_len,
@@ -458,8 +478,8 @@ bool Log_to_csv_event_handler::
DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
- ((Field_timestamp*) table->field[0])->store_timestamp((my_time_t)
- event_time);
+ ((Field_timestamp*) table->field[0])->store_TIME(
+ hrtime_to_my_time(event_time), hrtime_sec_part(event_time));
/* do a write */
if (table->field[1]->store(user_host, user_host_len, client_cs) ||
@@ -523,7 +543,6 @@ err:
log_slow()
thd THD of the query
current_time current timestamp
- query_start_arg command start timestamp
user_host the pointer to the string with user@host info
user_host_len length of the user_host string. this is computed once
and passed to all general log event handlers
@@ -546,7 +565,7 @@ err:
*/
bool Log_to_csv_event_handler::
- log_slow(THD *thd, time_t current_time, time_t query_start_arg,
+ log_slow(THD *thd, my_hrtime_t current_time,
const char *user_host, uint user_host_len,
ulonglong query_utime, ulonglong lock_utime, bool is_command,
const char *sql_text, uint sql_text_len)
@@ -560,6 +579,11 @@ bool Log_to_csv_event_handler::
Open_tables_state open_tables_backup;
CHARSET_INFO *client_cs= thd->variables.character_set_client;
bool save_time_zone_used;
+ long query_time= (long) min(query_utime/1000000, TIME_MAX_VALUE_SECONDS);
+ long lock_time= (long) min(lock_utime/1000000, TIME_MAX_VALUE_SECONDS);
+ long query_time_micro= (long) (query_utime % 1000000);
+ long lock_time_micro= (long) (lock_utime % 1000000);
+
DBUG_ENTER("Log_to_csv_event_handler::log_slow");
thd->push_internal_handler(& error_handler);
@@ -601,45 +625,34 @@ bool Log_to_csv_event_handler::
/* store the time and user values */
DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
- ((Field_timestamp*) table->field[0])->store_timestamp((my_time_t)
- current_time);
+ ((Field_timestamp*) table->field[0])->store_TIME(
+ hrtime_to_my_time(current_time), hrtime_sec_part(current_time));
if (table->field[1]->store(user_host, user_host_len, client_cs))
goto err;
- if (query_start_arg)
- {
- longlong query_time= (longlong) (query_utime/1000000);
- longlong lock_time= (longlong) (lock_utime/1000000);
- /*
- A TIME field can not hold the full longlong range; query_time or
- lock_time may be truncated without warning here, if greater than
- 839 hours (~35 days)
- */
- MYSQL_TIME t;
- t.neg= 0;
+ /*
+ A TIME field can not hold the full longlong range; query_time or
+ lock_time may be truncated without warning here, if greater than
+ 839 hours (~35 days)
+ */
+ MYSQL_TIME t;
+ t.neg= 0;
+
+ /* fill in query_time field */
+ calc_time_from_sec(&t, query_time, query_time_micro);
+ if (table->field[2]->store_time(&t))
+ goto err;
+ /* lock_time */
+ calc_time_from_sec(&t, lock_time, lock_time_micro);
+ if (table->field[3]->store_time(&t))
+ goto err;
+ /* rows_sent */
+ if (table->field[4]->store((longlong) thd->sent_row_count, TRUE))
+ goto err;
+ /* rows_examined */
+ if (table->field[5]->store((longlong) thd->examined_row_count, TRUE))
+ goto err;
- /* fill in query_time field */
- calc_time_from_sec(&t, (long) min(query_time, (longlong) TIME_MAX_VALUE_SECONDS), 0);
- if (table->field[2]->store_time(&t, MYSQL_TIMESTAMP_TIME))
- goto err;
- /* lock_time */
- calc_time_from_sec(&t, (long) min(lock_time, (longlong) TIME_MAX_VALUE_SECONDS), 0);
- if (table->field[3]->store_time(&t, MYSQL_TIMESTAMP_TIME))
- goto err;
- /* rows_sent */
- if (table->field[4]->store((longlong) thd->sent_row_count, TRUE))
- goto err;
- /* rows_examined */
- if (table->field[5]->store((longlong) thd->examined_row_count, TRUE))
- goto err;
- }
- else
- {
- table->field[2]->set_null();
- table->field[3]->set_null();
- table->field[4]->set_null();
- table->field[5]->set_null();
- }
/* fill database field */
if (thd->db)
{
@@ -776,14 +789,14 @@ void Log_to_file_event_handler::init_pthread_objects()
/** Wrapper around MYSQL_LOG::write() for slow log. */
bool Log_to_file_event_handler::
- log_slow(THD *thd, time_t current_time, time_t query_start_arg,
+ log_slow(THD *thd, my_hrtime_t current_time,
const char *user_host, uint user_host_len,
ulonglong query_utime, ulonglong lock_utime, bool is_command,
const char *sql_text, uint sql_text_len)
{
Silence_log_table_errors error_handler;
thd->push_internal_handler(&error_handler);
- bool retval= mysql_slow_log.write(thd, current_time, query_start_arg,
+ bool retval= mysql_slow_log.write(thd, hrtime_to_my_time(current_time),
user_host, user_host_len,
query_utime, lock_utime, is_command,
sql_text, sql_text_len);
@@ -798,7 +811,7 @@ bool Log_to_file_event_handler::
*/
bool Log_to_file_event_handler::
- log_general(THD *thd, time_t event_time, const char *user_host,
+ log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
uint user_host_len, int thread_id,
const char *command_type, uint command_type_len,
const char *sql_text, uint sql_text_len,
@@ -806,7 +819,8 @@ bool Log_to_file_event_handler::
{
Silence_log_table_errors error_handler;
thd->push_internal_handler(&error_handler);
- bool retval= mysql_log.write(event_time, user_host, user_host_len,
+ bool retval= mysql_log.write(hrtime_to_time(event_time), user_host,
+ user_host_len,
thread_id, command_type, command_type_len,
sql_text, sql_text_len);
thd->pop_internal_handler();
@@ -1015,8 +1029,6 @@ bool LOGGER::slow_log_print(THD *thd, const char *query, uint query_length,
if (*slow_log_handler_list)
{
- time_t current_time;
-
/* do not log slow queries from replication threads */
if (thd->slave_thread && !opt_log_slow_slave_statements)
return 0;
@@ -1036,16 +1048,12 @@ bool LOGGER::slow_log_print(THD *thd, const char *query, uint query_length,
sctx->ip ? sctx->ip : "", "]", NullS) -
user_host_buff);
- current_time= my_time_possible_from_micro(current_utime);
- if (thd->start_utime)
- {
- query_utime= (current_utime - thd->start_utime);
- lock_utime= (thd->utime_after_lock - thd->start_utime);
- }
- else
- {
- query_utime= lock_utime= 0;
- }
+ DBUG_ASSERT(thd->start_utime);
+ DBUG_ASSERT(thd->start_time);
+ query_utime= (current_utime - thd->start_utime);
+ lock_utime= (thd->utime_after_lock - thd->start_utime);
+ my_hrtime_t current_time= { hrtime_from_time(thd->start_time) +
+ thd->start_time_sec_part + query_utime };
if (!query)
{
@@ -1054,19 +1062,8 @@ bool LOGGER::slow_log_print(THD *thd, const char *query, uint query_length,
query_length= command_name[thd->command].length;
}
- if (!query_length)
- {
- /*
- Not a real query; Reset counts for slow query logging
- (QQ: Wonder if this is really needed)
- */
- thd->sent_row_count= thd->examined_row_count= 0;
- thd->query_plan_flags= QPLAN_INIT;
- thd->query_plan_fsort_passes= 0;
- }
-
for (current_handler= slow_log_handler_list; *current_handler ;)
- error= (*current_handler++)->log_slow(thd, current_time, thd->start_time,
+ error= (*current_handler++)->log_slow(thd, current_time,
user_host_buff, user_host_len,
query_utime, lock_utime, is_command,
query, query_length) || error;
@@ -1084,7 +1081,7 @@ bool LOGGER::general_log_write(THD *thd, enum enum_server_command command,
char user_host_buff[MAX_USER_HOST_SIZE + 1];
Security_context *sctx= thd->security_ctx;
uint user_host_len= 0;
- time_t current_time;
+ my_hrtime_t current_time;
DBUG_ASSERT(thd);
@@ -1101,7 +1098,7 @@ bool LOGGER::general_log_write(THD *thd, enum enum_server_command command,
sctx->ip ? sctx->ip : "", "]", NullS) -
user_host_buff;
- current_time= my_time(0);
+ current_time= my_hrtime();
while (*current_handler)
error|= (*current_handler++)->
log_general(thd, current_time, user_host_buff,
@@ -1421,6 +1418,7 @@ int binlog_init(void *p)
binlog_hton->commit= binlog_commit;
binlog_hton->rollback= binlog_rollback;
binlog_hton->prepare= binlog_prepare;
+ binlog_hton->start_consistent_snapshot= binlog_start_consistent_snapshot;
binlog_hton->flags= HTON_NOT_USER_SELECTABLE | HTON_HIDDEN;
return 0;
}
@@ -1437,91 +1435,118 @@ static int binlog_close_connection(handlerton *hton, THD *thd)
}
/*
- End a transaction.
+ End a transaction, writing events to the binary log.
SYNOPSIS
- binlog_end_trans()
+ binlog_flush_trx_cache()
thd The thread whose transaction should be ended
trx_data Pointer to the transaction data to use
- end_ev The end event to use, or NULL
all True if the entire transaction should be ended, false if
only the statement transaction should be ended.
+ end_ev The end event to use (COMMIT, ROLLBACK, or commit XID)
DESCRIPTION
End the currently open transaction. The transaction can be either
- a real transaction (if 'all' is true) or a statement transaction
- (if 'all' is false).
+ a real transaction or a statement transaction.
- If 'end_ev' is NULL, the transaction is a rollback of only
- transactional tables, so the transaction cache will be truncated
- to either just before the last opened statement transaction (if
- 'all' is false), or reset completely (if 'all' is true).
+ This can be to commit a transaction, with a COMMIT query event or an XA
+ commit XID event. But it can also be to rollback a transaction with a
+ ROLLBACK query event, used for rolling back transactions which also
+ contain updates to non-transactional tables.
*/
static int
-binlog_end_trans(THD *thd, binlog_trx_data *trx_data,
- Log_event *end_ev, bool all)
+binlog_flush_trx_cache(THD *thd, binlog_trx_data *trx_data,
+ Log_event *end_ev, bool all)
{
- DBUG_ENTER("binlog_end_trans");
- int error=0;
+ DBUG_ENTER("binlog_flush_trx_cache");
IO_CACHE *trans_log= &trx_data->trans_log;
- DBUG_PRINT("enter", ("transaction: %s end_ev: 0x%lx",
- all ? "all" : "stmt", (long) end_ev));
DBUG_PRINT("info", ("thd->options={ %s%s}",
FLAGSTR(thd->options, OPTION_NOT_AUTOCOMMIT),
FLAGSTR(thd->options, OPTION_BEGIN)));
+ if (thd->binlog_flush_pending_rows_event(TRUE))
+ DBUG_RETURN(1);
+
/*
- NULL denotes ROLLBACK with nothing to replicate: i.e., rollback of
- only transactional tables. If the transaction contain changes to
- any non-transactiona tables, we need write the transaction and log
- a ROLLBACK last.
- */
- if (end_ev != NULL)
- {
- if (thd->binlog_flush_pending_rows_event(TRUE))
- DBUG_RETURN(1);
- /*
- Doing a commit or a rollback including non-transactional tables,
- i.e., ending a transaction where we might write the transaction
- cache to the binary log.
-
- We can always end the statement when ending a transaction since
- transactions are not allowed inside stored functions. If they
- were, we would have to ensure that we're not ending a statement
- inside a stored function.
- */
- error= mysql_bin_log.write(thd, &trx_data->trans_log, end_ev,
- trx_data->has_incident());
- trx_data->reset();
+ Doing a commit or a rollback including non-transactional tables,
+ i.e., ending a transaction where we might write the transaction
+ cache to the binary log.
+
+ We can always end the statement when ending a transaction since
+ transactions are not allowed inside stored functions. If they
+ were, we would have to ensure that we're not ending a statement
+ inside a stored function.
+ */
+ int error= mysql_bin_log.write_transaction_to_binlog(thd, trx_data,
+ end_ev, all);
- statistic_increment(binlog_cache_use, &LOCK_status);
- if (trans_log->disk_writes != 0)
- {
- statistic_increment(binlog_cache_disk_use, &LOCK_status);
- trans_log->disk_writes= 0;
- }
- }
- else
+ trx_data->reset();
+
+ statistic_increment(binlog_cache_use, &LOCK_status);
+ if (trans_log->disk_writes != 0)
{
- /*
- If rolling back an entire transaction or a single statement not
- inside a transaction, we reset the transaction cache.
+ statistic_increment(binlog_cache_disk_use, &LOCK_status);
+ trans_log->disk_writes= 0;
+ }
- If rolling back a statement in a transaction, we truncate the
- transaction cache to remove the statement.
- */
- thd->binlog_remove_pending_rows_event(TRUE);
- if (all || !(thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT)))
- {
- if (trx_data->has_incident())
- error= mysql_bin_log.write_incident(thd, TRUE);
- trx_data->reset();
- }
- else // ...statement
- trx_data->truncate(trx_data->before_stmt_pos);
+ DBUG_ASSERT(thd->binlog_get_pending_rows_event() == NULL);
+ DBUG_RETURN(error);
+}
+
+/*
+ Discard a transaction, ie. ROLLBACK with only transactional table updates.
+
+ SYNOPSIS
+ binlog_truncate_trx_cache()
+
+ thd The thread whose transaction should be ended
+ trx_data Pointer to the transaction data to use
+ all True if the entire transaction should be ended, false if
+ only the statement transaction should be ended.
+
+ DESCRIPTION
+
+ Rollback (and end) a transaction that only modifies transactional
+ tables. The transaction can be either a real transaction (if 'all' is
+ true) or a statement transaction (if 'all' is false).
+
+ The transaction cache will be truncated to either just before the last
+ opened statement transaction (if 'all' is false), or reset completely (if
+ 'all' is true).
+ */
+static int
+binlog_truncate_trx_cache(THD *thd, binlog_trx_data *trx_data, bool all)
+{
+ DBUG_ENTER("binlog_truncate_trx_cache");
+ int error= 0;
+ DBUG_PRINT("enter", ("transaction: %s", all ? "all" : "stmt"));
+ DBUG_PRINT("info", ("thd->options={ %s%s}",
+ FLAGSTR(thd->options, OPTION_NOT_AUTOCOMMIT),
+ FLAGSTR(thd->options, OPTION_BEGIN)));
+
+ /*
+ ROLLBACK with nothing to replicate: i.e., rollback of only transactional
+ tables.
+ */
+
+ /*
+ If rolling back an entire transaction or a single statement not
+ inside a transaction, we reset the transaction cache.
+
+ If rolling back a statement in a transaction, we truncate the
+ transaction cache to remove the statement.
+ */
+ thd->binlog_remove_pending_rows_event(TRUE);
+ if (all || !(thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT)))
+ {
+ if (trx_data->has_incident())
+ error= mysql_bin_log.write_incident(thd);
+ trx_data->reset();
}
+ else // ...statement
+ trx_data->truncate(trx_data->before_stmt_pos);
DBUG_ASSERT(thd->binlog_get_pending_rows_event() == NULL);
DBUG_RETURN(error);
@@ -1533,7 +1558,7 @@ static int binlog_prepare(handlerton *hton, THD *thd, bool all)
do nothing.
just pretend we can do 2pc, so that MySQL won't
switch to 1pc.
- real work will be done in MYSQL_BIN_LOG::log_xid()
+ real work will be done in MYSQL_BIN_LOG::log_and_order()
*/
return 0;
}
@@ -1584,8 +1609,8 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all)
(trans_has_no_stmt_committed(thd, all) &&
!stmt_has_updated_trans_table(thd) && stmt_has_updated_non_trans_table(thd)))
{
- Query_log_event qev(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0);
- error= binlog_end_trans(thd, trx_data, &qev, all);
+ Query_log_event end_ev(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0);
+ error= binlog_flush_trx_cache(thd, trx_data, &end_ev, all);
}
trx_data->at_least_one_stmt_committed = my_b_tell(&trx_data->trans_log) > 0;
@@ -1649,7 +1674,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
(thd->options & OPTION_KEEP_LOG)) &&
mysql_bin_log.check_write_error(thd))
trx_data->set_incident();
- error= binlog_end_trans(thd, trx_data, 0, all);
+ error= binlog_truncate_trx_cache(thd, trx_data, all);
}
else
{
@@ -1668,8 +1693,8 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
stmt_has_updated_non_trans_table(thd) &&
thd->current_stmt_binlog_row_based))
{
- Query_log_event qev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, TRUE, 0);
- error= binlog_end_trans(thd, trx_data, &qev, all);
+ Query_log_event end_ev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, TRUE, 0);
+ error= binlog_flush_trx_cache(thd, trx_data, &end_ev, all);
}
/*
Otherwise, we simply truncate the cache as there is no change on
@@ -1677,7 +1702,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
*/
else if (ending_trans(thd, all) ||
(!(thd->options & OPTION_KEEP_LOG) && !stmt_has_updated_non_trans_table(thd)))
- error= binlog_end_trans(thd, trx_data, 0, all);
+ error= binlog_truncate_trx_cache(thd, trx_data, all);
}
if (!all)
trx_data->before_stmt_pos = MY_OFF_T_UNDEF; // part of the stmt rollback
@@ -1773,7 +1798,7 @@ static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv)
log_query.append(thd->lex->ident.str, thd->lex->ident.length) ||
log_query.append("`"))
DBUG_RETURN(1);
- int errcode= query_error_code(thd, thd->killed == THD::NOT_KILLED);
+ int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
Query_log_event qinfo(thd, log_query.ptr(), log_query.length(),
TRUE, TRUE, errcode);
DBUG_RETURN(mysql_bin_log.write(&qinfo));
@@ -1797,7 +1822,7 @@ static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv)
log_query.append(thd->lex->ident.str, thd->lex->ident.length) ||
log_query.append("`"))
DBUG_RETURN(1);
- int errcode= query_error_code(thd, thd->killed == THD::NOT_KILLED);
+ int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
Query_log_event qinfo(thd, log_query.ptr(), log_query.length(),
TRUE, TRUE, errcode);
DBUG_RETURN(mysql_bin_log.write(&qinfo));
@@ -2319,7 +2344,6 @@ err:
thd THD of the query
current_time current timestamp
- query_start_arg command start timestamp
user_host the pointer to the string with user@host info
user_host_len length of the user_host string. this is computed once
and passed to all general log event handlers
@@ -2341,7 +2365,7 @@ err:
*/
bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time,
- time_t query_start_arg, const char *user_host,
+ const char *user_host,
uint user_host_len, ulonglong query_utime,
ulonglong lock_utime, bool is_command,
const char *sql_text, uint sql_text_len)
@@ -2521,7 +2545,11 @@ const char *MYSQL_LOG::generate_name(const char *log_name,
MYSQL_BIN_LOG::MYSQL_BIN_LOG()
:bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
need_start_event(TRUE),
+ group_commit_queue(0), group_commit_queue_busy(FALSE),
+ num_commits(0), num_group_commits(0),
is_relay_log(0),
+ checksum_alg_reset(BINLOG_CHECKSUM_ALG_UNDEF),
+ relay_log_checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF),
description_event_for_exec(0), description_event_for_queue(0)
{
/*
@@ -2577,6 +2605,7 @@ void MYSQL_BIN_LOG::init_pthread_objects()
(void) my_pthread_mutex_init(&LOCK_index, MY_MUTEX_INIT_SLOW, "LOCK_index",
MYF_NO_DEADLOCK_DETECTION);
(void) pthread_cond_init(&update_cond, 0);
+ (void) pthread_cond_init(&COND_queue_busy, 0);
}
@@ -2756,10 +2785,23 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
as we won't be able to reset it later
*/
if (io_cache_type == WRITE_CACHE)
- s.flags|= LOG_EVENT_BINLOG_IN_USE_F;
+ s.flags |= LOG_EVENT_BINLOG_IN_USE_F;
+ s.checksum_alg= is_relay_log ?
+ /* relay-log */
+ /* inherit master's A descriptor if one has been received */
+ (relay_log_checksum_alg=
+ (relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF) ?
+ relay_log_checksum_alg :
+ /* otherwise use slave's local preference of RL events verification */
+ (opt_slave_sql_verify_checksum == 0) ?
+ (uint8) BINLOG_CHECKSUM_ALG_OFF : (uint8) binlog_checksum_options):
+ /* binlog */
+ (uint8) binlog_checksum_options;
+ DBUG_ASSERT(s.checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
if (!s.is_valid())
goto err;
s.dont_set_created= null_created_arg;
+ s.pre_55_writing_direct();
if (s.write(&log_file))
goto err;
bytes_written+= s.data_written;
@@ -2791,6 +2833,7 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
/* Don't set log_pos in event header */
description_event_for_queue->set_artificial_event();
+ description_event_for_queue->pre_55_writing_direct();
if (description_event_for_queue->write(&log_file))
goto err;
bytes_written+= description_event_for_queue->data_written;
@@ -2798,6 +2841,11 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
if (flush_io_cache(&log_file) ||
my_sync(log_file.file, MYF(MY_WME)))
goto err;
+ pthread_mutex_lock(&LOCK_commit_ordered);
+ strmake(last_commit_pos_file, log_file_name,
+ sizeof(last_commit_pos_file)-1);
+ last_commit_pos_offset= my_b_tell(&log_file);
+ pthread_mutex_unlock(&LOCK_commit_ordered);
if (write_file_name_to_index_file)
{
@@ -3923,8 +3971,16 @@ int MYSQL_BIN_LOG::new_file_impl(bool need_lock)
We log the whole file name for log file as the user may decide
to change base names at some point.
*/
- Rotate_log_event r(new_name+dirname_length(new_name),
- 0, LOG_EVENT_OFFSET, is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
+ Rotate_log_event r(new_name+dirname_length(new_name), 0, LOG_EVENT_OFFSET,
+ is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
+ /*
+ The current relay-log's closing Rotate event must have checksum
+ value computed with an algorithm of the last relay-logged FD event.
+ */
+ if (is_relay_log)
+ r.checksum_alg= relay_log_checksum_alg;
+ r.pre_55_writing_direct();
+ DBUG_ASSERT(!is_relay_log || relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
if(DBUG_EVALUATE_IF("fault_injection_new_file_rotate_event", (error=close_on_error=TRUE), FALSE) ||
(error= r.write(&log_file)))
{
@@ -3945,7 +4001,12 @@ int MYSQL_BIN_LOG::new_file_impl(bool need_lock)
old_name=name;
name=0; // Don't free name
close(LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX);
-
+ if (log_type == LOG_BIN && checksum_alg_reset != BINLOG_CHECKSUM_ALG_UNDEF)
+ {
+ DBUG_ASSERT(!is_relay_log);
+ DBUG_ASSERT(binlog_checksum_options != checksum_alg_reset);
+ binlog_checksum_options= checksum_alg_reset;
+ }
/*
Note that at this point, log_state != LOG_CLOSED (important for is_open()).
*/
@@ -4024,6 +4085,7 @@ bool MYSQL_BIN_LOG::append(Log_event* ev)
Log_event::write() is smart enough to use my_b_write() or
my_b_append() depending on the kind of cache we have.
*/
+ ev->pre_55_writing_direct();
if (ev->write(&log_file))
{
error=1;
@@ -4079,6 +4141,10 @@ bool MYSQL_BIN_LOG::flush_and_sync()
{
sync_binlog_counter= 0;
err=my_sync(fd, MYF(MY_WME));
+#ifndef DBUG_OFF
+ if (opt_binlog_dbug_fsync_sleep > 0)
+ my_sleep(opt_binlog_dbug_fsync_sleep);
+#endif
}
return err;
}
@@ -4270,12 +4336,33 @@ void THD::binlog_set_stmt_begin() {
trx_data->before_stmt_pos= pos;
}
+static int
+binlog_start_consistent_snapshot(handlerton *hton, THD *thd)
+{
+ int err= 0;
+ binlog_trx_data *trx_data;
+ DBUG_ENTER("binlog_start_consistent_snapshot");
+
+ thd->binlog_setup_trx_data();
+ trx_data= (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton);
+
+ /* Server layer calls us with LOCK_commit_ordered locked, so this is safe. */
+ strmake(trx_data->last_commit_pos_file, mysql_bin_log.last_commit_pos_file,
+ sizeof(trx_data->last_commit_pos_file)-1);
+ trx_data->last_commit_pos_offset= mysql_bin_log.last_commit_pos_offset;
+
+ trans_register_ha(thd, TRUE, hton);
+
+ DBUG_RETURN(err);
+}
/*
- Write a table map to the binary log.
+ Write a table map to the binary log. If with_annotate != NULL and
+ *with_annotate = TRUE write also Annotate_rows before the table map.
*/
-int THD::binlog_write_table_map(TABLE *table, bool is_trans)
+int THD::binlog_write_table_map(TABLE *table, bool is_trans,
+ my_bool *with_annotate)
{
int error;
DBUG_ENTER("THD::binlog_write_table_map");
@@ -4293,7 +4380,7 @@ int THD::binlog_write_table_map(TABLE *table, bool is_trans)
if (is_trans && binlog_table_maps == 0)
binlog_start_trans_and_stmt();
- if ((error= mysql_bin_log.write(&the_event)))
+ if ((error= mysql_bin_log.write(&the_event, with_annotate)))
DBUG_RETURN(error);
binlog_table_maps++;
@@ -4377,44 +4464,49 @@ MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
if (Rows_log_event* pending= trx_data->pending())
{
- IO_CACHE *file= &log_file;
-
/*
Decide if we should write to the log file directly or to the
transaction log.
*/
if (pending->get_cache_stmt() || my_b_tell(&trx_data->trans_log))
- file= &trx_data->trans_log;
-
- /*
- If we are not writing to the log file directly, we could avoid
- locking the log.
- */
- pthread_mutex_lock(&LOCK_log);
-
- /*
- Write pending event to log file or transaction cache
- */
- if (pending->write(file))
{
- pthread_mutex_unlock(&LOCK_log);
- set_write_error(thd);
- DBUG_RETURN(1);
+ /* Write to transaction log/cache. */
+ if (pending->write(&trx_data->trans_log))
+ {
+ set_write_error(thd);
+ DBUG_RETURN(1);
+ }
}
-
- delete pending;
-
- if (file == &log_file)
+ else
{
+ /* Write directly to log file. */
+ pthread_mutex_lock(&LOCK_log);
+ pending->pre_55_writing_direct();
+ if (pending->write(&log_file))
+ {
+ pthread_mutex_unlock(&LOCK_log);
+ set_write_error(thd);
+ DBUG_RETURN(1);
+ }
+
error= flush_and_sync();
if (!error)
{
signal_update();
error= rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
}
+
+ /*
+ Take mutex to protect against a reader seeing partial writes of 64-bit
+ offset on 32-bit CPUs.
+ */
+ pthread_mutex_lock(&LOCK_commit_ordered);
+ last_commit_pos_offset= my_b_tell(&log_file);
+ pthread_mutex_unlock(&LOCK_commit_ordered);
+ pthread_mutex_unlock(&LOCK_log);
}
- pthread_mutex_unlock(&LOCK_log);
+ delete pending;
}
thd->binlog_set_pending_rows_event(event);
@@ -4423,13 +4515,16 @@ MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
}
/**
- Write an event to the binary log.
+ Write an event to the binary log. If with_annotate != NULL and
+ *with_annotate = TRUE write also Annotate_rows before the event
+ (this should happen only if the event is a Table_map).
*/
-bool MYSQL_BIN_LOG::write(Log_event *event_info)
+bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
{
THD *thd= event_info->thd;
bool error= 1;
+ uint16 cache_type;
DBUG_ENTER("MYSQL_BIN_LOG::write(Log_event *)");
if (thd->binlog_evt_union.do_union)
@@ -4444,11 +4539,6 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
}
/*
- Flush the pending rows event to the transaction cache or to the
- log file. Since this function potentially aquire the LOCK_log
- mutex, we do this before aquiring the LOCK_log mutex in this
- function.
-
We only end the statement if we are in a top-level statement. If
we are inside a stored function, we do not end the statement since
this will close all tables on the slave.
@@ -4458,8 +4548,6 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
if (thd->binlog_flush_pending_rows_event(end_stmt))
DBUG_RETURN(error);
- pthread_mutex_lock(&LOCK_log);
-
/*
In most cases this is only called if 'is_open()' is true; in fact this is
mostly called if is_open() *was* true a few instructions before, but it
@@ -4481,7 +4569,6 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
thd->lex->sql_command != SQLCOM_SAVEPOINT &&
!binlog_filter->db_ok(local_db)))
{
- VOID(pthread_mutex_unlock(&LOCK_log));
DBUG_RETURN(0);
}
#endif /* HAVE_REPLICATION */
@@ -4525,15 +4612,26 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
thd->binlog_start_trans_and_stmt();
file= trans_log;
}
- /*
- TODO as Mats suggested, for all the cases above where we write to
- trans_log, it sounds unnecessary to lock LOCK_log. We should rather
- test first if we want to write to trans_log, and if not, lock
- LOCK_log.
- */
}
#endif /* USING_TRANSACTIONS */
DBUG_PRINT("info",("event type: %d",event_info->get_type_code()));
+ if (file == &log_file)
+ {
+ pthread_mutex_lock(&LOCK_log);
+ /*
+ We did not want to take LOCK_log unless really necessary.
+ However, now that we hold LOCK_log, we must check is_open() again, lest
+ the log was closed just before.
+ */
+ if (unlikely(!is_open()))
+ {
+ pthread_mutex_unlock(&LOCK_log);
+ DBUG_RETURN(error);
+ }
+ event_info->pre_55_writing_direct();
+ }
+
+ cache_type= event_info->cache_type;
/*
No check for auto events flag here - this write method should
@@ -4545,6 +4643,16 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
of the SQL command
*/
+ if (with_annotate && *with_annotate)
+ {
+ DBUG_ASSERT(event_info->get_type_code() == TABLE_MAP_EVENT);
+ Annotate_rows_log_event anno(thd, cache_type);
+ /* Annotate event should be written not more than once */
+ *with_annotate= 0;
+ if (anno.write(file))
+ goto err;
+ }
+
/*
If row-based binlogging, Insert_id, Rand and other kind of "setting
context" events are not needed.
@@ -4552,12 +4660,14 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
{
if (!thd->current_stmt_binlog_row_based)
{
+
if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
{
Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT,
- thd->first_successful_insert_id_in_prev_stmt_for_binlog);
+ thd->first_successful_insert_id_in_prev_stmt_for_binlog,
+ cache_type);
if (e.write(file))
- goto err;
+ goto err_unlock;
}
if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
{
@@ -4566,15 +4676,16 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
nb_elements()));
Intvar_log_event e(thd, (uchar) INSERT_ID_EVENT,
thd->auto_inc_intervals_in_cur_stmt_for_binlog.
- minimum());
+ minimum(), cache_type);
if (e.write(file))
- goto err;
+ goto err_unlock;
}
if (thd->rand_used)
{
- Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2);
+ Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2,
+ cache_type);
if (e.write(file))
- goto err;
+ goto err_unlock;
}
if (thd->user_var_events.elements)
{
@@ -4587,9 +4698,10 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
user_var_event->value,
user_var_event->length,
user_var_event->type,
- user_var_event->charset_number);
+ user_var_event->charset_number,
+ cache_type);
if (e.write(file))
- goto err;
+ goto err_unlock;
}
}
}
@@ -4598,7 +4710,7 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
/* Write the SQL command */
if (event_info->write(file) ||
DBUG_EVALUATE_IF("injecting_fault_writing", 1, 0))
- goto err;
+ goto err_unlock;
if (file == &log_file) // we are writing to the real log (disk)
{
@@ -4606,20 +4718,33 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
status_var_add(thd->status_var.binlog_bytes_written, data_written);
if (flush_and_sync())
- goto err;
+ goto err_unlock;
signal_update();
if ((error= rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED)))
- goto err;
+ goto err_unlock;
}
error=0;
+err_unlock:
+ if (file == &log_file)
+ {
+ my_off_t offset= my_b_tell(&log_file);
+ /*
+ Take mutex to protect against a reader seeing partial writes of 64-bit
+ offset on 32-bit CPUs.
+ */
+ pthread_mutex_lock(&LOCK_commit_ordered);
+ last_commit_pos_offset= offset;
+ pthread_mutex_unlock(&LOCK_commit_ordered);
+ pthread_mutex_unlock(&LOCK_log);
+ }
+
err:
if (error)
set_write_error(thd);
}
- pthread_mutex_unlock(&LOCK_log);
DBUG_RETURN(error);
}
@@ -4724,12 +4849,14 @@ int MYSQL_BIN_LOG::rotate_and_purge(uint flags)
We give it a shot and try to write an incident event anyway
to the current log.
*/
- if (!write_incident(current_thd, FALSE))
+ if (!write_incident_already_locked(current_thd))
flush_and_sync();
#ifdef HAVE_REPLICATION
check_purge= true;
#endif
+ if (flags & RP_BINLOG_CHECKSUM_ALG_CHANGE)
+ checksum_alg_reset= BINLOG_CHECKSUM_ALG_UNDEF; // done
}
if (!(flags & RP_LOCK_LOG_IS_ALREADY_LOCKED))
pthread_mutex_unlock(&LOCK_log);
@@ -4758,6 +4885,33 @@ uint MYSQL_BIN_LOG::next_file_id()
}
+/**
+ Calculate checksum of possibly a part of an event containing at least
+ the whole common header.
+
+ @param buf the pointer to trans cache's buffer
+ @param off the offset of the beginning of the event in the buffer
+ @param event_len no-checksum length of the event
+ @param length the current size of the buffer
+
+ @param crc [in-out] the checksum
+
+ Event size in incremented by @c BINLOG_CHECKSUM_LEN.
+
+ @return 0 or number of unprocessed yet bytes of the event excluding
+ the checksum part.
+*/
+ static ulong fix_log_event_crc(uchar *buf, uint off, uint event_len,
+ uint length, ha_checksum *crc)
+{
+ ulong ret;
+ uchar *event_begin= buf + off;
+
+ ret= length >= off + event_len ? 0 : off + event_len - length;
+ *crc= my_checksum(*crc, event_begin, event_len - ret);
+ return ret;
+}
+
/*
Write the contents of a cache to the binary log.
@@ -4765,24 +4919,33 @@ uint MYSQL_BIN_LOG::next_file_id()
write_cache()
thd Current_thread
cache Cache to write to the binary log
- lock_log True if the LOCK_log mutex should be aquired, false otherwise
- sync_log True if the log should be flushed and sync:ed
DESCRIPTION
Write the contents of the cache to the binary log. The cache will
be reset as a READ_CACHE to be able to read the contents from it.
- */
-int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
- bool sync_log)
-{
- Mutex_sentry sentry(lock_log ? &LOCK_log : NULL);
+ Reading from the trans cache with possible (per @c binlog_checksum_options)
+ adding checksum value and then fixing the length and the end_log_pos of
+ events prior to fill in the binlog cache.
+*/
+int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
+{
+ safe_mutex_assert_owner(&LOCK_log);
if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
return ER_ERROR_ON_WRITE;
uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
+ ulong remains= 0; // part of unprocessed yet netto length of the event
long val;
+ ulong end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t
uchar header[LOG_EVENT_HEADER_LEN];
+ ha_checksum crc= 0, crc_0= 0; // assignments to keep compiler happy
+ my_bool do_checksum= (binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF);
+ uchar buf[BINLOG_CHECKSUM_LEN];
+
+ // while there is just one alg the following must hold:
+ DBUG_ASSERT(!do_checksum ||
+ binlog_checksum_options == BINLOG_CHECKSUM_ALG_CRC32);
/*
The events in the buffer have incorrect end_log_pos data
@@ -4800,6 +4963,8 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
group= (uint)my_b_tell(&log_file);
hdr_offs= carry= 0;
+ if (do_checksum)
+ crc= crc_0= my_checksum(0L, NULL, 0);
do
{
@@ -4813,12 +4978,21 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
DBUG_ASSERT(carry < LOG_EVENT_HEADER_LEN);
/* assemble both halves */
- memcpy(&header[carry], (char *)cache->read_pos, LOG_EVENT_HEADER_LEN - carry);
+ memcpy(&header[carry], (char *)cache->read_pos,
+ LOG_EVENT_HEADER_LEN - carry);
/* fix end_log_pos */
- val= uint4korr(&header[LOG_POS_OFFSET]) + group;
+ val= uint4korr(&header[LOG_POS_OFFSET]) + group +
+ (end_log_pos_inc+= (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
int4store(&header[LOG_POS_OFFSET], val);
+ if (do_checksum)
+ {
+ ulong len= uint4korr(&header[EVENT_LEN_OFFSET]);
+ /* fix len */
+ int4store(&header[EVENT_LEN_OFFSET], len + BINLOG_CHECKSUM_LEN);
+ }
+
/* write the first half of the split header */
if (my_b_write(&log_file, header, carry))
return ER_ERROR_ON_WRITE;
@@ -4828,11 +5002,20 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
copy fixed second half of header to cache so the correct
version will be written later.
*/
- memcpy((char *)cache->read_pos, &header[carry], LOG_EVENT_HEADER_LEN - carry);
+ memcpy((char *)cache->read_pos, &header[carry],
+ LOG_EVENT_HEADER_LEN - carry);
/* next event header at ... */
- hdr_offs = uint4korr(&header[EVENT_LEN_OFFSET]) - carry;
+ hdr_offs= uint4korr(&header[EVENT_LEN_OFFSET]) - carry -
+ (do_checksum ? BINLOG_CHECKSUM_LEN : 0);
+ if (do_checksum)
+ {
+ DBUG_ASSERT(crc == crc_0 && remains == 0);
+ crc= my_checksum(crc, header, carry);
+ remains= uint4korr(header + EVENT_LEN_OFFSET) - carry -
+ BINLOG_CHECKSUM_LEN;
+ }
carry= 0;
}
@@ -4847,6 +5030,25 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
very next iteration, just "eventually").
*/
+ /* crc-calc the whole buffer */
+ if (do_checksum && hdr_offs >= length)
+ {
+
+ DBUG_ASSERT(remains != 0 && crc != crc_0);
+
+ crc= my_checksum(crc, cache->read_pos, length);
+ remains -= length;
+ if (my_b_write(&log_file, cache->read_pos, length))
+ return ER_ERROR_ON_WRITE;
+ if (remains == 0)
+ {
+ int4store(buf, crc);
+ if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
+ return ER_ERROR_ON_WRITE;
+ crc= crc_0;
+ }
+ }
+
while (hdr_offs < length)
{
/*
@@ -4854,6 +5056,26 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
we get the rest.
*/
+ if (do_checksum)
+ {
+ if (remains != 0)
+ {
+ /*
+ finish off with remains of the last event that crawls
+ from previous into the current buffer
+ */
+ DBUG_ASSERT(crc != crc_0);
+ crc= my_checksum(crc, cache->read_pos, hdr_offs);
+ int4store(buf, crc);
+ remains -= hdr_offs;
+ DBUG_ASSERT(remains == 0);
+ if (my_b_write(&log_file, cache->read_pos, hdr_offs) ||
+ my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
+ return ER_ERROR_ON_WRITE;
+ crc= crc_0;
+ }
+ }
+
if (hdr_offs + LOG_EVENT_HEADER_LEN > length)
{
carry= length - hdr_offs;
@@ -4863,17 +5085,38 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
else
{
/* we've got a full event-header, and it came in one piece */
-
- uchar *log_pos= (uchar *)cache->read_pos + hdr_offs + LOG_POS_OFFSET;
+ uchar *ev= (uchar *)cache->read_pos + hdr_offs;
+ uint event_len= uint4korr(ev + EVENT_LEN_OFFSET); // netto len
+ uchar *log_pos= ev + LOG_POS_OFFSET;
/* fix end_log_pos */
- val= uint4korr(log_pos) + group;
+ val= uint4korr(log_pos) + group +
+ (end_log_pos_inc += (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
int4store(log_pos, val);
+ /* fix CRC */
+ if (do_checksum)
+ {
+ /* fix length */
+ int4store(ev + EVENT_LEN_OFFSET, event_len + BINLOG_CHECKSUM_LEN);
+ remains= fix_log_event_crc(cache->read_pos, hdr_offs, event_len,
+ length, &crc);
+ if (my_b_write(&log_file, ev,
+ remains == 0 ? event_len : length - hdr_offs))
+ return ER_ERROR_ON_WRITE;
+ if (remains == 0)
+ {
+ int4store(buf, crc);
+ if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
+ return ER_ERROR_ON_WRITE;
+ crc= crc_0; // crc is complete
+ }
+ }
+
/* next event header at ... */
- log_pos= (uchar *)cache->read_pos + hdr_offs + EVENT_LEN_OFFSET;
- hdr_offs += uint4korr(log_pos);
+ hdr_offs += event_len; // incr by the netto len
+ DBUG_ASSERT(!do_checksum || remains == 0 || hdr_offs >= length);
}
}
@@ -4889,17 +5132,19 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
}
/* Write data to the binary log file */
- if (my_b_write(&log_file, cache->read_pos, length))
- return ER_ERROR_ON_WRITE;
+ DBUG_EXECUTE_IF("fail_binlog_write_1",
+ errno= 28; return ER_ERROR_ON_WRITE;);
+ if (!do_checksum)
+ if (my_b_write(&log_file, cache->read_pos, length))
+ return ER_ERROR_ON_WRITE;
status_var_add(thd->status_var.binlog_bytes_written, length);
cache->read_pos=cache->read_end; // Mark buffer used up
} while ((length= my_b_fill(cache)));
DBUG_ASSERT(carry == 0);
-
- if (sync_log)
- flush_and_sync();
+ DBUG_ASSERT(!do_checksum || remains == 0);
+ DBUG_ASSERT(!do_checksum || crc == crc_0);
return 0; // All OK
}
@@ -4911,7 +5156,7 @@ int query_error_code(THD *thd, bool not_killed)
{
int error;
- if (not_killed || (thd->killed == THD::KILL_BAD_DATA))
+ if (not_killed || (killed_mask_hard(thd->killed) == KILL_BAD_DATA))
{
error= thd->is_error() ? thd->main_da.sql_errno() : 0;
@@ -4921,7 +5166,7 @@ int query_error_code(THD *thd, bool not_killed)
caller.
*/
if (error == ER_SERVER_SHUTDOWN || error == ER_QUERY_INTERRUPTED ||
- error == ER_NEW_ABORTING_CONNECTION)
+ error == ER_NEW_ABORTING_CONNECTION || error == ER_CONNECTION_KILLED)
error= 0;
}
else
@@ -4934,31 +5179,51 @@ int query_error_code(THD *thd, bool not_killed)
return error;
}
-bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock)
+
+bool MYSQL_BIN_LOG::write_incident_already_locked(THD *thd)
{
uint error= 0;
- DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
-
- if (!is_open())
- DBUG_RETURN(error);
-
- LEX_STRING const write_error_msg=
- { C_STRING_WITH_LEN("error writing to the binary log") };
+ DBUG_ENTER("MYSQL_BIN_LOG::write_incident_already_locked");
Incident incident= INCIDENT_LOST_EVENTS;
Incident_log_event ev(thd, incident, write_error_msg);
- if (lock)
- pthread_mutex_lock(&LOCK_log);
- error= ev.write(&log_file);
- status_var_add(thd->status_var.binlog_bytes_written, ev.data_written);
- if (lock)
+
+ if (likely(is_open()))
{
- if (!error && !(error= flush_and_sync()))
+ ev.pre_55_writing_direct();
+ error= ev.write(&log_file);
+ status_var_add(thd->status_var.binlog_bytes_written, ev.data_written);
+ }
+
+ DBUG_RETURN(error);
+}
+
+
+bool MYSQL_BIN_LOG::write_incident(THD *thd)
+{
+ uint error= 0;
+ my_off_t offset;
+ DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
+
+ pthread_mutex_lock(&LOCK_log);
+ if (likely(is_open()))
+ {
+ if (!(error= write_incident_already_locked(thd)) &&
+ !(error= flush_and_sync()))
{
signal_update();
error= rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
}
- pthread_mutex_unlock(&LOCK_log);
+ offset= my_b_tell(&log_file);
+ /*
+ Take mutex to protect against a reader seeing partial writes of 64-bit
+ offset on 32-bit CPUs.
+ */
+ pthread_mutex_lock(&LOCK_commit_ordered);
+ last_commit_pos_offset= offset;
+ pthread_mutex_unlock(&LOCK_commit_ordered);
}
+ pthread_mutex_unlock(&LOCK_log);
+
DBUG_RETURN(error);
}
@@ -4986,111 +5251,373 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock)
'cache' needs to be reinitialized after this functions returns.
*/
-bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
- bool incident)
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data,
+ Log_event *end_ev, bool all)
{
- DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
+ group_commit_entry entry;
+ DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
+
+ entry.thd= thd;
+ entry.trx_data= trx_data;
+ entry.error= 0;
+ entry.all= all;
+
+ /*
+ Log "BEGIN" at the beginning of every transaction. Here, a transaction is
+ either a BEGIN..COMMIT block or a single statement in autocommit mode.
+
+ Create the necessary events here, where we have the correct THD (and
+ thread context).
+
+ Due to group commit the actual writing to binlog may happen in a different
+ thread.
+ */
+ Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0);
+ entry.begin_event= &qinfo;
+ entry.end_event= end_ev;
+ if (trx_data->has_incident())
+ {
+ Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
+ entry.incident_event= &inc_ev;
+ DBUG_RETURN(write_transaction_to_binlog_events(&entry));
+ }
+ else
+ {
+ entry.incident_event= NULL;
+ DBUG_RETURN(write_transaction_to_binlog_events(&entry));
+ }
+}
+
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
+{
+ /*
+ To facilitate group commit for the binlog, we first queue up ourselves in
+ the group commit queue. Then the first thread to enter the queue waits for
+ the LOCK_log mutex, and commits for everyone in the queue once it gets the
+ lock. Any other threads in the queue just wait for the first one to finish
+ the commit and wake them up.
+ */
+
+ entry->thd->clear_wakeup_ready();
+ pthread_mutex_lock(&LOCK_prepare_ordered);
+ group_commit_entry *orig_queue= group_commit_queue;
+ entry->next= orig_queue;
+ group_commit_queue= entry;
+
+ if (entry->trx_data->using_xa)
+ {
+ DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered");
+ run_prepare_ordered(entry->thd, entry->all);
+ DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
+ }
+ pthread_mutex_unlock(&LOCK_prepare_ordered);
+ DEBUG_SYNC(entry->thd, "commit_after_release_LOCK_prepare_ordered");
+
+ /*
+ The first in the queue handle group commit for all; the others just wait
+ to be signalled when group commit is done.
+ */
+ if (orig_queue != NULL)
+ entry->thd->wait_for_wakeup_ready();
+ else
+ trx_group_commit_leader(entry);
+
+ if (!opt_optimize_thread_scheduling)
+ {
+ /* For the leader, trx_group_commit_leader() already took the lock. */
+ if (orig_queue != NULL)
+ pthread_mutex_lock(&LOCK_commit_ordered);
+
+ DEBUG_SYNC(entry->thd, "commit_loop_entry_commit_ordered");
+ ++num_commits;
+ if (entry->trx_data->using_xa && !entry->error)
+ run_commit_ordered(entry->thd, entry->all);
+
+ group_commit_entry *next= entry->next;
+ if (!next)
+ {
+ group_commit_queue_busy= FALSE;
+ pthread_cond_signal(&COND_queue_busy);
+ DEBUG_SYNC(entry->thd, "commit_after_group_run_commit_ordered");
+ }
+ pthread_mutex_unlock(&LOCK_commit_ordered);
+
+ if (next)
+ {
+ next->thd->signal_wakeup_ready();
+ }
+ }
+
+ if (likely(!entry->error))
+ return 0;
+
+ switch (entry->error)
+ {
+ case ER_ERROR_ON_WRITE:
+ my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno);
+ break;
+ case ER_ERROR_ON_READ:
+ my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
+ entry->trx_data->trans_log.file_name, entry->commit_errno);
+ break;
+ default:
+ /*
+ There are not (and should not be) any errors thrown not covered above.
+ But just in case one is added later without updating the above switch
+ statement, include a catch-all.
+ */
+ my_printf_error(entry->error,
+ "Error writing transaction to binary log: %d",
+ MYF(ME_NOREFRESH), entry->error);
+ }
+
+ /*
+ Since we return error, this transaction XID will not be committed, so
+ we need to mark it as not needed for recovery (unlog() is not called
+ for a transaction if log_xid() fails).
+ */
+ if (entry->trx_data->using_xa && entry->trx_data->xa_xid)
+ mark_xid_done();
+
+ return 1;
+}
+
+/*
+ Do binlog group commit as the lead thread.
+
+ This must be called when this thread/transaction is queued at the start of
+ the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
+ commit all the transactions in the queue (more may have entered while waiting
+ for LOCK_log). After commit is done, all other threads in the queue will be
+ signalled.
+
+ */
+void
+MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
+{
+ uint xid_count= 0;
+ uint write_count= 0;
+ my_off_t commit_offset;
+ group_commit_entry *current;
+ group_commit_entry *last_in_queue;
+ DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
+ LINT_INIT(commit_offset);
+
+ /*
+ Lock the LOCK_log(), and once we get it, collect any additional writes
+ that queued up while we were waiting.
+ */
VOID(pthread_mutex_lock(&LOCK_log));
+ DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
- /* NULL would represent nothing to replicate after ROLLBACK */
- DBUG_ASSERT(commit_event != NULL);
+ pthread_mutex_lock(&LOCK_prepare_ordered);
+ current= group_commit_queue;
+ group_commit_queue= NULL;
+ pthread_mutex_unlock(&LOCK_prepare_ordered);
+ /* As the queue is in reverse order of entering, reverse it. */
+ group_commit_entry *queue= NULL;
+ last_in_queue= current;
+ while (current)
+ {
+ group_commit_entry *next= current->next;
+ current->next= queue;
+ queue= current;
+ current= next;
+ }
+ DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
+
+ /* Now we have in queue the list of transactions to be committed in order. */
DBUG_ASSERT(is_open());
if (likely(is_open())) // Should always be true
{
/*
- We only bother to write to the binary log if there is anything
- to write.
- */
- if (my_b_tell(cache) > 0)
+ Commit every transaction in the queue.
+
+ Note that we are doing this in a different thread than the one running
+ the transaction! So we are limited in the operations we can do. In
+ particular, we cannot call my_error() on behalf of a transaction, as
+ that obtains the THD from thread local storage. Instead, we must set
+ current->error and let the thread do the error reporting itself once
+ we wake it up.
+ */
+ for (current= queue; current != NULL; current= current->next)
{
- /*
- Log "BEGIN" at the beginning of every transaction. Here, a
- transaction is either a BEGIN..COMMIT block or a single
- statement in autocommit mode.
- */
- Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0);
+ binlog_trx_data *trx_data= current->trx_data;
+ IO_CACHE *cache= &trx_data->trans_log;
/*
- Now this Query_log_event has artificial log_pos 0. It must be
- adjusted to reflect the real position in the log. Not doing it
- would confuse the slave: it would prevent this one from
- knowing where he is in the master's binlog, which would result
- in wrong positions being shown to the user, MASTER_POS_WAIT
- undue waiting etc.
+ We only bother to write to the binary log if there is anything
+ to write.
*/
- if (qinfo.write(&log_file))
- goto err;
- status_var_add(thd->status_var.binlog_bytes_written, qinfo.data_written);
-
- DBUG_EXECUTE_IF("crash_before_writing_xid",
- {
- if ((write_error= write_cache(thd, cache, FALSE,
- TRUE)))
- DBUG_PRINT("info", ("error writing binlog cache: %d",
- write_error));
- DBUG_PRINT("info", ("crashing before writing xid"));
- DBUG_SUICIDE();
- });
-
- if ((write_error= write_cache(thd, cache, FALSE, FALSE)))
- goto err;
-
- if (commit_event)
+ if (my_b_tell(cache) > 0)
{
- if (commit_event->write(&log_file))
- goto err;
- status_var_add(thd->status_var.binlog_bytes_written,
- commit_event->data_written);
+ if ((current->error= write_transaction(current)))
+ current->commit_errno= errno;
+
+ write_count++;
}
- if (incident && write_incident(thd, FALSE))
- goto err;
+ strmake(trx_data->last_commit_pos_file, log_file_name,
+ sizeof(trx_data->last_commit_pos_file)-1);
+ commit_offset= my_b_write_tell(&log_file);
+ trx_data->last_commit_pos_offset= commit_offset;
+ if (trx_data->using_xa && trx_data->xa_xid)
+ xid_count++;
+ }
+ if (write_count > 0)
+ {
if (flush_and_sync())
- goto err;
- DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
- if (cache->error) // Error on read
{
- sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno);
- write_error=1; // Don't give more errors
- goto err;
+ for (current= queue; current != NULL; current= current->next)
+ {
+ if (!current->error)
+ {
+ current->error= ER_ERROR_ON_WRITE;
+ current->commit_errno= errno;
+ }
+ }
+ }
+ else
+ {
+ signal_update();
}
- signal_update();
}
/*
- if commit_event is Xid_log_event, increase the number of
- prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated
+ if any commit_events are Xid_log_event, increase the number of
+ prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated
if there're prepared xids in it - see the comment in new_file() for
an explanation.
- If the commit_event is not Xid_log_event (then it's a Query_log_event)
- rotate binlog, if necessary.
+ If no Xid_log_events (then it's all Query_log_event) rotate binlog,
+ if necessary.
*/
- if (commit_event && commit_event->get_type_code() == XID_EVENT)
+ if (xid_count > 0)
{
- pthread_mutex_lock(&LOCK_prep_xids);
- prepared_xids++;
- pthread_mutex_unlock(&LOCK_prep_xids);
+ mark_xids_active(xid_count);
}
else
+ {
if (rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED))
- goto err;
+ {
+ /*
+ If we fail to rotate, which thread should get the error?
+ We give the error to the *last* transaction thread; that seems to
+ make the most sense, as it was the last to write to the log.
+ */
+ last_in_queue->error= ER_ERROR_ON_WRITE;
+ last_in_queue->commit_errno= errno;
+ }
+ }
}
- VOID(pthread_mutex_unlock(&LOCK_log));
- DBUG_RETURN(0);
+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
+ pthread_mutex_lock(&LOCK_commit_ordered);
+ last_commit_pos_offset= commit_offset;
+ /*
+ We cannot unlock LOCK_log until we have locked LOCK_commit_ordered;
+ otherwise scheduling could allow the next group commit to run ahead of us,
+ messing up the order of commit_ordered() calls. But as soon as
+ LOCK_commit_ordered is obtained, we can let the next group commit start.
+ */
+ pthread_mutex_unlock(&LOCK_log);
+ DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
+ ++num_group_commits;
-err:
- if (!write_error)
+ if (!opt_optimize_thread_scheduling)
{
- write_error= 1;
- sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
+ /*
+ If we want to run commit_ordered() each in the transaction's own thread
+ context, then we need to mark the queue reserved; we need to finish all
+ threads in one group commit before the next group commit can be allowed
+ to proceed, and we cannot unlock a simple pthreads mutex in a different
+ thread from the one that locked it.
+ */
+
+ while (group_commit_queue_busy)
+ pthread_cond_wait(&COND_queue_busy, &LOCK_commit_ordered);
+ group_commit_queue_busy= TRUE;
+
+ /* Note that we return with LOCK_commit_ordered locked! */
+ DBUG_VOID_RETURN;
}
- VOID(pthread_mutex_unlock(&LOCK_log));
- DBUG_RETURN(1);
+
+ /*
+ Wakeup each participant waiting for our group commit, first calling the
+ commit_ordered() methods for any transactions doing 2-phase commit.
+ */
+ current= queue;
+ while (current != NULL)
+ {
+ group_commit_entry *next;
+
+ DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
+ ++num_commits;
+ if (current->trx_data->using_xa && !current->error)
+ run_commit_ordered(current->thd, current->all);
+
+ /*
+ Careful not to access current->next after waking up the other thread! As
+ it may change immediately after wakeup.
+ */
+ next= current->next;
+ if (current != leader) // Don't wake up ourself
+ current->thd->signal_wakeup_ready();
+ current= next;
+ }
+ DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
+ pthread_mutex_unlock(&LOCK_commit_ordered);
+
+ DBUG_VOID_RETURN;
}
+int
+MYSQL_BIN_LOG::write_transaction(group_commit_entry *entry)
+{
+ binlog_trx_data *trx_data= entry->trx_data;
+ IO_CACHE *cache= &trx_data->trans_log;
+
+ entry->begin_event->pre_55_writing_direct();
+ if (entry->begin_event->write(&log_file))
+ return ER_ERROR_ON_WRITE;
+ status_var_add(entry->thd->status_var.binlog_bytes_written,
+ entry->begin_event->data_written);
+
+ DBUG_EXECUTE_IF("crash_before_writing_xid",
+ {
+ if ((write_cache(entry->thd, cache)))
+ DBUG_PRINT("info", ("error writing binlog cache"));
+ else
+ flush_and_sync();
+
+ DBUG_PRINT("info", ("crashing before writing xid"));
+ DBUG_SUICIDE();
+ });
+
+ if (write_cache(entry->thd, cache))
+ return ER_ERROR_ON_WRITE;
+
+ entry->end_event->pre_55_writing_direct();
+ if (entry->end_event->write(&log_file))
+ return ER_ERROR_ON_WRITE;
+ status_var_add(entry->thd->status_var.binlog_bytes_written,
+ entry->end_event->data_written);
+
+ if (entry->incident_event)
+ {
+ entry->incident_event->pre_55_writing_direct();
+ if (entry->incident_event->write(&log_file))
+ return ER_ERROR_ON_WRITE;
+ }
+
+ if (cache->error) // Error on read
+ return ER_ERROR_ON_READ;
+
+ return 0;
+}
/**
Wait until we get a signal that the binary log has been updated.
@@ -5148,6 +5675,12 @@ void MYSQL_BIN_LOG::close(uint exiting)
(exiting & LOG_CLOSE_STOP_EVENT))
{
Stop_log_event s;
+ // the checksumming rule for relay-log case is similar to Rotate
+ s.checksum_alg= is_relay_log ?
+ (uint8) relay_log_checksum_alg : (uint8) binlog_checksum_options;
+ DBUG_ASSERT(!is_relay_log ||
+ relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
+ s.pre_55_writing_direct();
s.write(&log_file);
bytes_written+= s.data_written;
signal_update();
@@ -5483,6 +6016,171 @@ void sql_print_information(const char *format, ...)
}
+void
+TC_init()
+{
+ my_pthread_mutex_init(&LOCK_prepare_ordered, MY_MUTEX_INIT_SLOW,
+ "LOCK_prepare_ordered", MYF(0));
+ my_pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_SLOW,
+ "LOCK_commit_ordered", MYF(0));
+ mutexes_inited= TRUE;
+}
+
+
+void
+TC_destroy()
+{
+ if (mutexes_inited)
+ {
+ pthread_mutex_destroy(&LOCK_prepare_ordered);
+ pthread_mutex_destroy(&LOCK_commit_ordered);
+ mutexes_inited= FALSE;
+ }
+}
+
+
+void
+TC_LOG::run_prepare_ordered(THD *thd, bool all)
+{
+ Ha_trx_info *ha_info=
+ all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
+
+ safe_mutex_assert_owner(&LOCK_prepare_ordered);
+ for (; ha_info; ha_info= ha_info->next())
+ {
+ handlerton *ht= ha_info->ht();
+ if (!ht->prepare_ordered)
+ continue;
+ ht->prepare_ordered(ht, thd, all);
+ }
+}
+
+
+void
+TC_LOG::run_commit_ordered(THD *thd, bool all)
+{
+ Ha_trx_info *ha_info=
+ all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
+
+ safe_mutex_assert_owner(&LOCK_commit_ordered);
+ for (; ha_info; ha_info= ha_info->next())
+ {
+ handlerton *ht= ha_info->ht();
+ if (!ht->commit_ordered)
+ continue;
+ ht->commit_ordered(ht, thd, all);
+ DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
+ }
+}
+
+
+int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
+ bool need_prepare_ordered,
+ bool need_commit_ordered)
+{
+ int cookie;
+ struct commit_entry entry;
+ bool is_group_commit_leader;
+ LINT_INIT(is_group_commit_leader);
+
+ if (need_prepare_ordered)
+ {
+ pthread_mutex_lock(&LOCK_prepare_ordered);
+ run_prepare_ordered(thd, all);
+ if (need_commit_ordered)
+ {
+ /*
+ Must put us in queue so we can run_commit_ordered() in same sequence
+ as we did run_prepare_ordered().
+ */
+ thd->clear_wakeup_ready();
+ entry.thd= thd;
+ commit_entry *previous_queue= commit_ordered_queue;
+ entry.next= previous_queue;
+ commit_ordered_queue= &entry;
+ is_group_commit_leader= (previous_queue == NULL);
+ }
+ pthread_mutex_unlock(&LOCK_prepare_ordered);
+ }
+
+ cookie= 0;
+ if (xid)
+ cookie= log_one_transaction(xid);
+
+ if (need_commit_ordered)
+ {
+ if (need_prepare_ordered)
+ {
+ /*
+ We did the run_prepare_ordered() serialised, then ran the log_xid() in
+ parallel. Now we have to do run_commit_ordered() serialised in the
+ same sequence as run_prepare_ordered().
+
+ We do this starting from the head of the queue, each thread doing
+ run_commit_ordered() and signalling the next in queue.
+ */
+ if (is_group_commit_leader)
+ {
+ /* The first in queue starts the ball rolling. */
+ pthread_mutex_lock(&LOCK_prepare_ordered);
+ while (commit_ordered_queue_busy)
+ pthread_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered);
+ commit_entry *queue= commit_ordered_queue;
+ commit_ordered_queue= NULL;
+ /*
+ Mark the queue busy while we bounce it from one thread to the
+ next.
+ */
+ commit_ordered_queue_busy= true;
+ pthread_mutex_unlock(&LOCK_prepare_ordered);
+
+ /* Reverse the queue list so we get correct order. */
+ commit_entry *prev= NULL;
+ while (queue)
+ {
+ commit_entry *next= queue->next;
+ queue->next= prev;
+ prev= queue;
+ queue= next;
+ }
+ DBUG_ASSERT(prev == &entry && prev->thd == thd);
+ }
+ else
+ {
+ /* Not first in queue; just wait until previous thread wakes us up. */
+ thd->wait_for_wakeup_ready();
+ }
+ }
+
+ /* Only run commit_ordered() if log_xid was successful. */
+ if (cookie)
+ {
+ pthread_mutex_lock(&LOCK_commit_ordered);
+ run_commit_ordered(thd, all);
+ pthread_mutex_unlock(&LOCK_commit_ordered);
+ }
+
+ if (need_prepare_ordered)
+ {
+ commit_entry *next= entry.next;
+ if (next)
+ {
+ next->thd->signal_wakeup_ready();
+ }
+ else
+ {
+ pthread_mutex_lock(&LOCK_prepare_ordered);
+ commit_ordered_queue_busy= false;
+ pthread_cond_signal(&COND_queue_busy);
+ pthread_mutex_unlock(&LOCK_prepare_ordered);
+ }
+ }
+ }
+
+ return cookie;
+}
+
+
/********* transaction coordinator log for 2pc - mmap() based solution *******/
/*
@@ -5619,6 +6317,7 @@ int TC_LOG_MMAP::open(const char *opt_name)
pthread_mutex_init(&LOCK_pool, MY_MUTEX_INIT_FAST);
pthread_cond_init(&COND_active, 0);
pthread_cond_init(&COND_pool, 0);
+ pthread_cond_init(&COND_queue_busy, 0);
inited=6;
@@ -5626,6 +6325,8 @@ int TC_LOG_MMAP::open(const char *opt_name)
active=pages;
pool=pages+1;
pool_last=pages+npages-1;
+ commit_ordered_queue= NULL;
+ commit_ordered_queue_busy= false;
return 0;
@@ -5731,7 +6432,7 @@ int TC_LOG_MMAP::overflow()
to the position in memory where xid was logged to.
*/
-int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid)
+int TC_LOG_MMAP::log_one_transaction(my_xid xid)
{
int err;
PAGE *p;
@@ -5901,6 +6602,8 @@ void TC_LOG_MMAP::close()
pthread_mutex_destroy(&LOCK_active);
pthread_mutex_destroy(&LOCK_pool);
pthread_cond_destroy(&COND_pool);
+ pthread_cond_destroy(&COND_active);
+ pthread_cond_destroy(&COND_queue_busy);
case 5:
data[0]='A'; // garble the first (signature) byte, in case my_delete fails
case 4:
@@ -6079,7 +6782,8 @@ int TC_LOG_BINLOG::open(const char *opt_name)
goto err;
}
- if ((ev= Log_event::read_log_event(&log, 0, &fdle)) &&
+ if ((ev= Log_event::read_log_event(&log, 0, &fdle,
+ opt_master_verify_checksum)) &&
ev->get_type_code() == FORMAT_DESCRIPTION_EVENT &&
ev->flags & LOG_EVENT_BINLOG_IN_USE_F)
{
@@ -6109,39 +6813,98 @@ void TC_LOG_BINLOG::close()
pthread_cond_destroy (&COND_prep_xids);
}
-/**
- @todo
- group commit
-
- @retval
- 0 error
- @retval
- 1 success
+/*
+ Do a binlog log_xid() for a group of transactions, linked through
+ thd->next_commit_ordered.
*/
-int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid)
+int
+TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
+ bool need_prepare_ordered __attribute__((unused)),
+ bool need_commit_ordered __attribute__((unused)))
{
- DBUG_ENTER("TC_LOG_BINLOG::log");
- Xid_log_event xle(thd, xid);
- binlog_trx_data *trx_data=
+ int err;
+ DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
+
+ binlog_trx_data *const trx_data=
(binlog_trx_data*) thd_get_ha_data(thd, binlog_hton);
- /*
- We always commit the entire transaction when writing an XID. Also
- note that the return value is inverted.
- */
- DBUG_RETURN(!binlog_end_trans(thd, trx_data, &xle, TRUE));
+
+ trx_data->using_xa= TRUE;
+ trx_data->xa_xid= xid;
+ if (xid)
+ {
+ Xid_log_event xid_event(thd, xid);
+ err= binlog_flush_trx_cache(thd, trx_data, &xid_event, all);
+ }
+ else
+ {
+ /*
+ Empty xid occurs in XA COMMIT ... ONE PHASE.
+ In this case, we do not have a MySQL xid for the transaction, and the
+ external XA transaction coordinator will have to handle recovery if
+ needed. So we end the transaction with a plain COMMIT query event.
+ */
+ Query_log_event end_event(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0);
+ err= binlog_flush_trx_cache(thd, trx_data, &end_event, all);
+ }
+
+ DEBUG_SYNC(thd, "binlog_after_log_and_order");
+
+ DBUG_RETURN(!err);
}
-int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
+/*
+ After an XID is logged, we need to hold on to the current binlog file until
+ it is fully committed in the storage engine. The reason is that crash
+ recovery only looks at the latest binlog, so we must make sure there are no
+ outstanding prepared (but not committed) transactions before rotating the
+ binlog.
+
+ To handle this, we keep a count of outstanding XIDs. This function is used
+ to increase this count when committing one or more transactions to the
+ binary log.
+*/
+void
+TC_LOG_BINLOG::mark_xids_active(uint xid_count)
{
- DBUG_ENTER("TC_LOG_BINLOG::unlog");
+ DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
+ DBUG_PRINT("info", ("xid_count=%u", xid_count));
+ pthread_mutex_lock(&LOCK_prep_xids);
+ prepared_xids+= xid_count;
+ pthread_mutex_unlock(&LOCK_prep_xids);
+ DBUG_VOID_RETURN;
+}
+
+/*
+ Once an XID is committed, it is safe to rotate the binary log, as it can no
+ longer be needed during crash recovery.
+
+ This function is called to mark an XID this way. It needs to decrease the
+ count of pending XIDs, and signal the log rotator thread when it reaches zero.
+*/
+void
+TC_LOG_BINLOG::mark_xid_done()
+{
+ my_bool send_signal;
+
+ DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
pthread_mutex_lock(&LOCK_prep_xids);
DBUG_ASSERT(prepared_xids > 0);
- if (--prepared_xids == 0) {
+ send_signal= !--prepared_xids;
+ pthread_mutex_unlock(&LOCK_prep_xids);
+ if (send_signal) {
DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids));
pthread_cond_signal(&COND_prep_xids);
}
- pthread_mutex_unlock(&LOCK_prep_xids);
- DBUG_RETURN(rotate_and_purge(0)); // as ::write() did not rotate
+ DBUG_VOID_RETURN;
+}
+
+int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
+{
+ DBUG_ENTER("TC_LOG_BINLOG::unlog");
+ if (xid)
+ mark_xid_done();
+ /* As ::write_transaction_to_binlog() did not rotate, do it here. */
+ DBUG_RETURN(rotate_and_purge(0));
}
int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
@@ -6159,7 +6922,9 @@ int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error
- while ((ev= Log_event::read_log_event(log,0,fdle)) && ev->is_valid())
+ while ((ev= Log_event::read_log_event(log, 0, fdle,
+ opt_master_verify_checksum))
+ && ev->is_valid())
{
if (ev->get_type_code() == XID_EVENT)
{
@@ -6210,9 +6975,153 @@ ulonglong mysql_bin_log_file_pos(void)
{
return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file;
}
+/*
+ Get the current position of the MySQL binlog for transaction currently being
+ committed.
+
+ This is valid to call from within storage engine commit_ordered() and
+ commit() methods only.
+
+ Since it stores the position inside THD, it is safe to call without any
+ locking.
+*/
+void
+mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
+{
+ binlog_trx_data *const trx_data=
+ (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton);
+ if (trx_data)
+ {
+ *out_file= trx_data->last_commit_pos_file;
+ *out_pos= (ulonglong)(trx_data->last_commit_pos_offset);
+ }
+ else
+ {
+ *out_file= NULL;
+ *out_pos= 0;
+ }
+}
#endif /* INNODB_COMPATIBILITY_HOOKS */
+static void
+binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= *((ulong *)save);
+
+ pthread_mutex_lock(mysql_bin_log.get_log_lock());
+ if(mysql_bin_log.is_open())
+ {
+ uint flags= RP_FORCE_ROTATE | RP_LOCK_LOG_IS_ALREADY_LOCKED |
+ (binlog_checksum_options != (uint) value?
+ RP_BINLOG_CHECKSUM_ALG_CHANGE : 0);
+ if (flags & RP_BINLOG_CHECKSUM_ALG_CHANGE)
+ mysql_bin_log.checksum_alg_reset= (uint8) value;
+ mysql_bin_log.rotate_and_purge(flags);
+ }
+ else
+ {
+ binlog_checksum_options= value;
+ }
+ DBUG_ASSERT((ulong) binlog_checksum_options == value);
+ DBUG_ASSERT(mysql_bin_log.checksum_alg_reset == BINLOG_CHECKSUM_ALG_UNDEF);
+ pthread_mutex_unlock(mysql_bin_log.get_log_lock());
+}
+
+
+static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
+{
+ mysql_bin_log.set_status_variables(thd);
+ var->type= SHOW_ARRAY;
+ var->value= (char *)&binlog_status_vars_detail;
+ return 0;
+}
+
+static SHOW_VAR binlog_status_vars_top[]= {
+ {"binlog", (char *) &show_binlog_vars, SHOW_FUNC},
+ {NullS, NullS, SHOW_LONG}
+};
+
+static MYSQL_SYSVAR_BOOL(
+ optimize_thread_scheduling,
+ opt_optimize_thread_scheduling,
+ PLUGIN_VAR_READONLY,
+ "Run fast part of group commit in a single thread, to optimize kernel "
+ "thread scheduling. On by default. Disable to run each transaction in group "
+ "commit in its own thread, which can be slower at very high concurrency. "
+ "This option is mostly for testing one algorithm versus the other, and it "
+ "should not normally be necessary to change it.",
+ NULL,
+ NULL,
+ 1);
+
+static MYSQL_SYSVAR_ENUM(
+ checksum,
+ binlog_checksum_options,
+ PLUGIN_VAR_RQCMDARG,
+ "Type of BINLOG_CHECKSUM_ALG. Include checksum for "
+ "log events in the binary log. Possible values are NONE and CRC32; "
+ "default is NONE.",
+ NULL,
+ binlog_checksum_update,
+ BINLOG_CHECKSUM_ALG_OFF,
+ &binlog_checksum_typelib);
+
+static struct st_mysql_sys_var *binlog_sys_vars[]=
+{
+ MYSQL_SYSVAR(optimize_thread_scheduling),
+ MYSQL_SYSVAR(checksum),
+ NULL
+};
+
+
+/*
+ Copy out the non-directory part of binlog position filename for the
+ `binlog_snapshot_file' status variable, same way as it is done for
+ SHOW MASTER STATUS.
+*/
+static void
+set_binlog_snapshot_file(const char *src)
+{
+ int dir_len = dirname_length(src);
+ strmake(binlog_snapshot_file, src + dir_len, sizeof(binlog_snapshot_file)-1);
+}
+
+/*
+ Copy out current values of status variables, for SHOW STATUS or
+ information_schema.global_status.
+
+ This is called only under LOCK_status, so we can fill in a static array.
+*/
+void
+TC_LOG_BINLOG::set_status_variables(THD *thd)
+{
+ binlog_trx_data *trx_data;
+
+ if (thd)
+ trx_data= (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton);
+ else
+ trx_data= NULL;
+
+ bool have_snapshot= (trx_data && trx_data->last_commit_pos_file[0] != 0);
+ pthread_mutex_lock(&LOCK_commit_ordered);
+ binlog_status_var_num_commits= this->num_commits;
+ binlog_status_var_num_group_commits= this->num_group_commits;
+ if (!have_snapshot)
+ {
+ set_binlog_snapshot_file(last_commit_pos_file);
+ binlog_snapshot_position= last_commit_pos_offset;
+ }
+ pthread_mutex_unlock(&LOCK_commit_ordered);
+
+ if (have_snapshot)
+ {
+ set_binlog_snapshot_file(trx_data->last_commit_pos_file);
+ binlog_snapshot_position= trx_data->last_commit_pos_offset;
+ }
+}
+
struct st_mysql_storage_engine binlog_storage_engine=
{ MYSQL_HANDLERTON_INTERFACE_VERSION };
@@ -6227,8 +7136,8 @@ mysql_declare_plugin(binlog)
binlog_init, /* Plugin Init */
NULL, /* Plugin Deinit */
0x0100 /* 1.0 */,
- NULL, /* status variables */
- NULL, /* system variables */
+ binlog_status_vars_top, /* status variables */
+ binlog_sys_vars, /* system variables */
NULL /* config options */
}
mysql_declare_plugin_end;
@@ -6243,8 +7152,8 @@ maria_declare_plugin(binlog)
binlog_init, /* Plugin Init */
NULL, /* Plugin Deinit */
0x0100 /* 1.0 */,
- NULL, /* status variables */
- NULL, /* system variables */
+ binlog_status_vars_top, /* status variables */
+ binlog_sys_vars, /* system variables */
"1.0", /* string version */
MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
}