summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--innobase/buf/buf0flu.c2
-rw-r--r--innobase/include/log0log.h65
-rw-r--r--innobase/include/trx0trx.h14
-rw-r--r--innobase/log/log0log.c158
-rw-r--r--innobase/os/os0file.c7
-rw-r--r--innobase/row/row0mysql.c2
-rw-r--r--innobase/srv/srv0srv.c17
-rw-r--r--innobase/trx/trx0trx.c48
-rw-r--r--sql/ha_innodb.cc64
-rw-r--r--sql/ha_innodb.h2
-rw-r--r--sql/handler.cc30
-rw-r--r--sql/handler.h1
-rw-r--r--sql/log.cc20
13 files changed, 276 insertions, 154 deletions
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
index 516056b5174..4d998f8306f 100644
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@@ -398,7 +398,7 @@ buf_flush_write_block_low(
"Warning: cannot force log to disk in the log debug version!\n");
#else
/* Force the log to the disk before writing the modified block */
- log_flush_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS);
+ log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
#endif
buf_flush_init_for_writing(block->frame, block->newest_modification,
block->space, block->offset);
diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h
index f200371de9d..4e1404b15fe 100644
--- a/innobase/include/log0log.h
+++ b/innobase/include/log0log.h
@@ -20,7 +20,7 @@ typedef struct log_group_struct log_group_t;
extern ibool log_do_write;
extern ibool log_debug_writes;
-/* Wait modes for log_flush_up_to */
+/* Wait modes for log_write_up_to */
#define LOG_NO_WAIT 91
#define LOG_WAIT_ONE_GROUP 92
#define LOG_WAIT_ALL_GROUPS 93
@@ -157,26 +157,21 @@ log_io_complete(
/*============*/
log_group_t* group); /* in: log group */
/**********************************************************
-Flushes the log files to the disk, using, for example, the Unix fsync.
-This function does the flush even if the user has set
-srv_flush_log_at_trx_commit = FALSE. */
-
-void
-log_flush_to_disk(void);
-/*===================*/
-/**********************************************************
This function is called, e.g., when a transaction wants to commit. It checks
-that the log has been flushed to disk up to the last log entry written by the
-transaction. If there is a flush running, it waits and checks if the flush
-flushed enough. If not, starts a new flush. */
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
void
-log_flush_up_to(
+log_write_up_to(
/*============*/
dulint lsn, /* in: log sequence number up to which the log should
- be flushed, ut_dulint_max if not specified */
- ulint wait); /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ be written, ut_dulint_max if not specified */
+ ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
or LOG_WAIT_ALL_GROUPS */
+ ibool flush_to_disk);
+ /* in: TRUE if we want the written log also to be
+ flushed to disk */
/********************************************************************
Advances the smallest lsn for which there are unflushed dirty blocks in the
buffer pool and also may make a new checkpoint. NOTE: this function may only
@@ -741,27 +736,37 @@ struct log_struct{
be advanced, it is enough that the
write i/o has been completed for all
log groups */
- dulint flush_lsn; /* end lsn for the current flush */
- ulint flush_end_offset;/* the data in buffer has been flushed
+ dulint write_lsn; /* end lsn for the current running
+ write */
+ ulint write_end_offset;/* the data in buffer has been written
up to this offset when the current
- flush ends: this field will then
+ write ends: this field will then
be copied to buf_next_to_write */
- ulint n_pending_writes;/* number of currently pending flush
- writes */
+ dulint current_flush_lsn;/* end lsn for the current running
+ write + flush operation */
+ dulint flushed_to_disk_lsn;
+ /* how far we have written the log
+ AND flushed to disk */
+ ulint n_pending_writes;/* number of currently pending flushes
+ or writes */
+ /* NOTE on the 'flush' in names of the fields below: starting from
+ 4.0.14, we separate the write of the log file and the actual fsync()
+ or other method to flush it to disk. The names below shhould really
+ be 'flush_or_write'! */
os_event_t no_flush_event; /* this event is in the reset state
- when a flush is running; a thread
- should wait for this without owning
- the log mutex, but NOTE that to set or
- reset this event, the thread MUST own
- the log mutex! */
+ when a flush or a write is running;
+ a thread should wait for this without
+ owning the log mutex, but NOTE that
+ to set or reset this event, the
+ thread MUST own the log mutex! */
ibool one_flushed; /* during a flush, this is first FALSE
and becomes TRUE when one log group
- has been flushed */
+ has been written or flushed */
os_event_t one_flushed_event;/* this event is reset when the
- flush has not yet completed for any
- log group; e.g., this means that a
- transaction has been committed when
- this is set; a thread should wait
+ flush or write has not yet completed
+ for any log group; e.g., this means
+ that a transaction has been committed
+ when this is set; a thread should wait
for this without owning the log mutex,
but NOTE that to set or reset this
event, the thread MUST own the log
diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h
index be96519c4ea..39229923375 100644
--- a/innobase/include/trx0trx.h
+++ b/innobase/include/trx0trx.h
@@ -157,6 +157,15 @@ trx_commit_for_mysql(
/* out: 0 or error number */
trx_t* trx); /* in: trx handle */
/**************************************************************************
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+ /* out: 0 or error number */
+ trx_t* trx); /* in: trx handle */
+/**************************************************************************
Marks the latest SQL statement ended. */
void
@@ -343,6 +352,11 @@ struct trx_struct{
dulint no; /* transaction serialization number ==
max trx id when the transaction is
moved to COMMITTED_IN_MEMORY state */
+ ibool flush_log_later;/* when we commit the transaction
+ in MySQL's binlog write, we will
+ flush the log to disk later in
+ a separate call */
+ dulint commit_lsn; /* lsn at the time of the commit */
ibool dict_operation; /* TRUE if the trx is used to create
a table, create an index, or drop a
table */
diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c
index 539cde337bd..25cc666e802 100644
--- a/innobase/log/log0log.c
+++ b/innobase/log/log0log.c
@@ -178,7 +178,7 @@ loop:
/* Not enough free space, do a syncronous flush of the log
buffer */
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS);
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS, TRUE);
count++;
@@ -675,7 +675,9 @@ log_init(void)
log_sys->buf_next_to_write = 0;
- log_sys->flush_lsn = ut_dulint_zero;
+ log_sys->write_lsn = ut_dulint_zero;
+ log_sys->current_flush_lsn = ut_dulint_zero;
+ log_sys->flushed_to_disk_lsn = ut_dulint_zero;
log_sys->written_to_some_lsn = log_sys->lsn;
log_sys->written_to_all_lsn = log_sys->lsn;
@@ -867,7 +869,7 @@ log_group_check_flush_completion(
printf("Log flushed first to group %lu\n", group->id);
}
- log_sys->written_to_some_lsn = log_sys->flush_lsn;
+ log_sys->written_to_some_lsn = log_sys->write_lsn;
log_sys->one_flushed = TRUE;
return(LOG_UNLOCK_NONE_FLUSHED_LOCK);
@@ -896,15 +898,15 @@ log_sys_check_flush_completion(void)
if (log_sys->n_pending_writes == 0) {
- log_sys->written_to_all_lsn = log_sys->flush_lsn;
- log_sys->buf_next_to_write = log_sys->flush_end_offset;
+ log_sys->written_to_all_lsn = log_sys->write_lsn;
+ log_sys->buf_next_to_write = log_sys->write_end_offset;
- if (log_sys->flush_end_offset > log_sys->max_buf_free / 2) {
+ if (log_sys->write_end_offset > log_sys->max_buf_free / 2) {
/* Move the log buffer content to the start of the
buffer */
move_start = ut_calc_align_down(
- log_sys->flush_end_offset,
+ log_sys->write_end_offset,
OS_FILE_LOG_BLOCK_SIZE);
move_end = ut_calc_align(log_sys->buf_free,
OS_FILE_LOG_BLOCK_SIZE);
@@ -982,57 +984,6 @@ log_io_complete(
}
/**********************************************************
-Flushes the log files to the disk, using, for example, the Unix fsync.
-This function does the flush even if the user has set
-srv_flush_log_at_trx_commit = FALSE. */
-
-void
-log_flush_to_disk(void)
-/*===================*/
-{
- log_group_t* group;
-loop:
- mutex_enter(&(log_sys->mutex));
-
- if (log_sys->n_pending_writes > 0) {
- /* A log file write is running */
-
- mutex_exit(&(log_sys->mutex));
-
- /* Wait for the log file write to complete and try again */
-
- os_event_wait(log_sys->no_flush_event);
-
- goto loop;
- }
-
- group = UT_LIST_GET_FIRST(log_sys->log_groups);
-
- log_sys->n_pending_writes++;
- group->n_pending_writes++;
-
- os_event_reset(log_sys->no_flush_event);
- os_event_reset(log_sys->one_flushed_event);
-
- mutex_exit(&(log_sys->mutex));
-
- fil_flush(group->space_id);
-
- mutex_enter(&(log_sys->mutex));
-
- ut_a(group->n_pending_writes == 1);
- ut_a(log_sys->n_pending_writes == 1);
-
- group->n_pending_writes--;
- log_sys->n_pending_writes--;
-
- os_event_set(log_sys->no_flush_event);
- os_event_set(log_sys->one_flushed_event);
-
- mutex_exit(&(log_sys->mutex));
-}
-
-/**********************************************************
Writes a log file header to a log file space. */
static
void
@@ -1205,12 +1156,15 @@ by the transaction. If there is a flush running, it waits and checks if the
flush flushed enough. If not, starts a new flush. */
void
-log_flush_up_to(
+log_write_up_to(
/*============*/
dulint lsn, /* in: log sequence number up to which the log should
be written, ut_dulint_max if not specified */
- ulint wait) /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
or LOG_WAIT_ALL_GROUPS */
+ ibool flush_to_disk)
+ /* in: TRUE if we want the written log also to be
+ flushed to disk */
{
log_group_t* group;
ulint start_offset;
@@ -1239,9 +1193,18 @@ loop:
mutex_enter(&(log_sys->mutex));
- if ((ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0)
- || ((ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0)
- && (wait != LOG_WAIT_ALL_GROUPS))) {
+ if (flush_to_disk
+ && ut_dulint_cmp(log_sys->flushed_to_disk_lsn, lsn) >= 0) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return;
+ }
+
+ if (!flush_to_disk
+ && (ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0
+ || (ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0
+ && wait != LOG_WAIT_ALL_GROUPS))) {
mutex_exit(&(log_sys->mutex));
@@ -1249,10 +1212,19 @@ loop:
}
if (log_sys->n_pending_writes > 0) {
- /* A flush is running */
+ /* A write (+ possibly flush to disk) is running */
+
+ if (flush_to_disk
+ && ut_dulint_cmp(log_sys->current_flush_lsn, lsn) >= 0) {
+ /* The write + flush will write enough: wait for it to
+ complete */
+
+ goto do_waits;
+ }
- if (ut_dulint_cmp(log_sys->flush_lsn, lsn) >= 0) {
- /* The flush will flush enough: wait for it to
+ if (!flush_to_disk
+ && ut_dulint_cmp(log_sys->write_lsn, lsn) >= 0) {
+ /* The write will write enough: wait for it to
complete */
goto do_waits;
@@ -1260,16 +1232,17 @@ loop:
mutex_exit(&(log_sys->mutex));
- /* Wait for the flush to complete and try to start a new
- flush */
+ /* Wait for the write to complete and try to start a new
+ write */
os_event_wait(log_sys->no_flush_event);
goto loop;
}
- if (log_sys->buf_free == log_sys->buf_next_to_write) {
- /* Nothing to flush */
+ if (!flush_to_disk
+ && log_sys->buf_free == log_sys->buf_next_to_write) {
+ /* Nothing to write and no flush to disk requested */
mutex_exit(&(log_sys->mutex));
@@ -1277,7 +1250,7 @@ loop:
}
if (log_debug_writes) {
- printf("Flushing log from %lu %lu up to lsn %lu %lu\n",
+ printf("Writing log from %lu %lu up to lsn %lu %lu\n",
ut_dulint_get_high(log_sys->written_to_all_lsn),
ut_dulint_get_low(log_sys->written_to_all_lsn),
ut_dulint_get_high(log_sys->lsn),
@@ -1301,7 +1274,12 @@ loop:
ut_ad(area_end - area_start > 0);
- log_sys->flush_lsn = log_sys->lsn;
+ log_sys->write_lsn = log_sys->lsn;
+
+ if (flush_to_disk) {
+ log_sys->current_flush_lsn = log_sys->lsn;
+ }
+
log_sys->one_flushed = FALSE;
log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
@@ -1318,10 +1296,12 @@ loop:
OS_FILE_LOG_BLOCK_SIZE);
log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE;
- log_sys->flush_end_offset = log_sys->buf_free;
+ log_sys->write_end_offset = log_sys->buf_free;
group = UT_LIST_GET_FIRST(log_sys->log_groups);
+ /* Do the write to the log files */
+
while (group) {
log_group_write_buf(LOG_FLUSH, group,
log_sys->buf + area_start,
@@ -1330,20 +1310,25 @@ loop:
OS_FILE_LOG_BLOCK_SIZE),
start_offset - area_start);
- log_group_set_fields(group, log_sys->flush_lsn);
+ log_group_set_fields(group, log_sys->write_lsn);
group = UT_LIST_GET_NEXT(log_groups, group);
}
mutex_exit(&(log_sys->mutex));
- if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
- && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
- && srv_flush_log_at_trx_commit != 2) {
+ if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+ /* O_DSYNC means the OS did not buffer the log file at all:
+ so we have also flushed to disk what we have written */
+
+ log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+
+ } else if (flush_to_disk) {
group = UT_LIST_GET_FIRST(log_sys->log_groups);
fil_flush(group->space_id);
+ log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
}
mutex_enter(&(log_sys->mutex));
@@ -1403,7 +1388,7 @@ log_flush_margin(void)
mutex_exit(&(log->mutex));
if (do_flush) {
- log_flush_up_to(ut_dulint_max, LOG_NO_WAIT);
+ log_write_up_to(ut_dulint_max, LOG_NO_WAIT, FALSE);
}
}
@@ -1555,7 +1540,8 @@ log_group_checkpoint(
buf = group->checkpoint_buf;
mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
- mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
+ mach_write_to_8(buf + LOG_CHECKPOINT_LSN,
+ log_sys->next_checkpoint_lsn);
mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
log_group_calc_lsn_offset(
@@ -1664,8 +1650,10 @@ log_reset_first_header_and_checkpoint(
lsn = ut_dulint_add(start, LOG_BLOCK_HDR_SIZE);
/* Write the label of ibbackup --restore */
- sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "ibbackup ");
- ut_sprintf_timestamp((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+ sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+ "ibbackup ");
+ ut_sprintf_timestamp(
+ (char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+ strlen("ibbackup "));
buf = hdr_buf + LOG_CHECKPOINT_1;
@@ -1773,7 +1761,7 @@ log_checkpoint(
write-ahead-logging algorithm ensures that the log has been flushed
up to oldest_lsn. */
- log_flush_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS);
+ log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
mutex_enter(&(log_sys->mutex));
@@ -2466,7 +2454,7 @@ loop:
mutex_exit(&(log_sys->mutex));
- log_flush_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS);
+ log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
calc_new_limit = FALSE;
@@ -3104,8 +3092,8 @@ log_print(
"Last checkpoint at %lu %lu\n",
ut_dulint_get_high(log_sys->lsn),
ut_dulint_get_low(log_sys->lsn),
- ut_dulint_get_high(log_sys->written_to_some_lsn),
- ut_dulint_get_low(log_sys->written_to_some_lsn),
+ ut_dulint_get_high(log_sys->flushed_to_disk_lsn),
+ ut_dulint_get_low(log_sys->flushed_to_disk_lsn),
ut_dulint_get_high(log_sys->last_checkpoint_lsn),
ut_dulint_get_low(log_sys->last_checkpoint_lsn));
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c
index 1d1d84adda7..46129e3de79 100644
--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -521,10 +521,11 @@ try_again:
}
#endif
#ifdef UNIV_NON_BUFFERED_IO
- if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+ if (type == OS_LOG_FILE) {
/* Do not use unbuffered i/o to log files because
- value 2 denotes that we do not flush the log at every
- commit, but only once per second */
+ to allow group commit to work when MySQL binlogging
+ is used we must separate log file write and log
+ file flush to disk. */
} else {
if (srv_win_file_flush_method ==
SRV_WIN_IO_UNBUFFERED) {
diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c
index db1119a2abc..428e4d568f3 100644
--- a/innobase/row/row0mysql.c
+++ b/innobase/row/row0mysql.c
@@ -1664,7 +1664,7 @@ row_drop_table_for_mysql_in_background(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
trx_commit_for_mysql(trx);
diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
index d90b818ad4b..07df708e5fb 100644
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -2812,8 +2812,7 @@ loop:
at transaction commit */
srv_main_thread_op_info = (char*)"flushing log";
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
- log_flush_to_disk();
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* If there were less than 10 i/os during the
one second sleep, we assume that there is free
@@ -2831,8 +2830,8 @@ loop:
srv_main_thread_op_info =
(char*)"flushing log";
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
- log_flush_to_disk();
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP,
+ TRUE);
}
if (srv_activity_count == old_activity_count) {
@@ -2867,8 +2866,7 @@ loop:
buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
srv_main_thread_op_info = (char*) "flushing log";
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
- log_flush_to_disk();
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
}
/* We run a batch of insert buffer merge every 10 seconds,
@@ -2878,8 +2876,7 @@ loop:
ibuf_contract_for_n_pages(TRUE, 5);
srv_main_thread_op_info = (char*)"flushing log";
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
- log_flush_to_disk();
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* We run a full purge every 10 seconds, even if the server
were active */
@@ -2903,8 +2900,8 @@ loop:
if (difftime(current_time, last_flush_time) > 1) {
srv_main_thread_op_info = (char*) "flushing log";
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
- log_flush_to_disk();
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP,
+ TRUE);
last_flush_time = current_time;
}
}
diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c
index 4ce2236f78a..e6ef400bb40 100644
--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@@ -89,6 +89,8 @@ trx_create(
trx->check_foreigns = TRUE;
trx->check_unique_secondary = TRUE;
+ trx->flush_log_later = FALSE;
+
trx->dict_operation = FALSE;
trx->mysql_thd = NULL;
@@ -780,13 +782,26 @@ trx_commit_off_kernel(
/*-------------------------------------*/
- /* Most MySQL users run with srv_flush_.. set to FALSE: */
+ /* Most MySQL users run with srv_flush_.. set to 0: */
- if (srv_flush_log_at_trx_commit) {
-
- log_flush_up_to(lsn, LOG_WAIT_ONE_GROUP);
+ if (srv_flush_log_at_trx_commit != 0) {
+ if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+ && srv_flush_log_at_trx_commit != 2
+ && !trx->flush_log_later) {
+
+ /* Write the log to the log files AND flush
+ them to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ } else {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ }
}
+ trx->commit_lsn = lsn;
+
/*-------------------------------------*/
mutex_enter(&kernel_mutex);
@@ -1468,6 +1483,31 @@ trx_commit_for_mysql(
}
/**************************************************************************
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+ /* out: 0 or error number */
+ trx_t* trx) /* in: trx handle */
+{
+ ut_a(trx);
+
+ if (srv_flush_log_at_trx_commit == 1
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+
+ trx->op_info = (char *) "flushing log";
+
+ /* Flush the log files to disk */
+
+ log_write_up_to(trx->commit_lsn, LOG_WAIT_ONE_GROUP, TRUE);
+
+ trx->op_info = (char *) "";
+ }
+}
+
+/**************************************************************************
Marks the latest SQL statement ended. */
void
diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc
index 73654536083..50bb4275eaa 100644
--- a/sql/ha_innodb.cc
+++ b/sql/ha_innodb.cc
@@ -872,8 +872,7 @@ innobase_flush_logs(void)
DBUG_ENTER("innobase_flush_logs");
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
- log_flush_to_disk();
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
DBUG_RETURN(result);
}
@@ -920,7 +919,7 @@ Commits a transaction in an InnoDB database. */
int
innobase_commit(
/*============*/
- /* out: 0 or error number */
+ /* out: 0 */
THD* thd, /* in: MySQL thread handle of the user for whom
the transaction should be committed */
void* trx_handle)/* in: InnoDB trx handle or
@@ -928,7 +927,6 @@ innobase_commit(
that the current SQL statement ended, and we should
mark the start of a new statement with a savepoint */
{
- int error = 0;
trx_t* trx;
DBUG_ENTER("innobase_commit");
@@ -955,29 +953,27 @@ innobase_commit(
innobase_release_stat_resources(trx);
trx_mark_sql_stat_end(trx);
-#ifndef DBUG_OFF
- if (error) {
- DBUG_PRINT("error", ("error: %d", error));
- }
-#endif
/* Tell InnoDB server that there might be work for
utility threads: */
srv_active_wake_master_thread();
- DBUG_RETURN(error);
+ DBUG_RETURN(0);
}
/*********************************************************************
This is called when MySQL writes the binlog entry for the current
transaction. Writes to the InnoDB tablespace info which tells where the
MySQL binlog entry for the current transaction ended. Also commits the
-transaction inside InnoDB. */
+transaction inside InnoDB but does NOT flush InnoDB log files to disk.
+To flush you have to call innobase_flush_log_to_disk. We have separated
+flushing to eliminate the bottleneck of LOCK_log in log.cc which disabled
+InnoDB's group commit capability. */
int
innobase_report_binlog_offset_and_commit(
/*=====================================*/
- /* out: 0 or error code */
+ /* out: 0 */
THD* thd, /* in: user thread */
void* trx_handle, /* in: InnoDB trx handle */
char* log_file_name, /* in: latest binlog file name */
@@ -993,7 +989,39 @@ innobase_report_binlog_offset_and_commit(
trx->mysql_log_file_name = log_file_name;
trx->mysql_log_offset = (ib_longlong)end_offset;
- return(innobase_commit(thd, trx_handle));
+ trx->flush_log_later = TRUE;
+
+ innobase_commit(thd, trx_handle);
+
+ trx->flush_log_later = FALSE;
+
+ return(0);
+}
+
+/*********************************************************************
+This is called after MySQL has written the binlog entry for the current
+transaction. Flushes the InnoDB log files to disk if required. */
+
+int
+innobase_commit_complete(
+/*=====================*/
+ /* out: 0 */
+ void* trx_handle) /* in: InnoDB trx handle */
+{
+ trx_t* trx;
+
+ if (srv_flush_log_at_trx_commit == 0) {
+
+ return(0);
+ }
+
+ trx = (trx_t*)trx_handle;
+
+ ut_a(trx != NULL);
+
+ trx_commit_complete_for_mysql(trx);
+
+ return(0);
}
/*********************************************************************
@@ -3202,7 +3230,7 @@ ha_innobase::create(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
innobase_table = dict_table_get(norm_name, NULL);
@@ -3277,7 +3305,7 @@ ha_innobase::delete_table(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for
utility threads: */
@@ -3347,7 +3375,7 @@ innobase_drop_database(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for
utility threads: */
@@ -3419,7 +3447,7 @@ ha_innobase::rename_table(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
- log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+ log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for
utility threads: */
@@ -3936,7 +3964,7 @@ ha_innobase::extra(
case HA_EXTRA_RESET:
case HA_EXTRA_RESET_STATE:
prebuilt->read_just_key = 0;
- break;
+ break;
case HA_EXTRA_NO_KEYREAD:
prebuilt->read_just_key = 0;
break;
diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h
index 5677d22a2ca..8309c5eb440 100644
--- a/sql/ha_innodb.h
+++ b/sql/ha_innodb.h
@@ -211,6 +211,8 @@ int innobase_report_binlog_offset_and_commit(
void* trx_handle,
char* log_file_name,
my_off_t end_offset);
+int innobase_commit_complete(
+ void* trx_handle);
int innobase_rollback(THD *thd, void* trx_handle);
int innobase_close_connection(THD *thd);
int innobase_drop_database(char *path);
diff --git a/sql/handler.cc b/sql/handler.cc
index 6e3f8486b45..ba7799fef4a 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -243,6 +243,9 @@ int ha_autocommit_or_rollback(THD *thd, int error)
replication. This function also calls the commit of the table
handler, because the order of transactions in the log of the table
handler must be the same as in the binlog.
+ NOTE that to eliminate the bottleneck of the group commit, we do not
+ flush the handler log files here, but only later in a call of
+ ha_commit_complete().
arguments:
thd: the thread handle of the current connection
@@ -269,13 +272,38 @@ int ha_report_binlog_offset_and_commit(THD *thd,
my_error(ER_ERROR_DURING_COMMIT, MYF(0), error);
error=1;
}
- trans->innodb_active_trans=0;
}
#endif
return error;
}
/*
+ Flushes the handler log files (if my.cnf settings do not free us from it)
+ after we have called ha_report_binlog_offset_and_commit(). To eliminate
+ the bottleneck from the group commit, this should be called when
+ LOCK_log has been released in log.cc.
+
+ arguments:
+ thd: the thread handle of the current connection
+ return value: always 0
+*/
+
+int ha_commit_complete(THD *thd)
+{
+#ifdef HAVE_INNOBASE_DB
+ THD_TRANS *trans;
+ trans = &thd->transaction.all;
+ if (trans->innobase_tid)
+ {
+ innobase_commit_complete(trans->innobase_tid);
+
+ trans->innodb_active_trans=0;
+ }
+#endif
+ return 0;
+}
+
+/*
This function should be called when MySQL sends rows of a SELECT result set
or the EOF mark to the client. It releases a possible adaptive hash index
S-latch held by thd in InnoDB and also releases a possible InnoDB query
diff --git a/sql/handler.h b/sql/handler.h
index 72a05d7ebee..fbad36bffdd 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -372,6 +372,7 @@ void ha_resize_key_cache(void);
int ha_start_stmt(THD *thd);
int ha_report_binlog_offset_and_commit(THD *thd, char *log_file_name,
my_off_t end_offset);
+int ha_commit_complete(THD *thd);
int ha_release_temporary_latches(THD *thd);
int ha_commit_trans(THD *thd, THD_TRANS *trans);
int ha_rollback_trans(THD *thd, THD_TRANS *trans);
diff --git a/sql/log.cc b/sql/log.cc
index 8a5aba5cd34..f4c78b9c50d 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -1033,6 +1033,8 @@ bool MYSQL_LOG::write(THD *thd,enum enum_server_command command,
bool MYSQL_LOG::write(Log_event* event_info)
{
+ THD *thd=event_info->thd;
+ bool called_handler_commit=0;
bool error=0;
DBUG_ENTER("MYSQL_LOG::write(event)");
@@ -1047,7 +1049,6 @@ bool MYSQL_LOG::write(Log_event* event_info)
if (is_open())
{
bool should_rotate = 0;
- THD *thd=event_info->thd;
const char *local_db = event_info->get_db();
#ifdef USING_TRANSACTIONS
IO_CACHE *file = ((event_info->get_cache_stmt()) ?
@@ -1147,6 +1148,7 @@ bool MYSQL_LOG::write(Log_event* event_info)
{
error = ha_report_binlog_offset_and_commit(thd, log_file_name,
file->pos_in_file);
+ called_handler_commit=1;
}
should_rotate= (my_b_tell(file) >= (my_off_t) max_binlog_size);
@@ -1172,6 +1174,15 @@ err:
}
pthread_mutex_unlock(&LOCK_log);
+
+ /* Flush the transactional handler log file now that we have released
+ LOCK_log; the flush is placed here to eliminate the bottleneck on the
+ group commit */
+
+ if (called_handler_commit) {
+ ha_commit_complete(thd);
+ }
+
DBUG_RETURN(error);
}
@@ -1277,6 +1288,13 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache)
}
VOID(pthread_mutex_unlock(&LOCK_log));
+
+ /* Flush the transactional handler log file now that we have released
+ LOCK_log; the flush is placed here to eliminate the bottleneck on the
+ group commit */
+
+ ha_commit_complete(thd);
+
DBUG_RETURN(0);
err: