diff options
Diffstat (limited to 'innobase')
-rw-r--r-- | innobase/buf/buf0flu.c | 2 | ||||
-rw-r--r-- | innobase/include/log0log.h | 65 | ||||
-rw-r--r-- | innobase/include/trx0trx.h | 14 | ||||
-rw-r--r-- | innobase/log/log0log.c | 158 | ||||
-rw-r--r-- | innobase/os/os0file.c | 7 | ||||
-rw-r--r-- | innobase/row/row0mysql.c | 2 | ||||
-rw-r--r-- | innobase/srv/srv0srv.c | 17 | ||||
-rw-r--r-- | innobase/trx/trx0trx.c | 48 |
8 files changed, 179 insertions, 134 deletions
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 516056b5174..4d998f8306f 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -398,7 +398,7 @@ buf_flush_write_block_low( "Warning: cannot force log to disk in the log debug version!\n"); #else /* Force the log to the disk before writing the modified block */ - log_flush_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS); + log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE); #endif buf_flush_init_for_writing(block->frame, block->newest_modification, block->space, block->offset); diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h index f200371de9d..4e1404b15fe 100644 --- a/innobase/include/log0log.h +++ b/innobase/include/log0log.h @@ -20,7 +20,7 @@ typedef struct log_group_struct log_group_t; extern ibool log_do_write; extern ibool log_debug_writes; -/* Wait modes for log_flush_up_to */ +/* Wait modes for log_write_up_to */ #define LOG_NO_WAIT 91 #define LOG_WAIT_ONE_GROUP 92 #define LOG_WAIT_ALL_GROUPS 93 @@ -157,26 +157,21 @@ log_io_complete( /*============*/ log_group_t* group); /* in: log group */ /********************************************************** -Flushes the log files to the disk, using, for example, the Unix fsync. -This function does the flush even if the user has set -srv_flush_log_at_trx_commit = FALSE. */ - -void -log_flush_to_disk(void); -/*===================*/ -/********************************************************** This function is called, e.g., when a transaction wants to commit. It checks -that the log has been flushed to disk up to the last log entry written by the -transaction. If there is a flush running, it waits and checks if the flush -flushed enough. If not, starts a new flush. */ +that the log has been written to the log file up to the last log entry written +by the transaction. If there is a flush running, it waits and checks if the +flush flushed enough. If not, starts a new flush. */ void -log_flush_up_to( +log_write_up_to( /*============*/ dulint lsn, /* in: log sequence number up to which the log should - be flushed, ut_dulint_max if not specified */ - ulint wait); /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + be written, ut_dulint_max if not specified */ + ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, or LOG_WAIT_ALL_GROUPS */ + ibool flush_to_disk); + /* in: TRUE if we want the written log also to be + flushed to disk */ /******************************************************************** Advances the smallest lsn for which there are unflushed dirty blocks in the buffer pool and also may make a new checkpoint. NOTE: this function may only @@ -741,27 +736,37 @@ struct log_struct{ be advanced, it is enough that the write i/o has been completed for all log groups */ - dulint flush_lsn; /* end lsn for the current flush */ - ulint flush_end_offset;/* the data in buffer has been flushed + dulint write_lsn; /* end lsn for the current running + write */ + ulint write_end_offset;/* the data in buffer has been written up to this offset when the current - flush ends: this field will then + write ends: this field will then be copied to buf_next_to_write */ - ulint n_pending_writes;/* number of currently pending flush - writes */ + dulint current_flush_lsn;/* end lsn for the current running + write + flush operation */ + dulint flushed_to_disk_lsn; + /* how far we have written the log + AND flushed to disk */ + ulint n_pending_writes;/* number of currently pending flushes + or writes */ + /* NOTE on the 'flush' in names of the fields below: starting from + 4.0.14, we separate the write of the log file and the actual fsync() + or other method to flush it to disk. The names below shhould really + be 'flush_or_write'! */ os_event_t no_flush_event; /* this event is in the reset state - when a flush is running; a thread - should wait for this without owning - the log mutex, but NOTE that to set or - reset this event, the thread MUST own - the log mutex! */ + when a flush or a write is running; + a thread should wait for this without + owning the log mutex, but NOTE that + to set or reset this event, the + thread MUST own the log mutex! */ ibool one_flushed; /* during a flush, this is first FALSE and becomes TRUE when one log group - has been flushed */ + has been written or flushed */ os_event_t one_flushed_event;/* this event is reset when the - flush has not yet completed for any - log group; e.g., this means that a - transaction has been committed when - this is set; a thread should wait + flush or write has not yet completed + for any log group; e.g., this means + that a transaction has been committed + when this is set; a thread should wait for this without owning the log mutex, but NOTE that to set or reset this event, the thread MUST own the log diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index be96519c4ea..39229923375 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -157,6 +157,15 @@ trx_commit_for_mysql( /* out: 0 or error number */ trx_t* trx); /* in: trx handle */ /************************************************************************** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ + +ulint +trx_commit_complete_for_mysql( +/*==========================*/ + /* out: 0 or error number */ + trx_t* trx); /* in: trx handle */ +/************************************************************************** Marks the latest SQL statement ended. */ void @@ -343,6 +352,11 @@ struct trx_struct{ dulint no; /* transaction serialization number == max trx id when the transaction is moved to COMMITTED_IN_MEMORY state */ + ibool flush_log_later;/* when we commit the transaction + in MySQL's binlog write, we will + flush the log to disk later in + a separate call */ + dulint commit_lsn; /* lsn at the time of the commit */ ibool dict_operation; /* TRUE if the trx is used to create a table, create an index, or drop a table */ diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index 539cde337bd..25cc666e802 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -178,7 +178,7 @@ loop: /* Not enough free space, do a syncronous flush of the log buffer */ - log_flush_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS); + log_write_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS, TRUE); count++; @@ -675,7 +675,9 @@ log_init(void) log_sys->buf_next_to_write = 0; - log_sys->flush_lsn = ut_dulint_zero; + log_sys->write_lsn = ut_dulint_zero; + log_sys->current_flush_lsn = ut_dulint_zero; + log_sys->flushed_to_disk_lsn = ut_dulint_zero; log_sys->written_to_some_lsn = log_sys->lsn; log_sys->written_to_all_lsn = log_sys->lsn; @@ -867,7 +869,7 @@ log_group_check_flush_completion( printf("Log flushed first to group %lu\n", group->id); } - log_sys->written_to_some_lsn = log_sys->flush_lsn; + log_sys->written_to_some_lsn = log_sys->write_lsn; log_sys->one_flushed = TRUE; return(LOG_UNLOCK_NONE_FLUSHED_LOCK); @@ -896,15 +898,15 @@ log_sys_check_flush_completion(void) if (log_sys->n_pending_writes == 0) { - log_sys->written_to_all_lsn = log_sys->flush_lsn; - log_sys->buf_next_to_write = log_sys->flush_end_offset; + log_sys->written_to_all_lsn = log_sys->write_lsn; + log_sys->buf_next_to_write = log_sys->write_end_offset; - if (log_sys->flush_end_offset > log_sys->max_buf_free / 2) { + if (log_sys->write_end_offset > log_sys->max_buf_free / 2) { /* Move the log buffer content to the start of the buffer */ move_start = ut_calc_align_down( - log_sys->flush_end_offset, + log_sys->write_end_offset, OS_FILE_LOG_BLOCK_SIZE); move_end = ut_calc_align(log_sys->buf_free, OS_FILE_LOG_BLOCK_SIZE); @@ -982,57 +984,6 @@ log_io_complete( } /********************************************************** -Flushes the log files to the disk, using, for example, the Unix fsync. -This function does the flush even if the user has set -srv_flush_log_at_trx_commit = FALSE. */ - -void -log_flush_to_disk(void) -/*===================*/ -{ - log_group_t* group; -loop: - mutex_enter(&(log_sys->mutex)); - - if (log_sys->n_pending_writes > 0) { - /* A log file write is running */ - - mutex_exit(&(log_sys->mutex)); - - /* Wait for the log file write to complete and try again */ - - os_event_wait(log_sys->no_flush_event); - - goto loop; - } - - group = UT_LIST_GET_FIRST(log_sys->log_groups); - - log_sys->n_pending_writes++; - group->n_pending_writes++; - - os_event_reset(log_sys->no_flush_event); - os_event_reset(log_sys->one_flushed_event); - - mutex_exit(&(log_sys->mutex)); - - fil_flush(group->space_id); - - mutex_enter(&(log_sys->mutex)); - - ut_a(group->n_pending_writes == 1); - ut_a(log_sys->n_pending_writes == 1); - - group->n_pending_writes--; - log_sys->n_pending_writes--; - - os_event_set(log_sys->no_flush_event); - os_event_set(log_sys->one_flushed_event); - - mutex_exit(&(log_sys->mutex)); -} - -/********************************************************** Writes a log file header to a log file space. */ static void @@ -1205,12 +1156,15 @@ by the transaction. If there is a flush running, it waits and checks if the flush flushed enough. If not, starts a new flush. */ void -log_flush_up_to( +log_write_up_to( /*============*/ dulint lsn, /* in: log sequence number up to which the log should be written, ut_dulint_max if not specified */ - ulint wait) /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, or LOG_WAIT_ALL_GROUPS */ + ibool flush_to_disk) + /* in: TRUE if we want the written log also to be + flushed to disk */ { log_group_t* group; ulint start_offset; @@ -1239,9 +1193,18 @@ loop: mutex_enter(&(log_sys->mutex)); - if ((ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0) - || ((ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0) - && (wait != LOG_WAIT_ALL_GROUPS))) { + if (flush_to_disk + && ut_dulint_cmp(log_sys->flushed_to_disk_lsn, lsn) >= 0) { + + mutex_exit(&(log_sys->mutex)); + + return; + } + + if (!flush_to_disk + && (ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0 + || (ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0 + && wait != LOG_WAIT_ALL_GROUPS))) { mutex_exit(&(log_sys->mutex)); @@ -1249,10 +1212,19 @@ loop: } if (log_sys->n_pending_writes > 0) { - /* A flush is running */ + /* A write (+ possibly flush to disk) is running */ + + if (flush_to_disk + && ut_dulint_cmp(log_sys->current_flush_lsn, lsn) >= 0) { + /* The write + flush will write enough: wait for it to + complete */ + + goto do_waits; + } - if (ut_dulint_cmp(log_sys->flush_lsn, lsn) >= 0) { - /* The flush will flush enough: wait for it to + if (!flush_to_disk + && ut_dulint_cmp(log_sys->write_lsn, lsn) >= 0) { + /* The write will write enough: wait for it to complete */ goto do_waits; @@ -1260,16 +1232,17 @@ loop: mutex_exit(&(log_sys->mutex)); - /* Wait for the flush to complete and try to start a new - flush */ + /* Wait for the write to complete and try to start a new + write */ os_event_wait(log_sys->no_flush_event); goto loop; } - if (log_sys->buf_free == log_sys->buf_next_to_write) { - /* Nothing to flush */ + if (!flush_to_disk + && log_sys->buf_free == log_sys->buf_next_to_write) { + /* Nothing to write and no flush to disk requested */ mutex_exit(&(log_sys->mutex)); @@ -1277,7 +1250,7 @@ loop: } if (log_debug_writes) { - printf("Flushing log from %lu %lu up to lsn %lu %lu\n", + printf("Writing log from %lu %lu up to lsn %lu %lu\n", ut_dulint_get_high(log_sys->written_to_all_lsn), ut_dulint_get_low(log_sys->written_to_all_lsn), ut_dulint_get_high(log_sys->lsn), @@ -1301,7 +1274,12 @@ loop: ut_ad(area_end - area_start > 0); - log_sys->flush_lsn = log_sys->lsn; + log_sys->write_lsn = log_sys->lsn; + + if (flush_to_disk) { + log_sys->current_flush_lsn = log_sys->lsn; + } + log_sys->one_flushed = FALSE; log_block_set_flush_bit(log_sys->buf + area_start, TRUE); @@ -1318,10 +1296,12 @@ loop: OS_FILE_LOG_BLOCK_SIZE); log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE; - log_sys->flush_end_offset = log_sys->buf_free; + log_sys->write_end_offset = log_sys->buf_free; group = UT_LIST_GET_FIRST(log_sys->log_groups); + /* Do the write to the log files */ + while (group) { log_group_write_buf(LOG_FLUSH, group, log_sys->buf + area_start, @@ -1330,20 +1310,25 @@ loop: OS_FILE_LOG_BLOCK_SIZE), start_offset - area_start); - log_group_set_fields(group, log_sys->flush_lsn); + log_group_set_fields(group, log_sys->write_lsn); group = UT_LIST_GET_NEXT(log_groups, group); } mutex_exit(&(log_sys->mutex)); - if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC - && srv_unix_file_flush_method != SRV_UNIX_NOSYNC - && srv_flush_log_at_trx_commit != 2) { + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + /* O_DSYNC means the OS did not buffer the log file at all: + so we have also flushed to disk what we have written */ + + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; + + } else if (flush_to_disk) { group = UT_LIST_GET_FIRST(log_sys->log_groups); fil_flush(group->space_id); + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; } mutex_enter(&(log_sys->mutex)); @@ -1403,7 +1388,7 @@ log_flush_margin(void) mutex_exit(&(log->mutex)); if (do_flush) { - log_flush_up_to(ut_dulint_max, LOG_NO_WAIT); + log_write_up_to(ut_dulint_max, LOG_NO_WAIT, FALSE); } } @@ -1555,7 +1540,8 @@ log_group_checkpoint( buf = group->checkpoint_buf; mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no); - mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn); + mach_write_to_8(buf + LOG_CHECKPOINT_LSN, + log_sys->next_checkpoint_lsn); mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET, log_group_calc_lsn_offset( @@ -1664,8 +1650,10 @@ log_reset_first_header_and_checkpoint( lsn = ut_dulint_add(start, LOG_BLOCK_HDR_SIZE); /* Write the label of ibbackup --restore */ - sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "ibbackup "); - ut_sprintf_timestamp((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP + sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + "ibbackup "); + ut_sprintf_timestamp( + (char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP + strlen("ibbackup ")); buf = hdr_buf + LOG_CHECKPOINT_1; @@ -1773,7 +1761,7 @@ log_checkpoint( write-ahead-logging algorithm ensures that the log has been flushed up to oldest_lsn. */ - log_flush_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS); + log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE); mutex_enter(&(log_sys->mutex)); @@ -2466,7 +2454,7 @@ loop: mutex_exit(&(log_sys->mutex)); - log_flush_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS); + log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE); calc_new_limit = FALSE; @@ -3104,8 +3092,8 @@ log_print( "Last checkpoint at %lu %lu\n", ut_dulint_get_high(log_sys->lsn), ut_dulint_get_low(log_sys->lsn), - ut_dulint_get_high(log_sys->written_to_some_lsn), - ut_dulint_get_low(log_sys->written_to_some_lsn), + ut_dulint_get_high(log_sys->flushed_to_disk_lsn), + ut_dulint_get_low(log_sys->flushed_to_disk_lsn), ut_dulint_get_high(log_sys->last_checkpoint_lsn), ut_dulint_get_low(log_sys->last_checkpoint_lsn)); diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 1d1d84adda7..46129e3de79 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -521,10 +521,11 @@ try_again: } #endif #ifdef UNIV_NON_BUFFERED_IO - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + if (type == OS_LOG_FILE) { /* Do not use unbuffered i/o to log files because - value 2 denotes that we do not flush the log at every - commit, but only once per second */ + to allow group commit to work when MySQL binlogging + is used we must separate log file write and log + file flush to disk. */ } else { if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) { diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index db1119a2abc..428e4d568f3 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -1664,7 +1664,7 @@ row_drop_table_for_mysql_in_background( the InnoDB data dictionary get out-of-sync if the user runs with innodb_flush_log_at_trx_commit = 0 */ - log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); + log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); trx_commit_for_mysql(trx); diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index d90b818ad4b..07df708e5fb 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -2812,8 +2812,7 @@ loop: at transaction commit */ srv_main_thread_op_info = (char*)"flushing log"; - log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); - log_flush_to_disk(); + log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); /* If there were less than 10 i/os during the one second sleep, we assume that there is free @@ -2831,8 +2830,8 @@ loop: srv_main_thread_op_info = (char*)"flushing log"; - log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); - log_flush_to_disk(); + log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, + TRUE); } if (srv_activity_count == old_activity_count) { @@ -2867,8 +2866,7 @@ loop: buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); srv_main_thread_op_info = (char*) "flushing log"; - log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); - log_flush_to_disk(); + log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); } /* We run a batch of insert buffer merge every 10 seconds, @@ -2878,8 +2876,7 @@ loop: ibuf_contract_for_n_pages(TRUE, 5); srv_main_thread_op_info = (char*)"flushing log"; - log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); - log_flush_to_disk(); + log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); /* We run a full purge every 10 seconds, even if the server were active */ @@ -2903,8 +2900,8 @@ loop: if (difftime(current_time, last_flush_time) > 1) { srv_main_thread_op_info = (char*) "flushing log"; - log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP); - log_flush_to_disk(); + log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, + TRUE); last_flush_time = current_time; } } diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 4ce2236f78a..e6ef400bb40 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -89,6 +89,8 @@ trx_create( trx->check_foreigns = TRUE; trx->check_unique_secondary = TRUE; + trx->flush_log_later = FALSE; + trx->dict_operation = FALSE; trx->mysql_thd = NULL; @@ -780,13 +782,26 @@ trx_commit_off_kernel( /*-------------------------------------*/ - /* Most MySQL users run with srv_flush_.. set to FALSE: */ + /* Most MySQL users run with srv_flush_.. set to 0: */ - if (srv_flush_log_at_trx_commit) { - - log_flush_up_to(lsn, LOG_WAIT_ONE_GROUP); + if (srv_flush_log_at_trx_commit != 0) { + if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && srv_flush_log_at_trx_commit != 2 + && !trx->flush_log_later) { + + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } else { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } } + trx->commit_lsn = lsn; + /*-------------------------------------*/ mutex_enter(&kernel_mutex); @@ -1468,6 +1483,31 @@ trx_commit_for_mysql( } /************************************************************************** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ + +ulint +trx_commit_complete_for_mysql( +/*==========================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + ut_a(trx); + + if (srv_flush_log_at_trx_commit == 1 + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + + trx->op_info = (char *) "flushing log"; + + /* Flush the log files to disk */ + + log_write_up_to(trx->commit_lsn, LOG_WAIT_ONE_GROUP, TRUE); + + trx->op_info = (char *) ""; + } +} + +/************************************************************************** Marks the latest SQL statement ended. */ void |