diff options
author | unknown <knielsen@knielsen-hq.org> | 2012-08-30 10:53:49 +0200 |
---|---|---|
committer | unknown <knielsen@knielsen-hq.org> | 2012-08-30 10:53:49 +0200 |
commit | 10802c4d9046fd54bf1a27cb7611c182ecde93fb (patch) | |
tree | 7dd0f391ecde085d8f47990da25869298d14969c | |
parent | 0536c506ff7c3ed261abc3d02fb787bfdd228abb (diff) | |
download | mariadb-git-10802c4d9046fd54bf1a27cb7611c182ecde93fb.tar.gz |
MDEV-381: fdatasync() does not correctly flush growing binlog file.
When we append data to the binlog file, we use fdatasync() to ensure
the data gets to disk so that crash recovery can work.
Unfortunately there seems to be a bug in ext3/ext4 on linux, so that
fdatasync() does not correctly sync all data when the size of a file
is increased. This causes crash recovery to not work correctly (it
loses transactions from the binlog).
As a work-around, use fsync() for the binlog, not fdatasync(). Since
we are increasing the file size, (correct) fdatasync() will most
likely not be faster than fsync() on any file system, and fsync()
does work correctly on ext3/ext4. This avoids the need to try to
detect if we are running on buggy ext3/ext4.
-rw-r--r-- | include/my_sys.h | 1 | ||||
-rw-r--r-- | mysys/my_sync.c | 18 | ||||
-rw-r--r-- | sql/log.cc | 10 |
3 files changed, 22 insertions, 7 deletions
diff --git a/include/my_sys.h b/include/my_sys.h index db22f55f492..ebe643abce5 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -70,6 +70,7 @@ extern int NEAR my_errno; /* Last error in mysys */ #define MY_THREADSAFE 2048 /* my_seek(): lock fd mutex */ #define MY_SYNC 4096 /* my_copy(): sync dst file */ #define MY_SYNC_DIR 32768 /* my_create/delete/rename: sync directory */ +#define MY_SYNC_FILESIZE 65536 /* my_sync(): safe sync when file is extended */ #define MY_CHECK_ERROR 1 /* Params to my_end; Check open-close */ #define MY_GIVE_INFO 2 /* Give time info about process*/ diff --git a/mysys/my_sync.c b/mysys/my_sync.c index d8973244620..33033ff1045 100644 --- a/mysys/my_sync.c +++ b/mysys/my_sync.c @@ -39,6 +39,13 @@ ulong my_sync_count; /* Count number of sync calls */ (which is correct behaviour, if we know that the other thread synced the file before closing) + MY_SYNC_FILESIZE is useful when syncing a file after it has been extended. + On Linux, fdatasync() on ext3/ext4 file systems does not properly flush + to disk the inode data required to preserve the added data across a crash + (this looks to be a bug). But when a file is extended, inode data will most + likely need flushing in any case, so passing MY_SYNC_FILESIZE as flags + is not likely to be any slower, and will be crash safe on Linux ext3/ext4. + RETURN 0 ok -1 error @@ -67,8 +74,12 @@ int my_sync(File fd, myf my_flags) DBUG_PRINT("info",("fcntl(F_FULLFSYNC) failed, falling back")); #endif #if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC - res= fdatasync(fd); -#elif defined(HAVE_FSYNC) + if (!(my_flags & MY_SYNC_FILESIZE)) + res= fdatasync(fd); + else + { +#endif +#if defined(HAVE_FSYNC) res= fsync(fd); if (res == -1 && errno == ENOLCK) res= 0; /* Result Bug in Old FreeBSD */ @@ -78,6 +89,9 @@ int my_sync(File fd, myf my_flags) #error Cannot find a way to sync a file, durability in danger res= 0; /* No sync (strange OS) */ #endif +#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC + } +#endif } while (res == -1 && errno == EINTR); if (res) diff --git a/sql/log.cc b/sql/log.cc index ddb12457fcd..05e8a66ed04 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -2838,7 +2838,7 @@ bool MYSQL_BIN_LOG::open(const char *log_name, bytes_written+= description_event_for_queue->data_written; } if (flush_io_cache(&log_file) || - my_sync(log_file.file, MYF(MY_WME))) + my_sync(log_file.file, MYF(MY_WME|MY_SYNC_FILESIZE))) goto err; pthread_mutex_lock(&LOCK_commit_ordered); strmake(last_commit_pos_file, log_file_name, @@ -2864,7 +2864,7 @@ bool MYSQL_BIN_LOG::open(const char *log_name, strlen(log_file_name)) || my_b_write(&index_file, (uchar*) "\n", 1) || flush_io_cache(&index_file) || - my_sync(index_file.file, MYF(MY_WME))) + my_sync(index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE))) goto err; #ifdef HAVE_REPLICATION @@ -2956,7 +2956,7 @@ static bool copy_up_file_and_fill(IO_CACHE *index_file, my_off_t offset) } /* The following will either truncate the file or fill the end with \n' */ if (my_chsize(file, offset - init_offset, '\n', MYF(MY_WME)) || - my_sync(file, MYF(MY_WME))) + my_sync(file, MYF(MY_WME|MY_SYNC_FILESIZE))) goto err; /* Reset data in old index cache */ @@ -3549,7 +3549,7 @@ int MYSQL_BIN_LOG::sync_purge_index_file() DBUG_ENTER("MYSQL_BIN_LOG::sync_purge_index_file"); if ((error= flush_io_cache(&purge_index_file)) || - (error= my_sync(purge_index_file.file, MYF(MY_WME)))) + (error= my_sync(purge_index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE)))) DBUG_RETURN(error); DBUG_RETURN(error); @@ -4139,7 +4139,7 @@ bool MYSQL_BIN_LOG::flush_and_sync() if (++sync_binlog_counter >= sync_binlog_period && sync_binlog_period) { sync_binlog_counter= 0; - err=my_sync(fd, MYF(MY_WME)); + err=my_sync(fd, MYF(MY_WME|MY_SYNC_FILESIZE)); #ifndef DBUG_OFF if (opt_binlog_dbug_fsync_sleep > 0) my_sleep(opt_binlog_dbug_fsync_sleep); |