summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2016-07-14 20:36:33 -0400
committerAlex Gorrod <alexander.gorrod@mongodb.com>2016-07-15 10:36:33 +1000
commit11f018322c3f84552f2b1fe79f2ca9d6585577fb (patch)
tree04400eefdedfd08dfb5b833c08fc3147de0b3bfd /src
parent55f4584c9f3616571b77bf3876cae8c9ae0b1f93 (diff)
downloadmongo-11f018322c3f84552f2b1fe79f2ca9d6585577fb.tar.gz
WT-2760 Fix a bug in backup related to directory sync. Change the filesystem API to make durable the default (#2867)
Change the default remove/rename calls to flush the enclosing directory. Simplify the pluggable file system API by replacing the directory-sync method with "durable" boolean argument to the remove, rename and open-file methods. * Add "durable" arguments to relevant functions so that each remove or rename call specifies its durability requirements. * Switch the WT_FILE_SYSTEM::fs_open_file type enum from WT_OPEN_FILE_TYPE, with WT_OPEN_XXX names, to the WT_FS_OPEN_FILE_TYPE, with WT_FS_OPEN_XXX names. Switch the WT_FILE_SYSTEM::fs_open_file flags from WT_OPEN_XXX names to WT_FS_OPEN_XXX names. * Replace the "bool durable" argument to WT_FILE_SYSTEM.fs_remove and WT_FILE_SYSTEM.fs_rename with a "uint32_t flags" argument, and the WT_FS_DURABLE flag. * Remove a stray bracket.
Diffstat (limited to 'src')
-rw-r--r--src/block/block_open.c27
-rw-r--r--src/btree/bt_huffman.c3
-rw-r--r--src/conn/conn_api.c17
-rw-r--r--src/conn/conn_stat.c2
-rw-r--r--src/cursor/cur_backup.c14
-rw-r--r--src/include/extern.h7
-rw-r--r--src/include/os_fs.i75
-rw-r--r--src/include/os_fstream.i2
-rw-r--r--src/include/wiredtiger.in75
-rw-r--r--src/log/log.c14
-rw-r--r--src/lsm/lsm_work_unit.c2
-rw-r--r--src/meta/meta_track.c13
-rw-r--r--src/meta/meta_turtle.c11
-rw-r--r--src/os_common/filename.c54
-rw-r--r--src/os_common/os_fhandle.c32
-rw-r--r--src/os_common/os_fs_inmemory.c12
-rw-r--r--src/os_common/os_fstream.c2
-rw-r--r--src/os_posix/os_fs.c119
-rw-r--r--src/os_win/os_fs.c28
-rw-r--r--src/schema/schema_rename.c2
20 files changed, 244 insertions, 267 deletions
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 1603b1574e7..7cff7eab629 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -15,9 +15,10 @@ static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *);
* Drop a file.
*/
int
-__wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename)
+__wt_block_manager_drop(
+ WT_SESSION_IMPL *session, const char *filename, bool durable)
{
- return (__wt_remove_if_exists(session, filename));
+ return (__wt_remove_if_exists(session, filename, durable));
}
/*
@@ -43,8 +44,9 @@ __wt_block_manager_create(
* in our space. Move any existing files out of the way and complain.
*/
for (;;) {
- if ((ret = __wt_open(session, filename, WT_OPEN_FILE_TYPE_DATA,
- WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, &fh)) == 0)
+ if ((ret = __wt_open(session, filename,
+ WT_FS_OPEN_FILE_TYPE_DATA, WT_FS_OPEN_CREATE |
+ WT_FS_OPEN_DURABLE | WT_FS_OPEN_EXCLUSIVE, &fh)) == 0)
break;
WT_ERR_TEST(ret != EEXIST, ret);
@@ -56,7 +58,7 @@ __wt_block_manager_create(
WT_ERR(__wt_fs_exist(session, tmp->data, &exists));
if (!exists) {
WT_ERR(__wt_fs_rename(
- session, filename, tmp->data));
+ session, filename, tmp->data, false));
WT_ERR(__wt_msg(session,
"unexpected file %s found, renamed to %s",
filename, (const char *)tmp->data));
@@ -77,16 +79,9 @@ __wt_block_manager_create(
/* Close the file handle. */
WT_TRET(__wt_close(session, &fh));
- /*
- * Some filesystems require that we sync the directory to be confident
- * that the file will appear.
- */
- if (ret == 0)
- WT_TRET(__wt_fs_directory_sync(session, filename));
-
/* Undo any create on error. */
if (ret != 0)
- WT_TRET(__wt_fs_remove(session, filename));
+ WT_TRET(__wt_fs_remove(session, filename, false));
err: __wt_scr_free(session, &tmp);
@@ -207,11 +202,11 @@ __wt_block_open(WT_SESSION_IMPL *session,
*/
flags = 0;
if (readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_CHECKPOINT))
- LF_SET(WT_OPEN_DIRECTIO);
+ LF_SET(WT_FS_OPEN_DIRECTIO);
if (!readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_DATA))
- LF_SET(WT_OPEN_DIRECTIO);
+ LF_SET(WT_FS_OPEN_DIRECTIO);
WT_ERR(__wt_open(
- session, filename, WT_OPEN_FILE_TYPE_DATA, flags, &block->fh));
+ session, filename, WT_FS_OPEN_FILE_TYPE_DATA, flags, &block->fh));
/* Set the file's size. */
WT_ERR(__wt_filesize(session, block->fh, &block->size));
diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c
index 9e9d69c342e..918791d9c6e 100644
--- a/src/btree/bt_huffman.c
+++ b/src/btree/bt_huffman.c
@@ -157,7 +157,8 @@ __huffman_confchk_file(WT_SESSION_IMPL *session,
/* Check the file exists. */
WT_RET(__wt_strndup(session, v->str + len, v->len - len, &fname));
- WT_ERR(__wt_fopen(session, fname, WT_OPEN_FIXED, WT_STREAM_READ, &fs));
+ WT_ERR(__wt_fopen(
+ session, fname, WT_FS_OPEN_FIXED, WT_STREAM_READ, &fs));
/* Optionally return the file handle. */
if (fsp == NULL)
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index e2ba297337e..1c6b0c2b500 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -1217,7 +1217,8 @@ __conn_config_file(WT_SESSION_IMPL *session,
return (0);
/* Open the configuration file. */
- WT_RET(__wt_open(session, filename, WT_OPEN_FILE_TYPE_REGULAR, 0, &fh));
+ WT_RET(__wt_open(
+ session, filename, WT_FS_OPEN_FILE_TYPE_REGULAR, 0, &fh));
WT_ERR(__wt_filesize(session, fh, &size));
if (size == 0)
goto err;
@@ -1510,8 +1511,8 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
exist = false;
if (!is_create)
WT_ERR(__wt_fs_exist(session, WT_WIREDTIGER, &exist));
- ret = __wt_open(session, WT_SINGLETHREAD, WT_OPEN_FILE_TYPE_REGULAR,
- is_create || exist ? WT_OPEN_CREATE : 0, &conn->lock_fh);
+ ret = __wt_open(session, WT_SINGLETHREAD, WT_FS_OPEN_FILE_TYPE_REGULAR,
+ is_create || exist ? WT_FS_OPEN_CREATE : 0, &conn->lock_fh);
/*
* If this is a read-only connection and we cannot grab the lock
@@ -1563,7 +1564,8 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
/* We own the lock file, optionally create the WiredTiger file. */
ret = __wt_open(session, WT_WIREDTIGER,
- WT_OPEN_FILE_TYPE_REGULAR, is_create ? WT_OPEN_CREATE : 0, &fh);
+ WT_FS_OPEN_FILE_TYPE_REGULAR, is_create ? WT_FS_OPEN_CREATE : 0,
+ &fh);
/*
* If we're read-only, check for handled errors. Even if able to open
@@ -1784,7 +1786,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
* runs. This doesn't matter for correctness, it's just cleaning up
* random files.
*/
- WT_RET(__wt_remove_if_exists(session, WT_BASECONFIG_SET));
+ WT_RET(__wt_remove_if_exists(session, WT_BASECONFIG_SET, false));
/*
* The base configuration file is only written if creating the database,
@@ -1809,7 +1811,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
return (0);
WT_RET(__wt_fopen(session, WT_BASECONFIG_SET,
- WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, WT_STREAM_WRITE, &fs));
+ WT_FS_OPEN_CREATE | WT_FS_OPEN_EXCLUSIVE, WT_STREAM_WRITE, &fs));
WT_ERR(__wt_fprintf(session, fs, "%s\n\n",
"# Do not modify this file.\n"
@@ -1870,7 +1872,8 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
if (0) {
/* Close open file handle, remove any temporary file. */
err: WT_TRET(__wt_fclose(session, &fs));
- WT_TRET(__wt_remove_if_exists(session, WT_BASECONFIG_SET));
+ WT_TRET(
+ __wt_remove_if_exists(session, WT_BASECONFIG_SET, false));
}
__wt_free(session, base_config);
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 566a5397a43..4e7cac59c4a 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -429,7 +429,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
if (path != NULL)
(void)strcpy(path->mem, tmp->mem);
WT_RET(__wt_fopen(session, tmp->mem,
- WT_OPEN_CREATE | WT_OPEN_FIXED, WT_STREAM_APPEND,
+ WT_FS_OPEN_CREATE | WT_FS_OPEN_FIXED, WT_STREAM_APPEND,
&log_stream));
}
conn->stat_fs = log_stream;
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 9b7a93e6bfe..63952169566 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -243,7 +243,7 @@ __backup_start(
* doesn't confuse restarting in the source database.
*/
WT_ERR(__wt_fopen(session, WT_BACKUP_TMP,
- WT_OPEN_CREATE, WT_STREAM_WRITE, &cb->bfs));
+ WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &cb->bfs));
/*
* If a list of targets was specified, work our way through them.
* Else, generate a list of all database objects.
@@ -269,7 +269,7 @@ __backup_start(
*/
dest = WT_INCREMENTAL_BACKUP;
WT_ERR(__wt_fopen(session, WT_INCREMENTAL_SRC,
- WT_OPEN_CREATE, WT_STREAM_WRITE, &srcfs));
+ WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &srcfs));
WT_ERR(__backup_list_append(
session, cb, WT_INCREMENTAL_BACKUP));
} else {
@@ -292,7 +292,7 @@ err: /* Close the hot backup file. */
WT_TRET(__wt_fclose(session, &srcfs));
if (ret == 0) {
WT_ASSERT(session, dest != NULL);
- WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest));
+ WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false));
}
return (ret);
@@ -449,10 +449,10 @@ __wt_backup_file_remove(WT_SESSION_IMPL *session)
* always know we were a source directory while there's any chance of
* an incremental backup file existing.
*/
- WT_TRET(__wt_remove_if_exists(session, WT_BACKUP_TMP));
- WT_TRET(__wt_remove_if_exists(session, WT_INCREMENTAL_BACKUP));
- WT_TRET(__wt_remove_if_exists(session, WT_INCREMENTAL_SRC));
- WT_TRET(__wt_remove_if_exists(session, WT_METADATA_BACKUP));
+ WT_TRET(__wt_remove_if_exists(session, WT_BACKUP_TMP, true));
+ WT_TRET(__wt_remove_if_exists(session, WT_INCREMENTAL_BACKUP, true));
+ WT_TRET(__wt_remove_if_exists(session, WT_INCREMENTAL_SRC, true));
+ WT_TRET(__wt_remove_if_exists(session, WT_METADATA_BACKUP, true));
return (ret);
}
diff --git a/src/include/extern.h b/src/include/extern.h
index 0cfcb12fdf4..881bac505ae 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -44,7 +44,7 @@ extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
extern int __wt_block_map(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_regionp, size_t *lengthp, void *mapped_cookiep);
extern int __wt_block_unmap(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_region, size_t length, void *mapped_cookie);
extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp);
-extern int __wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename);
+extern int __wt_block_manager_drop( WT_SESSION_IMPL *session, const char *filename, bool durable);
extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on);
extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BLOCK **blockp);
@@ -486,8 +486,7 @@ extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **va
extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value);
extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path);
extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path);
-extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name);
-extern int __wt_rename_and_sync_directory( WT_SESSION_IMPL *session, const char *from, const char *to);
+extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, bool durable);
extern int __wt_copy_and_sync(WT_SESSION *wt_session, const char *from, const char *to);
extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
@@ -501,7 +500,7 @@ extern int __wt_errno(void);
extern const char *__wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen);
extern int __wt_ext_map_windows_error( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint32_t windows_error);
extern bool __wt_handle_is_open(WT_SESSION_IMPL *session, const char *name);
-extern int __wt_open(WT_SESSION_IMPL *session, const char *name, WT_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp);
+extern int __wt_open(WT_SESSION_IMPL *session, const char *name, WT_FS_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp);
extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp);
extern int __wt_close_connection_close(WT_SESSION_IMPL *session);
extern int __wt_os_inmemory(WT_SESSION_IMPL *session);
diff --git a/src/include/os_fs.i b/src/include/os_fs.i
index 88ee71d953a..a3a2fe29b65 100644
--- a/src/include/os_fs.i
+++ b/src/include/os_fs.i
@@ -8,7 +8,7 @@
/*
* __wt_fs_directory_list --
- * Get a list of files from a directory.
+ * Return a list of files from a directory.
*/
static inline int
__wt_fs_directory_list(WT_SESSION_IMPL *session,
@@ -61,61 +61,6 @@ __wt_fs_directory_list_free(
}
/*
- * __wt_fs_directory_sync --
- * Flush a directory to ensure file creation is durable.
- */
-static inline int
-__wt_fs_directory_sync(WT_SESSION_IMPL *session, const char *name)
-{
- WT_DECL_RET;
- WT_FILE_SYSTEM *file_system;
- WT_SESSION *wt_session;
- char *copy, *dir;
-
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
-
- WT_RET(__wt_verbose(
- session, WT_VERB_FILEOPS, "%s: directory-sync", name));
-
- /*
- * POSIX 1003.1 does not require that fsync of a file handle ensures the
- * entry in the directory containing the file has also reached disk (and
- * there are historic Linux filesystems requiring it). If the underlying
- * filesystem method is set, do an explicit fsync on a file descriptor
- * for the directory to be sure.
- *
- * directory-sync is not a required call, no method means the call isn't
- * needed.
- */
- file_system = S2C(session)->file_system;
- if (file_system->fs_directory_sync == NULL)
- return (0);
-
- copy = NULL;
- if (name == NULL || strchr(name, '/') == NULL)
- name = S2C(session)->home;
- else {
- /*
- * File name construction should not return a path without any
- * slash separator, but caution isn't unreasonable.
- */
- WT_RET(__wt_filename(session, name, &copy));
- if ((dir = strrchr(copy, '/')) == NULL)
- name = S2C(session)->home;
- else {
- dir[1] = '\0';
- name = copy;
- }
- }
-
- wt_session = (WT_SESSION *)session;
- ret = file_system->fs_directory_sync(file_system, wt_session, name);
-
- __wt_free(session, copy);
- return (ret);
-}
-
-/*
* __wt_fs_exist --
* Return if the file exists.
*/
@@ -141,10 +86,10 @@ __wt_fs_exist(WT_SESSION_IMPL *session, const char *name, bool *existp)
/*
* __wt_fs_remove --
- * POSIX remove.
+ * Remove the file.
*/
static inline int
-__wt_fs_remove(WT_SESSION_IMPL *session, const char *name)
+__wt_fs_remove(WT_SESSION_IMPL *session, const char *name, bool durable)
{
WT_DECL_RET;
WT_FILE_SYSTEM *file_system;
@@ -169,7 +114,8 @@ __wt_fs_remove(WT_SESSION_IMPL *session, const char *name)
file_system = S2C(session)->file_system;
wt_session = (WT_SESSION *)session;
- ret = file_system->fs_remove(file_system, wt_session, path);
+ ret = file_system->fs_remove(
+ file_system, wt_session, path, durable ? WT_FS_DURABLE : 0);
__wt_free(session, path);
return (ret);
@@ -177,10 +123,11 @@ __wt_fs_remove(WT_SESSION_IMPL *session, const char *name)
/*
* __wt_fs_rename --
- * POSIX rename.
+ * Rename the file.
*/
static inline int
-__wt_fs_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+__wt_fs_rename(
+ WT_SESSION_IMPL *session, const char *from, const char *to, bool durable)
{
WT_DECL_RET;
WT_FILE_SYSTEM *file_system;
@@ -211,8 +158,8 @@ __wt_fs_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
file_system = S2C(session)->file_system;
wt_session = (WT_SESSION *)session;
- ret = file_system->fs_rename(
- file_system, wt_session, from_path, to_path);
+ ret = file_system->fs_rename(file_system,
+ wt_session, from_path, to_path, durable ? WT_FS_DURABLE : 0);
err: __wt_free(session, from_path);
__wt_free(session, to_path);
@@ -221,7 +168,7 @@ err: __wt_free(session, from_path);
/*
* __wt_fs_size --
- * Get the size of a file in bytes, by file name.
+ * Return the size of a file in bytes, by file name.
*/
static inline int
__wt_fs_size(WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep)
diff --git a/src/include/os_fstream.i b/src/include/os_fstream.i
index 8c0fdadbdb0..92274431011 100644
--- a/src/include/os_fstream.i
+++ b/src/include/os_fstream.i
@@ -93,5 +93,5 @@ __wt_sync_and_rename(WT_SESSION_IMPL *session,
WT_TRET(__wt_fclose(session, &fstr));
WT_RET(ret);
- return (__wt_rename_and_sync_directory(session, from, to));
+ return (__wt_fs_rename(session, from, to, true));
}
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 3de1333a0c5..d15916b3e32 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -3685,24 +3685,34 @@ struct __wt_extractor {
#if !defined(SWIG)
/*! WT_FILE_SYSTEM::open_file file types */
typedef enum {
- WT_OPEN_FILE_TYPE_CHECKPOINT, /*!< open a data file checkpoint */
- WT_OPEN_FILE_TYPE_DATA, /*!< open a data file */
- WT_OPEN_FILE_TYPE_DIRECTORY, /*!< open a directory */
- WT_OPEN_FILE_TYPE_LOG, /*!< open a log file */
- WT_OPEN_FILE_TYPE_REGULAR /*!< open a regular file */
-} WT_OPEN_FILE_TYPE;
+ WT_FS_OPEN_FILE_TYPE_CHECKPOINT,/*!< open a data file checkpoint */
+ WT_FS_OPEN_FILE_TYPE_DATA, /*!< open a data file */
+ WT_FS_OPEN_FILE_TYPE_DIRECTORY, /*!< open a directory */
+ WT_FS_OPEN_FILE_TYPE_LOG, /*!< open a log file */
+ WT_FS_OPEN_FILE_TYPE_REGULAR /*!< open a regular file */
+} WT_FS_OPEN_FILE_TYPE;
/*! WT_FILE_SYSTEM::open_file flags: create if does not exist */
-#define WT_OPEN_CREATE 0x001
+#define WT_FS_OPEN_CREATE 0x001
/*! WT_FILE_SYSTEM::open_file flags: direct I/O requested */
-#define WT_OPEN_DIRECTIO 0x002
-/*! WT_FILE_SYSTEM::open_file flags: error if exclusive use not available */
-#define WT_OPEN_EXCLUSIVE 0x004
+#define WT_FS_OPEN_DIRECTIO 0x002
+/*! WT_FILE_SYSTEM::open_file flags: file creation must be durable */
+#define WT_FS_OPEN_DURABLE 0x004
+/*!
+ * WT_FILE_SYSTEM::open_file flags: return EBUSY if exclusive use not available
+ */
+#define WT_FS_OPEN_EXCLUSIVE 0x008
#ifndef DOXYGEN
-#define WT_OPEN_FIXED 0x008 /* Path not home relative (internal) */
+#define WT_FS_OPEN_FIXED 0x010 /* Path not home relative (internal) */
#endif
/*! WT_FILE_SYSTEM::open_file flags: open is read-only */
-#define WT_OPEN_READONLY 0x010
+#define WT_FS_OPEN_READONLY 0x020
+
+/*!
+ * WT_FILE_SYSTEM::remove or WT_FILE_SYSTEM::rename flags: the remove or rename
+ * operation must be durable
+ */
+#define WT_FS_DURABLE 0x001
/*!
* The interface implemented by applications to provide a custom file system
@@ -3752,23 +3762,6 @@ struct __wt_file_system {
WT_SESSION *session, char **dirlist, uint32_t count);
/*!
- * Flush the named directory.
- *
- * This method is not required for readonly file systems or file systems
- * where it is not necessary to flush a file's directory to ensure the
- * durability of file system operations, and should be set to NULL when
- * not required by the file system.
- *
- * @errors
- *
- * @param file_system the WT_FILE_SYSTEM
- * @param session the current WiredTiger session
- * @param directory the name of the directory
- */
- int (*fs_directory_sync)(WT_FILE_SYSTEM *file_system,
- WT_SESSION *session, const char *directory);
-
- /*!
* Return if the named file system object exists.
*
* @errors
@@ -3784,11 +3777,16 @@ struct __wt_file_system {
/*!
* Open a handle for a named file system object
*
- * The method should return ENOENT if the file does not exist.
+ * The method should return ENOENT if the file is not being created and
+ * does not exist.
+ *
* The method should return EACCES if the file cannot be opened in the
* requested mode (for example, a file opened for writing in a readonly
* file system).
*
+ * The method should return EBUSY if ::WT_FS_OPEN_EXCLUSIVE is set and
+ * the file is in use.
+ *
* @errors
*
* @param file_system the WT_FILE_SYSTEM
@@ -3798,8 +3796,8 @@ struct __wt_file_system {
* The file type is provided to allow optimization for different file
* access patterns.
* @param flags flags indicating how to open the file, one or more of
- * ::WT_OPEN_CREATE, ::WT_OPEN_DIRECTIO, ::WT_OPEN_EXCLUSIVE or
- * ::WT_OPEN_READONLY.
+ * ::WT_FS_OPEN_CREATE, ::WT_FS_OPEN_DIRECTIO, ::WT_FS_OPEN_DURABLE,
+ * ::WT_FS_OPEN_EXCLUSIVE or ::WT_FS_OPEN_READONLY.
* @param[out] file_handlep the handle to the newly opened file. File
* system implementations must allocate memory for the handle and
* the WT_FILE_HANDLE::name field, and fill in the WT_FILE_HANDLE::
@@ -3808,7 +3806,7 @@ struct __wt_file_system {
* their own structure as a superset of a WT_FILE_HANDLE:: structure.
*/
int (*fs_open_file)(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
- const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags,
WT_FILE_HANDLE **file_handlep);
/*!
@@ -3822,9 +3820,11 @@ struct __wt_file_system {
* @param file_system the WT_FILE_SYSTEM
* @param session the current WiredTiger session
* @param name the name of the file system object
+ * @param durable if the operation requires durability
+ * @param flags 0 or ::WT_FS_DURABLE
*/
- int (*fs_remove)(
- WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name);
+ int (*fs_remove)(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, uint32_t flags);
/*!
* Rename a named file system object
@@ -3838,9 +3838,10 @@ struct __wt_file_system {
* @param session the current WiredTiger session
* @param from the original name of the object
* @param to the new name for the object
+ * @param flags 0 or ::WT_FS_DURABLE
*/
- int (*fs_rename)(WT_FILE_SYSTEM *file_system,
- WT_SESSION *session, const char *from, const char *to);
+ int (*fs_rename)(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
+ const char *from, const char *to, uint32_t flags);
/*!
* Return the size of a named file system object
diff --git a/src/log/log.c b/src/log/log.c
index b684717288e..b9d757bbde8 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -701,11 +701,11 @@ __log_openfile(WT_SESSION_IMPL *session,
"opening log %s", (const char *)buf->data));
wtopen_flags = 0;
if (LF_ISSET(WT_LOG_OPEN_CREATE_OK))
- FLD_SET(wtopen_flags, WT_OPEN_CREATE);
+ FLD_SET(wtopen_flags, WT_FS_OPEN_CREATE);
if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
- FLD_SET(wtopen_flags, WT_OPEN_DIRECTIO);
+ FLD_SET(wtopen_flags, WT_FS_OPEN_DIRECTIO);
WT_ERR(__wt_open(
- session, buf->data, WT_OPEN_FILE_TYPE_LOG, wtopen_flags, fhp));
+ session, buf->data, WT_FS_OPEN_FILE_TYPE_LOG, wtopen_flags, fhp));
/*
* If we are not creating the log file but opening it for reading,
@@ -777,7 +777,7 @@ __log_alloc_prealloc(WT_SESSION_IMPL *session, uint32_t to_num)
* All file setup, writing the header and pre-allocation was done
* before. We only need to rename it.
*/
- WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data));
+ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false));
err: __wt_scr_free(session, &from_path);
__wt_scr_free(session, &to_path);
@@ -1063,7 +1063,7 @@ __wt_log_allocfile(
/*
* Rename it into place and make it available.
*/
- WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data));
+ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false));
err: __wt_scr_free(session, &from_path);
__wt_scr_free(session, &to_path);
@@ -1086,7 +1086,7 @@ __wt_log_remove(WT_SESSION_IMPL *session,
WT_ERR(__log_filename(session, lognum, file_prefix, path));
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_remove: remove log %s", (char *)path->data));
- WT_ERR(__wt_fs_remove(session, path->data));
+ WT_ERR(__wt_fs_remove(session, path->data, false));
err: __wt_scr_free(session, &path);
return (ret);
}
@@ -1122,7 +1122,7 @@ __wt_log_open(WT_SESSION_IMPL *session)
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"log_open: open fh to directory %s", conn->log_path));
WT_RET(__wt_open(session, conn->log_path,
- WT_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh));
+ WT_FS_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh));
}
if (!F_ISSET(conn, WT_CONN_READONLY)) {
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index c19f42327be..0f2a407c70d 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -526,7 +526,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
ret = __wt_schema_drop(session, uri, drop_cfg));
if (ret == 0)
- ret = __wt_fs_remove(session, uri + strlen("file:"));
+ ret = __wt_fs_remove(session, uri + strlen("file:"), false);
WT_RET(__wt_verbose(session, WT_VERB_LSM, "Dropped %s", uri));
if (ret == EBUSY || ret == ENOENT)
diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c
index eb06b2bed66..3d8b7c46500 100644
--- a/src/meta/meta_track.c
+++ b/src/meta/meta_track.c
@@ -141,7 +141,8 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
ret = bm->checkpoint_resolve(bm, session));
break;
case WT_ST_DROP_COMMIT:
- if ((ret = __wt_block_manager_drop(session, trk->a)) != 0)
+ if ((ret =
+ __wt_block_manager_drop(session, trk->a, false)) != 0)
__wt_err(session, ret,
"metadata remove dropped file %s", trk->a);
break;
@@ -188,13 +189,15 @@ __meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
* For removes, b is NULL.
*/
if (trk->a != NULL && trk->b != NULL &&
- (ret = __wt_rename_and_sync_directory(session,
- trk->b + strlen("file:"), trk->a + strlen("file:"))) != 0)
+ (ret = __wt_fs_rename(session,
+ trk->b + strlen("file:"), trk->a + strlen("file:"),
+ true)) != 0)
__wt_err(session, ret,
"metadata unroll rename %s to %s", trk->b, trk->a);
- if (trk->a == NULL && (ret =
- __wt_fs_remove(session, trk->b + strlen("file:"))) != 0)
+ if (trk->a == NULL &&
+ (ret = __wt_fs_remove(session,
+ trk->b + strlen("file:"), false)) != 0)
__wt_err(session, ret,
"metadata unroll create %s", trk->b);
diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c
index 4d2b359bbed..ace0fabab48 100644
--- a/src/meta/meta_turtle.c
+++ b/src/meta/meta_turtle.c
@@ -158,7 +158,7 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
* Discard any turtle setup file left-over from previous runs. This
* doesn't matter for correctness, it's just cleaning up random files.
*/
- WT_RET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET));
+ WT_RET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET, false));
/*
* We could die after creating the turtle file and before creating the
@@ -197,9 +197,10 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
"Both %s and %s exist; recreating metadata from "
"backup",
WT_METADATA_TURTLE, WT_METADATA_BACKUP));
- WT_RET(__wt_remove_if_exists(session, WT_METAFILE));
+ WT_RET(
+ __wt_remove_if_exists(session, WT_METAFILE, false));
WT_RET(__wt_remove_if_exists(
- session, WT_METADATA_TURTLE));
+ session, WT_METADATA_TURTLE, false));
load = true;
}
} else
@@ -305,7 +306,7 @@ __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value)
* every time.
*/
WT_RET(__wt_fopen(session, WT_METADATA_TURTLE_SET,
- WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, WT_STREAM_WRITE, &fs));
+ WT_FS_OPEN_CREATE | WT_FS_OPEN_EXCLUSIVE, WT_STREAM_WRITE, &fs));
version = wiredtiger_version(&vmajor, &vminor, &vpatch);
WT_ERR(__wt_fprintf(session, fs,
@@ -320,7 +321,7 @@ __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value)
/* Close any file handle left open, remove any temporary file. */
err: WT_TRET(__wt_fclose(session, &fs));
- WT_TRET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET));
+ WT_TRET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET, false));
return (ret);
}
diff --git a/src/os_common/filename.c b/src/os_common/filename.c
index 5f174288350..8b6c1269829 100644
--- a/src/os_common/filename.c
+++ b/src/os_common/filename.c
@@ -56,55 +56,17 @@ __wt_nfilename(
* Remove a file if it exists.
*/
int
-__wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name)
+__wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, bool durable)
{
bool exist;
WT_RET(__wt_fs_exist(session, name, &exist));
if (exist)
- WT_RET(__wt_fs_remove(session, name));
+ WT_RET(__wt_fs_remove(session, name, durable));
return (0);
}
/*
- * __wt_rename_and_sync_directory --
- * Rename a file and sync the enclosing directory.
- */
-int
-__wt_rename_and_sync_directory(
- WT_SESSION_IMPL *session, const char *from, const char *to)
-{
- const char *fp, *tp;
- bool same_directory;
-
- /* Rename the source file to the target. */
- WT_RET(__wt_fs_rename(session, from, to));
-
- /*
- * Flush the backing directory to guarantee the rename. My reading of
- * POSIX 1003.1 is there's no guarantee flushing only one of the from
- * or to directories, or flushing a common parent, is sufficient, and
- * even if POSIX were to make that guarantee, existing filesystems are
- * known to not provide the guarantee or only provide the guarantee
- * with specific mount options. Flush both of the from/to directories
- * until it's a performance problem.
- */
- WT_RET(__wt_fs_directory_sync(session, from));
-
- /*
- * In almost all cases, we're going to be renaming files in the same
- * directory, we can at least fast-path that.
- */
- fp = strrchr(from, '/');
- tp = strrchr(to, '/');
- same_directory = (fp == NULL && tp == NULL) ||
- (fp != NULL && tp != NULL &&
- fp - from == tp - to && memcmp(from, to, (size_t)(fp - from)) == 0);
-
- return (same_directory ? 0 : __wt_fs_directory_sync(session, to));
-}
-
-/*
* __wt_copy_and_sync --
* Copy a file safely; here to support the wt utility.
*/
@@ -134,13 +96,13 @@ __wt_copy_and_sync(WT_SESSION *wt_session, const char *from, const char *to)
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(__wt_buf_fmt(session, tmp, "%s.copy", to));
- WT_ERR(__wt_remove_if_exists(session, to));
- WT_ERR(__wt_remove_if_exists(session, tmp->data));
+ WT_ERR(__wt_remove_if_exists(session, to, false));
+ WT_ERR(__wt_remove_if_exists(session, tmp->data, false));
/* Open the from and temporary file handles. */
- WT_ERR(__wt_open(session, from, WT_OPEN_FILE_TYPE_REGULAR, 0, &ffh));
- WT_ERR(__wt_open(session, tmp->data, WT_OPEN_FILE_TYPE_REGULAR,
- WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, &tfh));
+ WT_ERR(__wt_open(session, from, WT_FS_OPEN_FILE_TYPE_REGULAR, 0, &ffh));
+ WT_ERR(__wt_open(session, tmp->data, WT_FS_OPEN_FILE_TYPE_REGULAR,
+ WT_FS_OPEN_CREATE | WT_FS_OPEN_EXCLUSIVE, &tfh));
/*
* Allocate a copy buffer. Don't use a scratch buffer, this thing is
@@ -162,7 +124,7 @@ __wt_copy_and_sync(WT_SESSION *wt_session, const char *from, const char *to)
WT_ERR(__wt_fsync(session, tfh, true));
WT_ERR(__wt_close(session, &tfh));
- ret = __wt_rename_and_sync_directory(session, tmp->data, to);
+ ret = __wt_fs_rename(session, tmp->data, to, true);
err: WT_TRET(__wt_close(session, &ffh));
WT_TRET(__wt_close(session, &tfh));
diff --git a/src/os_common/os_fhandle.c b/src/os_common/os_fhandle.c
index 81e4cc14ccb..184a9df0e72 100644
--- a/src/os_common/os_fhandle.c
+++ b/src/os_common/os_fhandle.c
@@ -150,19 +150,19 @@ __open_verbose(
*/
switch (file_type) {
- case WT_OPEN_FILE_TYPE_CHECKPOINT:
+ case WT_FS_OPEN_FILE_TYPE_CHECKPOINT:
file_type_tag = "checkpoint";
break;
- case WT_OPEN_FILE_TYPE_DATA:
+ case WT_FS_OPEN_FILE_TYPE_DATA:
file_type_tag = "data";
break;
- case WT_OPEN_FILE_TYPE_DIRECTORY:
+ case WT_FS_OPEN_FILE_TYPE_DIRECTORY:
file_type_tag = "directory";
break;
- case WT_OPEN_FILE_TYPE_LOG:
+ case WT_FS_OPEN_FILE_TYPE_LOG:
file_type_tag = "log";
break;
- case WT_OPEN_FILE_TYPE_REGULAR:
+ case WT_FS_OPEN_FILE_TYPE_REGULAR:
file_type_tag = "regular";
break;
default:
@@ -172,18 +172,18 @@ __open_verbose(
WT_RET(__wt_scr_alloc(session, 0, &tmp));
sep = " (";
-#define WT_OPEN_VERBOSE_FLAG(f, name) \
+#define WT_FS_OPEN_VERBOSE_FLAG(f, name) \
if (LF_ISSET(f)) { \
WT_ERR(__wt_buf_catfmt( \
session, tmp, "%s%s", sep, name)); \
sep = ", "; \
}
- WT_OPEN_VERBOSE_FLAG(WT_OPEN_CREATE, "create");
- WT_OPEN_VERBOSE_FLAG(WT_OPEN_DIRECTIO, "direct-IO");
- WT_OPEN_VERBOSE_FLAG(WT_OPEN_EXCLUSIVE, "exclusive");
- WT_OPEN_VERBOSE_FLAG(WT_OPEN_FIXED, "fixed");
- WT_OPEN_VERBOSE_FLAG(WT_OPEN_READONLY, "readonly");
+ WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_CREATE, "create");
+ WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_DIRECTIO, "direct-IO");
+ WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_EXCLUSIVE, "exclusive");
+ WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_FIXED, "fixed");
+ WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_READONLY, "readonly");
if (tmp->size != 0)
WT_ERR(__wt_buf_catfmt(session, tmp, ")"));
@@ -209,7 +209,7 @@ err: __wt_scr_free(session, &tmp);
*/
int
__wt_open(WT_SESSION_IMPL *session,
- const char *name, WT_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp)
+ const char *name, WT_FS_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -247,12 +247,12 @@ __wt_open(WT_SESSION_IMPL *session,
if (F_ISSET(conn, WT_CONN_READONLY)) {
lock_file = strcmp(name, WT_SINGLETHREAD) == 0;
if (!lock_file)
- LF_SET(WT_OPEN_READONLY);
- WT_ASSERT(session, lock_file || !LF_ISSET(WT_OPEN_CREATE));
+ LF_SET(WT_FS_OPEN_READONLY);
+ WT_ASSERT(session, lock_file || !LF_ISSET(WT_FS_OPEN_CREATE));
}
/* Create the path to the file. */
- if (!LF_ISSET(WT_OPEN_FIXED))
+ if (!LF_ISSET(WT_FS_OPEN_FIXED))
WT_ERR(__wt_filename(session, name, &path));
/* Call the underlying open function. */
@@ -261,7 +261,7 @@ __wt_open(WT_SESSION_IMPL *session,
open_called = true;
WT_ERR(__fhandle_method_finalize(
- session, fh->handle, LF_ISSET(WT_OPEN_READONLY)));
+ session, fh->handle, LF_ISSET(WT_FS_OPEN_READONLY)));
/*
* Repeat the check for a match: if there's no match, link our newly
diff --git a/src/os_common/os_fs_inmemory.c b/src/os_common/os_fs_inmemory.c
index 09c2e08db83..178adc1dac8 100644
--- a/src/os_common/os_fs_inmemory.c
+++ b/src/os_common/os_fs_inmemory.c
@@ -188,14 +188,16 @@ __im_fs_exist(WT_FILE_SYSTEM *file_system,
* POSIX remove.
*/
static int
-__im_fs_remove(
- WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *name)
+__im_fs_remove(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, uint32_t flags)
{
WT_DECL_RET;
WT_FILE_HANDLE_INMEM *im_fh;
WT_FILE_SYSTEM_INMEM *im_fs;
WT_SESSION_IMPL *session;
+ WT_UNUSED(flags);
+
im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
session = (WT_SESSION_IMPL *)wt_session;
@@ -215,7 +217,7 @@ __im_fs_remove(
*/
static int
__im_fs_rename(WT_FILE_SYSTEM *file_system,
- WT_SESSION *wt_session, const char *from, const char *to)
+ WT_SESSION *wt_session, const char *from, const char *to, uint32_t flags)
{
WT_DECL_RET;
WT_FILE_HANDLE_INMEM *im_fh;
@@ -224,6 +226,8 @@ __im_fs_rename(WT_FILE_SYSTEM *file_system,
uint64_t bucket;
char *copy;
+ WT_UNUSED(flags);
+
im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
session = (WT_SESSION_IMPL *)wt_session;
@@ -463,7 +467,7 @@ err: __wt_spin_unlock(session, &im_fs->lock);
*/
static int
__im_file_open(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
- const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags,
WT_FILE_HANDLE **file_handlep)
{
WT_DECL_RET;
diff --git a/src/os_common/os_fstream.c b/src/os_common/os_fstream.c
index 0b199529e19..5a368ea75e6 100644
--- a/src/os_common/os_fstream.c
+++ b/src/os_common/os_fstream.c
@@ -187,7 +187,7 @@ __wt_fopen(WT_SESSION_IMPL *session,
fstr = NULL;
WT_RET(__wt_open(
- session, name, WT_OPEN_FILE_TYPE_REGULAR, open_flags, &fh));
+ session, name, WT_FS_OPEN_FILE_TYPE_REGULAR, open_flags, &fh));
WT_ERR(__wt_calloc_one(session, &fstr));
fstr->fh = fh;
diff --git a/src/os_posix/os_fs.c b/src/os_posix/os_fs.c
index 86fa2e8f117..11f38ec063b 100644
--- a/src/os_posix/os_fs.c
+++ b/src/os_posix/os_fs.c
@@ -30,7 +30,7 @@
/*
* __posix_sync --
- * Underlying support function to flush a file handle.
+ * Underlying support function to flush a file descriptor.
*/
static int
__posix_sync(
@@ -77,33 +77,42 @@ __posix_sync(
#ifdef __linux__
/*
* __posix_directory_sync --
- * Flush a directory to ensure file creation is durable.
+ * Flush a directory to ensure file creation, remove or rename is durable.
*/
static int
-__posix_directory_sync(
- WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *path)
+__posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
{
+ WT_DECL_ITEM(tmp);
WT_DECL_RET;
- WT_SESSION_IMPL *session;
int fd, tret;
+ char *dir;
- WT_UNUSED(file_system);
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_setstr(session, tmp, path));
- session = (WT_SESSION_IMPL *)wt_session;
+ /*
+ * This layer should never see a path that doesn't include a trailing
+ * path separator, this code asserts that fact.
+ */
+ dir = tmp->mem;
+ strrchr(dir, '/')[1] = '\0';
+ fd = -1; /* -Wconditional-uninitialized */
WT_SYSCALL_RETRY((
- (fd = open(path, O_RDONLY, 0444)) == -1 ? -1 : 0), ret);
+ (fd = open(dir, O_RDONLY, 0444)) == -1 ? -1 : 0), ret);
if (ret != 0)
- WT_RET_MSG(session, ret, "%s: directory-sync: open", path);
+ WT_ERR_MSG(session, ret, "%s: directory-sync: open", dir);
- ret = __posix_sync(session, fd, path, "directory-sync");
+ ret = __posix_sync(session, fd, dir, "directory-sync");
WT_SYSCALL(close(fd), tret);
if (tret != 0) {
- __wt_err(session, tret, "%s: directory-sync: close", path);
+ __wt_err(session, tret, "%s: directory-sync: close", dir);
if (ret == 0)
ret = tret;
}
+
+err: __wt_scr_free(session, &tmp);
return (ret);
}
#endif
@@ -141,8 +150,8 @@ __posix_fs_exist(WT_FILE_SYSTEM *file_system,
* Remove a file.
*/
static int
-__posix_fs_remove(
- WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *name)
+__posix_fs_remove(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, uint32_t flags)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
@@ -159,9 +168,17 @@ __posix_fs_remove(
* using unlink may be marginally safer.
*/
WT_SYSCALL(unlink(name), ret);
- if (ret == 0)
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "%s: file-remove: unlink", name);
+
+ if (!LF_ISSET(WT_FS_DURABLE))
return (0);
- WT_RET_MSG(session, ret, "%s: file-remove: unlink", name);
+
+#ifdef __linux__
+ /* Flush the backing directory to guarantee the remove. */
+ WT_RET (__posix_directory_sync(session, name));
+#endif
+ return (0);
}
/*
@@ -170,7 +187,7 @@ __posix_fs_remove(
*/
static int
__posix_fs_rename(WT_FILE_SYSTEM *file_system,
- WT_SESSION *wt_session, const char *from, const char *to)
+ WT_SESSION *wt_session, const char *from, const char *to, uint32_t flags)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
@@ -187,9 +204,43 @@ __posix_fs_rename(WT_FILE_SYSTEM *file_system,
* return (if errno is 0), but we've done the best we can.
*/
WT_SYSCALL(rename(from, to) != 0 ? -1 : 0, ret);
- if (ret == 0)
+ if (ret != 0)
+ WT_RET_MSG(
+ session, ret, "%s to %s: file-rename: rename", from, to);
+
+ if (!LF_ISSET(WT_FS_DURABLE))
return (0);
- WT_RET_MSG(session, ret, "%s to %s: file-rename: rename", from, to);
+#ifdef __linux__
+ /*
+ * Flush the backing directory to guarantee the rename. My reading of
+ * POSIX 1003.1 is there's no guarantee flushing only one of the from
+ * or to directories, or flushing a common parent, is sufficient, and
+ * even if POSIX were to make that guarantee, existing filesystems are
+ * known to not provide the guarantee or only provide the guarantee
+ * with specific mount options. Flush both of the from/to directories
+ * until it's a performance problem.
+ */
+ WT_RET(__posix_directory_sync(session, from));
+
+ /*
+ * In almost all cases, we're going to be renaming files in the same
+ * directory, we can at least fast-path that.
+ */
+ {
+ bool same_directory;
+ const char *fp, *tp;
+
+ fp = strrchr(from, '/');
+ tp = strrchr(to, '/');
+ same_directory = (fp == NULL && tp == NULL) ||
+ (fp != NULL && tp != NULL &&
+ fp - from == tp - to && memcmp(from, to, (size_t)(fp - from)) == 0);
+
+ if (!same_directory)
+ WT_RET(__posix_directory_sync(session, to));
+ }
+#endif
+ return (0);
}
/*
@@ -513,7 +564,7 @@ __posix_open_file_cloexec(WT_SESSION_IMPL *session, int fd, const char *name)
*/
static int
__posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
- const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags,
WT_FILE_HANDLE **file_handlep)
{
WT_CONNECTION_IMPL *conn;
@@ -536,7 +587,7 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
/* Set up error handling. */
pfh->fd = -1;
- if (file_type == WT_OPEN_FILE_TYPE_DIRECTORY) {
+ if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY) {
f = O_RDONLY;
#ifdef O_CLOEXEC
/*
@@ -554,10 +605,10 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
goto directory_open;
}
- f = LF_ISSET(WT_OPEN_READONLY) ? O_RDONLY : O_RDWR;
- if (LF_ISSET(WT_OPEN_CREATE)) {
+ f = LF_ISSET(WT_FS_OPEN_READONLY) ? O_RDONLY : O_RDWR;
+ if (LF_ISSET(WT_FS_OPEN_CREATE)) {
f |= O_CREAT;
- if (LF_ISSET(WT_OPEN_EXCLUSIVE))
+ if (LF_ISSET(WT_FS_OPEN_EXCLUSIVE))
f |= O_EXCL;
mode = 0666;
} else
@@ -577,7 +628,7 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
#endif
#ifdef O_DIRECT
/* Direct I/O. */
- if (LF_ISSET(WT_OPEN_DIRECTIO)) {
+ if (LF_ISSET(WT_FS_OPEN_DIRECTIO)) {
f |= O_DIRECT;
pfh->direct_io = true;
} else
@@ -585,11 +636,11 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
#endif
#ifdef O_NOATIME
/* Avoid updating metadata for read-only workloads. */
- if (file_type == WT_OPEN_FILE_TYPE_DATA)
+ if (file_type == WT_FS_OPEN_FILE_TYPE_DATA)
f |= O_NOATIME;
#endif
- if (file_type == WT_OPEN_FILE_TYPE_LOG &&
+ if (file_type == WT_FS_OPEN_FILE_TYPE_LOG &&
FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
#ifdef O_DSYNC
f |= O_DSYNC;
@@ -601,6 +652,7 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
#endif
}
+ /* Create/Open the file. */
WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? -1 : 0), ret);
if (ret != 0)
WT_ERR_MSG(session, ret,
@@ -608,6 +660,16 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
"%s: handle-open: open: failed with direct I/O configured, "
"some filesystem types do not support direct I/O" :
"%s: handle-open: open", name);
+
+#ifdef __linux__
+ /*
+ * Durability: some filesystems require a directory sync to be confident
+ * the file will appear.
+ */
+ if (LF_ISSET(WT_FS_OPEN_DURABLE))
+ WT_ERR(__posix_directory_sync(session, name));
+#endif
+
WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
#if defined(HAVE_POSIX_FADVISE)
@@ -616,7 +678,7 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
* Ignore fadvise when doing direct I/O, the kernel cache isn't
* interesting.
*/
- if (!pfh->direct_io && file_type == WT_OPEN_FILE_TYPE_DATA) {
+ if (!pfh->direct_io && file_type == WT_FS_OPEN_FILE_TYPE_DATA) {
WT_SYSCALL(
posix_fadvise(pfh->fd, 0, 0, POSIX_FADV_RANDOM), ret);
if (ret != 0)
@@ -705,9 +767,6 @@ __wt_os_posix(WT_SESSION_IMPL *session)
/* Initialize the POSIX jump table. */
file_system->fs_directory_list = __wt_posix_directory_list;
file_system->fs_directory_list_free = __wt_posix_directory_list_free;
-#ifdef __linux__
- file_system->fs_directory_sync = __posix_directory_sync;
-#endif
file_system->fs_exist = __posix_fs_exist;
file_system->fs_open_file = __posix_open_file;
file_system->fs_remove = __posix_fs_remove;
diff --git a/src/os_win/os_fs.c b/src/os_win/os_fs.c
index 5daba124e90..fc03e0a2595 100644
--- a/src/os_win/os_fs.c
+++ b/src/os_win/os_fs.c
@@ -36,13 +36,14 @@ __win_fs_exist(WT_FILE_SYSTEM *file_system,
* Remove a file.
*/
static int
-__win_fs_remove(
- WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *name)
+__win_fs_remove(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, uint32_t flags)
{
DWORD windows_error;
WT_SESSION_IMPL *session;
WT_UNUSED(file_system);
+ WT_UNUSED(flags);
session = (WT_SESSION_IMPL *)wt_session;
@@ -62,12 +63,13 @@ __win_fs_remove(
*/
static int
__win_fs_rename(WT_FILE_SYSTEM *file_system,
- WT_SESSION *wt_session, const char *from, const char *to)
+ WT_SESSION *wt_session, const char *from, const char *to, uint32_t flags)
{
DWORD windows_error;
WT_SESSION_IMPL *session;
WT_UNUSED(file_system);
+ WT_UNUSED(flags);
session = (WT_SESSION_IMPL *)wt_session;
@@ -426,7 +428,7 @@ __win_file_write(WT_FILE_HANDLE *file_handle,
*/
static int
__win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
- const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags,
WT_FILE_HANDLE **file_handlep)
{
DWORD dwCreationDisposition, windows_error;
@@ -458,11 +460,11 @@ __win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
* require that functionality: create an empty WT_FH structure with
* invalid handles.
*/
- if (file_type == WT_OPEN_FILE_TYPE_DIRECTORY)
+ if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY)
goto directory_open;
desired_access = GENERIC_READ;
- if (!LF_ISSET(WT_OPEN_READONLY))
+ if (!LF_ISSET(WT_FS_OPEN_READONLY))
desired_access |= GENERIC_WRITE;
/*
@@ -476,15 +478,15 @@ __win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
f = FILE_ATTRIBUTE_NORMAL;
dwCreationDisposition = 0;
- if (LF_ISSET(WT_OPEN_CREATE)) {
+ if (LF_ISSET(WT_FS_OPEN_CREATE)) {
dwCreationDisposition = CREATE_NEW;
- if (LF_ISSET(WT_OPEN_EXCLUSIVE))
+ if (LF_ISSET(WT_FS_OPEN_EXCLUSIVE))
dwCreationDisposition = CREATE_ALWAYS;
} else
dwCreationDisposition = OPEN_EXISTING;
/* Direct I/O. */
- if (LF_ISSET(WT_OPEN_DIRECTIO)) {
+ if (LF_ISSET(WT_FS_OPEN_DIRECTIO)) {
f |= FILE_FLAG_NO_BUFFERING;
win_fh->direct_io = true;
}
@@ -493,19 +495,19 @@ __win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
if (FLD_ISSET(conn->write_through, file_type))
f |= FILE_FLAG_WRITE_THROUGH;
- if (file_type == WT_OPEN_FILE_TYPE_LOG &&
+ if (file_type == WT_FS_OPEN_FILE_TYPE_LOG &&
FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC))
f |= FILE_FLAG_WRITE_THROUGH;
/* Disable read-ahead on trees: it slows down random read workloads. */
- if (file_type == WT_OPEN_FILE_TYPE_DATA)
+ if (file_type == WT_FS_OPEN_FILE_TYPE_DATA)
f |= FILE_FLAG_RANDOM_ACCESS;
win_fh->filehandle = CreateFileA(name, desired_access,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, dwCreationDisposition, f, NULL);
if (win_fh->filehandle == INVALID_HANDLE_VALUE) {
- if (LF_ISSET(WT_OPEN_CREATE) &&
+ if (LF_ISSET(WT_FS_OPEN_CREATE) &&
GetLastError() == ERROR_FILE_EXISTS)
win_fh->filehandle = CreateFileA(name, desired_access,
FILE_SHARE_READ | FILE_SHARE_WRITE,
@@ -528,7 +530,7 @@ __win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
* concurrently with reads on the file. Writes would also move the file
* pointer.
*/
- if (!LF_ISSET(WT_OPEN_READONLY)) {
+ if (!LF_ISSET(WT_FS_OPEN_READONLY)) {
win_fh->filehandle_secondary = CreateFileA(name, desired_access,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, OPEN_EXISTING, f, NULL);
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index 8f4d374fd22..bc92c882117 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -64,7 +64,7 @@ __rename_file(
WT_ERR(__wt_metadata_insert(session, newuri, oldvalue));
/* Rename the underlying file. */
- WT_ERR(__wt_fs_rename(session, filename, newfile));
+ WT_ERR(__wt_fs_rename(session, filename, newfile, false));
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_fileop(session, uri, newuri));