summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2018-08-23 10:31:08 +1000
committerLuke Chen <luke.chen@mongodb.com>2018-08-23 10:32:12 +1000
commit1ab1010737145ba3761318508ff65ba74dfe8155 (patch)
tree65ce8e71f66740f515a57fbc72bd87333138f9f6
parent1711789a79b3e9c15865a3817d2b22c0f53b7c0f (diff)
downloadmongo-1ab1010737145ba3761318508ff65ba74dfe8155.tar.gz
Import wiredtiger: 264bc732b0b25f84e4c3af70220d3cedb34bc551 from branch mongodb-3.2r3.2.21-rc0r3.2.21
ref: 4babae8093..264bc732b0 for: 3.2.21 WT-3637 Fix a heap use after free from evicting of a page that just split. WT-3710 Fix a race condition between concurrent page splits WT-4045 Don't retry fsync calls after EIO failure
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok3
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c5
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c78
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c12
5 files changed, 83 insertions, 17 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 99abc3e9ad1..fd8854a32e5 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -143,6 +143,7 @@ Fprintf
FreeBSD
FreeBSD's
FreeLibrary
+Fsync
Fuerst
GCC
GIDs
@@ -207,6 +208,7 @@ LSM
LSN
LSNs
LTE
+LWN
LZ
LZO
LeafGreen
@@ -285,6 +287,7 @@ Pandis
Phong
PlatformSDK
Posix
+PostgreSQL
PowerPC
Pre
Preload
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index b613d8d45e1..fdb9f216469 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "4babae8093f7a3cb05226abe959e10e0bb6b2716",
+ "commit": "264bc732b0b25f84e4c3af70220d3cedb34bc551",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.2"
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 92be2125b88..6c2d1c06000 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -462,6 +462,9 @@ __split_ref_prepare(WT_SESSION_IMPL *session,
* ascend into the created children, but eventually fail as that parent
* page won't yet know about the created children pages. That's OK, we
* spin there until the parent's page index is updated.
+ *
+ * Lock the newly created page to ensure it doesn't split until all
+ * child pages have been updated.
*/
for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
ref = pindex->index[i];
@@ -495,12 +498,14 @@ __split_ref_prepare(WT_SESSION_IMPL *session,
* reading the child's page index structure is safe.
*/
j = 0;
+ WT_PAGE_LOCK(session, child);
WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
child_ref->home = child;
child_ref->pindex_hint = j++;
} WT_INTL_FOREACH_END;
WT_LEAVE_PAGE_INDEX(session);
+ WT_PAGE_UNLOCK(session, child);
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index bc8cbf67025..3c90183caf2 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -31,6 +31,23 @@
/*
* __posix_sync --
* Underlying support function to flush a file descriptor.
+ *
+ * Fsync calls (or fsync-style calls, for example, fdatasync) are not retried
+ * on failure, and failure halts the system.
+ *
+ * Excerpted from the LWN.net article https://lwn.net/Articles/752063/:
+ * In short, PostgreSQL assumes that a successful call to fsync() indicates
+ * that all data written since the last successful call made it safely to
+ * persistent storage. But that is not what the kernel actually does. When
+ * a buffered I/O write fails due to a hardware-level error, filesystems
+ * will respond differently, but that behavior usually includes discarding
+ * the data in the affected pages and marking them as being clean. So a read
+ * of the blocks that were just written will likely return something other
+ * than the data that was written.
+ *
+ * Given the shared history of UNIX filesystems, and the difficulty of knowing
+ * what specific error will be returned under specific circumstances, we don't
+ * retry fsync-style calls and panic if a flush operation fails.
*/
static int
__posix_sync(
@@ -52,25 +69,49 @@ __posix_sync(
* OS X F_FULLFSYNC fcntl documentation:
* "This is currently implemented on HFS, MS-DOS (FAT), and Universal
* Disk Format (UDF) file systems."
+ *
+ * See comment in __posix_sync(): sync cannot be retried or fail.
*/
- WT_SYSCALL_RETRY(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
- if (ret == 0)
- return (0);
- /*
- * Assume F_FULLFSYNC failed because the file system doesn't support it
- * and fallback to fsync.
- */
+ static enum { FF_NOTSET, FF_IGNORE, FF_OK } ff_status = FF_NOTSET;
+ switch (ff_status) {
+ case FF_NOTSET:
+ WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
+ if (ret == 0) {
+ ff_status = FF_OK;
+ return (0);
+ }
+
+ /*
+ * If the first F_FULLFSYNC fails, assume the file system
+ * doesn't support it and fallback to fdatasync or fsync.
+ */
+ ff_status = FF_IGNORE;
+ __wt_err(session, ret,
+ "fcntl(F_FULLFSYNC) failed, falling back to fdatasync "
+ "or fsync");
+ break;
+ case FF_IGNORE:
+ break;
+ case FF_OK:
+ WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
+ if (ret == 0)
+ return (0);
+ WT_PANIC_RET(session,
+ ret, "%s: %s: fcntl(F_FULLFSYNC)", name, func);
+ }
#endif
#if defined(HAVE_FDATASYNC)
- WT_SYSCALL_RETRY(fdatasync(fd), ret);
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_SYSCALL(fdatasync(fd), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret, "%s: %s: fdatasync", name, func);
+ WT_PANIC_RET(session, ret, "%s: %s: fdatasync", name, func);
#else
- WT_SYSCALL_RETRY(fsync(fd), ret);
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_SYSCALL(fsync(fd), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret, "%s: %s: fsync", name, func);
+ WT_PANIC_RET(session, ret, "%s: %s: fsync", name, func);
#endif
}
@@ -108,12 +149,15 @@ __posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
WT_SYSCALL(close(fd), tret);
if (tret != 0) {
__wt_err(session, tret, "%s: directory-sync: close", dir);
- if (ret == 0)
- ret = tret;
+ WT_TRET(tret);
}
err: __wt_scr_free(session, &tmp);
- return (ret);
+ if (ret == 0)
+ return (ret);
+
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_PANIC_RET(session, ret, "%s: directory-sync", path);
}
#endif
@@ -460,11 +504,13 @@ __posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
session = (WT_SESSION_IMPL *)wt_session;
pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
- WT_SYSCALL_RETRY(sync_file_range(pfh->fd,
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_SYSCALL(sync_file_range(pfh->fd,
(off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret,
+
+ WT_PANIC_RET(session, ret,
"%s: handle-sync-nowait: sync_file_range", file_handle->name);
}
#endif
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index e59d9796352..1c17a90df2f 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -388,6 +388,18 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
*/
WT_PAGE_LOCK(session, page);
+ /*
+ * Now that the page is locked, if attempting to evict it, check again
+ * whether eviction is permitted. The page's state could have changed
+ * while we were waiting to acquire the lock (e.g., the page could have
+ * split).
+ */
+ if (LF_ISSET(WT_EVICTING) &&
+ !__wt_page_can_evict(session, ref, NULL)) {
+ WT_PAGE_UNLOCK(session, page);
+ return (EBUSY);
+ }
+
oldest_id = __wt_txn_oldest_id(session);
if (LF_ISSET(WT_EVICTING))
mod->last_eviction_id = oldest_id;