From 1ab1010737145ba3761318508ff65ba74dfe8155 Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Thu, 23 Aug 2018 10:31:08 +1000 Subject: Import wiredtiger: 264bc732b0b25f84e4c3af70220d3cedb34bc551 from branch mongodb-3.2 ref: 4babae8093..264bc732b0 for: 3.2.21 WT-3637 Fix a heap use after free from evicting of a page that just split. WT-3710 Fix a race condition between concurrent page splits WT-4045 Don't retry fsync calls after EIO failure --- src/third_party/wiredtiger/dist/s_string.ok | 3 + src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_split.c | 5 ++ src/third_party/wiredtiger/src/os_posix/os_fs.c | 78 +++++++++++++++++----- .../wiredtiger/src/reconcile/rec_write.c | 12 ++++ 5 files changed, 83 insertions(+), 17 deletions(-) diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 99abc3e9ad1..fd8854a32e5 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -143,6 +143,7 @@ Fprintf FreeBSD FreeBSD's FreeLibrary +Fsync Fuerst GCC GIDs @@ -207,6 +208,7 @@ LSM LSN LSNs LTE +LWN LZ LZO LeafGreen @@ -285,6 +287,7 @@ Pandis Phong PlatformSDK Posix +PostgreSQL PowerPC Pre Preload diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index b613d8d45e1..fdb9f216469 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "4babae8093f7a3cb05226abe959e10e0bb6b2716", + "commit": "264bc732b0b25f84e4c3af70220d3cedb34bc551", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.2" diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 92be2125b88..6c2d1c06000 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -462,6 +462,9 @@ __split_ref_prepare(WT_SESSION_IMPL *session, * ascend into the created children, but eventually fail as that parent * page won't yet know about the created children pages. That's OK, we * spin there until the parent's page index is updated. + * + * Lock the newly created page to ensure it doesn't split until all + * child pages have been updated. */ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { ref = pindex->index[i]; @@ -495,12 +498,14 @@ __split_ref_prepare(WT_SESSION_IMPL *session, * reading the child's page index structure is safe. */ j = 0; + WT_PAGE_LOCK(session, child); WT_ENTER_PAGE_INDEX(session); WT_INTL_FOREACH_BEGIN(session, child, child_ref) { child_ref->home = child; child_ref->pindex_hint = j++; } WT_INTL_FOREACH_END; WT_LEAVE_PAGE_INDEX(session); + WT_PAGE_UNLOCK(session, child); #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index bc8cbf67025..3c90183caf2 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -31,6 +31,23 @@ /* * __posix_sync -- * Underlying support function to flush a file descriptor. + * + * Fsync calls (or fsync-style calls, for example, fdatasync) are not retried + * on failure, and failure halts the system. + * + * Excerpted from the LWN.net article https://lwn.net/Articles/752063/: + * In short, PostgreSQL assumes that a successful call to fsync() indicates + * that all data written since the last successful call made it safely to + * persistent storage. But that is not what the kernel actually does. When + * a buffered I/O write fails due to a hardware-level error, filesystems + * will respond differently, but that behavior usually includes discarding + * the data in the affected pages and marking them as being clean. So a read + * of the blocks that were just written will likely return something other + * than the data that was written. + * + * Given the shared history of UNIX filesystems, and the difficulty of knowing + * what specific error will be returned under specific circumstances, we don't + * retry fsync-style calls and panic if a flush operation fails. */ static int __posix_sync( @@ -52,25 +69,49 @@ __posix_sync( * OS X F_FULLFSYNC fcntl documentation: * "This is currently implemented on HFS, MS-DOS (FAT), and Universal * Disk Format (UDF) file systems." + * + * See comment in __posix_sync(): sync cannot be retried or fail. */ - WT_SYSCALL_RETRY(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret); - if (ret == 0) - return (0); - /* - * Assume F_FULLFSYNC failed because the file system doesn't support it - * and fallback to fsync. - */ + static enum { FF_NOTSET, FF_IGNORE, FF_OK } ff_status = FF_NOTSET; + switch (ff_status) { + case FF_NOTSET: + WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret); + if (ret == 0) { + ff_status = FF_OK; + return (0); + } + + /* + * If the first F_FULLFSYNC fails, assume the file system + * doesn't support it and fallback to fdatasync or fsync. + */ + ff_status = FF_IGNORE; + __wt_err(session, ret, + "fcntl(F_FULLFSYNC) failed, falling back to fdatasync " + "or fsync"); + break; + case FF_IGNORE: + break; + case FF_OK: + WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret); + if (ret == 0) + return (0); + WT_PANIC_RET(session, + ret, "%s: %s: fcntl(F_FULLFSYNC)", name, func); + } #endif #if defined(HAVE_FDATASYNC) - WT_SYSCALL_RETRY(fdatasync(fd), ret); + /* See comment in __posix_sync(): sync cannot be retried or fail. */ + WT_SYSCALL(fdatasync(fd), ret); if (ret == 0) return (0); - WT_RET_MSG(session, ret, "%s: %s: fdatasync", name, func); + WT_PANIC_RET(session, ret, "%s: %s: fdatasync", name, func); #else - WT_SYSCALL_RETRY(fsync(fd), ret); + /* See comment in __posix_sync(): sync cannot be retried or fail. */ + WT_SYSCALL(fsync(fd), ret); if (ret == 0) return (0); - WT_RET_MSG(session, ret, "%s: %s: fsync", name, func); + WT_PANIC_RET(session, ret, "%s: %s: fsync", name, func); #endif } @@ -108,12 +149,15 @@ __posix_directory_sync(WT_SESSION_IMPL *session, const char *path) WT_SYSCALL(close(fd), tret); if (tret != 0) { __wt_err(session, tret, "%s: directory-sync: close", dir); - if (ret == 0) - ret = tret; + WT_TRET(tret); } err: __wt_scr_free(session, &tmp); - return (ret); + if (ret == 0) + return (ret); + + /* See comment in __posix_sync(): sync cannot be retried or fail. */ + WT_PANIC_RET(session, ret, "%s: directory-sync", path); } #endif @@ -460,11 +504,13 @@ __posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session) session = (WT_SESSION_IMPL *)wt_session; pfh = (WT_FILE_HANDLE_POSIX *)file_handle; - WT_SYSCALL_RETRY(sync_file_range(pfh->fd, + /* See comment in __posix_sync(): sync cannot be retried or fail. */ + WT_SYSCALL(sync_file_range(pfh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret); if (ret == 0) return (0); - WT_RET_MSG(session, ret, + + WT_PANIC_RET(session, ret, "%s: handle-sync-nowait: sync_file_range", file_handle->name); } #endif diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index e59d9796352..1c17a90df2f 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -388,6 +388,18 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, */ WT_PAGE_LOCK(session, page); + /* + * Now that the page is locked, if attempting to evict it, check again + * whether eviction is permitted. The page's state could have changed + * while we were waiting to acquire the lock (e.g., the page could have + * split). + */ + if (LF_ISSET(WT_EVICTING) && + !__wt_page_can_evict(session, ref, NULL)) { + WT_PAGE_UNLOCK(session, page); + return (EBUSY); + } + oldest_id = __wt_txn_oldest_id(session); if (LF_ISSET(WT_EVICTING)) mod->last_eviction_id = oldest_id; -- cgit v1.2.1