summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2018-05-01 12:15:54 +1000
committerLuke Chen <luke.chen@mongodb.com>2018-05-01 12:15:54 +1000
commitb5b11cc858e1da6d8c451e0e9be17122cf5968de (patch)
treea1cc3479914f771ed2e5a89edf5cef37f7d0aad9 /src
parente88d29393fa22c857af21e107a3bc20402625b10 (diff)
downloadmongo-b5b11cc858e1da6d8c451e0e9be17122cf5968de.tar.gz
Import wiredtiger: aa6646fd0a1394793edfcf799f5f41f1d073bc5d from branch mongodb-3.8
ref: abf0651a81..aa6646fd0a for: 4.0.0-rc0 WT-4045 Don't retry fsync calls after EIO failure WT-4051 Test format configures too-small LSM caches. WT-4061 Don't rollback during recovery due to cache pressure
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok3
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c11
-rw-r--r--src/third_party/wiredtiger/src/include/lsm.h7
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c16
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c80
-rw-r--r--src/third_party/wiredtiger/test/format/config.c139
-rw-r--r--src/third_party/wiredtiger/test/format/format.h3
-rw-r--r--src/third_party/wiredtiger/test/format/wts.c29
9 files changed, 193 insertions, 97 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 6ceb4c2013b..877f14c6b8c 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -146,6 +146,7 @@ Fprintf
FreeBSD
FreeBSD's
FreeLibrary
+Fsync
Fuerst
GCC
GIDs
@@ -213,6 +214,7 @@ LSM
LSN
LSNs
LTE
+LWN
LZ
LZO
LeafGreen
@@ -293,6 +295,7 @@ Pandis
Phong
PlatformSDK
Posix
+PostgreSQL
PowerPC
Pre
Preload
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 08ac7c3b053..3dcc44b3f0a 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "abf0651a814a01169c21a3cbe689ad2534134701",
+ "commit": "aa6646fd0a1394793edfcf799f5f41f1d073bc5d",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.8"
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 3c1bdee2ef3..8396612b7ca 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -2375,11 +2375,14 @@ __wt_cache_eviction_worker(
for (initial_progress = cache->eviction_progress;; ret = 0) {
/*
* A pathological case: if we're the oldest transaction in the
- * system and the eviction server is stuck trying to find space,
- * abort the transaction to give up all hazard pointers before
- * trying again.
+ * system and the eviction server is stuck trying to find space
+ * (and we're not in recovery, because those transactions can't
+ * be rolled back), abort the transaction to give up all hazard
+ * pointers before trying again.
*/
- if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session)) {
+ if (__wt_cache_stuck(session) &&
+ __wt_txn_am_oldest(session) &&
+ !F_ISSET(conn, WT_CONN_RECOVERING)) {
--cache->evict_aggressive_score;
WT_STAT_CONN_INCR(session, txn_fail_cache);
WT_ERR(__wt_txn_rollback_required(session,
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
index 3102f1754cf..f515e03519a 100644
--- a/src/third_party/wiredtiger/src/include/lsm.h
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -193,6 +193,13 @@ struct __wt_lsm_manager {
#define WT_LSM_AGGRESSIVE_THRESHOLD 2
/*
+ * The minimum size for opening a tree: three chunks, plus one page for each
+ * participant in up to three concurrent merges.
+ */
+#define WT_LSM_TREE_MINIMUM_SIZE(chunk_size, merge_max, maxleafpage) \
+ (3 * (chunk_size) + 3 * ((merge_max) * (maxleafpage)))
+
+/*
* WT_LSM_TREE --
* An LSM tree.
*/
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index 9a7ab20f18f..16b28a1aecc 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -467,25 +467,23 @@ static int
__lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
uint64_t maxleafpage, required;
const char *cfg[] = { WT_CONFIG_BASE(
session, WT_SESSION_create), lsm_tree->file_config, NULL };
+ conn = S2C(session);
+
WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
maxleafpage = (uint64_t)cval.val;
- /*
- * Three chunks, plus one page for each participant in up to three
- * concurrent merges.
- */
- required = 3 * lsm_tree->chunk_size +
- 3 * (lsm_tree->merge_max * maxleafpage);
- if (S2C(session)->cache_size < required)
+ required = WT_LSM_TREE_MINIMUM_SIZE(
+ lsm_tree->chunk_size, lsm_tree->merge_max, maxleafpage);
+ if (conn->cache_size < required)
WT_RET_MSG(session, EINVAL,
"LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, "
"must be at least %" PRIu64 " (%" PRIu64 "MB)",
- S2C(session)->cache_size,
- S2C(session)->cache_size / WT_MEGABYTE,
+ conn->cache_size, conn->cache_size / WT_MEGABYTE,
required, (required + (WT_MEGABYTE - 1))/ WT_MEGABYTE);
return (0);
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index 7875a6be028..0af67ad38c5 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -31,6 +31,23 @@
/*
* __posix_sync --
* Underlying support function to flush a file descriptor.
+ *
+ * Fsync calls (or fsync-style calls, for example, fdatasync) are not retried
+ * on failure, and failure halts the system.
+ *
+ * Excerpted from the LWN.net article https://lwn.net/Articles/752063/:
+ * In short, PostgreSQL assumes that a successful call to fsync() indicates
+ * that all data written since the last successful call made it safely to
+ * persistent storage. But that is not what the kernel actually does. When
+ * a buffered I/O write fails due to a hardware-level error, filesystems
+ * will respond differently, but that behavior usually includes discarding
+ * the data in the affected pages and marking them as being clean. So a read
+ * of the blocks that were just written will likely return something other
+ * than the data that was written.
+ *
+ * Given the shared history of UNIX filesystems, and the difficulty of knowing
+ * what specific error will be returned under specific circumstances, we don't
+ * retry fsync-style calls and panic if a flush operation fails.
*/
static int
__posix_sync(
@@ -39,8 +56,6 @@ __posix_sync(
WT_DECL_RET;
#if defined(F_FULLFSYNC)
- static bool fullfsync_error_logged = false;
-
/*
* OS X fsync documentation:
* "Note that while fsync() will flush all data from the host to the
@@ -54,31 +69,49 @@ __posix_sync(
* OS X F_FULLFSYNC fcntl documentation:
* "This is currently implemented on HFS, MS-DOS (FAT), and Universal
* Disk Format (UDF) file systems."
+ *
+ * See comment in __posix_sync(): sync cannot be retried or fail.
*/
- WT_SYSCALL_RETRY(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
- if (ret == 0)
- return (0);
+ static enum { FF_NOTSET, FF_IGNORE, FF_OK } ff_status = FF_NOTSET;
+ switch (ff_status) {
+ case FF_NOTSET:
+ WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
+ if (ret == 0) {
+ ff_status = FF_OK;
+ return (0);
+ }
- /*
- * Assume F_FULLFSYNC failed because the file system doesn't support it
- * and fallback to fsync.
- */
- if (!fullfsync_error_logged) {
- fullfsync_error_logged = true;
+ /*
+ * If the first F_FULLFSYNC fails, assume the file system
+ * doesn't support it and fallback to fdatasync or fsync.
+ */
+ ff_status = FF_IGNORE;
__wt_err(session, ret,
- "fcntl(F_FULLFSYNC) failed, falling back to fsync");
+ "fcntl(F_FULLFSYNC) failed, falling back to fdatasync "
+ "or fsync");
+ break;
+ case FF_IGNORE:
+ break;
+ case FF_OK:
+ WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
+ if (ret == 0)
+ return (0);
+ WT_PANIC_RET(session,
+ ret, "%s: %s: fcntl(F_FULLFSYNC)", name, func);
}
#endif
#if defined(HAVE_FDATASYNC)
- WT_SYSCALL_RETRY(fdatasync(fd), ret);
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_SYSCALL(fdatasync(fd), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret, "%s: %s: fdatasync", name, func);
+ WT_PANIC_RET(session, ret, "%s: %s: fdatasync", name, func);
#else
- WT_SYSCALL_RETRY(fsync(fd), ret);
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_SYSCALL(fsync(fd), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret, "%s: %s: fsync", name, func);
+ WT_PANIC_RET(session, ret, "%s: %s: fsync", name, func);
#endif
}
@@ -116,12 +149,15 @@ __posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
WT_SYSCALL(close(fd), tret);
if (tret != 0) {
__wt_err(session, tret, "%s: directory-sync: close", dir);
- if (ret == 0)
- ret = tret;
+ WT_TRET(tret);
}
err: __wt_scr_free(session, &tmp);
- return (ret);
+ if (ret == 0)
+ return (ret);
+
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_PANIC_RET(session, ret, "%s: directory-sync", path);
}
#endif
@@ -468,11 +504,13 @@ __posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
session = (WT_SESSION_IMPL *)wt_session;
pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
- WT_SYSCALL_RETRY(sync_file_range(pfh->fd,
+ /* See comment in __posix_sync(): sync cannot be retried or fail. */
+ WT_SYSCALL(sync_file_range(pfh->fd,
(off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret,
+
+ WT_PANIC_RET(session, ret,
"%s: handle-sync-nowait: sync_file_range", file_handle->name);
}
#endif
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index c77d51f5df0..e2ecda66827 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -29,6 +29,7 @@
#include "format.h"
#include "config.h"
+static void config_cache(void);
static void config_checkpoint(void);
static void config_checksum(void);
static void config_compression(const char *);
@@ -59,6 +60,7 @@ void
config_setup(void)
{
CONFIG *cp;
+ char buf[128];
/* Clear any temporary values. */
config_reset();
@@ -135,14 +137,20 @@ config_setup(void)
continue;
/*
- * Boolean flags are 0 or 1, but only set N in 100 where the
- * variable's min value is N. Set the flag if we rolled >=
- * the min, 0 otherwise.
+ * Boolean flags are 0 or 1, where the variable's "min" value
+ * is the percent chance the flag is "on" (so "on" if random
+ * rolled <= N, otherwise "off").
*/
if (F_ISSET(cp, C_BOOL))
- *cp->v = mmrand(NULL, 1, 100) <= cp->min ? 1 : 0;
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "%s=%s",
+ cp->name,
+ mmrand(NULL, 1, 100) <= cp->min ? "on" : "off"));
else
- *cp->v = mmrand(NULL, cp->min, cp->maxrand);
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "%s=%" PRIu32,
+ cp->name, mmrand(NULL, cp->min, cp->maxrand)));
+ config_single(buf, 0);
}
/* Required shared libraries. */
@@ -174,21 +182,7 @@ config_setup(void)
config_lrt();
config_pct();
config_prepare();
-
- /*
- * If this is an LSM run, ensure cache size sanity.
- * Ensure there is at least 1MB of cache per thread.
- */
- if (!config_is_perm("cache")) {
- if (DATASOURCE("lsm"))
- g.c_cache = 30 * g.c_chunk_size;
- if (g.c_cache < g.c_threads)
- g.c_cache = g.c_threads;
- }
-
- /* Check if a minimum cache size has been specified. */
- if (g.c_cache_minimum != 0 && g.c_cache < g.c_cache_minimum)
- g.c_cache = g.c_cache_minimum;
+ config_cache();
/*
* Turn off truncate for LSM runs (some configurations with truncate
@@ -253,6 +247,77 @@ config_setup(void)
}
/*
+ * config_cache --
+ * Cache configuration.
+ */
+static void
+config_cache(void)
+{
+ uint32_t max_dirty_bytes, required;
+
+ /* Page sizes are powers-of-two for bad historic reasons. */
+ g.intl_page_max = 1U << g.c_intl_page_max;
+ g.leaf_page_max = 1U << g.c_leaf_page_max;
+
+ if (config_is_perm("cache")) {
+ if (config_is_perm("cache_minimum") &&
+ g.c_cache_minimum != 0 && g.c_cache < g.c_cache_minimum)
+ testutil_die(EINVAL,
+ "minimum cache set larger than cache "
+ "(%" PRIu32 " > %" PRIu32 ")",
+ g.c_cache_minimum, g.c_cache);
+ return;
+ }
+
+ /* Check if a minimum cache size has been specified. */
+ if (g.c_cache_minimum != 0 && g.c_cache < g.c_cache_minimum)
+ g.c_cache = g.c_cache_minimum;
+
+ /* Ensure there is at least 1MB of cache per thread. */
+ if (g.c_cache < g.c_threads)
+ g.c_cache = g.c_threads;
+
+ /*
+ * Maximum internal/leaf page size sanity.
+ *
+ * Ensure we can service at least one operation per-thread concurrently
+ * without filling the cache with pinned pages, that is, every thread
+ * consuming an internal page and a leaf page. Page-size configurations
+ * control on-disk sizes and in-memory pages are often larger than their
+ * disk counterparts, so it's hard to translate from one to the other.
+ * Use a size-adjustment multiplier as an estimate.
+ *
+ * Assuming all of those pages are dirty, don't let the maximum dirty
+ * bytes exceed 40% of the cache (the default eviction trigger is 20%).
+ */
+#define SIZE_ADJUSTMENT 3
+ for (;;) {
+ max_dirty_bytes = ((g.c_cache * WT_MEGABYTE) / 10) * 4;
+ if (SIZE_ADJUSTMENT * g.c_threads *
+ (g.intl_page_max + g.leaf_page_max) <= max_dirty_bytes)
+ break;
+ ++g.c_cache;
+ }
+
+ /*
+ * Ensure cache size sanity for LSM runs. An LSM tree open requires 3
+ * chunks plus a page for each participant in up to three concurrent
+ * merges. Integrate a thread count into that calculation by requiring
+ * 3 chunks/pages per configured thread. That might be overkill, but
+ * LSM runs are more sensitive to small caches than other runs, and a
+ * generous cache avoids stalls we're not interested in chasing.
+ */
+ if (DATASOURCE("lsm")) {
+ required = WT_LSM_TREE_MINIMUM_SIZE(
+ g.c_chunk_size * WT_MEGABYTE,
+ g.c_threads * g.c_merge_max, g.c_threads * g.leaf_page_max);
+ required = (required + (WT_MEGABYTE - 1)) / WT_MEGABYTE;
+ if (g.c_cache < required)
+ g.c_cache = required;
+ }
+}
+
+/*
* config_checkpoint --
* Checkpoint configuration.
*/
@@ -900,17 +965,6 @@ config_single(const char *s, int perm)
++ep;
if (F_ISSET(cp, C_STRING)) {
- if (strncmp(s, "data_source", strlen("data_source")) == 0 &&
- strncmp("file", ep, strlen("file")) != 0 &&
- strncmp("helium", ep, strlen("helium")) != 0 &&
- strncmp("kvsbdb", ep, strlen("kvsbdb")) != 0 &&
- strncmp("lsm", ep, strlen("lsm")) != 0 &&
- strncmp("table", ep, strlen("table")) != 0) {
- fprintf(stderr,
- "Invalid data source option: %s\n", ep);
- exit(EXIT_FAILURE);
- }
-
/*
* Free the previous setting if a configuration has been
* passed in twice.
@@ -926,12 +980,22 @@ config_single(const char *s, int perm)
} else if (strncmp(s, "checksum", strlen("checksum")) == 0) {
config_map_checksum(ep, &g.c_checksum_flag);
*cp->vstr = dstrdup(ep);
- } else if (strncmp(
- s, "compression", strlen("compression")) == 0) {
+ } else if (strncmp(s,
+ "compression", strlen("compression")) == 0) {
config_map_compression(ep, &g.c_compression_flag);
*cp->vstr = dstrdup(ep);
- } else if (strncmp(
- s, "encryption", strlen("encryption")) == 0) {
+ } else if (strncmp(s,
+ "data_source", strlen("data_source")) == 0 &&
+ strncmp("file", ep, strlen("file")) != 0 &&
+ strncmp("helium", ep, strlen("helium")) != 0 &&
+ strncmp("kvsbdb", ep, strlen("kvsbdb")) != 0 &&
+ strncmp("lsm", ep, strlen("lsm")) != 0 &&
+ strncmp("table", ep, strlen("table")) != 0) {
+ fprintf(stderr,
+ "Invalid data source option: %s\n", ep);
+ exit(EXIT_FAILURE);
+ } else if (strncmp(s,
+ "encryption", strlen("encryption")) == 0) {
config_map_encryption(ep, &g.c_encryption_flag);
*cp->vstr = dstrdup(ep);
} else if (strncmp(s, "file_type", strlen("file_type")) == 0) {
@@ -945,10 +1009,8 @@ config_single(const char *s, int perm)
config_map_compression(ep,
&g.c_logging_compression_flag);
*cp->vstr = dstrdup(ep);
- } else {
- free((void *)*cp->vstr);
+ } else
*cp->vstr = dstrdup(ep);
- }
return;
}
@@ -981,6 +1043,7 @@ config_single(const char *s, int perm)
progname, s, cp->min, cp->maxset);
exit(EXIT_FAILURE);
}
+
*cp->v = v;
}
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index e929eb3207d..eb3ce376423 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -264,6 +264,9 @@ typedef struct {
#define ISOLATION_SNAPSHOT 4
u_int c_isolation_flag; /* Isolation flag value */
+ uint32_t intl_page_max; /* Maximum page sizes */
+ uint32_t leaf_page_max;
+
uint64_t key_cnt; /* Keys loaded so far */
uint64_t rows; /* Total rows */
diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c
index 4f1d99b78fb..92558ec1f54 100644
--- a/src/third_party/wiredtiger/test/format/wts.c
+++ b/src/third_party/wiredtiger/test/format/wts.c
@@ -355,51 +355,32 @@ wts_init(void)
WT_CONNECTION *conn;
WT_SESSION *session;
size_t max;
- uint32_t maxintlpage, maxintlkey, maxleafpage, maxleafkey, maxleafvalue;
+ uint32_t maxintlkey, maxleafkey, maxleafvalue;
char config[4096], *p;
conn = g.wts_conn;
p = config;
max = sizeof(config);
- /*
- * Ensure that we can service at least one operation per-thread
- * concurrently without filling the cache with pinned pages. We choose
- * a multiplier of three because the max configurations control on disk
- * size and in memory pages are often significantly larger than their
- * disk counterparts. We also apply the default eviction_dirty_trigger
- * of 20% so that workloads don't get stuck with dirty pages in cache.
- */
- maxintlpage = 1U << g.c_intl_page_max;
- maxleafpage = 1U << g.c_leaf_page_max;
- while (3 * g.c_threads * (maxintlpage + maxleafpage) >
- (g.c_cache << 20) / 5) {
- if (maxleafpage <= 512 && maxintlpage <= 512)
- break;
- if (maxintlpage > 512)
- maxintlpage >>= 1;
- if (maxleafpage > 512)
- maxleafpage >>= 1;
- }
CONFIG_APPEND(p,
"key_format=%s,"
"allocation_size=512,%s"
"internal_page_max=%" PRIu32 ",leaf_page_max=%" PRIu32,
(g.type == ROW) ? "u" : "r",
g.c_firstfit ? "block_allocation=first," : "",
- maxintlpage, maxleafpage);
+ g.intl_page_max, g.leaf_page_max);
/*
* Configure the maximum key/value sizes, but leave it as the default
* if we come up with something crazy.
*/
- maxintlkey = mmrand(NULL, maxintlpage / 50, maxintlpage / 40);
+ maxintlkey = mmrand(NULL, g.intl_page_max / 50, g.intl_page_max / 40);
if (maxintlkey > 20)
CONFIG_APPEND(p, ",internal_key_max=%" PRIu32, maxintlkey);
- maxleafkey = mmrand(NULL, maxleafpage / 50, maxleafpage / 40);
+ maxleafkey = mmrand(NULL, g.leaf_page_max / 50, g.leaf_page_max / 40);
if (maxleafkey > 20)
CONFIG_APPEND(p, ",leaf_key_max=%" PRIu32, maxleafkey);
- maxleafvalue = mmrand(NULL, maxleafpage * 10, maxleafpage / 40);
+ maxleafvalue = mmrand(NULL, g.leaf_page_max * 10, g.leaf_page_max / 40);
if (maxleafvalue > 40 && maxleafvalue < 100 * 1024)
CONFIG_APPEND(p, ",leaf_value_max=%" PRIu32, maxleafvalue);