summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-10-03 17:14:59 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-10-03 17:14:59 +1100
commitdafae2d745493a1f6a753ba1f6874fd4d3eacd80 (patch)
tree73d639c27c97681dab946ece232b6e2bfb129930 /src
parent2164f83d6342428f522ba36e8eed0abdd064014f (diff)
downloadmongo-dafae2d745493a1f6a753ba1f6874fd4d3eacd80.tar.gz
Import wiredtiger: 2d781c8cfeb2a1db8bd93e03ba35b302436e4ff3 from branch mongodb-3.4
ref: 31af5d70a8..2d781c8cfe for: 3.4.10 WT-3263 Allow archive on restart/recovery if clean shutdown WT-3264 Permanent change to disable logging should eventually remove all logs WT-3284 tree-walk restart bug WT-3308 Add statistics tracking around yield loops WT-3351 Recovery assertion failure: old_lognum < lognum WT-3406 Reconciliation is choosing reserved records for writing. WT-3461 Avoid long sleeps when the system clock is adjusted WT-3470 Avoid a metadata cursor open for table open/drop WT-3533 eviction handle walk can race with handle re-open WT-3590 Keep data consistent if writes fail during a clean shutdown
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/build_posix/configure.ac.in38
-rw-r--r--src/third_party/wiredtiger/build_win/wiredtiger_config.h3
-rw-r--r--src/third_party/wiredtiger/dist/filelist1
-rw-r--r--src/third_party/wiredtiger/dist/flags.py1
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py8
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c9
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c35
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_random.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c121
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c35
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c15
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c6
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c3
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c30
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h4
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h25
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h7
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h7
-rw-r--r--src/third_party/wiredtiger/src/include/extern_posix.h2
-rw-r--r--src/third_party/wiredtiger/src/include/extern_win.h2
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h5
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i39
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h8
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i10
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in76
-rw-r--r--src/third_party/wiredtiger/src/log/log.c54
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c20
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c4
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c13
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c33
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_time.c27
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_time.c15
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c28
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_open.c39
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c5
-rw-r--r--src/third_party/wiredtiger/src/support/err.c13
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c29
-rw-r--r--src/third_party/wiredtiger/src/support/time.c89
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c22
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c28
-rw-r--r--src/third_party/wiredtiger/test/recovery/random-abort.c4
-rw-r--r--src/third_party/wiredtiger/test/suite/test_bug018.py98
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn02.py1
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn05.py11
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn09.py4
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn16.py140
49 files changed, 897 insertions, 280 deletions
diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in
index 0fef587b4b8..415545a0d56 100644
--- a/src/third_party/wiredtiger/build_posix/configure.ac.in
+++ b/src/third_party/wiredtiger/build_posix/configure.ac.in
@@ -160,6 +160,44 @@ AS_CASE([$host_os], [darwin*], [], [AC_CHECK_FUNCS([fdatasync])])
# the generic declaration in AC_CHECK_FUNCS is incompatible.
AX_FUNC_POSIX_MEMALIGN
+# Check for POSIX condition variables with monotonic clock support
+AC_CACHE_CHECK([for condition waits with monotonic clock support],
+ [wt_cv_pthread_cond_monotonic],
+ [AC_RUN_IFELSE([AC_LANG_SOURCE([[
+#include <errno.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <time.h>
+
+int main()
+{
+ int ret;
+ pthread_condattr_t condattr;
+ pthread_cond_t cond;
+ pthread_mutex_t mtx;
+ struct timespec ts;
+
+ if ((ret = pthread_condattr_init(&condattr)) != 0) exit(1);
+ if ((ret = pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)) != 0) exit(1);
+ if ((ret = pthread_cond_init(&cond, &condattr)) != 0) exit(1);
+ if ((ret = pthread_mutex_init(&mtx, NULL)) != 0) exit(1);
+ if ((ret = clock_gettime(CLOCK_MONOTONIC, &ts)) != 0) exit(1);
+ ts.tv_sec += 1;
+ if ((ret = pthread_mutex_lock(&mtx)) != 0) exit(1);
+ if ((ret = pthread_cond_timedwait(&cond, &mtx, &ts)) != 0 && ret != EINTR && ret != ETIMEDOUT) exit(1);
+
+ exit(0);
+}
+ ]])],
+ [wt_pthread_cond_monotonic=yes],
+ [wt_pthread_cond_monotonic=no],
+ [wt_pthread_cond_monotonic=no])])
+AC_MSG_RESULT($wt_pthread_cond_monotonic)
+if test "$wt_pthread_cond_monotonic" = "yes" ; then
+ AC_DEFINE([HAVE_PTHREAD_COND_MONOTONIC], [1],
+ [Define to 1 if pthread condition variables support monotonic clocks.])
+fi
+
AC_SYS_LARGEFILE
AC_C_BIGENDIAN
diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
index 78d2784cb70..8babdbfdc1b 100644
--- a/src/third_party/wiredtiger/build_win/wiredtiger_config.h
+++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
@@ -79,6 +79,9 @@
/* Define to 1 if you have the <memory.h> header file. */
/* #undef HAVE_MEMORY_H */
+/* Define to 1 if pthread condition variables support monotonic clocks. */
+/* #undef HAVE_PTHREAD_COND_MONOTONIC */
+
/* Define to 1 if you have the `posix_fadvise' function. */
/* #undef HAVE_POSIX_FADVISE */
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
index 5a3348b940a..f53509e96ec 100644
--- a/src/third_party/wiredtiger/dist/filelist
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -191,6 +191,7 @@ src/support/rand.c
src/support/scratch.c
src/support/stat.c
src/support/thread_group.c
+src/support/time.c
src/txn/txn.c
src/txn/txn_ckpt.c
src/txn/txn_ext.c
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
index 64b5d789e72..d80c80a37ce 100644
--- a/src/third_party/wiredtiger/dist/flags.py
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -32,7 +32,6 @@ flags = {
'READ_PREV',
'READ_RESTART_OK',
'READ_SKIP_INTL',
- 'READ_SKIP_LEAF',
'READ_TRUNCATE',
'READ_WONT_NEED',
],
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index f3852d00ac8..99abc3e9ad1 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -1060,6 +1060,7 @@ rebalancing
recno
recnos
reconfig
+reconfigures
reconfiguring
recsize
rectype
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 512892eb44d..b66e95ce49b 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -431,11 +431,19 @@ connection_stats = [
##########################################
YieldStat('application_cache_time', 'application thread time waiting for cache (usecs)'),
YieldStat('application_evict_time', 'application thread time evicting (usecs)'),
+ YieldStat('child_modify_blocked_page', 'page reconciliation yielded due to child modification'),
+ YieldStat('conn_close_blocked_lsm', 'connection close yielded for lsm manager shutdown'),
+ YieldStat('dhandle_lock_blocked', 'data handle lock yielded'),
+ YieldStat('log_server_sync_blocked', 'log server sync yielded for log write'),
YieldStat('page_busy_blocked', 'page acquire busy blocked'),
+ YieldStat('page_del_rollback_blocked', 'page delete rollback yielded for instantiation'),
YieldStat('page_forcible_evict_blocked', 'page acquire eviction blocked'),
+ YieldStat('page_index_slot_blocked', 'reference for page index and slot yielded'),
YieldStat('page_locked_blocked', 'page acquire locked blocked'),
YieldStat('page_read_blocked', 'page acquire read blocked'),
YieldStat('page_sleep', 'page acquire time sleeping (usecs)'),
+ YieldStat('tree_descend_blocked', 'tree descend one level yielded for split page index update'),
+ YieldStat('txn_release_blocked', 'connection close blocked waiting for transaction state stabilization'),
]
connection_stats = sorted(connection_stats, key=attrgetter('desc'))
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 8dfbc774419..d063f48ef9d 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "31af5d70a87cf1d99c7275bc8bc01d29e2cb0d2a",
+ "commit": "2d781c8cfeb2a1db8bd93e03ba35b302436e4ff3",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.4"
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index f0aa632551b..95d817850ef 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -156,8 +156,10 @@ __cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree)
* into a tree. Eviction is disabled when an empty tree is opened, and
* it must only be enabled once.
*/
- if (__wt_atomic_cas8(&btree->original, 1, 0))
+ if (__wt_atomic_cas8(&btree->original, 1, 0)) {
+ btree->evict_disabled_open = false;
__wt_evict_file_exclusive_off(session);
+ }
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index b55ad291c5e..5c4625044d3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -153,6 +153,7 @@ void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_UPDATE **upd;
+ uint64_t yield_count;
/*
* If the page is still "deleted", it's as we left it, reset the state
@@ -160,7 +161,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* instantiated or being instantiated. Loop because it's possible for
* the page to return to the deleted state if instantiation fails.
*/
- for (;; __wt_yield())
+ for (yield_count = 0;; yield_count++, __wt_yield())
switch (ref->state) {
case WT_REF_DISK:
case WT_REF_READING:
@@ -173,7 +174,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
*/
if (__wt_atomic_casv32(
&ref->state, WT_REF_DELETED, WT_REF_DISK))
- return;
+ goto done;
break;
case WT_REF_LOCKED:
/*
@@ -203,8 +204,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
*/
__wt_free(session, ref->page_del->update_list);
__wt_free(session, ref->page_del);
- return;
+ goto done;
}
+
+done: WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, yield_count);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index a0da7df0998..1e6405272f7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -66,7 +66,6 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
size_t root_addr_size;
- uint32_t mask;
uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
const char *filename;
bool creation, forced_salvage, readonly;
@@ -75,15 +74,14 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
dhandle = session->dhandle;
/*
- * This may be a re-open of an underlying object and we have to clean
- * up. We can't clear the operation flags, however, they're set by the
- * connection handle software that called us.
+ * This may be a re-open, clean up the btree structure.
+ * Clear the fields that don't persist across a re-open.
+ * Clear all flags other than the operation flags (which are set by the
+ * connection handle software that called us).
*/
WT_RET(__btree_clear(session));
-
- mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS);
- memset(btree, 0, sizeof(*btree));
- btree->flags = mask;
+ memset(btree, 0, WT_BTREE_CLEAR_SIZE);
+ F_CLR(btree, ~WT_BTREE_SPECIAL_FLAGS);
/* Set the data handle first, our called functions reasonably use it. */
btree->dhandle = dhandle;
@@ -185,13 +183,19 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
*
* Files that can still be bulk-loaded cannot be evicted.
* Permanently cache-resident files can never be evicted.
- * Special operations don't enable eviction. (The underlying commands
- * may turn on eviction, but it's their decision.)
+ * Special operations don't enable eviction. The underlying commands may
+ * turn on eviction (for example, verify turns on eviction while working
+ * a file to keep from consuming the cache), but it's their decision. If
+ * an underlying command reconfigures eviction, it must either clear the
+ * evict-disabled-open flag or restore the eviction configuration when
+ * finished so that handle close behaves correctly.
*/
if (btree->original ||
F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE |
- WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
WT_ERR(__wt_evict_file_exclusive_on(session));
+ btree->evict_disabled_open = true;
+ }
if (0) {
err: WT_TRET(__wt_btree_close(session));
@@ -228,6 +232,15 @@ __wt_btree_close(WT_SESSION_IMPL *session)
return (0);
F_SET(btree, WT_BTREE_CLOSED);
+ /*
+ * If we turned eviction off and never turned it back on, do that now,
+ * otherwise the counter will be off.
+ */
+ if (btree->evict_disabled_open) {
+ btree->evict_disabled_open = false;
+ __wt_evict_file_exclusive_off(session);
+ }
+
/* Discard any underlying block manager resources. */
if ((bm = btree->bm) != NULL) {
btree->bm = NULL;
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index c5948ec4ab5..b4f05c440ba 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -395,8 +395,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
*/
for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
n = skip;
- WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
- WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip));
if (n == skip) {
if (skip == 0)
break;
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 86484feb7c9..c22b99c55d0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -18,9 +18,16 @@ __ref_index_slot(WT_SESSION_IMPL *session,
{
WT_PAGE_INDEX *pindex;
WT_REF **start, **stop, **p, **t;
+ uint64_t yield_count;
uint32_t entries, slot;
- for (;;) {
+ /*
+ * If we don't find our reference, the page split and our home
+ * pointer references the wrong page. When internal pages
+ * split, their WT_REF structure home values are updated; yield
+ * and wait for that to happen.
+ */
+ for (yield_count = 0;; yield_count++, __wt_yield()) {
/*
* Copy the parent page's index value: the page can split at
* any time, but the index's value is always valid, even if
@@ -59,18 +66,13 @@ __ref_index_slot(WT_SESSION_IMPL *session,
}
}
- /*
- * If we don't find our reference, the page split and our home
- * pointer references the wrong page. When internal pages
- * split, their WT_REF structure home values are updated; yield
- * and wait for that to happen.
- */
- __wt_yield();
}
found: WT_ASSERT(session, pindex->index[slot] == ref);
*pindexp = pindex;
*slotp = slot;
+
+ WT_STAT_CONN_INCRV(session, page_index_slot_blocked, yield_count);
}
/*
@@ -177,12 +179,13 @@ __ref_descend_prev(
WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
WT_PAGE_INDEX *pindex;
+ uint64_t yield_count;
/*
* We're passed a child page into which we're descending, and on which
* we have a hazard pointer.
*/
- for (;; __wt_yield()) {
+ for (yield_count = 0;; yield_count++, __wt_yield()) {
/*
* There's a split race when a cursor moving backwards through
* the tree descends the tree. If we're splitting an internal
@@ -242,6 +245,7 @@ __ref_descend_prev(
break;
}
*pindexp = pindex;
+ WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count);
}
/*
@@ -497,29 +501,21 @@ restart: /*
}
/*
- * Optionally skip leaf pages: skip all leaf pages if
- * WT_READ_SKIP_LEAF is set, when the skip-leaf-count
- * variable is non-zero, skip some count of leaf pages.
- * If this page is disk-based, crack the cell to figure
- * out it's a leaf page without reading it.
+ * Optionally skip leaf pages: when the skip-leaf-count
+ * variable is non-zero, skip some count of leaf pages,
+ * then take the next leaf page we can.
*
- * If skipping some number of leaf pages, decrement the
- * count of pages to zero, and then take the next leaf
- * page we can. Be cautious around the page decrement,
- * if for some reason don't take this particular page,
- * we can take the next one, and, there are additional
- * tests/decrements when we're about to return a leaf
- * page.
+ * The reason to do some of this work here (rather than
+ * in our caller), is because we can look at the cell
+ * and know it's a leaf page without reading it into
+ * memory. If this page is disk-based, crack the cell
+ * to figure out it's a leaf page without reading it.
*/
- if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF))
- if (__ref_is_leaf(ref)) {
- if (LF_ISSET(WT_READ_SKIP_LEAF))
- break;
- if (*skipleafcntp > 0) {
- --*skipleafcntp;
- break;
- }
- }
+ if (skipleafcntp != NULL &&
+ *skipleafcntp > 0 && __ref_is_leaf(ref)) {
+ --*skipleafcntp;
+ break;
+ }
ret = __wt_page_swap(session, couple, ref,
WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);
@@ -626,34 +622,18 @@ descend: empty_internal = true;
session, ref, &pindex);
slot = pindex->entries - 1;
}
- } else {
- /*
- * At the lowest tree level (considering a leaf
- * page), turn off the initial-descent state.
- * Descent race tests are different when moving
- * through the tree vs. the initial descent.
- */
- initial_descent = false;
-
- /*
- * Optionally skip leaf pages, the second half.
- * We didn't have an on-page cell to figure out
- * if it was a leaf page, we had to acquire the
- * hazard pointer and look at the page.
- */
- if (skipleafcntp != NULL ||
- LF_ISSET(WT_READ_SKIP_LEAF)) {
- if (LF_ISSET(WT_READ_SKIP_LEAF))
- break;
- if (*skipleafcntp > 0) {
- --*skipleafcntp;
- break;
- }
- }
-
- *refp = ref;
- goto done;
+ continue;
}
+
+ /*
+ * The tree-walk restart code knows we return any leaf
+ * page we acquire (never hazard-pointer coupling on
+ * after acquiring a leaf page), and asserts no restart
+ * happens while holding a leaf page. This page must be
+ * returned to our caller.
+ */
+ *refp = ref;
+ goto done;
}
}
@@ -690,8 +670,29 @@ __wt_tree_walk_count(WT_SESSION_IMPL *session,
* of leaf pages before returning.
*/
int
-__wt_tree_walk_skip(WT_SESSION_IMPL *session,
- WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags)
+__wt_tree_walk_skip(
+ WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp)
{
- return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags));
+ /*
+ * Optionally skip leaf pages, the second half. The tree-walk function
+ * didn't have an on-page cell it could use to figure out if the page
+ * was a leaf page or not, it had to acquire the hazard pointer and look
+ * at the page. The tree-walk code never acquires a hazard pointer on a
+ * leaf page without returning it, and it's not trivial to change that.
+ * So, the tree-walk code returns all leaf pages here and we deal with
+ * decrementing the count.
+ */
+ do {
+ WT_RET(__tree_walk_internal(session, refp, NULL, skipleafcntp,
+ WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+
+ /*
+ * The walk skipped internal pages, any page returned must be a
+ * leaf page.
+ */
+ if (*skipleafcntp > 0)
+ --*skipleafcntp;
+ } while (*skipleafcntp > 0);
+
+ return (0);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 68d45678965..00d559881dc 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1086,6 +1086,41 @@ err: /*
WT_TRET(wt_session->close(wt_session, config));
}
+ /*
+ * Perform a system-wide checkpoint so that all tables are consistent
+ * with each other. Do this before shutting down all the subsystems.
+ * We have shut down all user sessions, but send in true for waiting
+ * for internal races.
+ */
+ if (!F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) {
+ s = NULL;
+ WT_TRET(__wt_open_internal_session(
+ conn, "close_ckpt", true, 0, &s));
+ if (s != NULL) {
+ const char *checkpoint_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_checkpoint),
+ NULL
+ };
+ wt_session = &s->iface;
+ WT_TRET(__wt_txn_checkpoint(s, checkpoint_cfg, true));
+
+ /*
+ * Mark the metadata dirty so we flush it on close,
+ * allowing recovery to be skipped.
+ */
+ WT_WITH_DHANDLE(s, WT_SESSION_META_DHANDLE(s),
+ __wt_tree_modify_set(s));
+
+ WT_TRET(wt_session->close(wt_session, config));
+ }
+ }
+
+ if (ret != 0) {
+ __wt_err(session, ret,
+ "failure during close, disabling further writes");
+ F_SET(conn, WT_CONN_PANIC);
+ }
+
WT_TRET(__wt_connection_close(conn));
/* We no longer have a session, don't try to update it. */
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 1816e66b0b7..2560ca47268 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -317,6 +317,9 @@ __wt_conn_btree_open(
WT_ASSERT(session,
!F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS));
+ /* Turn off eviction. */
+ WT_RET(__wt_evict_file_exclusive_on(session));
+
/*
* If the handle is already open, it has to be closed so it can be
* reopened with a new configuration.
@@ -330,7 +333,7 @@ __wt_conn_btree_open(
* in the tree that can block the close.
*/
if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
- WT_RET(__wt_conn_btree_sync_and_close(session, false, false));
+ WT_ERR(__wt_conn_btree_sync_and_close(session, false, false));
/* Discard any previous configuration, set up the new configuration. */
__conn_btree_config_clear(session);
@@ -374,6 +377,8 @@ __wt_conn_btree_open(
err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
}
+ __wt_evict_file_exclusive_off(session);
+
return (ret);
}
@@ -673,8 +678,8 @@ restart:
continue;
WT_WITH_DHANDLE(session, dhandle,
- WT_TRET(__wt_conn_dhandle_discard_single(
- session, true, F_ISSET(conn, WT_CONN_IN_MEMORY))));
+ WT_TRET(__wt_conn_dhandle_discard_single(session, true,
+ F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_PANIC))));
goto restart;
}
@@ -699,8 +704,8 @@ restart:
/* Close the metadata file handle. */
while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL)
WT_WITH_DHANDLE(session, dhandle,
- WT_TRET(__wt_conn_dhandle_discard_single(
- session, true, F_ISSET(conn, WT_CONN_IN_MEMORY))));
+ WT_TRET(__wt_conn_dhandle_discard_single(session, true,
+ F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_PANIC))));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index d2ed314fd2e..bee1b0443f5 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -375,6 +375,7 @@ __log_file_server(void *arg)
WT_LOG *log;
WT_LSN close_end_lsn, min_lsn;
WT_SESSION_IMPL *session;
+ uint64_t yield_count;
uint32_t filenum;
bool locked;
@@ -382,6 +383,7 @@ __log_file_server(void *arg)
conn = S2C(session);
log = conn->log;
locked = false;
+ yield_count = 0;
while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
/*
* If there is a log file to close, make sure any outstanding
@@ -512,6 +514,7 @@ __log_file_server(void *arg)
* thread a chance to run and try again in
* this case.
*/
+ yield_count++;
__wt_yield();
continue;
}
@@ -524,6 +527,7 @@ __log_file_server(void *arg)
if (0) {
err: __wt_err(session, ret, "log close server error");
}
+ WT_STAT_CONN_INCRV(session, log_server_sync_blocked, yield_count);
if (locked)
__wt_spin_unlock(session, &log->log_sync_lock);
return (WT_THREAD_RET_VALUE);
@@ -902,7 +906,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond));
WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond));
WT_RET(__wt_log_open(session));
- WT_RET(__wt_log_slot_init(session));
+ WT_RET(__wt_log_slot_init(session, true));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index eb3c79422a0..649bfa7c81f 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -91,6 +91,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
if (txn_global->oldest_id == txn_global->current &&
txn_global->metadata_pinned == txn_global->current)
break;
+ WT_STAT_CONN_INCR(session, txn_release_blocked);
__wt_yield();
}
@@ -143,7 +144,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* conditional because we allocate the log path so that printlog can
* run without running logging or recovery.
*/
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
+ if (ret == 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
WT_TRET(__wt_txn_checkpoint_log(
session, true, WT_TXN_LOG_CKPT_STOP, NULL));
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index fdf68841b85..b9b1a7783fe 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -1452,26 +1452,23 @@ retry: while (slot < max_entries) {
/*
* Re-check the "no eviction" flag, used to enforce exclusive
- * access when a handle is being closed. If not set, remember
- * the file to visit first, next loop.
+ * access when a handle is being closed.
*
* Only try to acquire the lock and simply continue if we fail;
* the lock is held while the thread turning off eviction clears
* the tree's current eviction point, and part of the process is
* waiting on this thread to acknowledge that action.
+ *
+ * If a handle is being discarded, it will still be marked open,
+ * but won't have a root page.
*/
if (btree->evict_disabled == 0 &&
!__wt_spin_trylock(session, &cache->evict_walk_lock)) {
- if (btree->evict_disabled == 0) {
+ if (btree->evict_disabled == 0 &&
+ btree->root.page != NULL) {
/*
- * Assert the handle has a root page: eviction
- * should have been locked out if the tree is
- * being discarded or the root page is changing.
- * As this has not always been the case, assert
- * to debug that change.
+ * Remember the file to visit first, next loop.
*/
- WT_ASSERT(session, btree->root.page != NULL);
-
cache->evict_file_next = dhandle;
WT_WITH_DHANDLE(session, dhandle,
ret = __evict_walk_file(
@@ -1860,6 +1857,10 @@ fast: /* If the page can't be evicted, give up. */
WT_STAT_CONN_INCRV(
session, cache_eviction_pages_queued, (u_int)(evict - start));
+ __wt_verbose(session, WT_VERB_EVICTSERVER,
+ "%s walk: seen %" PRIu64 ", queued %" PRIu64,
+ session->dhandle->name, pages_seen, pages_queued);
+
/*
* If we couldn't find the number of pages we were looking for, skip
* the tree next time.
@@ -2442,14 +2443,23 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ u_int pct;
uint64_t total_bytes, total_dirty_bytes;
conn = S2C(session);
total_bytes = total_dirty_bytes = 0;
+ pct = 0; /* [-Werror=uninitialized] */
WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
WT_RET(__wt_msg(session, "cache dump"));
+ WT_RET(__wt_msg(session,
+ "cache full: %s", __wt_cache_full(session) ? "yes" : "no"));
+ WT_RET(__wt_msg(session, "cache clean check: %s (%u%%)",
+ __wt_eviction_clean_needed(session, &pct) ? "yes" : "no", pct));
+ WT_RET(__wt_msg(session, "cache dirty check: %s (%u%%)",
+ __wt_eviction_dirty_needed(session, &pct) ? "yes" : "no", pct));
+
for (dhandle = NULL;;) {
WT_WITH_HANDLE_LIST_READ_LOCK(session,
WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index d0b21b17965..e965724dffe 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -714,7 +714,7 @@ struct __wt_page {
* Related information for fast-delete, on-disk pages.
*/
struct __wt_page_deleted {
- uint64_t txnid; /* Transaction ID */
+ volatile uint64_t txnid; /* Transaction ID */
WT_UPDATE **update_list; /* List of updates for abort */
};
@@ -904,7 +904,7 @@ struct __wt_ikey {
* list.
*/
WT_PACKED_STRUCT_BEGIN(__wt_update)
- uint64_t txnid; /* update transaction */
+ volatile uint64_t txnid; /* Transaction ID */
WT_UPDATE *next; /* forward-linked list */
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index 28fe1b94b23..8712a404b13 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -142,12 +142,30 @@ struct __wt_btree {
uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */
uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */
+ /*
+ * We flush pages from the tree (in order to make checkpoint faster),
+ * without a high-level lock. To avoid multiple threads flushing at
+ * the same time, lock the tree.
+ */
+ WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
+
+ /*
+ * All of the following fields live at the end of the structure so it's
+ * easier to clear everything but the fields that persist.
+ */
+#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref))
+
+ /*
+ * Eviction information is maintained in the btree handle, but owned by
+ * eviction, not the btree code.
+ */
WT_REF *evict_ref; /* Eviction thread's location */
uint64_t evict_priority; /* Relative priority of cached pages */
u_int evict_walk_period; /* Skip this many LRU walks */
u_int evict_walk_saved; /* Saved walk skips for checkpoints */
u_int evict_walk_skips; /* Number of walks skipped */
int evict_disabled; /* Eviction disabled count */
+ bool evict_disabled_open;/* Eviction disabled on open */
volatile uint32_t evict_busy; /* Count of threads in eviction */
int evict_start_type; /* Start position for eviction walk
(see WT_EVICT_WALK_START). */
@@ -155,13 +173,6 @@ struct __wt_btree {
WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING
} checkpointing; /* Checkpoint in progress */
- /*
- * We flush pages from the tree (in order to make checkpoint faster),
- * without a high-level lock. To avoid multiple threads flushing at
- * the same time, lock the tree.
- */
- WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
-
/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
#define WT_BTREE_ALLOW_SPLITS 0x000100 /* Allow splits, even with no evict */
#define WT_BTREE_BULK 0x000200 /* Bulk-load handle */
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index f74732684f5..74611de1131 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -314,9 +314,10 @@ struct __wt_connection_impl {
#define WT_CONN_LOG_ARCHIVE 0x01 /* Archive is enabled */
#define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */
#define WT_CONN_LOG_EXISTED 0x04 /* Log files found */
-#define WT_CONN_LOG_RECOVER_DONE 0x08 /* Recovery completed */
-#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */
-#define WT_CONN_LOG_ZERO_FILL 0x20 /* Manually zero files */
+#define WT_CONN_LOG_RECOVER_DIRTY 0x08 /* Recovering unclean */
+#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */
+#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */
+#define WT_CONN_LOG_ZERO_FILL 0x40 /* Manually zero files */
uint32_t log_flags; /* Global logging configuration */
WT_CONDVAR *log_cond; /* Log server wait mutex */
WT_SESSION_IMPL *log_session; /* Log server session */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 12233c0247a..e77de41344c 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -181,7 +181,7 @@ extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, cons
extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -369,6 +369,7 @@ extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bo
extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -405,7 +406,7 @@ extern int __wt_logop_row_truncate_print(WT_SESSION_IMPL *session, const uint8_t
extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_log_slot_init(WT_SESSION_IMPL *session, bool alloc) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size);
@@ -732,6 +733,8 @@ extern int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *
extern int __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_group_stop_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern void __wt_seconds(WT_SESSION_IMPL *session, time_t *timep);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h
index c0ed056c7b6..9e32e86e64c 100644
--- a/src/third_party/wiredtiger/src/include/extern_posix.h
+++ b/src/third_party/wiredtiger/src/include/extern_posix.h
@@ -28,5 +28,5 @@ extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, co
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp);
extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
diff --git a/src/third_party/wiredtiger/src/include/extern_win.h b/src/third_party/wiredtiger/src/include/extern_win.h
index d548ee0b2ec..85db8175615 100644
--- a/src/third_party/wiredtiger/src/include/extern_win.h
+++ b/src/third_party/wiredtiger/src/include/extern_win.h
@@ -26,7 +26,7 @@ extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, co
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp);
extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern DWORD __wt_getlasterror(void);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index f26a45c68f5..d7c0e0f9472 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -47,9 +47,8 @@
#define WT_READ_PREV 0x00000080
#define WT_READ_RESTART_OK 0x00000100
#define WT_READ_SKIP_INTL 0x00000200
-#define WT_READ_SKIP_LEAF 0x00000400
-#define WT_READ_TRUNCATE 0x00000800
-#define WT_READ_WONT_NEED 0x00001000
+#define WT_READ_TRUNCATE 0x00000400
+#define WT_READ_WONT_NEED 0x00000800
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_INTERNAL 0x00000002
#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
index fad10f01103..eb99de3dcab 100644
--- a/src/third_party/wiredtiger/src/include/misc.i
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -41,45 +41,6 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
}
/*
- * __wt_seconds --
- * Return the seconds since the Epoch.
- */
-static inline void
-__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
-{
- struct timespec t;
-
- __wt_epoch(session, &t);
-
- *timep = t.tv_sec;
-}
-
-/*
- * __wt_time_check_monotonic --
- * Check and prevent time running backward. If we detect that it has, we
- * set the time structure to the previous values, making time stand still
- * until we see a time in the future of the highest value seen so far.
- */
-static inline void
-__wt_time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp)
-{
- /*
- * Detect time going backward. If so, use the last
- * saved timestamp.
- */
- if (session == NULL)
- return;
-
- if (tsp->tv_sec < session->last_epoch.tv_sec ||
- (tsp->tv_sec == session->last_epoch.tv_sec &&
- tsp->tv_nsec < session->last_epoch.tv_nsec)) {
- WT_STAT_CONN_INCR(session, time_travel);
- *tsp = session->last_epoch;
- } else
- session->last_epoch = *tsp;
-}
-
-/*
* __wt_verbose --
* Verbose message.
*
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index db48a841571..01e622a5695 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -475,11 +475,19 @@ struct __wt_connection_stats {
int64_t thread_write_active;
int64_t application_evict_time;
int64_t application_cache_time;
+ int64_t txn_release_blocked;
+ int64_t conn_close_blocked_lsm;
+ int64_t dhandle_lock_blocked;
+ int64_t log_server_sync_blocked;
int64_t page_busy_blocked;
int64_t page_forcible_evict_blocked;
int64_t page_locked_blocked;
int64_t page_read_blocked;
int64_t page_sleep;
+ int64_t page_del_rollback_blocked;
+ int64_t child_modify_blocked_page;
+ int64_t page_index_slot_blocked;
+ int64_t tree_descend_blocked;
int64_t txn_snapshots_created;
int64_t txn_snapshots_dropped;
int64_t txn_begin;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 314c948e4d1..39273a1995c 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -148,16 +148,6 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
}
/*
- * __wt_txn_committed --
- * Return if a transaction has been committed.
- */
-static inline bool
-__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id)
-{
- return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running));
-}
-
-/*
* __wt_txn_visible_all --
* Check if a given transaction ID is "globally visible". This is, if
* all sessions in the system will see the transaction ID including the
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 821efdf5fa1..b2f49ef030d 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -4806,72 +4806,94 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1216
/*! thread-yield: application thread time waiting for cache (usecs) */
#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1217
+/*!
+ * thread-yield: connection close blocked waiting for transaction state
+ * stabilization
+ */
+#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1218
+/*! thread-yield: connection close yielded for lsm manager shutdown */
+#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1219
+/*! thread-yield: data handle lock yielded */
+#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1220
+/*! thread-yield: log server sync yielded for log write */
+#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1221
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1218
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1222
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1219
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1223
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1220
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1224
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1221
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1225
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1222
+#define WT_STAT_CONN_PAGE_SLEEP 1226
+/*! thread-yield: page delete rollback yielded for instantiation */
+#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1227
+/*! thread-yield: page reconciliation yielded due to child modification */
+#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1228
+/*! thread-yield: reference for page index and slot yielded */
+#define WT_STAT_CONN_PAGE_INDEX_SLOT_BLOCKED 1229
+/*!
+ * thread-yield: tree descend one level yielded for split page index
+ * update
+ */
+#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1230
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1223
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1231
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1224
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1232
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1225
+#define WT_STAT_CONN_TXN_BEGIN 1233
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1226
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1234
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1227
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1235
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1228
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1236
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1229
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1237
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1230
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1238
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1231
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1239
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1232
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1240
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1233
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1241
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1234
+#define WT_STAT_CONN_TXN_CHECKPOINT 1242
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1235
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1243
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1236
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1244
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1237
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1245
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1238
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1246
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1239
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1247
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1240
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1248
/*!
* transaction: transaction range of IDs currently pinned by named
* snapshots
*/
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1241
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1249
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1242
+#define WT_STAT_CONN_TXN_SYNC 1250
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1243
+#define WT_STAT_CONN_TXN_COMMIT 1251
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1244
+#define WT_STAT_CONN_TXN_ROLLBACK 1252
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 803d3e8dfab..486744d2e7f 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -8,6 +8,7 @@
#include "wt_internal.h"
+static int __log_newfile(WT_SESSION_IMPL *, bool, bool *);
static int __log_openfile(
WT_SESSION_IMPL *, WT_FH **, const char *, uint32_t, uint32_t);
static int __log_write_internal(
@@ -442,6 +443,59 @@ __wt_log_extract_lognum(
}
/*
+ * __wt_log_reset --
+ * Reset the existing log file to after the given file number.
+ * Called from recovery when toggling logging back on, it was off
+ * the previous open but it was on earlier before that toggle.
+ */
+int
+__wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t old_lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
+ log->fileid > lognum)
+ return (0);
+
+ WT_ASSERT(session, F_ISSET(conn, WT_CONN_RECOVERING));
+ WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY));
+ /*
+ * We know we're single threaded and called from recovery only when
+ * toggling logging back on. Therefore the only log files we have are
+ * old and outdated and the new one created when logging opened before
+ * recovery. We have to remove all old log files first and then create
+ * the new one so that log file numbers are contiguous in the file
+ * system.
+ */
+ WT_RET(__wt_close(session, &log->log_fh));
+ WT_RET(__log_get_files(session,
+ WT_LOG_FILENAME, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(
+ session, logfiles[i], &old_lognum));
+ WT_ASSERT(session, old_lognum < lognum || lognum == 1);
+ WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, old_lognum));
+ }
+ log->fileid = lognum;
+
+ /* Send in true to update connection creation LSNs. */
+ WT_WITH_SLOT_LOCK(session, log,
+ ret = __log_newfile(session, true, NULL));
+ WT_ERR(__wt_log_slot_init(session, false));
+err: WT_TRET(
+ __wt_fs_directory_list_free(session, &logfiles, logcount));
+ return (ret);
+}
+
+/*
* __log_zero --
* Zero a log file.
*/
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
index 97e317ce68c..b23d589c8e2 100644
--- a/src/third_party/wiredtiger/src/log/log_slot.c
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -401,7 +401,7 @@ __wt_log_slot_switch(WT_SESSION_IMPL *session,
* Initialize the slot array.
*/
int
-__wt_log_slot_init(WT_SESSION_IMPL *session)
+__wt_log_slot_init(WT_SESSION_IMPL *session, bool alloc)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -423,15 +423,17 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
* switch log files very aggressively. Scale back the buffer for
* small log file sizes.
*/
- log->slot_buf_size = (uint32_t)WT_MIN(
- (size_t)conn->log_file_max / 10, WT_LOG_SLOT_BUF_SIZE);
- for (i = 0; i < WT_SLOT_POOL; i++) {
- WT_ERR(__wt_buf_init(session,
- &log->slot_pool[i].slot_buf, log->slot_buf_size));
- F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
+ if (alloc) {
+ log->slot_buf_size = (uint32_t)WT_MIN(
+ (size_t)conn->log_file_max / 10, WT_LOG_SLOT_BUF_SIZE);
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ WT_ERR(__wt_buf_init(session,
+ &log->slot_pool[i].slot_buf, log->slot_buf_size));
+ F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
+ }
+ WT_STAT_CONN_SET(session,
+ log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
}
- WT_STAT_CONN_SET(session,
- log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
/*
* Set up the available slot from the pool the first time.
*/
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index b7d9086d10e..62da094b5f7 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -295,8 +295,10 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
manager->lsm_workers == 0);
if (manager->lsm_workers > 0) {
/* Wait for the main LSM manager thread to finish. */
- while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN))
+ while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) {
+ WT_STAT_CONN_INCR(session, conn_close_blocked_lsm);
__wt_yield();
+ }
/* Clean up open LSM handles. */
ret = __wt_lsm_tree_close_all(session);
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
index aca69d0e6a2..895b8a9c565 100644
--- a/src/third_party/wiredtiger/src/meta/meta_table.c
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -230,12 +230,23 @@ __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key)
WT_RET_MSG(session, EINVAL,
"%s: remove not supported on the turtle file", key);
+ /*
+ * Take, release, and reacquire the metadata cursor. It's complicated,
+ * but that way the underlying meta-tracking function doesn't have to
+ * open a second metadata cursor, it can use the session's cached one.
+ */
WT_RET(__wt_metadata_cursor(session, &cursor));
cursor->set_key(cursor, key);
WT_ERR(cursor->search(cursor));
+ WT_ERR(__wt_metadata_cursor_release(session, &cursor));
+
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_update(session, key));
- WT_ERR(cursor->remove(cursor));
+
+ WT_ERR(__wt_metadata_cursor(session, &cursor));
+ cursor->set_key(cursor, key);
+ ret = cursor->remove(cursor);
+
err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
index fe010b62305..e4a6683dee9 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -19,11 +19,19 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
WT_DECL_RET;
WT_RET(__wt_calloc_one(session, &cond));
-
WT_ERR(pthread_mutex_init(&cond->mtx, NULL));
- /* Initialize the condition variable to permit self-blocking. */
+#ifdef HAVE_PTHREAD_COND_MONOTONIC
+ {
+ pthread_condattr_t condattr;
+
+ WT_ERR(pthread_condattr_init(&condattr));
+ WT_ERR(pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC));
+ WT_ERR(pthread_cond_init(&cond->cond, &condattr));
+ }
+#else
WT_ERR(pthread_cond_init(&cond->cond, NULL));
+#endif
cond->name = name;
cond->waiters = 0;
@@ -79,7 +87,26 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
goto skipping;
if (usecs > 0) {
- __wt_epoch(session, &ts);
+ /*
+ * Get the current time as the basis for calculating when the
+ * wait should end. Prefer a monotonic clock source to avoid
+ * unexpectedly long sleeps when the system clock is adjusted.
+ *
+ * Failing that, query the time directly and don't attempt to
+ * correct for the clock moving backwards, which would result
+ * in a sleep that is too long by however much the clock is
+ * updated. This isn't as good as a monotonic clock source but
+ * makes the window of vulnerability smaller (i.e., the
+ * calculated time is only incorrect if the system clock
+ * changes in between us querying it and waiting).
+ */
+#ifdef HAVE_PTHREAD_COND_MONOTONIC
+ WT_SYSCALL_RETRY(clock_gettime(CLOCK_MONOTONIC, &ts), ret);
+ if (ret != 0)
+ WT_PANIC_MSG(session, ret, "clock_gettime");
+#else
+ __wt_epoch_raw(session, &ts);
+#endif
ts.tv_sec += (time_t)
(((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION);
ts.tv_nsec = (long)
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
index fe337fea7cf..25a08d62355 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_time.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -9,14 +9,12 @@
#include "wt_internal.h"
/*
- * __wt_epoch --
- * Return the time since the Epoch.
+ * __wt_epoch_raw --
+ * Return the time since the Epoch as reported by a system call.
*/
void
-__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+__wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
{
- struct timespec tmp;
WT_DECL_RET;
/*
@@ -28,19 +26,10 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
tsp->tv_sec = 0;
tsp->tv_nsec = 0;
- /*
- * Read into a local variable so that we're comparing the correct
- * value when we check for monotonic increasing time. There are
- * many places we read into an unlocked global variable.
- */
#if defined(HAVE_CLOCK_GETTIME)
- WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, &tmp), ret);
- if (ret == 0) {
- __wt_time_check_monotonic(session, &tmp);
- tsp->tv_sec = tmp.tv_sec;
- tsp->tv_nsec = tmp.tv_nsec;
+ WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret);
+ if (ret == 0)
return;
- }
WT_PANIC_MSG(session, ret, "clock_gettime");
#elif defined(HAVE_GETTIMEOFDAY)
{
@@ -48,10 +37,8 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
if (ret == 0) {
- tmp.tv_sec = v.tv_sec;
- tmp.tv_nsec = v.tv_usec * WT_THOUSAND;
- __wt_time_check_monotonic(session, &tmp);
- *tsp = tmp;
+ tsp->tv_sec = v.tv_sec;
+ tsp->tv_nsec = v.tv_usec * WT_THOUSAND;
return;
}
WT_PANIC_MSG(session, ret, "gettimeofday");
diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c
index ba71341ab22..84c06bed6e5 100644
--- a/src/third_party/wiredtiger/src/os_win/os_time.c
+++ b/src/third_party/wiredtiger/src/os_win/os_time.c
@@ -9,24 +9,23 @@
#include "wt_internal.h"
/*
- * __wt_epoch --
- * Return the time since the Epoch.
+ * __wt_epoch_raw --
+ * Return the time since the Epoch as reported by the system.
*/
void
-__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+__wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
{
- struct timespec tmp;
FILETIME time;
uint64_t ns100;
+ WT_UNUSED(session);
+
GetSystemTimeAsFileTime(&time);
ns100 = (((int64_t)time.dwHighDateTime << 32) + time.dwLowDateTime)
- 116444736000000000LL;
- tmp.tv_sec = ns100 / 10000000;
- tmp.tv_nsec = (long)((ns100 % 10000000) * 100);
- __wt_time_check_monotonic(session, &tmp);
- *tsp = tmp;
+ tsp->tv_sec = ns100 / 10000000;
+ tsp->tv_nsec = (long)((ns100 % 10000000) * 100);
}
/*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index e59d9796352..55e2c62ac01 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -45,7 +45,9 @@ typedef struct {
uint64_t orig_btree_checkpoint_gen;
uint64_t orig_txn_checkpoint_gen;
- /* Track the page's maximum transaction ID. */
+ /* Track the oldest transaction running when reconciliation starts. */
+ uint64_t last_running;
+
uint64_t max_txn;
/* Track if all updates were skipped. */
@@ -849,6 +851,16 @@ __rec_write_init(WT_SESSION_IMPL *session,
WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
/*
+ * Cache the oldest running transaction ID. This is used to check
+ * whether updates seen by reconciliation have committed. We keep a
+ * cached copy to avoid races where a concurrent transaction could
+ * abort while reconciliation is examining its updates. This way, any
+ * transaction running when reconciliation starts is considered
+ * uncommitted.
+ */
+ WT_ORDERED_READ(r->last_running, S2C(session)->txn_global.last_running);
+
+ /*
* Lookaside table eviction is configured when eviction gets aggressive,
* adjust the flags for cases we don't support.
*/
@@ -1159,11 +1171,13 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* When reconciling for eviction, track whether any
* uncommitted updates are found.
*/
- if (__wt_txn_committed(session, txnid)) {
- if (*updp == NULL)
- *updp = upd;
- } else
+ if (WT_TXNID_LE(r->last_running, txnid)) {
skipped = true;
+ continue;
+ }
+
+ if (*updp == NULL)
+ *updp = upd;
} else {
/*
* Checkpoint can only write updates visible as of its
@@ -1562,7 +1576,7 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* not reserved for our exclusive use, there are other page states that
* must be considered.
*/
- for (;; __wt_yield())
+ for (;; __wt_yield()) {
switch (r->tested_ref_state = ref->state) {
case WT_REF_DISK:
/* On disk, not modified by definition. */
@@ -1673,6 +1687,8 @@ __rec_child_modify(WT_SESSION_IMPL *session,
WT_ILLEGAL_VALUE(session);
}
+ WT_STAT_CONN_INCR(session, child_modify_blocked_page);
+ }
in_memory:
/*
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
index 44bd66e011a..081650d74a8 100644
--- a/src/third_party/wiredtiger/src/schema/schema_open.c
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -425,37 +425,40 @@ __schema_open_table(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_TABLE *table;
const char *tconfig;
- char *tablename;
*tablep = NULL;
cursor = NULL;
table = NULL;
- tablename = NULL;
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE));
+ WT_ERR(__wt_calloc_one(session, &table));
+ table->name_hash = __wt_hash_city64(name, namelen);
+
WT_ERR(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name));
- WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename));
+ WT_ERR(__wt_strndup(session, buf->data, buf->size, &table->name));
+ /*
+ * Don't hold the metadata cursor pinned, we call functions that use it
+ * to retrieve column group information.
+ */
WT_ERR(__wt_metadata_cursor(session, &cursor));
- cursor->set_key(cursor, tablename);
- WT_ERR(cursor->search(cursor));
- WT_ERR(cursor->get_value(cursor, &tconfig));
-
- WT_ERR(__wt_calloc_one(session, &table));
- table->name = tablename;
- tablename = NULL;
- table->name_hash = __wt_hash_city64(name, namelen);
-
- WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval));
+ cursor->set_key(cursor, table->name);
+ if ((ret = cursor->search(cursor)) == 0 &&
+ (ret = cursor->get_value(cursor, &tconfig)) == 0)
+ ret = __wt_strdup(session, tconfig, &table->config);
+ WT_TRET(__wt_metadata_cursor_release(session, &cursor));
+ WT_ERR(ret);
- WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval));
+ WT_ERR(__wt_config_getones(session, table->config, "columns", &cval));
+ WT_ERR(__wt_config_getones(
+ session, table->config, "key_format", &cval));
WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format));
- WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval));
+ WT_ERR(__wt_config_getones(
+ session, table->config, "value_format", &cval));
WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format));
- WT_ERR(__wt_strdup(session, tconfig, &table->config));
/* Point to some items in the copy to save re-parsing. */
WT_ERR(__wt_config_getones(session, table->config,
@@ -491,7 +494,7 @@ __schema_open_table(WT_SESSION_IMPL *session,
if (table->ncolgroups > 0 && table->is_simple)
WT_ERR_MSG(session, EINVAL,
- "%s requires a table with named columns", tablename);
+ "%s requires a table with named columns", table->name);
WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups));
WT_ERR(__wt_schema_open_colgroups(session, table));
@@ -509,9 +512,7 @@ __schema_open_table(WT_SESSION_IMPL *session,
if (0) {
err: WT_TRET(__wt_schema_destroy_table(session, &table));
}
- WT_TRET(__wt_metadata_cursor_release(session, &cursor));
- __wt_free(session, tablename);
__wt_scr_free(session, &buf);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index ffeb6137766..707e07ac11f 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -235,6 +235,7 @@ __wt_session_lock_dhandle(
lock_busy = true;
/* Give other threads a chance to make progress. */
+ WT_STAT_CONN_INCR(session, dhandle_lock_blocked);
__wt_yield();
}
}
@@ -597,7 +598,9 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint)
* the underlying file are visible to the in-memory pages.
*/
WT_ERR(__wt_evict_file_exclusive_on(session));
- WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD));
+ ret = __wt_cache_op(session, WT_SYNC_DISCARD);
+ __wt_evict_file_exclusive_off(session);
+ WT_ERR(ret);
/*
* We lock checkpoint handles that we are overwriting, so the handle
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
index 57efde72b23..f98b1943449 100644
--- a/src/third_party/wiredtiger/src/support/err.c
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -494,7 +494,18 @@ __wt_panic(WT_SESSION_IMPL *session)
WT_GCC_FUNC_ATTRIBUTE((cold))
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
- F_SET(S2C(session), WT_CONN_PANIC);
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * If the connection has already be marked for panic, just return the
+ * error.
+ */
+ if (F_ISSET(conn, WT_CONN_PANIC))
+ return (WT_PANIC);
+
+ F_SET(conn, WT_CONN_PANIC);
__wt_err(session, WT_PANIC, "the process must exit and restart");
#if defined(HAVE_DIAGNOSTIC)
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 8b72e653658..c9e577ac3b6 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -842,11 +842,19 @@ static const char * const __stats_connection_desc[] = {
"thread-state: active filesystem write calls",
"thread-yield: application thread time evicting (usecs)",
"thread-yield: application thread time waiting for cache (usecs)",
+ "thread-yield: connection close blocked waiting for transaction state stabilization",
+ "thread-yield: connection close yielded for lsm manager shutdown",
+ "thread-yield: data handle lock yielded",
+ "thread-yield: log server sync yielded for log write",
"thread-yield: page acquire busy blocked",
"thread-yield: page acquire eviction blocked",
"thread-yield: page acquire locked blocked",
"thread-yield: page acquire read blocked",
"thread-yield: page acquire time sleeping (usecs)",
+ "thread-yield: page delete rollback yielded for instantiation",
+ "thread-yield: page reconciliation yielded due to child modification",
+ "thread-yield: reference for page index and slot yielded",
+ "thread-yield: tree descend one level yielded for split page index update",
"transaction: number of named snapshots created",
"transaction: number of named snapshots dropped",
"transaction: transaction begins",
@@ -1129,11 +1137,19 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing thread_write_active */
stats->application_evict_time = 0;
stats->application_cache_time = 0;
+ stats->txn_release_blocked = 0;
+ stats->conn_close_blocked_lsm = 0;
+ stats->dhandle_lock_blocked = 0;
+ stats->log_server_sync_blocked = 0;
stats->page_busy_blocked = 0;
stats->page_forcible_evict_blocked = 0;
stats->page_locked_blocked = 0;
stats->page_read_blocked = 0;
stats->page_sleep = 0;
+ stats->page_del_rollback_blocked = 0;
+ stats->child_modify_blocked_page = 0;
+ stats->page_index_slot_blocked = 0;
+ stats->tree_descend_blocked = 0;
stats->txn_snapshots_created = 0;
stats->txn_snapshots_dropped = 0;
stats->txn_begin = 0;
@@ -1475,12 +1491,25 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, application_evict_time);
to->application_cache_time +=
WT_STAT_READ(from, application_cache_time);
+ to->txn_release_blocked += WT_STAT_READ(from, txn_release_blocked);
+ to->conn_close_blocked_lsm +=
+ WT_STAT_READ(from, conn_close_blocked_lsm);
+ to->dhandle_lock_blocked += WT_STAT_READ(from, dhandle_lock_blocked);
+ to->log_server_sync_blocked +=
+ WT_STAT_READ(from, log_server_sync_blocked);
to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked);
to->page_forcible_evict_blocked +=
WT_STAT_READ(from, page_forcible_evict_blocked);
to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked);
to->page_read_blocked += WT_STAT_READ(from, page_read_blocked);
to->page_sleep += WT_STAT_READ(from, page_sleep);
+ to->page_del_rollback_blocked +=
+ WT_STAT_READ(from, page_del_rollback_blocked);
+ to->child_modify_blocked_page +=
+ WT_STAT_READ(from, child_modify_blocked_page);
+ to->page_index_slot_blocked +=
+ WT_STAT_READ(from, page_index_slot_blocked);
+ to->tree_descend_blocked += WT_STAT_READ(from, tree_descend_blocked);
to->txn_snapshots_created +=
WT_STAT_READ(from, txn_snapshots_created);
to->txn_snapshots_dropped +=
diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c
new file mode 100644
index 00000000000..0e4562c0234
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/time.c
@@ -0,0 +1,89 @@
+/*-
+ * Public Domain 2014-2017 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __time_check_monotonic --
+ * Check and prevent time running backward. If we detect that it has, we
+ * set the time structure to the previous values, making time stand still
+ * until we see a time in the future of the highest value seen so far.
+ */
+static void
+__time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ /*
+ * Detect time going backward. If so, use the last
+ * saved timestamp.
+ */
+ if (session == NULL)
+ return;
+
+ if (tsp->tv_sec < session->last_epoch.tv_sec ||
+ (tsp->tv_sec == session->last_epoch.tv_sec &&
+ tsp->tv_nsec < session->last_epoch.tv_nsec)) {
+ WT_STAT_CONN_INCR(session, time_travel);
+ *tsp = session->last_epoch;
+ } else
+ session->last_epoch = *tsp;
+}
+
+/*
+ * __wt_epoch --
+ * Return the time since the Epoch, adjusted so it never appears to go
+ * backwards.
+ */
+void
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ struct timespec tmp;
+
+ /*
+ * Read into a local variable so that we're comparing the correct
+ * value when we check for monotonic increasing time. There are
+ * many places we read into an unlocked global variable.
+ */
+ __wt_epoch_raw(session, &tmp);
+ __time_check_monotonic(session, &tmp);
+ *tsp = tmp;
+}
+
+/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+void
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ __wt_epoch(session, &t);
+
+ *timep = t.tv_sec;
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index cb3b3436786..09a8c4d9663 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -289,6 +289,7 @@ int
__wt_txn_checkpoint_log(
WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp)
{
+ WT_CONNECTION_IMPL *conn;
WT_DECL_ITEM(logrec);
WT_DECL_RET;
WT_ITEM *ckpt_snapshot, empty;
@@ -300,7 +301,8 @@ __wt_txn_checkpoint_log(
uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
const char *fmt = WT_UNCHECKED_STRING(IIIIu);
- txn_global = &S2C(session)->txn_global;
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
txn = &session->txn;
ckpt_lsn = &txn->ckpt_lsn;
@@ -374,20 +376,20 @@ __wt_txn_checkpoint_log(
txn->ckpt_nsnapshot, ckpt_snapshot));
logrec->size += (uint32_t)recsize;
WT_ERR(__wt_log_write(session, logrec, lsnp,
- F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ?
+ F_ISSET(conn, WT_CONN_CKPT_SYNC) ?
WT_LOG_FSYNC : 0));
/*
* If this full checkpoint completed successfully and there is
- * no hot backup in progress and this is not recovery, tell
- * the logging subsystem the checkpoint LSN so that it can
- * archive. Do not update the logging checkpoint LSN if this
- * is during a clean connection close, only during a full
- * checkpoint. A clean close may not update any metadata LSN
- * and we do not want to archive in that case.
+ * no hot backup in progress and this is not an unclean
+ * recovery, tell the logging subsystem the checkpoint LSN so
+ * that it can archive. Do not update the logging checkpoint
+ * LSN if this is during a clean connection close, only during
+ * a full checkpoint. A clean close may not update any
+ * metadata LSN and we do not want to archive in that case.
*/
- if (!S2C(session)->hot_backup &&
- !F_ISSET(S2C(session), WT_CONN_RECOVERING) &&
+ if (!conn->hot_backup &&
+ !FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) &&
txn->full_ckpt)
__wt_log_ckpt(session, ckpt_lsn);
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 30932195b1e..fbef0ad4a5f 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -20,6 +20,7 @@ typedef struct {
} *files;
size_t file_alloc; /* Allocated size of files array. */
u_int max_fileid; /* Maximum file ID seen. */
+ WT_LSN max_lsn; /* Maximum checkpoint LSN seen. */
u_int nfiles; /* Number of files in the metadata. */
WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */
@@ -342,6 +343,10 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
"Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")",
uri, fileid, lsn.l.file, lsn.l.offset);
+ if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) &&
+ (WT_IS_MAX_LSN(&r->max_lsn) || __wt_log_cmp(&lsn, &r->max_lsn) > 0))
+ r->max_lsn = lsn;
+
return (0);
}
@@ -428,6 +433,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
WT_RET(__wt_open_internal_session(conn, "txn-recover",
false, WT_SESSION_NO_LOGGING, &session));
r.session = session;
+ WT_MAX_LSN(&r.max_lsn);
F_SET(conn, WT_CONN_RECOVERING);
WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
@@ -443,9 +449,24 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
*/
if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) ||
WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
+ /*
+ * Detect if we're going from logging disabled to enabled.
+ * We need to know this to verify LSNs and start at the correct
+ * log file later. If someone ran with logging, then disabled
+ * it and removed all the log files and then turned logging back
+ * on, we have to start logging in the log file number that is
+ * larger than any checkpoint LSN we have from the earlier time.
+ */
WT_ERR(__recovery_file_scan(&r));
conn->next_file_id = r.max_fileid;
- goto done;
+
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
+ WT_IS_MAX_LSN(&metafile->ckpt_lsn) &&
+ !WT_IS_MAX_LSN(&r.max_lsn)) {
+ WT_ERR(__wt_log_reset(session, r.max_lsn.l.file));
+ goto ckpt;
+ } else
+ goto done;
}
/*
@@ -535,6 +556,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
* this is not a read-only connection.
* We can consider skipping it in the future.
*/
+ if (needs_rec)
+ FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
if (WT_IS_INIT_LSN(&r.ckpt_lsn))
WT_ERR(__wt_log_scan(session, NULL,
WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
@@ -554,11 +577,12 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
* open is fast and keep the metadata up to date with the checkpoint
* LSN and archiving.
*/
- WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+ckpt: WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
err: WT_TRET(__recovery_free(&r));
__wt_free(session, config);
+ FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
if (ret != 0)
__wt_err(session, ret, "Recovery failed");
diff --git a/src/third_party/wiredtiger/test/recovery/random-abort.c b/src/third_party/wiredtiger/test/recovery/random-abort.c
index febe6530534..b53383e5730 100644
--- a/src/third_party/wiredtiger/test/recovery/random-abort.c
+++ b/src/third_party/wiredtiger/test/recovery/random-abort.c
@@ -47,9 +47,9 @@ static bool inmem;
#define RECORDS_FILE "records-%" PRIu32
#define ENV_CONFIG_DEF \
- "create,log=(file_max=10M,archive=false,enabled)"
+ "create,log=(file_max=10M,enabled)"
#define ENV_CONFIG_TXNSYNC \
- "create,log=(file_max=10M,archive=false,enabled)," \
+ "create,log=(file_max=10M,enabled)," \
"transaction_sync=(enabled,method=none)"
#define ENV_CONFIG_REC "log=(recover=on)"
#define MAX_VAL 4096
diff --git a/src/third_party/wiredtiger/test/suite/test_bug018.py b/src/third_party/wiredtiger/test/suite/test_bug018.py
new file mode 100644
index 00000000000..7d20ebcaacb
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_bug018.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from helper import copy_wiredtiger_home
+import os
+import wiredtiger, wttest
+
+# test_bug018.py
+# JIRA WT-3590: if writing table data fails during close then tables
+# that were updated within the same transaction could get out of sync with
+# each other.
+class test_bug018(wttest.WiredTigerTestCase):
+ '''Test closing/reopening/recovering tables when writes fail'''
+
+ conn_config = 'log=(enabled)'
+
+ def setUp(self):
+ # This test uses Linux-specific code so skip on any other system.
+ if os.name != 'posix' or os.uname()[0] != 'Linux':
+ self.skipTest('Linux-specific test skipped on ' + os.name)
+ super(test_bug018, self).setUp()
+
+ def create_table(self, uri):
+ self.session.create(uri, 'key_format=S,value_format=S')
+ return self.session.open_cursor(uri)
+
+ def test_bug018(self):
+ '''Test closing multiple tables'''
+ basename = 'bug018.'
+ baseuri = 'file:' + basename
+ c1 = self.create_table(baseuri + '01.wt')
+ c2 = self.create_table(baseuri + '02.wt')
+
+ self.session.begin_transaction()
+ c1['key'] = 'value'
+ c2['key'] = 'value'
+ self.session.commit_transaction()
+
+ # Simulate a write failure by closing the file descriptor for the second
+ # table out from underneath WiredTiger. We do this right before
+ # closing the connection so that the write error happens during close
+ # when writing out the final data. Allow table 1 to succeed and force
+ # an erorr writing out table 2.
+ #
+ # This is Linux-specific code to figure out the file descriptor.
+ for f in os.listdir('/proc/self/fd'):
+ try:
+ if os.readlink('/proc/self/fd/' + f).endswith(basename + '02.wt'):
+ os.close(int(f))
+ except OSError:
+ pass
+
+ # Expect an error and messages, so turn off stderr checking.
+ with self.expectedStderrPattern(''):
+ try:
+ self.close_conn()
+ except wiredtiger.WiredTigerError:
+ self.conn = None
+
+ # Make a backup for forensics in case something goes wrong.
+ backup_dir = 'BACKUP'
+ copy_wiredtiger_home('.', backup_dir, True)
+
+ # After reopening and running recovery both tables should be in
+ # sync even though table 1 was successfully written and table 2
+ # had an error on close.
+ self.open_conn()
+ c1 = self.session.open_cursor(baseuri + '01.wt')
+ c2 = self.session.open_cursor(baseuri + '02.wt')
+ self.assertEqual(list(c1), list(c2))
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_txn02.py b/src/third_party/wiredtiger/test/suite/test_txn02.py
index 01626057b9e..76a325743e9 100644
--- a/src/third_party/wiredtiger/test/suite/test_txn02.py
+++ b/src/third_party/wiredtiger/test/suite/test_txn02.py
@@ -169,7 +169,6 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
try:
session = backup_conn.open_session()
finally:
- session.checkpoint("force")
self.check(backup_conn.open_session(), None, committed)
# Sleep long enough so that the archive thread is guaranteed
# to run before we close the connection.
diff --git a/src/third_party/wiredtiger/test/suite/test_txn05.py b/src/third_party/wiredtiger/test/suite/test_txn05.py
index 7aaff221ba4..7099bc972aa 100644
--- a/src/third_party/wiredtiger/test/suite/test_txn05.py
+++ b/src/third_party/wiredtiger/test/suite/test_txn05.py
@@ -134,12 +134,12 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess):
session = backup_conn.open_session()
finally:
self.check(session, None, committed)
- # Force a checkpoint because we don't record the recovery
- # checkpoint as available for archiving.
- session.checkpoint("force")
# Sleep long enough so that the archive thread is guaranteed
# to run before we close the connection.
time.sleep(1.0)
+ if count == 0:
+ first_logs = \
+ fnmatch.filter(os.listdir(self.backup_dir), "*Log*")
backup_conn.close()
count += 1
#
@@ -149,6 +149,11 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess):
#
cur_logs = fnmatch.filter(os.listdir(self.backup_dir), "*Log*")
for o in orig_logs:
+ # Creating the backup was effectively an unclean shutdown so
+ # even after sleeping, we should never archive log files
+ # because a checkpoint has not run. Later opens and runs of
+ # recovery will detect a clean shutdown and allow archiving.
+ self.assertEqual(True, o in first_logs)
if self.archive == 'true':
self.assertEqual(False, o in cur_logs)
else:
diff --git a/src/third_party/wiredtiger/test/suite/test_txn09.py b/src/third_party/wiredtiger/test/suite/test_txn09.py
index 768d714e248..b8a3d7f38ae 100644
--- a/src/third_party/wiredtiger/test/suite/test_txn09.py
+++ b/src/third_party/wiredtiger/test/suite/test_txn09.py
@@ -26,8 +26,8 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
-# test_txn02.py
-# Transactions: commits and rollbacks
+# test_txn09.py
+# Transactions: recovery toggling logging
#
import fnmatch, os, shutil, time
diff --git a/src/third_party/wiredtiger/test/suite/test_txn16.py b/src/third_party/wiredtiger/test/suite/test_txn16.py
new file mode 100644
index 00000000000..929da2291c7
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_txn16.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_txn16.py
+# Recovery: Test that toggling between logging and not logging does not
+# continue to generate more log files.
+#
+
+import fnmatch, os, shutil, time
+from suite_subprocess import suite_subprocess
+import wttest
+
+class test_txn16(wttest.WiredTigerTestCase, suite_subprocess):
+ t1 = 'table:test_txn16_1'
+ t2 = 'table:test_txn16_2'
+ t3 = 'table:test_txn16_3'
+ nentries = 1000
+ create_params = 'key_format=i,value_format=i'
+ # Set the log file size small so we generate checkpoints
+ # with LSNs in different files.
+ conn_config = 'config_base=false,' + \
+ 'log=(archive=false,enabled,file_max=100K),' + \
+ 'transaction_sync=(method=dsync,enabled)'
+ conn_on = 'config_base=false,' + \
+ 'log=(archive=false,enabled,file_max=100K),' + \
+ 'transaction_sync=(method=dsync,enabled)'
+ conn_off = 'config_base=false,log=(enabled=false)'
+
+ def populate_table(self, uri):
+ self.session.create(uri, self.create_params)
+ c = self.session.open_cursor(uri, None, None)
+ # Populate with an occasional checkpoint to generate
+ # some varying LSNs.
+ for i in range(self.nentries):
+ c[i] = i + 1
+ if i % 900 == 0:
+ self.session.checkpoint()
+ c.close()
+
+ def copy_dir(self, olddir, newdir):
+ ''' Simulate a crash from olddir and restart in newdir. '''
+ # with the connection still open, copy files to new directory
+ shutil.rmtree(newdir, ignore_errors=True)
+ os.mkdir(newdir)
+ for fname in os.listdir(olddir):
+ fullname = os.path.join(olddir, fname)
+ # Skip lock file on Windows since it is locked
+ if os.path.isfile(fullname) and \
+ "WiredTiger.lock" not in fullname and \
+ "Tmplog" not in fullname and \
+ "Preplog" not in fullname:
+ shutil.copy(fullname, newdir)
+ # close the original connection.
+ self.close_conn()
+
+ def run_toggle(self, homedir):
+ loop = 0
+ # Record original log files. There should never be overlap
+ # with these even after they're removed.
+ orig_logs = fnmatch.filter(os.listdir(homedir), "*Log*")
+ while loop < 3:
+ # Reopen with logging on to run recovery first time
+ on_conn = self.wiredtiger_open(homedir, self.conn_on)
+ on_conn.close()
+ if loop > 0:
+ # Get current log files.
+ cur_logs = fnmatch.filter(os.listdir(homedir), "*Log*")
+ scur = set(cur_logs)
+ sorig = set(orig_logs)
+ # There should never be overlap with the log files that
+ # were there originally. Mostly this checks that after
+ # opening with logging disabled and then re-enabled, we
+ # don't see log file 1.
+ self.assertEqual(scur.isdisjoint(sorig), True)
+ if loop > 1:
+ # We should be creating the same log files each time.
+ for l in cur_logs:
+ self.assertEqual(l in last_logs, True)
+ for l in last_logs:
+ self.assertEqual(l in cur_logs, True)
+ last_logs = cur_logs
+ loop += 1
+ # Remove all log files before opening without logging.
+ cur_logs = fnmatch.filter(os.listdir(homedir), "*Log*")
+ for l in cur_logs:
+ path=homedir + "/" + l
+ os.remove(path)
+ off_conn = self.wiredtiger_open(homedir, self.conn_off)
+ off_conn.close()
+
+ def test_recovery(self):
+ ''' Check log file creation when toggling. '''
+
+ # Here's the strategy:
+ # - With logging populate 4 tables. Checkpoint
+ # them at different times.
+ # - Copy to a new directory to simulate a crash.
+ # - Close the original connection.
+ # On both a "copy" to simulate a crash and the original (3x):
+ # - Record log files existing.
+ # - Reopen with logging to run recovery. Close connection.
+ # - Record log files existing.
+ # - Remove all log files.
+ # - Open connection with logging disabled.
+ # - Record log files existing. Verify we don't keep adding.
+ #
+ self.populate_table(self.t1)
+ self.populate_table(self.t2)
+ self.populate_table(self.t3)
+ self.copy_dir(".", "RESTART")
+ self.run_toggle(".")
+ self.run_toggle("RESTART")
+
+if __name__ == '__main__':
+ wttest.run()