diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-10-03 17:14:59 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-10-03 17:14:59 +1100 |
commit | dafae2d745493a1f6a753ba1f6874fd4d3eacd80 (patch) | |
tree | 73d639c27c97681dab946ece232b6e2bfb129930 /src | |
parent | 2164f83d6342428f522ba36e8eed0abdd064014f (diff) | |
download | mongo-dafae2d745493a1f6a753ba1f6874fd4d3eacd80.tar.gz |
Import wiredtiger: 2d781c8cfeb2a1db8bd93e03ba35b302436e4ff3 from branch mongodb-3.4
ref: 31af5d70a8..2d781c8cfe
for: 3.4.10
WT-3263 Allow archive on restart/recovery if clean shutdown
WT-3264 Permanent change to disable logging should eventually remove all logs
WT-3284 tree-walk restart bug
WT-3308 Add statistics tracking around yield loops
WT-3351 Recovery assertion failure: old_lognum < lognum
WT-3406 Reconciliation is choosing reserved records for writing.
WT-3461 Avoid long sleeps when the system clock is adjusted
WT-3470 Avoid a metadata cursor open for table open/drop
WT-3533 eviction handle walk can race with handle re-open
WT-3590 Keep data consistent if writes fail during a clean shutdown
Diffstat (limited to 'src')
49 files changed, 897 insertions, 280 deletions
diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in index 0fef587b4b8..415545a0d56 100644 --- a/src/third_party/wiredtiger/build_posix/configure.ac.in +++ b/src/third_party/wiredtiger/build_posix/configure.ac.in @@ -160,6 +160,44 @@ AS_CASE([$host_os], [darwin*], [], [AC_CHECK_FUNCS([fdatasync])]) # the generic declaration in AC_CHECK_FUNCS is incompatible. AX_FUNC_POSIX_MEMALIGN +# Check for POSIX condition variables with monotonic clock support +AC_CACHE_CHECK([for condition waits with monotonic clock support], + [wt_cv_pthread_cond_monotonic], + [AC_RUN_IFELSE([AC_LANG_SOURCE([[ +#include <errno.h> +#include <pthread.h> +#include <stdlib.h> +#include <time.h> + +int main() +{ + int ret; + pthread_condattr_t condattr; + pthread_cond_t cond; + pthread_mutex_t mtx; + struct timespec ts; + + if ((ret = pthread_condattr_init(&condattr)) != 0) exit(1); + if ((ret = pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)) != 0) exit(1); + if ((ret = pthread_cond_init(&cond, &condattr)) != 0) exit(1); + if ((ret = pthread_mutex_init(&mtx, NULL)) != 0) exit(1); + if ((ret = clock_gettime(CLOCK_MONOTONIC, &ts)) != 0) exit(1); + ts.tv_sec += 1; + if ((ret = pthread_mutex_lock(&mtx)) != 0) exit(1); + if ((ret = pthread_cond_timedwait(&cond, &mtx, &ts)) != 0 && ret != EINTR && ret != ETIMEDOUT) exit(1); + + exit(0); +} + ]])], + [wt_pthread_cond_monotonic=yes], + [wt_pthread_cond_monotonic=no], + [wt_pthread_cond_monotonic=no])]) +AC_MSG_RESULT($wt_pthread_cond_monotonic) +if test "$wt_pthread_cond_monotonic" = "yes" ; then + AC_DEFINE([HAVE_PTHREAD_COND_MONOTONIC], [1], + [Define to 1 if pthread condition variables support monotonic clocks.]) +fi + AC_SYS_LARGEFILE AC_C_BIGENDIAN diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h index 78d2784cb70..8babdbfdc1b 100644 --- a/src/third_party/wiredtiger/build_win/wiredtiger_config.h +++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h @@ -79,6 +79,9 @@ /* Define to 1 if you have the <memory.h> header file. */ /* #undef HAVE_MEMORY_H */ +/* Define to 1 if pthread condition variables support monotonic clocks. */ +/* #undef HAVE_PTHREAD_COND_MONOTONIC */ + /* Define to 1 if you have the `posix_fadvise' function. */ /* #undef HAVE_POSIX_FADVISE */ diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 5a3348b940a..f53509e96ec 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -191,6 +191,7 @@ src/support/rand.c src/support/scratch.c src/support/stat.c src/support/thread_group.c +src/support/time.c src/txn/txn.c src/txn/txn_ckpt.c src/txn/txn_ext.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 64b5d789e72..d80c80a37ce 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -32,7 +32,6 @@ flags = { 'READ_PREV', 'READ_RESTART_OK', 'READ_SKIP_INTL', - 'READ_SKIP_LEAF', 'READ_TRUNCATE', 'READ_WONT_NEED', ], diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index f3852d00ac8..99abc3e9ad1 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1060,6 +1060,7 @@ rebalancing recno recnos reconfig +reconfigures reconfiguring recsize rectype diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 512892eb44d..b66e95ce49b 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -431,11 +431,19 @@ connection_stats = [ ########################################## YieldStat('application_cache_time', 'application thread time waiting for cache (usecs)'), YieldStat('application_evict_time', 'application thread time evicting (usecs)'), + YieldStat('child_modify_blocked_page', 'page reconciliation yielded due to child modification'), + YieldStat('conn_close_blocked_lsm', 'connection close yielded for lsm manager shutdown'), + YieldStat('dhandle_lock_blocked', 'data handle lock yielded'), + YieldStat('log_server_sync_blocked', 'log server sync yielded for log write'), YieldStat('page_busy_blocked', 'page acquire busy blocked'), + YieldStat('page_del_rollback_blocked', 'page delete rollback yielded for instantiation'), YieldStat('page_forcible_evict_blocked', 'page acquire eviction blocked'), + YieldStat('page_index_slot_blocked', 'reference for page index and slot yielded'), YieldStat('page_locked_blocked', 'page acquire locked blocked'), YieldStat('page_read_blocked', 'page acquire read blocked'), YieldStat('page_sleep', 'page acquire time sleeping (usecs)'), + YieldStat('tree_descend_blocked', 'tree descend one level yielded for split page index update'), + YieldStat('txn_release_blocked', 'connection close blocked waiting for transaction state stabilization'), ] connection_stats = sorted(connection_stats, key=attrgetter('desc')) diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 8dfbc774419..d063f48ef9d 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "31af5d70a87cf1d99c7275bc8bc01d29e2cb0d2a", + "commit": "2d781c8cfeb2a1db8bd93e03ba35b302436e4ff3", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.4" diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index f0aa632551b..95d817850ef 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -156,8 +156,10 @@ __cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree) * into a tree. Eviction is disabled when an empty tree is opened, and * it must only be enabled once. */ - if (__wt_atomic_cas8(&btree->original, 1, 0)) + if (__wt_atomic_cas8(&btree->original, 1, 0)) { + btree->evict_disabled_open = false; __wt_evict_file_exclusive_off(session); + } } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index b55ad291c5e..5c4625044d3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -153,6 +153,7 @@ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; + uint64_t yield_count; /* * If the page is still "deleted", it's as we left it, reset the state @@ -160,7 +161,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ - for (;; __wt_yield()) + for (yield_count = 0;; yield_count++, __wt_yield()) switch (ref->state) { case WT_REF_DISK: case WT_REF_READING: @@ -173,7 +174,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) */ if (__wt_atomic_casv32( &ref->state, WT_REF_DELETED, WT_REF_DISK)) - return; + goto done; break; case WT_REF_LOCKED: /* @@ -203,8 +204,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) */ __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); - return; + goto done; } + +done: WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, yield_count); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index a0da7df0998..1e6405272f7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -66,7 +66,6 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_DATA_HANDLE *dhandle; WT_DECL_RET; size_t root_addr_size; - uint32_t mask; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; const char *filename; bool creation, forced_salvage, readonly; @@ -75,15 +74,14 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) dhandle = session->dhandle; /* - * This may be a re-open of an underlying object and we have to clean - * up. We can't clear the operation flags, however, they're set by the - * connection handle software that called us. + * This may be a re-open, clean up the btree structure. + * Clear the fields that don't persist across a re-open. + * Clear all flags other than the operation flags (which are set by the + * connection handle software that called us). */ WT_RET(__btree_clear(session)); - - mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); - memset(btree, 0, sizeof(*btree)); - btree->flags = mask; + memset(btree, 0, WT_BTREE_CLEAR_SIZE); + F_CLR(btree, ~WT_BTREE_SPECIAL_FLAGS); /* Set the data handle first, our called functions reasonably use it. */ btree->dhandle = dhandle; @@ -185,13 +183,19 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) * * Files that can still be bulk-loaded cannot be evicted. * Permanently cache-resident files can never be evicted. - * Special operations don't enable eviction. (The underlying commands - * may turn on eviction, but it's their decision.) + * Special operations don't enable eviction. The underlying commands may + * turn on eviction (for example, verify turns on eviction while working + * a file to keep from consuming the cache), but it's their decision. If + * an underlying command reconfigures eviction, it must either clear the + * evict-disabled-open flag or restore the eviction configuration when + * finished so that handle close behaves correctly. */ if (btree->original || F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE | - WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { WT_ERR(__wt_evict_file_exclusive_on(session)); + btree->evict_disabled_open = true; + } if (0) { err: WT_TRET(__wt_btree_close(session)); @@ -228,6 +232,15 @@ __wt_btree_close(WT_SESSION_IMPL *session) return (0); F_SET(btree, WT_BTREE_CLOSED); + /* + * If we turned eviction off and never turned it back on, do that now, + * otherwise the counter will be off. + */ + if (btree->evict_disabled_open) { + btree->evict_disabled_open = false; + __wt_evict_file_exclusive_off(session); + } + /* Discard any underlying block manager resources. */ if ((bm = btree->bm) != NULL) { btree->bm = NULL; diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index c5948ec4ab5..b4f05c440ba 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -395,8 +395,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { n = skip; - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip)); if (n == skip) { if (skip == 0) break; diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 86484feb7c9..c22b99c55d0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -18,9 +18,16 @@ __ref_index_slot(WT_SESSION_IMPL *session, { WT_PAGE_INDEX *pindex; WT_REF **start, **stop, **p, **t; + uint64_t yield_count; uint32_t entries, slot; - for (;;) { + /* + * If we don't find our reference, the page split and our home + * pointer references the wrong page. When internal pages + * split, their WT_REF structure home values are updated; yield + * and wait for that to happen. + */ + for (yield_count = 0;; yield_count++, __wt_yield()) { /* * Copy the parent page's index value: the page can split at * any time, but the index's value is always valid, even if @@ -59,18 +66,13 @@ __ref_index_slot(WT_SESSION_IMPL *session, } } - /* - * If we don't find our reference, the page split and our home - * pointer references the wrong page. When internal pages - * split, their WT_REF structure home values are updated; yield - * and wait for that to happen. - */ - __wt_yield(); } found: WT_ASSERT(session, pindex->index[slot] == ref); *pindexp = pindex; *slotp = slot; + + WT_STAT_CONN_INCRV(session, page_index_slot_blocked, yield_count); } /* @@ -177,12 +179,13 @@ __ref_descend_prev( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; + uint64_t yield_count; /* * We're passed a child page into which we're descending, and on which * we have a hazard pointer. */ - for (;; __wt_yield()) { + for (yield_count = 0;; yield_count++, __wt_yield()) { /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal @@ -242,6 +245,7 @@ __ref_descend_prev( break; } *pindexp = pindex; + WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count); } /* @@ -497,29 +501,21 @@ restart: /* } /* - * Optionally skip leaf pages: skip all leaf pages if - * WT_READ_SKIP_LEAF is set, when the skip-leaf-count - * variable is non-zero, skip some count of leaf pages. - * If this page is disk-based, crack the cell to figure - * out it's a leaf page without reading it. + * Optionally skip leaf pages: when the skip-leaf-count + * variable is non-zero, skip some count of leaf pages, + * then take the next leaf page we can. * - * If skipping some number of leaf pages, decrement the - * count of pages to zero, and then take the next leaf - * page we can. Be cautious around the page decrement, - * if for some reason don't take this particular page, - * we can take the next one, and, there are additional - * tests/decrements when we're about to return a leaf - * page. + * The reason to do some of this work here (rather than + * in our caller), is because we can look at the cell + * and know it's a leaf page without reading it into + * memory. If this page is disk-based, crack the cell + * to figure out it's a leaf page without reading it. */ - if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) - if (__ref_is_leaf(ref)) { - if (LF_ISSET(WT_READ_SKIP_LEAF)) - break; - if (*skipleafcntp > 0) { - --*skipleafcntp; - break; - } - } + if (skipleafcntp != NULL && + *skipleafcntp > 0 && __ref_is_leaf(ref)) { + --*skipleafcntp; + break; + } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); @@ -626,34 +622,18 @@ descend: empty_internal = true; session, ref, &pindex); slot = pindex->entries - 1; } - } else { - /* - * At the lowest tree level (considering a leaf - * page), turn off the initial-descent state. - * Descent race tests are different when moving - * through the tree vs. the initial descent. - */ - initial_descent = false; - - /* - * Optionally skip leaf pages, the second half. - * We didn't have an on-page cell to figure out - * if it was a leaf page, we had to acquire the - * hazard pointer and look at the page. - */ - if (skipleafcntp != NULL || - LF_ISSET(WT_READ_SKIP_LEAF)) { - if (LF_ISSET(WT_READ_SKIP_LEAF)) - break; - if (*skipleafcntp > 0) { - --*skipleafcntp; - break; - } - } - - *refp = ref; - goto done; + continue; } + + /* + * The tree-walk restart code knows we return any leaf + * page we acquire (never hazard-pointer coupling on + * after acquiring a leaf page), and asserts no restart + * happens while holding a leaf page. This page must be + * returned to our caller. + */ + *refp = ref; + goto done; } } @@ -690,8 +670,29 @@ __wt_tree_walk_count(WT_SESSION_IMPL *session, * of leaf pages before returning. */ int -__wt_tree_walk_skip(WT_SESSION_IMPL *session, - WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) +__wt_tree_walk_skip( + WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) { - return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags)); + /* + * Optionally skip leaf pages, the second half. The tree-walk function + * didn't have an on-page cell it could use to figure out if the page + * was a leaf page or not, it had to acquire the hazard pointer and look + * at the page. The tree-walk code never acquires a hazard pointer on a + * leaf page without returning it, and it's not trivial to change that. + * So, the tree-walk code returns all leaf pages here and we deal with + * decrementing the count. + */ + do { + WT_RET(__tree_walk_internal(session, refp, NULL, skipleafcntp, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + + /* + * The walk skipped internal pages, any page returned must be a + * leaf page. + */ + if (*skipleafcntp > 0) + --*skipleafcntp; + } while (*skipleafcntp > 0); + + return (0); } diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 68d45678965..00d559881dc 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1086,6 +1086,41 @@ err: /* WT_TRET(wt_session->close(wt_session, config)); } + /* + * Perform a system-wide checkpoint so that all tables are consistent + * with each other. Do this before shutting down all the subsystems. + * We have shut down all user sessions, but send in true for waiting + * for internal races. + */ + if (!F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) { + s = NULL; + WT_TRET(__wt_open_internal_session( + conn, "close_ckpt", true, 0, &s)); + if (s != NULL) { + const char *checkpoint_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_checkpoint), + NULL + }; + wt_session = &s->iface; + WT_TRET(__wt_txn_checkpoint(s, checkpoint_cfg, true)); + + /* + * Mark the metadata dirty so we flush it on close, + * allowing recovery to be skipped. + */ + WT_WITH_DHANDLE(s, WT_SESSION_META_DHANDLE(s), + __wt_tree_modify_set(s)); + + WT_TRET(wt_session->close(wt_session, config)); + } + } + + if (ret != 0) { + __wt_err(session, ret, + "failure during close, disabling further writes"); + F_SET(conn, WT_CONN_PANIC); + } + WT_TRET(__wt_connection_close(conn)); /* We no longer have a session, don't try to update it. */ diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 1816e66b0b7..2560ca47268 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -317,6 +317,9 @@ __wt_conn_btree_open( WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS)); + /* Turn off eviction. */ + WT_RET(__wt_evict_file_exclusive_on(session)); + /* * If the handle is already open, it has to be closed so it can be * reopened with a new configuration. @@ -330,7 +333,7 @@ __wt_conn_btree_open( * in the tree that can block the close. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) - WT_RET(__wt_conn_btree_sync_and_close(session, false, false)); + WT_ERR(__wt_conn_btree_sync_and_close(session, false, false)); /* Discard any previous configuration, set up the new configuration. */ __conn_btree_config_clear(session); @@ -374,6 +377,8 @@ __wt_conn_btree_open( err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); } + __wt_evict_file_exclusive_off(session); + return (ret); } @@ -673,8 +678,8 @@ restart: continue; WT_WITH_DHANDLE(session, dhandle, - WT_TRET(__wt_conn_dhandle_discard_single( - session, true, F_ISSET(conn, WT_CONN_IN_MEMORY)))); + WT_TRET(__wt_conn_dhandle_discard_single(session, true, + F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_PANIC)))); goto restart; } @@ -699,8 +704,8 @@ restart: /* Close the metadata file handle. */ while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL) WT_WITH_DHANDLE(session, dhandle, - WT_TRET(__wt_conn_dhandle_discard_single( - session, true, F_ISSET(conn, WT_CONN_IN_MEMORY)))); + WT_TRET(__wt_conn_dhandle_discard_single(session, true, + F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_PANIC)))); return (ret); } diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index d2ed314fd2e..bee1b0443f5 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -375,6 +375,7 @@ __log_file_server(void *arg) WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; + uint64_t yield_count; uint32_t filenum; bool locked; @@ -382,6 +383,7 @@ __log_file_server(void *arg) conn = S2C(session); log = conn->log; locked = false; + yield_count = 0; while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * If there is a log file to close, make sure any outstanding @@ -512,6 +514,7 @@ __log_file_server(void *arg) * thread a chance to run and try again in * this case. */ + yield_count++; __wt_yield(); continue; } @@ -524,6 +527,7 @@ __log_file_server(void *arg) if (0) { err: __wt_err(session, ret, "log close server error"); } + WT_STAT_CONN_INCRV(session, log_server_sync_blocked, yield_count); if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); @@ -902,7 +906,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond)); WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond)); WT_RET(__wt_log_open(session)); - WT_RET(__wt_log_slot_init(session)); + WT_RET(__wt_log_slot_init(session, true)); return (0); } diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index eb3c79422a0..649bfa7c81f 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -91,6 +91,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) if (txn_global->oldest_id == txn_global->current && txn_global->metadata_pinned == txn_global->current) break; + WT_STAT_CONN_INCR(session, txn_release_blocked); __wt_yield(); } @@ -143,7 +144,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * conditional because we allocate the log path so that printlog can * run without running logging or recovery. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && + if (ret == 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index fdf68841b85..b9b1a7783fe 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -1452,26 +1452,23 @@ retry: while (slot < max_entries) { /* * Re-check the "no eviction" flag, used to enforce exclusive - * access when a handle is being closed. If not set, remember - * the file to visit first, next loop. + * access when a handle is being closed. * * Only try to acquire the lock and simply continue if we fail; * the lock is held while the thread turning off eviction clears * the tree's current eviction point, and part of the process is * waiting on this thread to acknowledge that action. + * + * If a handle is being discarded, it will still be marked open, + * but won't have a root page. */ if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { - if (btree->evict_disabled == 0) { + if (btree->evict_disabled == 0 && + btree->root.page != NULL) { /* - * Assert the handle has a root page: eviction - * should have been locked out if the tree is - * being discarded or the root page is changing. - * As this has not always been the case, assert - * to debug that change. + * Remember the file to visit first, next loop. */ - WT_ASSERT(session, btree->root.page != NULL); - cache->evict_file_next = dhandle; WT_WITH_DHANDLE(session, dhandle, ret = __evict_walk_file( @@ -1860,6 +1857,10 @@ fast: /* If the page can't be evicted, give up. */ WT_STAT_CONN_INCRV( session, cache_eviction_pages_queued, (u_int)(evict - start)); + __wt_verbose(session, WT_VERB_EVICTSERVER, + "%s walk: seen %" PRIu64 ", queued %" PRIu64, + session->dhandle->name, pages_seen, pages_queued); + /* * If we couldn't find the number of pages we were looking for, skip * the tree next time. @@ -2442,14 +2443,23 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; + u_int pct; uint64_t total_bytes, total_dirty_bytes; conn = S2C(session); total_bytes = total_dirty_bytes = 0; + pct = 0; /* [-Werror=uninitialized] */ WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); WT_RET(__wt_msg(session, "cache dump")); + WT_RET(__wt_msg(session, + "cache full: %s", __wt_cache_full(session) ? "yes" : "no")); + WT_RET(__wt_msg(session, "cache clean check: %s (%u%%)", + __wt_eviction_clean_needed(session, &pct) ? "yes" : "no", pct)); + WT_RET(__wt_msg(session, "cache dirty check: %s (%u%%)", + __wt_eviction_dirty_needed(session, &pct) ? "yes" : "no", pct)); + for (dhandle = NULL;;) { WT_WITH_HANDLE_LIST_READ_LOCK(session, WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index d0b21b17965..e965724dffe 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -714,7 +714,7 @@ struct __wt_page { * Related information for fast-delete, on-disk pages. */ struct __wt_page_deleted { - uint64_t txnid; /* Transaction ID */ + volatile uint64_t txnid; /* Transaction ID */ WT_UPDATE **update_list; /* List of updates for abort */ }; @@ -904,7 +904,7 @@ struct __wt_ikey { * list. */ WT_PACKED_STRUCT_BEGIN(__wt_update) - uint64_t txnid; /* update transaction */ + volatile uint64_t txnid; /* Transaction ID */ WT_UPDATE *next; /* forward-linked list */ diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index 28fe1b94b23..8712a404b13 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -142,12 +142,30 @@ struct __wt_btree { uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ + /* + * We flush pages from the tree (in order to make checkpoint faster), + * without a high-level lock. To avoid multiple threads flushing at + * the same time, lock the tree. + */ + WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ + + /* + * All of the following fields live at the end of the structure so it's + * easier to clear everything but the fields that persist. + */ +#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref)) + + /* + * Eviction information is maintained in the btree handle, but owned by + * eviction, not the btree code. + */ WT_REF *evict_ref; /* Eviction thread's location */ uint64_t evict_priority; /* Relative priority of cached pages */ u_int evict_walk_period; /* Skip this many LRU walks */ u_int evict_walk_saved; /* Saved walk skips for checkpoints */ u_int evict_walk_skips; /* Number of walks skipped */ int evict_disabled; /* Eviction disabled count */ + bool evict_disabled_open;/* Eviction disabled on open */ volatile uint32_t evict_busy; /* Count of threads in eviction */ int evict_start_type; /* Start position for eviction walk (see WT_EVICT_WALK_START). */ @@ -155,13 +173,6 @@ struct __wt_btree { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING } checkpointing; /* Checkpoint in progress */ - /* - * We flush pages from the tree (in order to make checkpoint faster), - * without a high-level lock. To avoid multiple threads flushing at - * the same time, lock the tree. - */ - WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ - /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ #define WT_BTREE_ALLOW_SPLITS 0x000100 /* Allow splits, even with no evict */ #define WT_BTREE_BULK 0x000200 /* Bulk-load handle */ diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index f74732684f5..74611de1131 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -314,9 +314,10 @@ struct __wt_connection_impl { #define WT_CONN_LOG_ARCHIVE 0x01 /* Archive is enabled */ #define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */ #define WT_CONN_LOG_EXISTED 0x04 /* Log files found */ -#define WT_CONN_LOG_RECOVER_DONE 0x08 /* Recovery completed */ -#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */ -#define WT_CONN_LOG_ZERO_FILL 0x20 /* Manually zero files */ +#define WT_CONN_LOG_RECOVER_DIRTY 0x08 /* Recovering unclean */ +#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */ +#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */ +#define WT_CONN_LOG_ZERO_FILL 0x40 /* Manually zero files */ uint32_t log_flags; /* Global logging configuration */ WT_CONDVAR *log_cond; /* Log server wait mutex */ WT_SESSION_IMPL *log_session; /* Log server session */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 12233c0247a..e77de41344c 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -181,7 +181,7 @@ extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, cons extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -369,6 +369,7 @@ extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bo extern void __wt_log_written_reset(WT_SESSION_IMPL *session); extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -405,7 +406,7 @@ extern int __wt_logop_row_truncate_print(WT_SESSION_IMPL *session, const uint8_t extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_log_slot_init(WT_SESSION_IMPL *session, bool alloc) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size); @@ -732,6 +733,8 @@ extern int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP * extern int __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_group_stop_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern void __wt_seconds(WT_SESSION_IMPL *session, time_t *timep); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h index c0ed056c7b6..9e32e86e64c 100644 --- a/src/third_party/wiredtiger/src/include/extern_posix.h +++ b/src/third_party/wiredtiger/src/include/extern_posix.h @@ -28,5 +28,5 @@ extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, co extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp); extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); diff --git a/src/third_party/wiredtiger/src/include/extern_win.h b/src/third_party/wiredtiger/src/include/extern_win.h index d548ee0b2ec..85db8175615 100644 --- a/src/third_party/wiredtiger/src/include/extern_win.h +++ b/src/third_party/wiredtiger/src/include/extern_win.h @@ -26,7 +26,7 @@ extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, co extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); +extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp); extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern DWORD __wt_getlasterror(void); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index f26a45c68f5..d7c0e0f9472 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -47,9 +47,8 @@ #define WT_READ_PREV 0x00000080 #define WT_READ_RESTART_OK 0x00000100 #define WT_READ_SKIP_INTL 0x00000200 -#define WT_READ_SKIP_LEAF 0x00000400 -#define WT_READ_TRUNCATE 0x00000800 -#define WT_READ_WONT_NEED 0x00001000 +#define WT_READ_TRUNCATE 0x00000400 +#define WT_READ_WONT_NEED 0x00000800 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index fad10f01103..eb99de3dcab 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -41,45 +41,6 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp) } /* - * __wt_seconds -- - * Return the seconds since the Epoch. - */ -static inline void -__wt_seconds(WT_SESSION_IMPL *session, time_t *timep) -{ - struct timespec t; - - __wt_epoch(session, &t); - - *timep = t.tv_sec; -} - -/* - * __wt_time_check_monotonic -- - * Check and prevent time running backward. If we detect that it has, we - * set the time structure to the previous values, making time stand still - * until we see a time in the future of the highest value seen so far. - */ -static inline void -__wt_time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp) -{ - /* - * Detect time going backward. If so, use the last - * saved timestamp. - */ - if (session == NULL) - return; - - if (tsp->tv_sec < session->last_epoch.tv_sec || - (tsp->tv_sec == session->last_epoch.tv_sec && - tsp->tv_nsec < session->last_epoch.tv_nsec)) { - WT_STAT_CONN_INCR(session, time_travel); - *tsp = session->last_epoch; - } else - session->last_epoch = *tsp; -} - -/* * __wt_verbose -- * Verbose message. * diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index db48a841571..01e622a5695 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -475,11 +475,19 @@ struct __wt_connection_stats { int64_t thread_write_active; int64_t application_evict_time; int64_t application_cache_time; + int64_t txn_release_blocked; + int64_t conn_close_blocked_lsm; + int64_t dhandle_lock_blocked; + int64_t log_server_sync_blocked; int64_t page_busy_blocked; int64_t page_forcible_evict_blocked; int64_t page_locked_blocked; int64_t page_read_blocked; int64_t page_sleep; + int64_t page_del_rollback_blocked; + int64_t child_modify_blocked_page; + int64_t page_index_slot_blocked; + int64_t tree_descend_blocked; int64_t txn_snapshots_created; int64_t txn_snapshots_dropped; int64_t txn_begin; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 314c948e4d1..39273a1995c 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -148,16 +148,6 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) } /* - * __wt_txn_committed -- - * Return if a transaction has been committed. - */ -static inline bool -__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id) -{ - return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running)); -} - -/* * __wt_txn_visible_all -- * Check if a given transaction ID is "globally visible". This is, if * all sessions in the system will see the transaction ID including the diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 821efdf5fa1..b2f49ef030d 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -4806,72 +4806,94 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_APPLICATION_EVICT_TIME 1216 /*! thread-yield: application thread time waiting for cache (usecs) */ #define WT_STAT_CONN_APPLICATION_CACHE_TIME 1217 +/*! + * thread-yield: connection close blocked waiting for transaction state + * stabilization + */ +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1218 +/*! thread-yield: connection close yielded for lsm manager shutdown */ +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1219 +/*! thread-yield: data handle lock yielded */ +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1220 +/*! thread-yield: log server sync yielded for log write */ +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1221 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1218 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1222 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1219 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1223 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1220 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1224 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1221 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1225 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1222 +#define WT_STAT_CONN_PAGE_SLEEP 1226 +/*! thread-yield: page delete rollback yielded for instantiation */ +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1227 +/*! thread-yield: page reconciliation yielded due to child modification */ +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1228 +/*! thread-yield: reference for page index and slot yielded */ +#define WT_STAT_CONN_PAGE_INDEX_SLOT_BLOCKED 1229 +/*! + * thread-yield: tree descend one level yielded for split page index + * update + */ +#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1230 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1223 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1231 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1224 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1232 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1225 +#define WT_STAT_CONN_TXN_BEGIN 1233 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1234 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1235 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1236 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1237 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1230 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1238 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1239 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1232 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1240 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1233 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1241 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1234 +#define WT_STAT_CONN_TXN_CHECKPOINT 1242 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1235 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1243 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1236 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1244 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1237 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1245 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1238 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1246 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1239 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1247 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1240 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1248 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1241 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1249 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1242 +#define WT_STAT_CONN_TXN_SYNC 1250 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1243 +#define WT_STAT_CONN_TXN_COMMIT 1251 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1244 +#define WT_STAT_CONN_TXN_ROLLBACK 1252 /*! * @} diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 803d3e8dfab..486744d2e7f 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -8,6 +8,7 @@ #include "wt_internal.h" +static int __log_newfile(WT_SESSION_IMPL *, bool, bool *); static int __log_openfile( WT_SESSION_IMPL *, WT_FH **, const char *, uint32_t, uint32_t); static int __log_write_internal( @@ -442,6 +443,59 @@ __wt_log_extract_lognum( } /* + * __wt_log_reset -- + * Reset the existing log file to after the given file number. + * Called from recovery when toggling logging back on, it was off + * the previous open but it was on earlier before that toggle. + */ +int +__wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + uint32_t old_lognum; + u_int i, logcount; + char **logfiles; + + conn = S2C(session); + log = conn->log; + + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || + log->fileid > lognum) + return (0); + + WT_ASSERT(session, F_ISSET(conn, WT_CONN_RECOVERING)); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); + /* + * We know we're single threaded and called from recovery only when + * toggling logging back on. Therefore the only log files we have are + * old and outdated and the new one created when logging opened before + * recovery. We have to remove all old log files first and then create + * the new one so that log file numbers are contiguous in the file + * system. + */ + WT_RET(__wt_close(session, &log->log_fh)); + WT_RET(__log_get_files(session, + WT_LOG_FILENAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum( + session, logfiles[i], &old_lognum)); + WT_ASSERT(session, old_lognum < lognum || lognum == 1); + WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, old_lognum)); + } + log->fileid = lognum; + + /* Send in true to update connection creation LSNs. */ + WT_WITH_SLOT_LOCK(session, log, + ret = __log_newfile(session, true, NULL)); + WT_ERR(__wt_log_slot_init(session, false)); +err: WT_TRET( + __wt_fs_directory_list_free(session, &logfiles, logcount)); + return (ret); +} + +/* * __log_zero -- * Zero a log file. */ diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 97e317ce68c..b23d589c8e2 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -401,7 +401,7 @@ __wt_log_slot_switch(WT_SESSION_IMPL *session, * Initialize the slot array. */ int -__wt_log_slot_init(WT_SESSION_IMPL *session) +__wt_log_slot_init(WT_SESSION_IMPL *session, bool alloc) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -423,15 +423,17 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) * switch log files very aggressively. Scale back the buffer for * small log file sizes. */ - log->slot_buf_size = (uint32_t)WT_MIN( - (size_t)conn->log_file_max / 10, WT_LOG_SLOT_BUF_SIZE); - for (i = 0; i < WT_SLOT_POOL; i++) { - WT_ERR(__wt_buf_init(session, - &log->slot_pool[i].slot_buf, log->slot_buf_size)); - F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); + if (alloc) { + log->slot_buf_size = (uint32_t)WT_MIN( + (size_t)conn->log_file_max / 10, WT_LOG_SLOT_BUF_SIZE); + for (i = 0; i < WT_SLOT_POOL; i++) { + WT_ERR(__wt_buf_init(session, + &log->slot_pool[i].slot_buf, log->slot_buf_size)); + F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); + } + WT_STAT_CONN_SET(session, + log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); } - WT_STAT_CONN_SET(session, - log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); /* * Set up the available slot from the pool the first time. */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index b7d9086d10e..62da094b5f7 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -295,8 +295,10 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) manager->lsm_workers == 0); if (manager->lsm_workers > 0) { /* Wait for the main LSM manager thread to finish. */ - while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) + while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) { + WT_STAT_CONN_INCR(session, conn_close_blocked_lsm); __wt_yield(); + } /* Clean up open LSM handles. */ ret = __wt_lsm_tree_close_all(session); diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c index aca69d0e6a2..895b8a9c565 100644 --- a/src/third_party/wiredtiger/src/meta/meta_table.c +++ b/src/third_party/wiredtiger/src/meta/meta_table.c @@ -230,12 +230,23 @@ __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key) WT_RET_MSG(session, EINVAL, "%s: remove not supported on the turtle file", key); + /* + * Take, release, and reacquire the metadata cursor. It's complicated, + * but that way the underlying meta-tracking function doesn't have to + * open a second metadata cursor, it can use the session's cached one. + */ WT_RET(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, key); WT_ERR(cursor->search(cursor)); + WT_ERR(__wt_metadata_cursor_release(session, &cursor)); + if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_update(session, key)); - WT_ERR(cursor->remove(cursor)); + + WT_ERR(__wt_metadata_cursor(session, &cursor)); + cursor->set_key(cursor, key); + ret = cursor->remove(cursor); + err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c index fe010b62305..e4a6683dee9 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -19,11 +19,19 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_DECL_RET; WT_RET(__wt_calloc_one(session, &cond)); - WT_ERR(pthread_mutex_init(&cond->mtx, NULL)); - /* Initialize the condition variable to permit self-blocking. */ +#ifdef HAVE_PTHREAD_COND_MONOTONIC + { + pthread_condattr_t condattr; + + WT_ERR(pthread_condattr_init(&condattr)); + WT_ERR(pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)); + WT_ERR(pthread_cond_init(&cond->cond, &condattr)); + } +#else WT_ERR(pthread_cond_init(&cond->cond, NULL)); +#endif cond->name = name; cond->waiters = 0; @@ -79,7 +87,26 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, goto skipping; if (usecs > 0) { - __wt_epoch(session, &ts); + /* + * Get the current time as the basis for calculating when the + * wait should end. Prefer a monotonic clock source to avoid + * unexpectedly long sleeps when the system clock is adjusted. + * + * Failing that, query the time directly and don't attempt to + * correct for the clock moving backwards, which would result + * in a sleep that is too long by however much the clock is + * updated. This isn't as good as a monotonic clock source but + * makes the window of vulnerability smaller (i.e., the + * calculated time is only incorrect if the system clock + * changes in between us querying it and waiting). + */ +#ifdef HAVE_PTHREAD_COND_MONOTONIC + WT_SYSCALL_RETRY(clock_gettime(CLOCK_MONOTONIC, &ts), ret); + if (ret != 0) + WT_PANIC_MSG(session, ret, "clock_gettime"); +#else + __wt_epoch_raw(session, &ts); +#endif ts.tv_sec += (time_t) (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION); ts.tv_nsec = (long) diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c index fe337fea7cf..25a08d62355 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_time.c +++ b/src/third_party/wiredtiger/src/os_posix/os_time.c @@ -9,14 +9,12 @@ #include "wt_internal.h" /* - * __wt_epoch -- - * Return the time since the Epoch. + * __wt_epoch_raw -- + * Return the time since the Epoch as reported by a system call. */ void -__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) - WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +__wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp) { - struct timespec tmp; WT_DECL_RET; /* @@ -28,19 +26,10 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) tsp->tv_sec = 0; tsp->tv_nsec = 0; - /* - * Read into a local variable so that we're comparing the correct - * value when we check for monotonic increasing time. There are - * many places we read into an unlocked global variable. - */ #if defined(HAVE_CLOCK_GETTIME) - WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, &tmp), ret); - if (ret == 0) { - __wt_time_check_monotonic(session, &tmp); - tsp->tv_sec = tmp.tv_sec; - tsp->tv_nsec = tmp.tv_nsec; + WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret); + if (ret == 0) return; - } WT_PANIC_MSG(session, ret, "clock_gettime"); #elif defined(HAVE_GETTIMEOFDAY) { @@ -48,10 +37,8 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret); if (ret == 0) { - tmp.tv_sec = v.tv_sec; - tmp.tv_nsec = v.tv_usec * WT_THOUSAND; - __wt_time_check_monotonic(session, &tmp); - *tsp = tmp; + tsp->tv_sec = v.tv_sec; + tsp->tv_nsec = v.tv_usec * WT_THOUSAND; return; } WT_PANIC_MSG(session, ret, "gettimeofday"); diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c index ba71341ab22..84c06bed6e5 100644 --- a/src/third_party/wiredtiger/src/os_win/os_time.c +++ b/src/third_party/wiredtiger/src/os_win/os_time.c @@ -9,24 +9,23 @@ #include "wt_internal.h" /* - * __wt_epoch -- - * Return the time since the Epoch. + * __wt_epoch_raw -- + * Return the time since the Epoch as reported by the system. */ void -__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) +__wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp) { - struct timespec tmp; FILETIME time; uint64_t ns100; + WT_UNUSED(session); + GetSystemTimeAsFileTime(&time); ns100 = (((int64_t)time.dwHighDateTime << 32) + time.dwLowDateTime) - 116444736000000000LL; - tmp.tv_sec = ns100 / 10000000; - tmp.tv_nsec = (long)((ns100 % 10000000) * 100); - __wt_time_check_monotonic(session, &tmp); - *tsp = tmp; + tsp->tv_sec = ns100 / 10000000; + tsp->tv_nsec = (long)((ns100 % 10000000) * 100); } /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index e59d9796352..55e2c62ac01 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -45,7 +45,9 @@ typedef struct { uint64_t orig_btree_checkpoint_gen; uint64_t orig_txn_checkpoint_gen; - /* Track the page's maximum transaction ID. */ + /* Track the oldest transaction running when reconciliation starts. */ + uint64_t last_running; + uint64_t max_txn; /* Track if all updates were skipped. */ @@ -849,6 +851,16 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); /* + * Cache the oldest running transaction ID. This is used to check + * whether updates seen by reconciliation have committed. We keep a + * cached copy to avoid races where a concurrent transaction could + * abort while reconciliation is examining its updates. This way, any + * transaction running when reconciliation starts is considered + * uncommitted. + */ + WT_ORDERED_READ(r->last_running, S2C(session)->txn_global.last_running); + + /* * Lookaside table eviction is configured when eviction gets aggressive, * adjust the flags for cases we don't support. */ @@ -1159,11 +1171,13 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * When reconciling for eviction, track whether any * uncommitted updates are found. */ - if (__wt_txn_committed(session, txnid)) { - if (*updp == NULL) - *updp = upd; - } else + if (WT_TXNID_LE(r->last_running, txnid)) { skipped = true; + continue; + } + + if (*updp == NULL) + *updp = upd; } else { /* * Checkpoint can only write updates visible as of its @@ -1562,7 +1576,7 @@ __rec_child_modify(WT_SESSION_IMPL *session, * not reserved for our exclusive use, there are other page states that * must be considered. */ - for (;; __wt_yield()) + for (;; __wt_yield()) { switch (r->tested_ref_state = ref->state) { case WT_REF_DISK: /* On disk, not modified by definition. */ @@ -1673,6 +1687,8 @@ __rec_child_modify(WT_SESSION_IMPL *session, WT_ILLEGAL_VALUE(session); } + WT_STAT_CONN_INCR(session, child_modify_blocked_page); + } in_memory: /* diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c index 44bd66e011a..081650d74a8 100644 --- a/src/third_party/wiredtiger/src/schema/schema_open.c +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -425,37 +425,40 @@ __schema_open_table(WT_SESSION_IMPL *session, WT_DECL_RET; WT_TABLE *table; const char *tconfig; - char *tablename; *tablep = NULL; cursor = NULL; table = NULL; - tablename = NULL; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE)); + WT_ERR(__wt_calloc_one(session, &table)); + table->name_hash = __wt_hash_city64(name, namelen); + WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name)); - WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename)); + WT_ERR(__wt_strndup(session, buf->data, buf->size, &table->name)); + /* + * Don't hold the metadata cursor pinned, we call functions that use it + * to retrieve column group information. + */ WT_ERR(__wt_metadata_cursor(session, &cursor)); - cursor->set_key(cursor, tablename); - WT_ERR(cursor->search(cursor)); - WT_ERR(cursor->get_value(cursor, &tconfig)); - - WT_ERR(__wt_calloc_one(session, &table)); - table->name = tablename; - tablename = NULL; - table->name_hash = __wt_hash_city64(name, namelen); - - WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval)); + cursor->set_key(cursor, table->name); + if ((ret = cursor->search(cursor)) == 0 && + (ret = cursor->get_value(cursor, &tconfig)) == 0) + ret = __wt_strdup(session, tconfig, &table->config); + WT_TRET(__wt_metadata_cursor_release(session, &cursor)); + WT_ERR(ret); - WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval)); + WT_ERR(__wt_config_getones(session, table->config, "columns", &cval)); + WT_ERR(__wt_config_getones( + session, table->config, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format)); - WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval)); + WT_ERR(__wt_config_getones( + session, table->config, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format)); - WT_ERR(__wt_strdup(session, tconfig, &table->config)); /* Point to some items in the copy to save re-parsing. */ WT_ERR(__wt_config_getones(session, table->config, @@ -491,7 +494,7 @@ __schema_open_table(WT_SESSION_IMPL *session, if (table->ncolgroups > 0 && table->is_simple) WT_ERR_MSG(session, EINVAL, - "%s requires a table with named columns", tablename); + "%s requires a table with named columns", table->name); WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups)); WT_ERR(__wt_schema_open_colgroups(session, table)); @@ -509,9 +512,7 @@ __schema_open_table(WT_SESSION_IMPL *session, if (0) { err: WT_TRET(__wt_schema_destroy_table(session, &table)); } - WT_TRET(__wt_metadata_cursor_release(session, &cursor)); - __wt_free(session, tablename); __wt_scr_free(session, &buf); return (ret); } diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index ffeb6137766..707e07ac11f 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -235,6 +235,7 @@ __wt_session_lock_dhandle( lock_busy = true; /* Give other threads a chance to make progress. */ + WT_STAT_CONN_INCR(session, dhandle_lock_blocked); __wt_yield(); } } @@ -597,7 +598,9 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) * the underlying file are visible to the in-memory pages. */ WT_ERR(__wt_evict_file_exclusive_on(session)); - WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD)); + ret = __wt_cache_op(session, WT_SYNC_DISCARD); + __wt_evict_file_exclusive_off(session); + WT_ERR(ret); /* * We lock checkpoint handles that we are overwriting, so the handle diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index 57efde72b23..f98b1943449 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -494,7 +494,18 @@ __wt_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((cold)) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - F_SET(S2C(session), WT_CONN_PANIC); + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* + * If the connection has already be marked for panic, just return the + * error. + */ + if (F_ISSET(conn, WT_CONN_PANIC)) + return (WT_PANIC); + + F_SET(conn, WT_CONN_PANIC); __wt_err(session, WT_PANIC, "the process must exit and restart"); #if defined(HAVE_DIAGNOSTIC) diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 8b72e653658..c9e577ac3b6 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -842,11 +842,19 @@ static const char * const __stats_connection_desc[] = { "thread-state: active filesystem write calls", "thread-yield: application thread time evicting (usecs)", "thread-yield: application thread time waiting for cache (usecs)", + "thread-yield: connection close blocked waiting for transaction state stabilization", + "thread-yield: connection close yielded for lsm manager shutdown", + "thread-yield: data handle lock yielded", + "thread-yield: log server sync yielded for log write", "thread-yield: page acquire busy blocked", "thread-yield: page acquire eviction blocked", "thread-yield: page acquire locked blocked", "thread-yield: page acquire read blocked", "thread-yield: page acquire time sleeping (usecs)", + "thread-yield: page delete rollback yielded for instantiation", + "thread-yield: page reconciliation yielded due to child modification", + "thread-yield: reference for page index and slot yielded", + "thread-yield: tree descend one level yielded for split page index update", "transaction: number of named snapshots created", "transaction: number of named snapshots dropped", "transaction: transaction begins", @@ -1129,11 +1137,19 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing thread_write_active */ stats->application_evict_time = 0; stats->application_cache_time = 0; + stats->txn_release_blocked = 0; + stats->conn_close_blocked_lsm = 0; + stats->dhandle_lock_blocked = 0; + stats->log_server_sync_blocked = 0; stats->page_busy_blocked = 0; stats->page_forcible_evict_blocked = 0; stats->page_locked_blocked = 0; stats->page_read_blocked = 0; stats->page_sleep = 0; + stats->page_del_rollback_blocked = 0; + stats->child_modify_blocked_page = 0; + stats->page_index_slot_blocked = 0; + stats->tree_descend_blocked = 0; stats->txn_snapshots_created = 0; stats->txn_snapshots_dropped = 0; stats->txn_begin = 0; @@ -1475,12 +1491,25 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, application_evict_time); to->application_cache_time += WT_STAT_READ(from, application_cache_time); + to->txn_release_blocked += WT_STAT_READ(from, txn_release_blocked); + to->conn_close_blocked_lsm += + WT_STAT_READ(from, conn_close_blocked_lsm); + to->dhandle_lock_blocked += WT_STAT_READ(from, dhandle_lock_blocked); + to->log_server_sync_blocked += + WT_STAT_READ(from, log_server_sync_blocked); to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked); to->page_forcible_evict_blocked += WT_STAT_READ(from, page_forcible_evict_blocked); to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked); to->page_read_blocked += WT_STAT_READ(from, page_read_blocked); to->page_sleep += WT_STAT_READ(from, page_sleep); + to->page_del_rollback_blocked += + WT_STAT_READ(from, page_del_rollback_blocked); + to->child_modify_blocked_page += + WT_STAT_READ(from, child_modify_blocked_page); + to->page_index_slot_blocked += + WT_STAT_READ(from, page_index_slot_blocked); + to->tree_descend_blocked += WT_STAT_READ(from, tree_descend_blocked); to->txn_snapshots_created += WT_STAT_READ(from, txn_snapshots_created); to->txn_snapshots_dropped += diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c new file mode 100644 index 00000000000..0e4562c0234 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/time.c @@ -0,0 +1,89 @@ +/*- + * Public Domain 2014-2017 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "wt_internal.h" + +/* + * __time_check_monotonic -- + * Check and prevent time running backward. If we detect that it has, we + * set the time structure to the previous values, making time stand still + * until we see a time in the future of the highest value seen so far. + */ +static void +__time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp) +{ + /* + * Detect time going backward. If so, use the last + * saved timestamp. + */ + if (session == NULL) + return; + + if (tsp->tv_sec < session->last_epoch.tv_sec || + (tsp->tv_sec == session->last_epoch.tv_sec && + tsp->tv_nsec < session->last_epoch.tv_nsec)) { + WT_STAT_CONN_INCR(session, time_travel); + *tsp = session->last_epoch; + } else + session->last_epoch = *tsp; +} + +/* + * __wt_epoch -- + * Return the time since the Epoch, adjusted so it never appears to go + * backwards. + */ +void +__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + struct timespec tmp; + + /* + * Read into a local variable so that we're comparing the correct + * value when we check for monotonic increasing time. There are + * many places we read into an unlocked global variable. + */ + __wt_epoch_raw(session, &tmp); + __time_check_monotonic(session, &tmp); + *tsp = tmp; +} + +/* + * __wt_seconds -- + * Return the seconds since the Epoch. + */ +void +__wt_seconds(WT_SESSION_IMPL *session, time_t *timep) +{ + struct timespec t; + + __wt_epoch(session, &t); + + *timep = t.tv_sec; +} diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index cb3b3436786..09a8c4d9663 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -289,6 +289,7 @@ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) { + WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(logrec); WT_DECL_RET; WT_ITEM *ckpt_snapshot, empty; @@ -300,7 +301,8 @@ __wt_txn_checkpoint_log( uint32_t i, rectype = WT_LOGREC_CHECKPOINT; const char *fmt = WT_UNCHECKED_STRING(IIIIu); - txn_global = &S2C(session)->txn_global; + conn = S2C(session); + txn_global = &conn->txn_global; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; @@ -374,20 +376,20 @@ __wt_txn_checkpoint_log( txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, - F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ? + F_ISSET(conn, WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is - * no hot backup in progress and this is not recovery, tell - * the logging subsystem the checkpoint LSN so that it can - * archive. Do not update the logging checkpoint LSN if this - * is during a clean connection close, only during a full - * checkpoint. A clean close may not update any metadata LSN - * and we do not want to archive in that case. + * no hot backup in progress and this is not an unclean + * recovery, tell the logging subsystem the checkpoint LSN so + * that it can archive. Do not update the logging checkpoint + * LSN if this is during a clean connection close, only during + * a full checkpoint. A clean close may not update any + * metadata LSN and we do not want to archive in that case. */ - if (!S2C(session)->hot_backup && - !F_ISSET(S2C(session), WT_CONN_RECOVERING) && + if (!conn->hot_backup && + !FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) && txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 30932195b1e..fbef0ad4a5f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -20,6 +20,7 @@ typedef struct { } *files; size_t file_alloc; /* Allocated size of files array. */ u_int max_fileid; /* Maximum file ID seen. */ + WT_LSN max_lsn; /* Maximum checkpoint LSN seen. */ u_int nfiles; /* Number of files in the metadata. */ WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */ @@ -342,6 +343,10 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", uri, fileid, lsn.l.file, lsn.l.offset); + if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) && + (WT_IS_MAX_LSN(&r->max_lsn) || __wt_log_cmp(&lsn, &r->max_lsn) > 0)) + r->max_lsn = lsn; + return (0); } @@ -428,6 +433,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); r.session = session; + WT_MAX_LSN(&r.max_lsn); F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); @@ -443,9 +449,24 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { + /* + * Detect if we're going from logging disabled to enabled. + * We need to know this to verify LSNs and start at the correct + * log file later. If someone ran with logging, then disabled + * it and removed all the log files and then turned logging back + * on, we have to start logging in the log file number that is + * larger than any checkpoint LSN we have from the earlier time. + */ WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; - goto done; + + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && + WT_IS_MAX_LSN(&metafile->ckpt_lsn) && + !WT_IS_MAX_LSN(&r.max_lsn)) { + WT_ERR(__wt_log_reset(session, r.max_lsn.l.file)); + goto ckpt; + } else + goto done; } /* @@ -535,6 +556,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * this is not a read-only connection. * We can consider skipping it in the future. */ + if (needs_rec) + FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, @@ -554,11 +577,12 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ - WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); +ckpt: WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); + FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); if (ret != 0) __wt_err(session, ret, "Recovery failed"); diff --git a/src/third_party/wiredtiger/test/recovery/random-abort.c b/src/third_party/wiredtiger/test/recovery/random-abort.c index febe6530534..b53383e5730 100644 --- a/src/third_party/wiredtiger/test/recovery/random-abort.c +++ b/src/third_party/wiredtiger/test/recovery/random-abort.c @@ -47,9 +47,9 @@ static bool inmem; #define RECORDS_FILE "records-%" PRIu32 #define ENV_CONFIG_DEF \ - "create,log=(file_max=10M,archive=false,enabled)" + "create,log=(file_max=10M,enabled)" #define ENV_CONFIG_TXNSYNC \ - "create,log=(file_max=10M,archive=false,enabled)," \ + "create,log=(file_max=10M,enabled)," \ "transaction_sync=(enabled,method=none)" #define ENV_CONFIG_REC "log=(recover=on)" #define MAX_VAL 4096 diff --git a/src/third_party/wiredtiger/test/suite/test_bug018.py b/src/third_party/wiredtiger/test/suite/test_bug018.py new file mode 100644 index 00000000000..7d20ebcaacb --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_bug018.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +from helper import copy_wiredtiger_home +import os +import wiredtiger, wttest + +# test_bug018.py +# JIRA WT-3590: if writing table data fails during close then tables +# that were updated within the same transaction could get out of sync with +# each other. +class test_bug018(wttest.WiredTigerTestCase): + '''Test closing/reopening/recovering tables when writes fail''' + + conn_config = 'log=(enabled)' + + def setUp(self): + # This test uses Linux-specific code so skip on any other system. + if os.name != 'posix' or os.uname()[0] != 'Linux': + self.skipTest('Linux-specific test skipped on ' + os.name) + super(test_bug018, self).setUp() + + def create_table(self, uri): + self.session.create(uri, 'key_format=S,value_format=S') + return self.session.open_cursor(uri) + + def test_bug018(self): + '''Test closing multiple tables''' + basename = 'bug018.' + baseuri = 'file:' + basename + c1 = self.create_table(baseuri + '01.wt') + c2 = self.create_table(baseuri + '02.wt') + + self.session.begin_transaction() + c1['key'] = 'value' + c2['key'] = 'value' + self.session.commit_transaction() + + # Simulate a write failure by closing the file descriptor for the second + # table out from underneath WiredTiger. We do this right before + # closing the connection so that the write error happens during close + # when writing out the final data. Allow table 1 to succeed and force + # an erorr writing out table 2. + # + # This is Linux-specific code to figure out the file descriptor. + for f in os.listdir('/proc/self/fd'): + try: + if os.readlink('/proc/self/fd/' + f).endswith(basename + '02.wt'): + os.close(int(f)) + except OSError: + pass + + # Expect an error and messages, so turn off stderr checking. + with self.expectedStderrPattern(''): + try: + self.close_conn() + except wiredtiger.WiredTigerError: + self.conn = None + + # Make a backup for forensics in case something goes wrong. + backup_dir = 'BACKUP' + copy_wiredtiger_home('.', backup_dir, True) + + # After reopening and running recovery both tables should be in + # sync even though table 1 was successfully written and table 2 + # had an error on close. + self.open_conn() + c1 = self.session.open_cursor(baseuri + '01.wt') + c2 = self.session.open_cursor(baseuri + '02.wt') + self.assertEqual(list(c1), list(c2)) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_txn02.py b/src/third_party/wiredtiger/test/suite/test_txn02.py index 01626057b9e..76a325743e9 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn02.py +++ b/src/third_party/wiredtiger/test/suite/test_txn02.py @@ -169,7 +169,6 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): try: session = backup_conn.open_session() finally: - session.checkpoint("force") self.check(backup_conn.open_session(), None, committed) # Sleep long enough so that the archive thread is guaranteed # to run before we close the connection. diff --git a/src/third_party/wiredtiger/test/suite/test_txn05.py b/src/third_party/wiredtiger/test/suite/test_txn05.py index 7aaff221ba4..7099bc972aa 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn05.py +++ b/src/third_party/wiredtiger/test/suite/test_txn05.py @@ -134,12 +134,12 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): session = backup_conn.open_session() finally: self.check(session, None, committed) - # Force a checkpoint because we don't record the recovery - # checkpoint as available for archiving. - session.checkpoint("force") # Sleep long enough so that the archive thread is guaranteed # to run before we close the connection. time.sleep(1.0) + if count == 0: + first_logs = \ + fnmatch.filter(os.listdir(self.backup_dir), "*Log*") backup_conn.close() count += 1 # @@ -149,6 +149,11 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): # cur_logs = fnmatch.filter(os.listdir(self.backup_dir), "*Log*") for o in orig_logs: + # Creating the backup was effectively an unclean shutdown so + # even after sleeping, we should never archive log files + # because a checkpoint has not run. Later opens and runs of + # recovery will detect a clean shutdown and allow archiving. + self.assertEqual(True, o in first_logs) if self.archive == 'true': self.assertEqual(False, o in cur_logs) else: diff --git a/src/third_party/wiredtiger/test/suite/test_txn09.py b/src/third_party/wiredtiger/test/suite/test_txn09.py index 768d714e248..b8a3d7f38ae 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn09.py +++ b/src/third_party/wiredtiger/test/suite/test_txn09.py @@ -26,8 +26,8 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # -# test_txn02.py -# Transactions: commits and rollbacks +# test_txn09.py +# Transactions: recovery toggling logging # import fnmatch, os, shutil, time diff --git a/src/third_party/wiredtiger/test/suite/test_txn16.py b/src/third_party/wiredtiger/test/suite/test_txn16.py new file mode 100644 index 00000000000..929da2291c7 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_txn16.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_txn16.py +# Recovery: Test that toggling between logging and not logging does not +# continue to generate more log files. +# + +import fnmatch, os, shutil, time +from suite_subprocess import suite_subprocess +import wttest + +class test_txn16(wttest.WiredTigerTestCase, suite_subprocess): + t1 = 'table:test_txn16_1' + t2 = 'table:test_txn16_2' + t3 = 'table:test_txn16_3' + nentries = 1000 + create_params = 'key_format=i,value_format=i' + # Set the log file size small so we generate checkpoints + # with LSNs in different files. + conn_config = 'config_base=false,' + \ + 'log=(archive=false,enabled,file_max=100K),' + \ + 'transaction_sync=(method=dsync,enabled)' + conn_on = 'config_base=false,' + \ + 'log=(archive=false,enabled,file_max=100K),' + \ + 'transaction_sync=(method=dsync,enabled)' + conn_off = 'config_base=false,log=(enabled=false)' + + def populate_table(self, uri): + self.session.create(uri, self.create_params) + c = self.session.open_cursor(uri, None, None) + # Populate with an occasional checkpoint to generate + # some varying LSNs. + for i in range(self.nentries): + c[i] = i + 1 + if i % 900 == 0: + self.session.checkpoint() + c.close() + + def copy_dir(self, olddir, newdir): + ''' Simulate a crash from olddir and restart in newdir. ''' + # with the connection still open, copy files to new directory + shutil.rmtree(newdir, ignore_errors=True) + os.mkdir(newdir) + for fname in os.listdir(olddir): + fullname = os.path.join(olddir, fname) + # Skip lock file on Windows since it is locked + if os.path.isfile(fullname) and \ + "WiredTiger.lock" not in fullname and \ + "Tmplog" not in fullname and \ + "Preplog" not in fullname: + shutil.copy(fullname, newdir) + # close the original connection. + self.close_conn() + + def run_toggle(self, homedir): + loop = 0 + # Record original log files. There should never be overlap + # with these even after they're removed. + orig_logs = fnmatch.filter(os.listdir(homedir), "*Log*") + while loop < 3: + # Reopen with logging on to run recovery first time + on_conn = self.wiredtiger_open(homedir, self.conn_on) + on_conn.close() + if loop > 0: + # Get current log files. + cur_logs = fnmatch.filter(os.listdir(homedir), "*Log*") + scur = set(cur_logs) + sorig = set(orig_logs) + # There should never be overlap with the log files that + # were there originally. Mostly this checks that after + # opening with logging disabled and then re-enabled, we + # don't see log file 1. + self.assertEqual(scur.isdisjoint(sorig), True) + if loop > 1: + # We should be creating the same log files each time. + for l in cur_logs: + self.assertEqual(l in last_logs, True) + for l in last_logs: + self.assertEqual(l in cur_logs, True) + last_logs = cur_logs + loop += 1 + # Remove all log files before opening without logging. + cur_logs = fnmatch.filter(os.listdir(homedir), "*Log*") + for l in cur_logs: + path=homedir + "/" + l + os.remove(path) + off_conn = self.wiredtiger_open(homedir, self.conn_off) + off_conn.close() + + def test_recovery(self): + ''' Check log file creation when toggling. ''' + + # Here's the strategy: + # - With logging populate 4 tables. Checkpoint + # them at different times. + # - Copy to a new directory to simulate a crash. + # - Close the original connection. + # On both a "copy" to simulate a crash and the original (3x): + # - Record log files existing. + # - Reopen with logging to run recovery. Close connection. + # - Record log files existing. + # - Remove all log files. + # - Open connection with logging disabled. + # - Record log files existing. Verify we don't keep adding. + # + self.populate_table(self.t1) + self.populate_table(self.t2) + self.populate_table(self.t3) + self.copy_dir(".", "RESTART") + self.run_toggle(".") + self.run_toggle("RESTART") + +if __name__ == '__main__': + wttest.run() |