diff options
author | Luke Chen <luke.chen@mongodb.com> | 2017-11-13 09:31:33 +1100 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2017-11-13 09:31:33 +1100 |
commit | 6554a50f2810b9e233f8cb6077d831e43ff2f8aa (patch) | |
tree | 29dcbe7164e0236caa56b342868f97b24320aa13 /src | |
parent | 13f29c918109df1dbbc3b561beb422650b8f3873 (diff) | |
download | mongo-6554a50f2810b9e233f8cb6077d831e43ff2f8aa.tar.gz |
Import wiredtiger: 3a8316e86e9c7cd379679d8530ecc54ad9bdf5c1 from branch mongodb-3.6
ref: 0a2f8f6ad7..3a8316e86e
for: 3.6.0-rc4
WT-3637 Fix a heap use after free from evicting of a page that just split.
WT-3648 Fix timestamp_abort test calculation of the oldest timestamp
WT-3696 Add diagnostic code to detect when sessions are in use by multiple threads
WT-3710 Fix a race condition between concurrent page splits
WT-3715 Performance tuning for cache overflow mechanism
WT-3717 Add a diagnostic verbose lookaside mode
WT-3730 For simple tables, do not use table dhandle after it is released
Diffstat (limited to 'src')
58 files changed, 1014 insertions, 366 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 3d6d4712413..a8b1a30a333 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -561,6 +561,7 @@ connection_runtime_config = [ 'fileops', 'handleops', 'log', + 'lookaside', 'lookaside_activity', 'lsm', 'lsm_manager', @@ -575,7 +576,6 @@ connection_runtime_config = [ 'salvage', 'shared_cache', 'split', - 'temporary', 'thread_group', 'timestamp', 'transaction', diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 21fd0756435..28c91486e1a 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -23,11 +23,12 @@ flags = { ], 'page_read' : [ 'READ_CACHE', + 'READ_IGNORE_CACHE_SIZE', 'READ_LOOKASIDE', 'READ_NOTFOUND_OK', 'READ_NO_EMPTY', - 'READ_NO_EVICT', 'READ_NO_GEN', + 'READ_NO_SPLIT', 'READ_NO_WAIT', 'READ_PREV', 'READ_RESTART_OK', @@ -74,6 +75,7 @@ flags = { 'VERB_HANDLEOPS', 'VERB_LOG', 'VERB_LOOKASIDE', + 'VERB_LOOKASIDE_ACTIVITY', 'VERB_LSM', 'VERB_LSM_MANAGER', 'VERB_METADATA', @@ -87,7 +89,6 @@ flags = { 'VERB_SALVAGE', 'VERB_SHARED_CACHE', 'VERB_SPLIT', - 'VERB_TEMPORARY', 'VERB_THREAD_GROUP', 'VERB_TIMESTAMP', 'VERB_TRANSACTION', @@ -124,6 +125,7 @@ flags = { 'session' : [ 'SESSION_CAN_WAIT', 'SESSION_INTERNAL', + 'SESSION_IGNORE_CACHE_SIZE', 'SESSION_LOCKED_CHECKPOINT', 'SESSION_LOCKED_HANDLE_LIST_READ', 'SESSION_LOCKED_HANDLE_LIST_WRITE', @@ -136,12 +138,12 @@ flags = { 'SESSION_LOCKED_TURTLE', 'SESSION_LOGGING_INMEM', 'SESSION_LOOKASIDE_CURSOR', - 'SESSION_NO_CACHE', 'SESSION_NO_DATA_HANDLES', - 'SESSION_NO_EVICTION', + 'SESSION_NO_RECONCILE', 'SESSION_NO_LOGGING', 'SESSION_NO_SCHEMA_LOCK', 'SESSION_QUIET_CORRUPT_FILE', + 'SESSION_READ_WONT_NEED', 'SESSION_SERVER_ASYNC', ], 'stat' : [ diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index dcaf975434f..fb0162079d9 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -22,6 +22,8 @@ WT_CACHE_LINE_PAD_END WT_CONN_CHECK_PANIC WT_DEADLOCK WT_DEBUG_BYTE +WT_SINGLE_THREAD_CHECK_START +WT_SINGLE_THREAD_CHECK_STOP WT_ERR_ERROR_OK WT_EXT_FOREACH_OFF WT_HANDLE_CLOSED diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 64d3d46818b..44eb743479d 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -453,6 +453,8 @@ connection_stats = [ TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'), TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'), + TxnStat('txn_pinned_timestamp', 'transaction range of timestamps currently pinned', 'no_clear,no_scale'), + TxnStat('txn_pinned_timestamp_oldest', 'transaction range of timestamps pinned by the oldest timestamp', 'no_clear,no_scale'), TxnStat('txn_read_queue_head', 'transactions read timestamp queue inserts to head'), TxnStat('txn_read_queue_inserts', 'transactions read timestamp queue inserts total'), TxnStat('txn_read_queue_len', 'transactions read timestamp queue length'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 1af1a5d50ec..409b0b2906a 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "0a2f8f6ad756189263d050b29f69bc57b45b9816", + "commit": "3a8316e86e9c7cd379679d8530ecc54ad9bdf5c1", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index fe6be6517a2..fc4afc7f9b1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -116,7 +116,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) cursor, btree_id, ref->page_las->las_pageid); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); + &las_pageid, &las_id, &las_counter, &las_key)); /* * Confirm the search using the unique prefix; if not a match, @@ -314,6 +314,11 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages or pages with lookaside entries. + * The difference is that checkpoints can skip over clean pages that + * are being read into cache, but need to wait for deletes or lookaside + * updates to be resolved (in order for checkpoint to write the correct + * version of the page). + * * If successful, we've won the race, read the page. */ switch (previous_state = ref->state) { @@ -368,8 +373,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) */ page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED; - if (LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(session, WT_SESSION_NO_EVICTION)) + if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) FLD_SET(page_flags, WT_PAGE_READ_NO_EVICT); WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &page)); tmp.mem = NULL; @@ -518,6 +522,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags btree = S2BT(session); + if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) + LF_SET(WT_READ_IGNORE_CACHE_SIZE); + /* * Ignore reads of pages already known to be in cache, otherwise the * eviction server can dominate these statistics. @@ -554,7 +561,7 @@ read: /* * allowed to do eviction work, check for space in the * cache. */ - if (!LF_ISSET(WT_READ_NO_EVICT)) + if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref, flags)); @@ -574,7 +581,7 @@ read: /* * we "acquire" it. */ wont_need = LF_ISSET(WT_READ_WONT_NEED) || - F_ISSET(session, WT_SESSION_NO_CACHE); + F_ISSET(session, WT_SESSION_READ_WONT_NEED); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) @@ -623,17 +630,22 @@ read: /* } /* - * If eviction is configured for this file, check to see - * if the page qualifies for forced eviction and update - * the page's generation number. If eviction isn't being - * done on this file, we're done. + * Check if the page requires forced eviction. */ - if (did_read || LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(session, WT_SESSION_NO_EVICTION) || + if (did_read || LF_ISSET(WT_READ_NO_SPLIT) || btree->evict_disabled > 0 || btree->lsm_primary) goto skip_evict; /* + * If reconciliation is disabled (e.g., when inserting + * into the lookaside table), skip forced eviction if + * the page can't split. + */ + if (F_ISSET(session, WT_SESSION_NO_RECONCILE) && + !__wt_leaf_page_can_split(session, ref->page)) + goto skip_evict; + + /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && @@ -684,9 +696,19 @@ skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. + * + * The logic here is a little weird: some code paths do + * a blanket ban on checking the cache size in + * sessions, but still require a transaction (e.g., + * when updating metadata or lookaside). If + * WT_READ_IGNORE_CACHE_SIZE was passed in explicitly, + * we're done. If we set WT_READ_IGNORE_CACHE_SIZE + * because it was set in the session then make sure we + * start a transaction. */ - return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : - __wt_txn_autocommit_check(session)); + return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) && + !F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE) ? + 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } @@ -707,7 +729,7 @@ skip_evict: /* * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ - if (!LF_ISSET(WT_READ_NO_EVICT)) { + if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) @@ -728,30 +750,33 @@ __btree_verbose_lookaside_read( WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid) { #ifdef HAVE_VERBOSE - WT_CONNECTION_IMPL *conn; + WT_CACHE *cache; uint64_t ckpt_gen_current, ckpt_gen_last; - if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) + if (!WT_VERBOSE_ISSET(session, + WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY)) return; - conn = S2C(session); + cache = S2C(session)->cache; ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); - ckpt_gen_last = conn->las_verb_gen_read; + ckpt_gen_last = cache->las_verb_gen_read; /* * This message is throttled to one per checkpoint. To do this we * track the generation of the last checkpoint for which the message * was printed and check against the current checkpoint generation. */ - if (ckpt_gen_current > ckpt_gen_last) { + if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) || + ckpt_gen_current > ckpt_gen_last) { /* * Attempt to atomically replace the last checkpoint generation * for which this message was printed. If the atomic swap fails * we have raced and the winning thread will print the message. */ - if (__wt_atomic_casv64(&conn->las_verb_gen_read, + if (__wt_atomic_casv64(&cache->las_verb_gen_read, ckpt_gen_last, ckpt_gen_current)) { - __wt_verbose(session, WT_VERB_LOOKASIDE, + __wt_verbose(session, + WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY, "Read from lookaside file triggered for " "file ID %" PRIu32 ", page ID %" PRIu64, las_id, las_pageid); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index dc699a6b23b..021788919d0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -141,6 +141,9 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_DECL_RET; WT_REF *ref; + uint32_t read_flags; + + read_flags = WT_READ_CACHE | WT_READ_NO_EVICT; /* The split is complete and live, verify all of the pages involved. */ __split_verify_intl_key_order(session, page); @@ -156,14 +159,14 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) * Ignore pages not in-memory (deleted, on-disk, being read), * there's no in-memory structure to check. */ - if ((ret = __wt_page_in(session, - ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) + if ((ret = + __wt_page_in(session, ref, read_flags)) == WT_NOTFOUND) continue; WT_ERR(ret); __split_verify_intl_key_order(session, ref->page); - WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT)); + WT_ERR(__wt_page_release(session, ref, read_flags)); } WT_INTL_FOREACH_END; return (0); @@ -345,6 +348,9 @@ __split_ref_prepare( * ascend into the created children, but eventually fail as that parent * page won't yet know about the created children pages. That's OK, we * spin there until the parent's page index is updated. + * + * Lock the newly created page to ensure it doesn't split until all + * child pages have been updated. */ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { ref = pindex->index[i]; @@ -352,10 +358,12 @@ __split_ref_prepare( /* Switch the WT_REF's to their new page. */ j = 0; + WT_PAGE_LOCK(session, child); WT_INTL_FOREACH_BEGIN(session, child, child_ref) { child_ref->home = child; child_ref->pindex_hint = j++; } WT_INTL_FOREACH_END; + WT_PAGE_UNLOCK(session, child); #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, @@ -1643,6 +1651,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_RET(__wt_calloc_one(session, &ref->page_las)); *ref->page_las = multi->page_las; + WT_ASSERT(session, ref->page_las->las_max_txn != WT_TXN_NONE); ref->state = WT_REF_LOOKASIDE; } diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index d15852af935..2338d5be8ed 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -58,6 +58,7 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) i = 0; i < mod->mod_multi_entries; ++multi, ++i) if (multi->addr.addr == NULL) return (false); + return (true); } diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index e2ebd38e82f..deed37517bb 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -9,18 +9,44 @@ #include "wt_internal.h" /* + * When an operation is accessing the lookaside table, it should ignore the + * cache size (since the cache is already full), any pages it reads should be + * evicted before application data, and the operation can't reenter + * reconciliation. + */ +#define WT_LAS_SESSION_FLAGS \ + (WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED | \ + WT_SESSION_NO_RECONCILE) + +/* + * __wt_las_nonempty -- + * Return when there are entries in the lookaside table. + */ +bool +__wt_las_nonempty(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + + cache = S2C(session)->cache; + + return (cache->las_entry_count > 0); +} + +/* * __wt_las_stats_update -- * Update the lookaside table statistics for return to the application. */ void __wt_las_stats_update(WT_SESSION_IMPL *session) { + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS **cstats; WT_DSRC_STATS **dstats; int64_t v; conn = S2C(session); + cache = conn->cache; /* * Lookaside table statistics are copied from the underlying lookaside @@ -36,7 +62,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) */ cstats = conn->stats; dstats = ((WT_CURSOR_BTREE *) - conn->las_session->las_cursor)->btree->dhandle->stats; + cache->las_session[0]->las_cursor)->btree->dhandle->stats; v = WT_STAT_READ(dstats, cursor_insert); WT_STAT_SET(session, cstats, cache_lookaside_insert, v); @@ -62,13 +88,15 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) int __wt_las_create(WT_SESSION_IMPL *session) { + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - uint32_t session_flags; + int i; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; conn = S2C(session); + cache = conn->cache; /* Read-only and in-memory configurations don't need the LAS table. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) @@ -86,16 +114,17 @@ __wt_las_create(WT_SESSION_IMPL *session) WT_RET(ret); /* Re-create the table. */ - WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); + WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_CONFIG)); /* * Open a shared internal session and cursor used for the lookaside - * table. This session should never be tapped for eviction. + * table. This session should never perform reconciliation. */ - session_flags = WT_SESSION_NO_EVICTION; - WT_RET(__wt_open_internal_session( - conn, "lookaside table", true, session_flags, &conn->las_session)); - WT_RET(__wt_las_cursor_open(conn->las_session)); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) { + WT_RET(__wt_open_internal_session(conn, "lookaside table", + true, WT_LAS_SESSION_FLAGS, &cache->las_session[i])); + WT_RET(__wt_las_cursor_open(cache->las_session[i])); + } /* The statistics server is already running, make sure we don't race. */ WT_WRITE_BARRIER(); @@ -111,20 +140,31 @@ __wt_las_create(WT_SESSION_IMPL *session) int __wt_las_destroy(WT_SESSION_IMPL *session) { + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; + int i; conn = S2C(session); + cache = conn->cache; F_CLR(conn, WT_CONN_LOOKASIDE_OPEN); - if (conn->las_session == NULL) + if (cache == NULL) return (0); - wt_session = &conn->las_session->iface; - ret = wt_session->close(wt_session, NULL); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) { + if (cache->las_session[i] == NULL) + continue; + + wt_session = &cache->las_session[i]->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + cache->las_session[i] = NULL; + } - conn->las_session = NULL; + __wt_buf_free(session, &cache->las_sweep_key); + __wt_free(session, cache->las_dropped); + __wt_free(session, cache->las_sweep_dropmap); return (ret); } @@ -154,8 +194,8 @@ __wt_las_cursor_open(WT_SESSION_IMPL *session) btree = ((WT_CURSOR_BTREE *)cursor)->btree; /* Track the lookaside file ID. */ - if (S2C(session)->las_fileid == 0) - S2C(session)->las_fileid = btree->id; + if (S2C(session)->cache->las_fileid == 0) + S2C(session)->cache->las_fileid = btree->id; /* * Set special flags for the lookaside table: the lookaside flag (used, @@ -187,7 +227,8 @@ void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) { - WT_CONNECTION_IMPL *conn; + WT_CACHE *cache; + int i; *cursorp = NULL; @@ -200,10 +241,9 @@ __wt_las_cursor( * problems and there's no reason to believe lookaside pages will be * useful more than once. */ - *session_flags = - F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + *session_flags = F_MASK(session, WT_LAS_SESSION_FLAGS); - conn = S2C(session); + cache = S2C(session)->cache; /* * Some threads have their own lookaside table cursors, else lock the @@ -212,12 +252,30 @@ __wt_las_cursor( if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) *cursorp = session->las_cursor; else { - __wt_spin_lock(session, &conn->las_lock); - *cursorp = conn->las_session->las_cursor; + for (;;) { + __wt_spin_lock(session, &cache->las_lock); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) { + if (!cache->las_session_inuse[i]) { + *cursorp = + cache->las_session[i]->las_cursor; + cache->las_session_inuse[i] = true; + break; + } + } + __wt_spin_unlock(session, &cache->las_lock); + if (*cursorp != NULL) + break; + /* + * If all the lookaside sessions are busy, stall. + * + * XXX better as a condition variable. + */ + __wt_sleep(0, 1000); + } } - /* Turn caching and eviction off. */ - F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + /* Configure session to access the lookaside table. */ + F_SET(session, WT_LAS_SESSION_FLAGS); } /* @@ -226,13 +284,14 @@ __wt_las_cursor( */ int __wt_las_cursor_close( - WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) { - WT_CONNECTION_IMPL *conn; + WT_CACHE *cache; WT_CURSOR *cursor; WT_DECL_RET; + int i; - conn = S2C(session); + cache = S2C(session)->cache; if ((cursor = *cursorp) == NULL) return (0); @@ -245,15 +304,23 @@ __wt_las_cursor_close( * We turned off caching and eviction while the lookaside cursor was in * use, restore the session's flags. */ - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_CLR(session, WT_LAS_SESSION_FLAGS); F_SET(session, session_flags); /* * Some threads have their own lookaside table cursors, else unlock the * shared lookaside cursor. */ - if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) - __wt_spin_unlock(session, &conn->las_lock); + if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { + __wt_spin_lock(session, &cache->las_lock); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) + if (cursor->session == &cache->las_session[i]->iface) { + cache->las_session_inuse[i] = false; + break; + } + __wt_spin_unlock(session, &cache->las_lock); + WT_ASSERT(session, i != WT_LAS_NUM_SESSIONS); + } return (ret); } @@ -263,54 +330,74 @@ __wt_las_cursor_close( * Display a verbose message once per checkpoint with details about the * cache state when performing a lookaside table write. */ -static void -__las_insert_block_verbose( - WT_SESSION_IMPL *session, uint32_t btree_id, uint64_t las_pageid) +static int +__las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) { #ifdef HAVE_VERBOSE + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; +#ifdef HAVE_TIMESTAMPS + char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1]; +#else + char hex_timestamp[9]; /* Enough for disabled string */ +#endif uint64_t ckpt_gen_current, ckpt_gen_last; - uint32_t pct_dirty, pct_full; + uint32_t btree_id, pct_dirty, pct_full; - if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) - return; + btree_id = S2BT(session)->id; + + if (!WT_VERBOSE_ISSET(session, + WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY)) + return (0); conn = S2C(session); + cache = conn->cache; ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); - ckpt_gen_last = conn->las_verb_gen_write; + ckpt_gen_last = cache->las_verb_gen_write; /* - * This message is throttled to one per checkpoint. To do this we - * track the generation of the last checkpoint for which the message - * was printed and check against the current checkpoint generation. + * Print a message if verbose lookaside, or once per checkpoint if + * only reporting activity. Avoid an expensive atomic operation as + * often as possible when the message rate is limited. */ - if (ckpt_gen_current > ckpt_gen_last) { - /* - * Attempt to atomically replace the last checkpoint generation - * for which this message was printed. If the atomic swap fails - * we have raced and the winning thread will print the message. - */ - if (__wt_atomic_casv64(&conn->las_verb_gen_write, - ckpt_gen_last, ckpt_gen_current)) { - (void)__wt_eviction_clean_needed(session, &pct_full); - (void)__wt_eviction_dirty_needed(session, &pct_dirty); - - __wt_verbose(session, WT_VERB_LOOKASIDE, - "Page reconciliation triggered lookaside write" - "file ID %" PRIu32 ", page ID %" PRIu64 ". " - "Entries now in lookaside file: %" PRId64 ", " - "cache dirty: %" PRIu32 "%% , " - "cache use: %" PRIu32 "%%", - btree_id, las_pageid, - WT_STAT_READ(conn->stats, cache_lookaside_entries), - pct_dirty, pct_full); - } + if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) || + (ckpt_gen_current > ckpt_gen_last && + __wt_atomic_casv64(&cache->las_verb_gen_write, + ckpt_gen_last, ckpt_gen_current))) { + (void)__wt_eviction_clean_needed(session, &pct_full); + (void)__wt_eviction_dirty_needed(session, &pct_dirty); + +#ifdef HAVE_TIMESTAMPS + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp, &multi->page_las.min_timestamp)); +#else + WT_RET(__wt_snprintf( + hex_timestamp, sizeof(hex_timestamp), "disabled")); +#endif + __wt_verbose(session, + WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY, + "Page reconciliation triggered lookaside write " + "file ID %" PRIu32 ", page ID %" PRIu64 ". " + "Max txn ID %" PRIu64 ", min timestamp %s, skewed %s. " + "Entries now in lookaside file: %" PRId64 ", " + "cache dirty: %" PRIu32 "%% , " + "cache use: %" PRIu32 "%%", + btree_id, multi->page_las.las_pageid, + multi->page_las.las_max_txn, + hex_timestamp, + multi->page_las.las_skew_oldest? "oldest" : "youngest", + WT_STAT_READ(conn->stats, cache_lookaside_entries), + pct_dirty, pct_full); } + + /* Never skip updating the tracked generation */ + if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) + cache->las_verb_gen_write = ckpt_gen_current; #else WT_UNUSED(session); - WT_UNUSED(btree_id); - WT_UNUSED(las_pageid); + WT_UNUSED(multi); #endif + return (0); } /* @@ -318,11 +405,14 @@ __las_insert_block_verbose( * Copy one set of saved updates into the database's lookaside buffer. */ int -__wt_las_insert_block(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) +__wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, + WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) { + WT_BTREE *btree; + WT_DECL_RET; WT_ITEM las_timestamp, las_value; WT_SAVE_UPD *list; + WT_SESSION_IMPL *las_session; WT_UPDATE *upd; uint64_t insert_cnt, las_counter, las_pageid; uint32_t btree_id, i, slot; @@ -332,15 +422,23 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CLEAR(las_value); insert_cnt = 0; - btree_id = S2BT(session)->id; + btree = S2BT(session); + btree_id = btree->id; las_pageid = multi->page_las.las_pageid = - __wt_atomic_add64(&S2BT(session)->las_pageid, 1); + __wt_atomic_add64(&S2C(session)->cache->las_pageid, 1); + + if (!btree->lookaside_entries) + btree->lookaside_entries = true; + + /* Wrap all the updates in a transaction. */ + las_session = (WT_SESSION_IMPL *)cursor->session; + WT_RET(__wt_txn_begin(las_session, NULL)); /* * Make sure there are no leftover entries (e.g., from a handle * reopen). */ - WT_RET(__wt_las_remove_block(session, cursor, btree_id, las_pageid)); + WT_ERR(__wt_las_remove_block(session, cursor, btree_id, las_pageid)); /* Enter each update in the boundary's list into the lookaside store. */ for (las_counter = 0, i = 0, @@ -350,20 +448,20 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = key->mem; - WT_RET( + WT_ERR( __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); key->size = WT_PTRDIFF(p, key->data); break; case WT_PAGE_ROW_LEAF: if (list->ins == NULL) - WT_RET(__wt_row_leaf_key( + WT_ERR(__wt_row_leaf_key( session, page, list->ripcip, key, false)); else { key->data = WT_INSERT_KEY(list->ins); key->size = WT_INSERT_KEY_SIZE(list->ins); } break; - WT_ILLEGAL_VALUE(session); + WT_ILLEGAL_VALUE_ERR(session); } /* @@ -411,7 +509,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, } cursor->set_key(cursor, - btree_id, las_pageid, ++las_counter, key); + las_pageid, btree_id, ++las_counter, key); #ifdef HAVE_TIMESTAMPS las_timestamp.data = &upd->timestamp; @@ -420,20 +518,27 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, cursor->set_value(cursor, upd->txnid, &las_timestamp, upd->type, &las_value); - WT_RET(cursor->insert(cursor)); + WT_ERR(cursor->insert(cursor)); ++insert_cnt; } while ((upd = upd->next) != NULL); } - __wt_free(session, multi->supd); - multi->supd_entries = 0; - if (insert_cnt > 0) { WT_STAT_CONN_INCRV( session, cache_lookaside_entries, insert_cnt); - __las_insert_block_verbose(session, btree_id, las_pageid); + __wt_atomic_add64( + &S2C(session)->cache->las_entry_count, insert_cnt); + WT_ERR(__las_insert_block_verbose(session, multi)); } - return (0); + +err: /* Resolve the transaction. */ + if (ret == 0) + ret = __wt_txn_commit(las_session, NULL); + else + WT_TRET(__wt_txn_rollback(las_session, NULL)); + __wt_free(session, multi->supd); + multi->supd_entries = 0; + return (ret); } /* @@ -452,6 +557,15 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) int exact; /* + * When scanning for all pages, start at the beginning of the lookaside + * table. + */ + if (pageid == 0) { + WT_RET(cursor->reset(cursor)); + return (cursor->next(cursor)); + } + + /* * Because of the special visibility rules for lookaside, a new block * can appear in between our search and the block of interest. Keep * trying until we find it. @@ -459,7 +573,7 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) for (;;) { WT_CLEAR(las_key); cursor->set_key(cursor, - btree_id, pageid, (uint64_t)0, &las_key); + pageid, btree_id, (uint64_t)0, &las_key); WT_RET(cursor->search_near(cursor, &exact)); if (exact < 0) { WT_RET(cursor->next(cursor)); @@ -475,9 +589,9 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) * WT_CONNECTION::rollback_to_stable. */ WT_RET(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); - if (las_id < btree_id || (las_id == btree_id && - pageid != 0 && las_pageid < pageid)) + &las_pageid, &las_id, &las_counter, &las_key)); + if (las_pageid < pageid || (las_pageid == pageid && + las_id < btree_id)) continue; } @@ -489,7 +603,7 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) /* * __wt_las_remove_block -- - * Remove all records matching a key prefix from the lookaside store. + * Remove all records for a given page from the lookaside store. */ int __wt_las_remove_block(WT_SESSION_IMPL *session, @@ -497,18 +611,29 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_ITEM las_key; + WT_SESSION_IMPL *las_session; uint64_t las_counter, las_pageid, remove_cnt; uint32_t las_id, session_flags; - bool local_cursor; + bool local_cursor, local_txn; remove_cnt = 0; session_flags = 0; /* [-Wconditional-uninitialized] */ - local_cursor = false; + local_cursor = local_txn = false; if (cursor == NULL) { __wt_las_cursor(session, &cursor, &session_flags); local_cursor = true; } + las_session = (WT_SESSION_IMPL *)cursor->session; + + /* + * Wrap all of the removes in a transaction, unless this remove is part + * of a larger operation. + */ + if (local_cursor) { + WT_ERR(__wt_txn_begin(las_session, NULL)); + local_txn = true; + } /* * Search for the block's unique prefix and step through all matching @@ -517,16 +642,13 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, ret = __wt_las_cursor_position(cursor, btree_id, pageid); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); + &las_pageid, &las_id, &las_counter, &las_key)); /* * Confirm the search using the unique prefix; if not a match, - * we're done searching for records for this page. Note that - * page ID zero is special: it is a wild card indicating that - * all pages in the tree should be removed. + * we're done searching for records for this page. */ - if (las_id != btree_id || - (pageid != 0 && las_pageid != pageid)) + if (las_pageid != pageid || las_id != btree_id) break; WT_ERR(cursor->remove(cursor)); @@ -534,9 +656,218 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, } WT_ERR_NOTFOUND_OK(ret); -err: if (local_cursor) - WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); +err: if (local_txn) { + if (ret == 0) + ret = __wt_txn_commit(las_session, NULL); + else + WT_TRET(__wt_txn_rollback(las_session, NULL)); + } + if (local_cursor) + WT_TRET(__wt_las_cursor_close( + session, &cursor, session_flags)); WT_STAT_CONN_DECRV(session, cache_lookaside_entries, remove_cnt); + __wt_cache_decr_check_uint64(session, + &S2C(session)->cache->las_entry_count, remove_cnt, + "lookaside entry count"); + return (ret); +} + +/* + * __wt_las_save_dropped -- + * Save a dropped btree ID to be swept from the lookaside table. + */ +int +__wt_las_save_dropped(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_DECL_RET; + + btree = S2BT(session); + cache = S2C(session)->cache; + + __wt_spin_lock(session, &cache->las_sweep_lock); + WT_ERR(__wt_realloc_def(session, &cache->las_dropped_alloc, + cache->las_dropped_next + 1, &cache->las_dropped)); + cache->las_dropped[cache->las_dropped_next++] = btree->id; +err: __wt_spin_unlock(session, &cache->las_sweep_lock); + return (ret); +} + +/* + * __las_sweep_init -- + * Prepare to start a lookaside sweep. + */ +static int +__las_sweep_init(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_DECL_RET; + u_int i; + + cache = S2C(session)->cache; + + __wt_spin_lock(session, &cache->las_sweep_lock); + /* If no files have been dropped, there's nothing to do. */ + if (cache->las_dropped_next == 0) + WT_ERR(WT_NOTFOUND); + + /* Scan the btree IDs to find min/max. */ + cache->las_sweep_dropmin = UINT32_MAX; + cache->las_sweep_dropmax = 0; + for (i = 0; i < cache->las_dropped_next; i++) { + cache->las_sweep_dropmin = WT_MIN( + cache->las_sweep_dropmin, + cache->las_dropped[i]); + cache->las_sweep_dropmax = WT_MAX( + cache->las_sweep_dropmax, + cache->las_dropped[i]); + } + + /* Initialize the bitmap. */ + __wt_free(session, cache->las_sweep_dropmap); + WT_ERR(__bit_alloc(session, + 1 + cache->las_sweep_dropmax - cache->las_sweep_dropmin, + &cache->las_sweep_dropmap)); + for (i = 0; i < cache->las_dropped_next; i++) + __bit_set(cache->las_sweep_dropmap, + cache->las_dropped[i] - cache->las_sweep_dropmin); + + /* Clear the list of btree IDs. */ + cache->las_dropped_next = 0; + +err: __wt_spin_unlock(session, &cache->las_sweep_lock); + return (ret); +} + +/* + * __wt_las_sweep -- + * Sweep the lookaside table. + */ +int +__wt_las_sweep(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_ITEM *key, las_key; + uint64_t cnt, las_counter, las_pageid, remove_cnt; + uint32_t las_id, session_flags; + int notused; + + cache = S2C(session)->cache; + cursor = NULL; + key = &cache->las_sweep_key; + remove_cnt = 0; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + __wt_las_cursor(session, &cursor, &session_flags); + + /* We should have our own session. */ + WT_ASSERT(session, cursor->session == &session->iface); + + /* + * When continuing a sweep, position the cursor using the key from the + * last call (we don't care if we're before or after the key, either + * side is fine). + * + * Otherwise, we're starting a new sweep, gather the list of trees to + * sweep. + */ + if (key->size != 0) { + __wt_cursor_set_raw_key(cursor, key); + ret = cursor->search_near(cursor, ¬used); + + /* + * Don't search for the same key twice; if we don't set a new + * key below, it's because we've reached the end of the table + * and we want the next pass to start at the beginning of the + * table. Searching for the same key could leave us stuck at + * the end of the table, repeatedly checking the same rows. + */ + key->size = 0; + } else + ret = __las_sweep_init(session); + + if (ret != 0) + goto srch_notfound; + + /* + * The sweep server wakes up every 10 seconds (by default), it's a slow + * moving thread. Try to review the entire lookaside table once every 5 + * minutes, or every 30 calls. + * + * The reason is because the lookaside table exists because we're seeing + * cache/eviction pressure (it allows us to trade performance and disk + * space for cache space), and it's likely lookaside blocks are being + * evicted, and reading them back in doesn't help things. A trickier, + * but possibly better, alternative might be to review all lookaside + * blocks in the cache in order to get rid of them, and slowly review + * lookaside blocks that have already been evicted. + */ + cnt = (uint64_t)WT_MAX(100, cache->las_entry_count / 30); + + /* Walk the file. */ + for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { + /* + * Give up if the cache is stuck: we are ignoring the cache + * size while scanning the lookaside table, so we're making + * things worse. + */ + if (__wt_cache_stuck(session)) + cnt = 1; + + /* + * If the loop terminates after completing a work unit, we will + * continue the table sweep next time. Get a local copy of the + * sweep key, we're going to reset the cursor; do so before + * calling cursor.remove, cursor.remove can discard our hazard + * pointer and the page could be evicted from underneath us. + */ + if (cnt == 1) { + WT_ERR(__wt_cursor_get_raw_key(cursor, key)); + if (!WT_DATA_IN_ITEM(key)) + WT_ERR(__wt_buf_set( + session, key, key->data, key->size)); + } + + WT_ERR(cursor->get_key(cursor, + &las_pageid, &las_id, &las_counter, &las_key)); + + /* + * If the entry belongs to a dropped tree, discard it. + * + * Cursor opened overwrite=true: won't return WT_NOTFOUND + * should another thread remove the record before we do (not + * expected for dropped trees), and the cursor remains + * positioned in that case. + * + * TODO it would also be good to remove entries in lookaside + * from live files that have aged out. If we track for each + * entry whether it was the on-page value chosen by + * reconciliation, we can safely remove entries from that point + * on (for the given key) that are visible to all readers. + */ + if (__bit_test(cache->las_sweep_dropmap, + las_id - cache->las_sweep_dropmin)) { + WT_ERR(cursor->remove(cursor)); + ++remove_cnt; + } + } + +srch_notfound: + WT_ERR_NOTFOUND_OK(ret); + + if (0) { +err: __wt_buf_free(session, key); + } + + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_cache_decr_check_uint64(session, + &S2C(session)->cache->las_entry_count, remove_cnt, + "lookaside entry count"); + return (ret); } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index e7ead608672..f0e1dc1f701 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -179,12 +179,12 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\"," "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\"," - "\"evictserver\",\"fileops\",\"handleops\",\"log\"," + "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\"," "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\"," "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"thread_group\",\"timestamp\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"split\",\"thread_group\",\"timestamp\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -834,12 +834,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\"," "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\"," - "\"evictserver\",\"fileops\",\"handleops\",\"log\"," + "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\"," "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\"," "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"thread_group\",\"timestamp\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"split\",\"thread_group\",\"timestamp\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -929,12 +929,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\"," "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\"," - "\"evictserver\",\"fileops\",\"handleops\",\"log\"," + "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\"," "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\"," "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"thread_group\",\"timestamp\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"split\",\"thread_group\",\"timestamp\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -1019,12 +1019,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\"," "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\"," - "\"evictserver\",\"fileops\",\"handleops\",\"log\"," + "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\"," "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\"," "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"thread_group\",\"timestamp\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"split\",\"thread_group\",\"timestamp\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -1109,12 +1109,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\"," "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\"," - "\"evictserver\",\"fileops\",\"handleops\",\"log\"," + "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\"," "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\"," "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"thread_group\",\"timestamp\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"split\",\"thread_group\",\"timestamp\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 5f77f27ee3f..fd8fd6763db 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1816,7 +1816,8 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "fileops", WT_VERB_FILEOPS }, { "handleops", WT_VERB_HANDLEOPS }, { "log", WT_VERB_LOG }, - { "lookaside_activity", WT_VERB_LOOKASIDE }, + { "lookaside", WT_VERB_LOOKASIDE }, + { "lookaside_activity", WT_VERB_LOOKASIDE_ACTIVITY }, { "lsm", WT_VERB_LSM }, { "lsm_manager", WT_VERB_LSM_MANAGER }, { "metadata", WT_VERB_METADATA }, @@ -1830,7 +1831,6 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "salvage", WT_VERB_SALVAGE }, { "shared_cache", WT_VERB_SHARED_CACHE }, { "split", WT_VERB_SPLIT }, - { "temporary", WT_VERB_TEMPORARY }, { "thread_group", WT_VERB_THREAD_GROUP }, { "timestamp", WT_VERB_TIMESTAMP }, { "transaction", WT_VERB_TRANSACTION }, diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index 007aa8757da..76106b3592f 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -198,6 +198,10 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET_MSG(NULL, ret, "Failed to create session for eviction walks"); + WT_RET(__wt_spin_init(session, &cache->las_lock, "lookaside table")); + WT_RET(__wt_spin_init( + session, &cache->las_sweep_lock, "lookaside sweep")); + /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; for (i = 0; i < WT_EVICT_QUEUE_MAX; ++i) { @@ -334,6 +338,8 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) __wt_spin_destroy(session, &cache->evict_pass_lock); __wt_spin_destroy(session, &cache->evict_queue_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); + __wt_spin_destroy(session, &cache->las_lock); + __wt_spin_destroy(session, &cache->las_sweep_lock); wt_session = &cache->walk_session->iface; if (wt_session != NULL) WT_TRET(wt_session->close(wt_session, NULL)); diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index 2f3f9488b58..42ae866b329 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -55,7 +55,6 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); - WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); @@ -125,7 +124,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); - __wt_spin_destroy(session, &conn->las_lock); __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 9b64c7a0f77..06e441a3037 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -278,10 +278,12 @@ __sweep_server(void *arg) WT_DECL_RET; WT_SESSION_IMPL *session; time_t now; + uint64_t last_las_sweep_id, oldest_id; u_int dead_handles; session = arg; conn = S2C(session); + last_las_sweep_id = WT_TXN_NONE; /* * Sweep for dead and excess handles. @@ -300,6 +302,26 @@ __sweep_server(void *arg) WT_STAT_CONN_INCR(session, dh_sweeps); /* + * Sweep the lookaside table. If the lookaside table hasn't yet + * been written, there's no work to do. + * + * Don't sweep the lookaside table if the cache is stuck full. + * The sweep uses the cache and can exacerbate the problem. + * If we try to sweep when the cache is full or we aren't + * making progress in eviction, sweeping can wind up constantly + * bringing in and evicting pages from the lookaside table, + * which will stop the cache from moving into the stuck state. + */ + if (__wt_las_nonempty(session) && + !__wt_cache_stuck(session)) { + oldest_id = __wt_txn_oldest_id(session); + if (WT_TXNID_LT(last_las_sweep_id, oldest_id)) { + WT_ERR(__wt_las_sweep(session)); + last_las_sweep_id = oldest_id; + } + } + + /* * Mark handles with a time of death, and report whether any * handles are marked dead. If sweep_idle_time is 0, handles * never become idle. @@ -379,15 +401,21 @@ __wt_sweep_create(WT_SESSION_IMPL *session) /* * Handle sweep does enough I/O it may be called upon to perform slow - * operations for the block manager. - * - * Don't tap the sweep thread for eviction. + * operations for the block manager. Sweep should not block due to the + * cache being full. */ - session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION; + session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE; WT_RET(__wt_open_internal_session( conn, "sweep-server", true, session_flags, &conn->sweep_session)); session = conn->sweep_session; + /* + * Sweep should have it's own lookaside cursor to avoid blocking reads + * and eviction when processing drops. + */ + if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) + WT_RET(__wt_las_cursor_open(session)); + WT_RET(__wt_cond_alloc( session, "handle sweep server", &conn->sweep_cond)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index e1fbb63178f..bcd3943122d 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -532,7 +532,8 @@ typedef struct { * Handle a key produced by a custom extractor. */ static int -__curjoin_extract_insert(WT_CURSOR *cursor) { +__curjoin_extract_insert(WT_CURSOR *cursor) +{ WT_CURJOIN_EXTRACTOR *cextract; WT_DECL_RET; WT_ITEM ikey; diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 9296038bd96..9cfa3203aec 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -570,8 +570,7 @@ __wt_cursor_close(WT_CURSOR *cursor) __wt_buf_free(session, &cursor->value); __wt_free(session, cursor->internal_uri); - if (!F_ISSET(cursor, WT_CURSTD_URI_SHARED)) - __wt_free(session, cursor->uri); + __wt_free(session, cursor->uri); __wt_overwrite_and_free(session, cursor); return (0); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index 78d508a4e9d..429f75208f2 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -33,7 +33,8 @@ typedef struct { * Handle a key produced by a custom extractor. */ static int -__curextract_insert(WT_CURSOR *cursor) { +__curextract_insert(WT_CURSOR *cursor) +{ WT_CURSOR_EXTRACTOR *cextract; WT_ITEM *key, ikey, pkey; WT_SESSION_IMPL *session; @@ -135,12 +136,13 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, * Apply an operation to all indices of a table. */ static int -__apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, bool skip_immutable) { +__apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, bool skip_immutable) +{ WT_CURSOR **cp; WT_INDEX *idx; WT_SESSION_IMPL *session; - int (*f)(WT_CURSOR *); u_int i; + int (*f)(WT_CURSOR *); cp = ctable->idx_cursors; session = (WT_SESSION_IMPL *)ctable->iface.session; @@ -987,22 +989,15 @@ __wt_curtable_open(WT_SESSION_IMPL *session, if (table->is_simple) { /* Just return a cursor on the underlying data source. */ - if (table->is_simple_file) - ret = __wt_curfile_open(session, - table->cgroups[0]->source, NULL, cfg, cursorp); - else - ret = __wt_open_cursor(session, - table->cgroups[0]->source, NULL, cfg, cursorp); + ret = __wt_open_cursor(session, + table->cgroups[0]->source, NULL, cfg, cursorp); WT_TRET(__wt_schema_release_table(session, table)); if (ret == 0) { /* Fix up the public URI to match what was passed in. */ cursor = *cursorp; - if (!F_ISSET(cursor, WT_CURSTD_URI_SHARED)) - __wt_free(session, cursor->uri); - cursor->uri = table->iface.name; - WT_ASSERT(session, strcmp(uri, cursor->uri) == 0); - F_SET(cursor, WT_CURSTD_URI_SHARED); + __wt_free(session, cursor->uri); + WT_TRET(__wt_strdup(session, uri, &cursor->uri)); } return (ret); } diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 147b615c0ab..13e2823d234 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -54,10 +54,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) */ if (F_ISSET(dhandle, WT_DHANDLE_DEAD) && F_ISSET(S2C(session), WT_CONN_LOOKASIDE_OPEN) && - !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { - WT_ASSERT(session, !WT_IS_METADATA(dhandle)); + btree->lookaside_entries) { + WT_ASSERT(session, !WT_IS_METADATA(dhandle) && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)); - WT_RET(__wt_las_remove_block(session, NULL, btree->id, 0)); + WT_RET(__wt_las_save_dropped(session)); } else FLD_SET(walk_flags, WT_READ_LOOKASIDE); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 02851492039..3af5338d73f 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -75,7 +75,8 @@ __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref) return (WT_READGEN_OLDEST); /* Any page from a dead tree is a great choice. */ - if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) + if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) || + F_ISSET(btree, WT_BTREE_LOOKASIDE)) return (WT_READGEN_OLDEST); /* Any empty page (leaf or internal), is a good choice. */ @@ -606,6 +607,21 @@ __evict_update_work(WT_SESSION_IMPL *session) F_SET(cache, WT_CACHE_EVICT_SCRUB); /* + * Try lookaside evict when: + * (1) the cache is stuck; OR + * (2) the lookaside score goes over 80; and + * (3) the cache is more than half way from the dirty target to the + * dirty trigger. + */ + if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) && + (__wt_cache_stuck(session) || + (__wt_cache_lookaside_score(cache) > 80 && + dirty_inuse > (uint64_t) + ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) * + bytes_max) / 200))) + F_SET(cache, WT_CACHE_EVICT_LOOKASIDE); + + /* * With an in-memory cache, we only do dirty eviction in order to scrub * pages. */ @@ -1632,6 +1648,28 @@ __evict_walk_file(WT_SESSION_IMPL *session, QUEUE_FILLS_PER_PASS; /* + * If the tree is dead or we're near the end of the queue, fill the + * remaining slots. + */ + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + target_pages = remaining_slots; + + /* + * Lookaside pages don't count toward the cache's dirty limit. + * + * Preferentially evict lookaside pages unless applications are stalled + * on the dirty limit. Once application threads are stalled by the + * dirty limit, don't take any lookaside pages unless we're also up + * against the total cache size limit. + */ + if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) { + if (!F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) + target_pages = remaining_slots; + else if (!F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) + target_pages = 0; + } + + /* * Walk trees with a small fraction of the cache in case there are so * many trees that none of them use enough of the cache to be allocated * slots. Only skip a tree if it has no bytes of interest. @@ -1652,12 +1690,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, if (target_pages < MIN_PAGES_PER_TREE) target_pages = MIN_PAGES_PER_TREE; - /* - * If the tree is dead or we're near the end of the queue, fill the - * remaining slots. - */ - if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || - target_pages > remaining_slots) + if (target_pages > remaining_slots) target_pages = remaining_slots; /* @@ -1993,8 +2026,8 @@ fast: /* If the page can't be evicted, give up. */ if (restarts == 0) WT_STAT_CONN_INCR( session, cache_eviction_walks_abandoned); - WT_RET(__wt_page_release(cache->walk_session, - ref, WT_READ_NO_EVICT)); + WT_RET(__wt_page_release( + cache->walk_session, ref, walk_flags)); ref = NULL; } else if (WT_READGEN_EVICT_SOON(ref->page->read_gen)) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( @@ -2315,8 +2348,9 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) /* See if eviction is still needed. */ if (!__wt_eviction_needed(session, busy, &pct_full) || - (pct_full < 100 && cache->eviction_progress > - initial_progress + max_progress)) + ((pct_full < 100 || cache->eviction_scrub_limit > 0.0) && + (cache->eviction_progress > + initial_progress + max_progress))) break; /* diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 103c93a075b..65009dc3449 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -522,6 +522,13 @@ __evict_review( return (0); /* + * If reconciliation is disabled for this thread (e.g., during an + * eviction that writes to lookaside), give up. + */ + if (F_ISSET(session, WT_SESSION_NO_RECONCILE)) + return (EBUSY); + + /* * If the page is dirty, reconcile it to decide if we can evict it. * * If we have an exclusive lock (we're discarding the tree), assert @@ -575,9 +582,7 @@ __evict_review( * that can't be evicted, check if reconciliation * suggests trying the lookaside table. */ - if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) && - (__wt_cache_lookaside_score(cache) > 50 || - __wt_cache_stuck(session))) + if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE)) lookaside_retryp = &lookaside_retry; } } diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index aa080d2bcca..2e3a2fe220f 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -6,12 +6,41 @@ * See the file LICENSE for redistribution information. */ +#ifdef HAVE_DIAGNOSTIC +/* + * Capture cases where a single session handle is used by multiple threads + * in parallel. The check isn't trivial because some API calls re-enter + * via public API entry points and the session with ID 0 is the default + * session in the connection handle which can be used across multiple threads. + * It is safe to use the reference count without atomic operations because the + * reference count is only tracking a thread re-entering the API. + */ +#define WT_SINGLE_THREAD_CHECK_START(s) \ + { \ + uintmax_t __tmp_api_tid; \ + __wt_thread_id(&__tmp_api_tid); \ + WT_ASSERT(session, (s)->id == 0 || (s)->api_tid == 0 || \ + (s)->api_tid == __tmp_api_tid); \ + if ((s)->api_tid == 0) \ + WT_PUBLISH((s)->api_tid, __tmp_api_tid); \ + ++(s)->api_enter_refcnt; \ + } + +#define WT_SINGLE_THREAD_CHECK_STOP(s) \ + if (--(s)->api_enter_refcnt == 0) \ + WT_PUBLISH((s)->api_tid, 0); +#else +#define WT_SINGLE_THREAD_CHECK_START(s) +#define WT_SINGLE_THREAD_CHECK_STOP(s) +#endif + /* Standard entry points to the API: declares/initializes local variables. */ #define API_SESSION_INIT(s, h, n, dh) \ WT_DATA_HANDLE *__olddh = (s)->dhandle; \ const char *__oldname = (s)->name; \ (s)->dhandle = (dh); \ (s)->name = (s)->lastop = #h "." #n; \ + WT_SINGLE_THREAD_CHECK_START(s); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ __wt_verbose((s), WT_VERB_API, "%s", "CALL: " #h ":" #n) @@ -28,6 +57,7 @@ #define API_END(s, ret) \ if ((s) != NULL) { \ + WT_SINGLE_THREAD_CHECK_STOP(s); \ (s)->dhandle = __olddh; \ (s)->name = __oldname; \ if (F_ISSET(&(s)->txn, WT_TXN_RUNNING) && \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index c3646a2ae59..abb7cc19972 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -167,11 +167,12 @@ struct __wt_ovfl_reuse { * are written into a lookaside table, and restored as necessary if the page is * read. * - * The key is a unique marker for the page (a file ID plus a page ID), a - * counter (used to ensure the update records remain in the original order), - * and the record's key (byte-string for row-store, record number for - * column-store). The value is the WT_UPDATE structure's transaction ID, - * timestamp, update type and value. + * The key is a unique marker for the page (a page ID plus a file ID, ordered + * this way so that overall the lookaside table is append-mostly), a counter + * (used to ensure the update records remain in the original order), and the + * record's key (byte-string for row-store, record number for column-store). + * The value is the WT_UPDATE structure's transaction ID, timestamp, update + * type and value. * * As the key for the lookaside table is different for row- and column-store, we * store both key types in a WT_ITEM, building/parsing them in the code, because @@ -181,8 +182,8 @@ struct __wt_ovfl_reuse { * makes the lookaside table's value more likely to overflow the page size when * the row-store key is relatively large. */ -#define WT_LAS_FORMAT \ - "key_format=" WT_UNCHECKED_STRING(IQQu) \ +#define WT_LAS_CONFIG \ + "key_format=" WT_UNCHECKED_STRING(QIQu) \ ",value_format=" WT_UNCHECKED_STRING(QuBu) /* diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index 7dc9b4a11a7..8a3273d1b6b 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -134,13 +134,13 @@ struct __wt_btree { u_int rec_multiblock_max; /* Maximum blocks written for a page */ uint64_t last_recno; /* Column-store last record number */ - uint64_t las_pageid; /* Lookaside table page ID counter */ WT_REF root; /* Root page reference */ bool modified; /* If the tree ever modified */ uint8_t original; /* Newly created: bulk-load possible (want a bool but needs atomic cas) */ + bool lookaside_entries; /* Has entries in the lookaside table */ bool lsm_primary; /* Handle is/was the LSM primary */ WT_BM *bm; /* Block manager reference */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index f2948bfc90f..edc0973ee6f 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -149,7 +149,8 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) if (WT_PAGE_IS_INTERNAL(page)) { (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); - } else if (!btree->lsm_primary) { + } else if (!btree->lsm_primary && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } @@ -189,7 +190,7 @@ __wt_cache_decr_check_size( */ static inline void __wt_cache_decr_check_uint64( - WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld) + WT_SESSION_IMPL *session, uint64_t *vp, uint64_t v, const char *fld) { if (__wt_atomic_sub64(vp, v) < WT_EXABYTE) return; @@ -200,7 +201,7 @@ __wt_cache_decr_check_uint64( */ *vp = 0; __wt_errx(session, - "%s went negative with decrement of %" WT_SIZET_FMT, fld, v); + "%s went negative with decrement of %" PRIu64, fld, v); #ifdef HAVE_DIAGNOSTIC __wt_abort(session); @@ -261,7 +262,7 @@ __wt_cache_page_byte_dirty_decr( decr, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); - } else if (!btree->lsm_primary) { + } else if (!btree->lsm_primary && !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf, @@ -321,7 +322,8 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { - if (!btree->lsm_primary) { + if (!btree->lsm_primary && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } @@ -420,7 +422,8 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite) __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); - } else if (!btree->lsm_primary) { + } else if (!btree->lsm_primary && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); @@ -1359,6 +1362,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_BTREE *btree; WT_PAGE *page; + bool inmem_split; btree = S2BT(session); @@ -1387,10 +1391,10 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) */ page = ref->page; if (!WT_READGEN_EVICT_SOON(page->read_gen) || - LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(session, WT_SESSION_NO_EVICTION) || + LF_ISSET(WT_READ_NO_SPLIT) || btree->evict_disabled > 0 || - !__wt_page_can_evict(session, ref, NULL)) + !__wt_page_can_evict(session, ref, &inmem_split) || + (F_ISSET(session, WT_SESSION_NO_RECONCILE) && !inmem_split)) return (__wt_hazard_clear(session, ref)); WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); @@ -1622,6 +1626,6 @@ __wt_ref_state_yield_sleep(uint64_t *yield_count, uint64_t *sleep_count) return; } - (*sleep_count) = WT_MIN((*sleep_count) + WT_THOUSAND, 10 * WT_THOUSAND); + (*sleep_count) = WT_MIN((*sleep_count) + 100, WT_THOUSAND); __wt_sleep(0, (*sleep_count)); } diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 0a42853b95b..f9ce4316e29 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -7,6 +7,12 @@ */ /* + * Helper: in order to read without any calls to eviction, we have to ignore + * the cache size and disable splits. + */ +#define WT_READ_NO_EVICT (WT_READ_IGNORE_CACHE_SIZE | WT_READ_NO_SPLIT) + +/* * Tuning constants: I hesitate to call this tuning, but we want to review some * number of pages from each file's in-memory tree for each page we evict. */ @@ -176,6 +182,38 @@ struct __wt_cache { int32_t evict_lookaside_score; /* + * Shared lookaside lock, session and cursor, used by threads accessing + * the lookaside table (other than eviction server and worker threads + * and the sweep thread, all of which have their own lookaside cursors). + */ +#define WT_LAS_NUM_SESSIONS 5 + WT_SPINLOCK las_lock; + WT_SESSION_IMPL *las_session[WT_LAS_NUM_SESSIONS]; + bool las_session_inuse[WT_LAS_NUM_SESSIONS]; + + uint32_t las_fileid; /* Lookaside table file ID */ + uint64_t las_entry_count; /* Count of entries in lookaside */ + uint64_t las_pageid; /* Lookaside table page ID counter */ + + WT_SPINLOCK las_sweep_lock; + WT_ITEM las_sweep_key; /* Track sweep position. */ + uint32_t las_sweep_dropmin; /* Minimum btree ID in current set. */ + uint8_t *las_sweep_dropmap; /* Bitmap of dropped btree IDs. */ + uint32_t las_sweep_dropmax; /* Maximum btree ID in current set. */ + + uint32_t *las_dropped; /* List of dropped btree IDs. */ + size_t las_dropped_next; /* Next index into drop list. */ + size_t las_dropped_alloc; /* Allocated size of drop list. */ + + /* + * The "lookaside_activity" verbose messages are throttled to once per + * checkpoint. To accomplish this we track the checkpoint generation + * for the most recent read and write verbose messages. + */ + uint64_t las_verb_gen_read; + uint64_t las_verb_gen_write; + + /* * Cache pool information. */ uint64_t cp_pass_pressure; /* Calculated pressure from this pass */ @@ -200,8 +238,9 @@ struct __wt_cache { #define WT_CACHE_EVICT_CLEAN_HARD 0x002 /* Clean % blocking app threads */ #define WT_CACHE_EVICT_DIRTY 0x004 /* Evict dirty pages */ #define WT_CACHE_EVICT_DIRTY_HARD 0x008 /* Dirty % blocking app threads */ -#define WT_CACHE_EVICT_SCRUB 0x010 /* Scrub dirty pages */ -#define WT_CACHE_EVICT_URGENT 0x020 /* Pages are in the urgent queue */ +#define WT_CACHE_EVICT_LOOKASIDE 0x010 /* Try lookaside eviction */ +#define WT_CACHE_EVICT_SCRUB 0x020 /* Scrub dirty pages */ +#define WT_CACHE_EVICT_URGENT 0x040 /* Pages are in the urgent queue */ #define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY) uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index e160dbf4d64..c7d802f8a5f 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -241,12 +241,12 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) return (false); /* - * LSM sets the no-eviction flag when holding the LSM tree lock, in that - * case, or when holding the schema lock, we don't want to highjack the - * thread for eviction. + * LSM sets the "ignore cache size" flag when holding the LSM tree + * lock, in that case, or when holding the schema lock, we don't want + * this thread to block for eviction. */ - return (!F_ISSET( - session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)); + return (!F_ISSET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_LOCKED_SCHEMA)); } /* @@ -395,12 +395,12 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) txn_global->current != txn_global->oldest_id); /* - * LSM sets the no-cache-check flag when holding the LSM tree lock, in - * that case, or when holding the handle list, schema or table locks - * (which can block checkpoints and eviction), don't block the thread - * for eviction. + * LSM sets the "ignore cache size" flag when holding the LSM tree + * lock, in that case, or when holding the handle list, schema or table + * locks (which can block checkpoints and eviction), don't block the + * thread for eviction. */ - if (F_ISSET(session, WT_SESSION_NO_EVICTION | + if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA | WT_SESSION_LOCKED_TABLE)) return (0); diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index c1d1921bdcc..9288618c87e 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -358,23 +358,6 @@ struct __wt_connection_impl { uint64_t sweep_interval; /* Handle sweep interval */ uint64_t sweep_handles_min;/* Handle sweep minimum open */ - /* - * Shared lookaside lock, session and cursor, used by threads accessing - * the lookaside table (other than eviction server and worker threads - * and the sweep thread, all of which have their own lookaside cursors). - */ - WT_SPINLOCK las_lock; /* Lookaside table spinlock */ - WT_SESSION_IMPL *las_session; /* Lookaside table session */ - uint32_t las_fileid; /* Lookaside table file ID */ - - /* - * The "lookaside_activity" verbose messages are throttled to once per - * checkpoint. To accomplish this we track the checkpoint generation - * for the most recent read and write verbose messages. - */ - uint64_t las_verb_gen_read; - uint64_t las_verb_gen_write; - /* Set of btree IDs not being rolled back */ uint8_t *stable_rollback_bitstring; uint32_t stable_rollback_maxfile; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index bbe66abf753..17afb48bda6 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -200,15 +200,18 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_las_nonempty(WT_SESSION_IMPL *session); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); -extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_save_dropped(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_checksum_init(void); extern void __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h index 864a40aa325..624cd815dad 100644 --- a/src/third_party/wiredtiger/src/include/extern_posix.h +++ b/src/third_party/wiredtiger/src/include/extern_posix.h @@ -27,6 +27,7 @@ extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DEC extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_thread_id(uintmax_t *id) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern int __wt_thread_str(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp); extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); diff --git a/src/third_party/wiredtiger/src/include/extern_win.h b/src/third_party/wiredtiger/src/include/extern_win.h index 85db8175615..ae4195b20a1 100644 --- a/src/third_party/wiredtiger/src/include/extern_win.h +++ b/src/third_party/wiredtiger/src/include/extern_win.h @@ -25,7 +25,8 @@ extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds); extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_thread_id(uintmax_t *id); +extern int __wt_thread_str(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp); extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 23be5fd2e14..b191e8fe01d 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -32,17 +32,18 @@ #define WT_LOG_FSYNC 0x00000008 #define WT_LOG_SYNC_ENABLED 0x00000010 #define WT_READ_CACHE 0x00000001 -#define WT_READ_LOOKASIDE 0x00000002 -#define WT_READ_NOTFOUND_OK 0x00000004 -#define WT_READ_NO_EMPTY 0x00000008 -#define WT_READ_NO_EVICT 0x00000010 +#define WT_READ_IGNORE_CACHE_SIZE 0x00000002 +#define WT_READ_LOOKASIDE 0x00000004 +#define WT_READ_NOTFOUND_OK 0x00000008 +#define WT_READ_NO_EMPTY 0x00000010 #define WT_READ_NO_GEN 0x00000020 -#define WT_READ_NO_WAIT 0x00000040 -#define WT_READ_PREV 0x00000080 -#define WT_READ_RESTART_OK 0x00000100 -#define WT_READ_SKIP_INTL 0x00000200 -#define WT_READ_TRUNCATE 0x00000400 -#define WT_READ_WONT_NEED 0x00000800 +#define WT_READ_NO_SPLIT 0x00000040 +#define WT_READ_NO_WAIT 0x00000080 +#define WT_READ_PREV 0x00000100 +#define WT_READ_RESTART_OK 0x00000200 +#define WT_READ_SKIP_INTL 0x00000400 +#define WT_READ_TRUNCATE 0x00000800 +#define WT_READ_WONT_NEED 0x00001000 #define WT_REC_CHECKPOINT 0x00000001 #define WT_REC_EVICT 0x00000002 #define WT_REC_IN_MEMORY 0x00000004 @@ -52,26 +53,27 @@ #define WT_REC_VISIBILITY_ERR 0x00000040 #define WT_REC_VISIBLE_ALL 0x00000080 #define WT_SESSION_CAN_WAIT 0x00000001 -#define WT_SESSION_INTERNAL 0x00000002 -#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 -#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008 -#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010 -#define WT_SESSION_LOCKED_METADATA 0x00000020 -#define WT_SESSION_LOCKED_PASS 0x00000040 -#define WT_SESSION_LOCKED_SCHEMA 0x00000080 -#define WT_SESSION_LOCKED_SLOT 0x00000100 -#define WT_SESSION_LOCKED_TABLE_READ 0x00000200 -#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400 -#define WT_SESSION_LOCKED_TURTLE 0x00000800 -#define WT_SESSION_LOGGING_INMEM 0x00001000 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000 -#define WT_SESSION_NO_CACHE 0x00004000 +#define WT_SESSION_IGNORE_CACHE_SIZE 0x00000002 +#define WT_SESSION_INTERNAL 0x00000004 +#define WT_SESSION_LOCKED_CHECKPOINT 0x00000008 +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000010 +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000020 +#define WT_SESSION_LOCKED_METADATA 0x00000040 +#define WT_SESSION_LOCKED_PASS 0x00000080 +#define WT_SESSION_LOCKED_SCHEMA 0x00000100 +#define WT_SESSION_LOCKED_SLOT 0x00000200 +#define WT_SESSION_LOCKED_TABLE_READ 0x00000400 +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000800 +#define WT_SESSION_LOCKED_TURTLE 0x00001000 +#define WT_SESSION_LOGGING_INMEM 0x00002000 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00004000 #define WT_SESSION_NO_DATA_HANDLES 0x00008000 -#define WT_SESSION_NO_EVICTION 0x00010000 -#define WT_SESSION_NO_LOGGING 0x00020000 +#define WT_SESSION_NO_LOGGING 0x00010000 +#define WT_SESSION_NO_RECONCILE 0x00020000 #define WT_SESSION_NO_SCHEMA_LOCK 0x00040000 #define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000 -#define WT_SESSION_SERVER_ASYNC 0x00100000 +#define WT_SESSION_READ_WONT_NEED 0x00100000 +#define WT_SESSION_SERVER_ASYNC 0x00200000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 @@ -102,20 +104,20 @@ #define WT_VERB_HANDLEOPS 0x00000200 #define WT_VERB_LOG 0x00000400 #define WT_VERB_LOOKASIDE 0x00000800 -#define WT_VERB_LSM 0x00001000 -#define WT_VERB_LSM_MANAGER 0x00002000 -#define WT_VERB_METADATA 0x00004000 -#define WT_VERB_MUTEX 0x00008000 -#define WT_VERB_OVERFLOW 0x00010000 -#define WT_VERB_READ 0x00020000 -#define WT_VERB_REBALANCE 0x00040000 -#define WT_VERB_RECONCILE 0x00080000 -#define WT_VERB_RECOVERY 0x00100000 -#define WT_VERB_RECOVERY_PROGRESS 0x00200000 -#define WT_VERB_SALVAGE 0x00400000 -#define WT_VERB_SHARED_CACHE 0x00800000 -#define WT_VERB_SPLIT 0x01000000 -#define WT_VERB_TEMPORARY 0x02000000 +#define WT_VERB_LOOKASIDE_ACTIVITY 0x00001000 +#define WT_VERB_LSM 0x00002000 +#define WT_VERB_LSM_MANAGER 0x00004000 +#define WT_VERB_METADATA 0x00008000 +#define WT_VERB_MUTEX 0x00010000 +#define WT_VERB_OVERFLOW 0x00020000 +#define WT_VERB_READ 0x00040000 +#define WT_VERB_REBALANCE 0x00080000 +#define WT_VERB_RECONCILE 0x00100000 +#define WT_VERB_RECOVERY 0x00200000 +#define WT_VERB_RECOVERY_PROGRESS 0x00400000 +#define WT_VERB_SALVAGE 0x00800000 +#define WT_VERB_SHARED_CACHE 0x01000000 +#define WT_VERB_SPLIT 0x02000000 #define WT_VERB_THREAD_GROUP 0x04000000 #define WT_VERB_TIMESTAMP 0x08000000 #define WT_VERB_TRANSACTION 0x10000000 diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index bae5fc8cc04..80513f1174b 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -63,7 +63,7 @@ struct __wt_table { WT_INDEX **indices; size_t idx_alloc; - bool cg_complete, idx_complete, is_simple, is_simple_file; + bool cg_complete, idx_complete, is_simple; u_int ncolgroups, nindices, nkey_columns; }; diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index bea436e05e2..23cf136d0aa 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -96,6 +96,12 @@ struct __wt_session_impl { size_t scratch_cached; /* Scratch bytes cached */ #ifdef HAVE_DIAGNOSTIC /* + * Variables used to look for violations of the contract that a + * session is only used by a single session at once. + */ + volatile uintmax_t api_tid; + volatile uint32_t api_enter_refcnt; + /* * It's hard to figure out from where a buffer was allocated after it's * leaked, so in diagnostic mode we track them; DIAGNOSTIC can't simply * add additional fields to WT_ITEM structures because they are visible diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 12a7d532496..2477079a2a8 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -536,6 +536,8 @@ struct __wt_connection_stats { int64_t txn_pinned_range; int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_snapshot_range; + int64_t txn_pinned_timestamp; + int64_t txn_pinned_timestamp_oldest; int64_t txn_sync; int64_t txn_commit_queue_head; int64_t txn_commit_queue_inserts; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 41dd970d3ba..5d3b0c52cbd 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -687,9 +687,8 @@ struct __wt_cursor { #define WT_CURSTD_OVERWRITE 0x00400 #define WT_CURSTD_RAW 0x00800 #define WT_CURSTD_RAW_SEARCH 0x01000 -#define WT_CURSTD_URI_SHARED 0x02000 -#define WT_CURSTD_VALUE_EXT 0x04000 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x08000 /* Value points into the tree. */ +#define WT_CURSTD_VALUE_EXT 0x02000 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x04000 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif @@ -2203,11 +2202,11 @@ struct __wt_connection { * list\, with values chosen from the following options: \c "api"\, \c * "block"\, \c "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, * \c "evict"\, \c "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c - * "handleops"\, \c "log"\, \c "lookaside_activity"\, \c "lsm"\, \c - * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c - * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c - * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, - * \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c + * "handleops"\, \c "log"\, \c "lookaside"\, \c "lookaside_activity"\, + * \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c + * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c + * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c + * "shared_cache"\, \c "split"\, \c "thread_group"\, \c "timestamp"\, \c * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default * empty.} * @configend @@ -2843,10 +2842,10 @@ struct __wt_connection { * values chosen from the following options: \c "api"\, \c "block"\, \c * "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c "evict"\, \c * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, - * \c "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c - * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c - * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c - * "split"\, \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c + * \c "lookaside"\, \c "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c + * "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c + * "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c + * "shared_cache"\, \c "split"\, \c "thread_group"\, \c "timestamp"\, \c * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such @@ -5286,26 +5285,33 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * snapshots */ #define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1278 +/*! transaction: transaction range of timestamps currently pinned */ +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1279 +/*! + * transaction: transaction range of timestamps pinned by the oldest + * timestamp + */ +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1280 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1279 +#define WT_STAT_CONN_TXN_SYNC 1281 /*! transaction: transactions commit timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1280 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1282 /*! transaction: transactions commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1281 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1283 /*! transaction: transactions commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1282 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1284 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1283 +#define WT_STAT_CONN_TXN_COMMIT 1285 /*! transaction: transactions read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1284 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1286 /*! transaction: transactions read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1285 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1287 /*! transaction: transactions read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1286 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1288 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1287 +#define WT_STAT_CONN_TXN_ROLLBACK 1289 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1288 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1290 /*! * @} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c index d159005ee11..7a20686fb97 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c @@ -446,7 +446,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) } /* Discard pages we read as soon as we're done with them. */ - F_SET(session, WT_SESSION_NO_CACHE); + F_SET(session, WT_SESSION_READ_WONT_NEED); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; @@ -498,14 +498,14 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) WT_TRET(dest->close(dest)); src = dest = NULL; - F_CLR(session, WT_SESSION_NO_CACHE); + F_CLR(session, WT_SESSION_READ_WONT_NEED); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ - F_SET(session, WT_SESSION_NO_EVICTION); + F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); if (create_bloom) { if (ret == 0) @@ -626,6 +626,7 @@ err: if (locked) "Merge failed with %s", __wt_strerror(session, ret, NULL, 0)); } - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED); return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 6195726ec67..6927fe909f8 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -1068,7 +1068,8 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); } /* @@ -1078,7 +1079,8 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) void __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); __wt_readunlock(session, &lsm_tree->rwlock); } @@ -1096,7 +1098,8 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); } /* @@ -1106,7 +1109,8 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) void __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); __wt_writeunlock(session, &lsm_tree->rwlock); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index f6aea02e20d..76827f7888c 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -503,7 +503,8 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ - F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_SET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); __wt_bloom_insert(bloom, &key); @@ -514,7 +515,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); - F_CLR(session, WT_SESSION_NO_CACHE); + F_CLR(session, WT_SESSION_READ_WONT_NEED); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); @@ -537,7 +538,8 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED); return (ret); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c index 3d06461a9ba..5e625a49bac 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_map.c +++ b/src/third_party/wiredtiger/src/os_posix/os_map.c @@ -88,7 +88,7 @@ __wt_posix_map_preload(WT_FILE_HANDLE *fh, length += WT_PTRDIFF(map, blk); /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ - if (F_ISSET(session, WT_SESSION_NO_CACHE)) { + if (F_ISSET(session, WT_SESSION_READ_WONT_NEED)) { /* Read in 2MB blocks every 1MB of data. */ if (((uintptr_t)((uint8_t *)blk + length) & (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk) diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c index 8af672dd0d4..dc4d49ad493 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_thread.c +++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c @@ -67,10 +67,32 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) /* * __wt_thread_id -- + * Return an arithmetic representation of a thread ID on POSIX. + */ +void +__wt_thread_id(uintmax_t *id) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + pthread_t self; + + /* + * POSIX 1003.1 allows pthread_t to be an opaque type; on systems where + * it's a pointer, print the pointer to match gdb output. + */ + self = pthread_self(); +#ifdef __sun + *id = (uintmax_t)self; +#else + *id = (uintmax_t)(void *)self; +#endif +} + +/* + * __wt_thread_str -- * Fill in a printable version of the process and thread IDs. */ int -__wt_thread_id(char *buf, size_t buflen) +__wt_thread_str(char *buf, size_t buflen) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { pthread_t self; diff --git a/src/third_party/wiredtiger/src/os_win/os_thread.c b/src/third_party/wiredtiger/src/os_win/os_thread.c index 1ecf53e382e..1d549cf4712 100644 --- a/src/third_party/wiredtiger/src/os_win/os_thread.c +++ b/src/third_party/wiredtiger/src/os_win/os_thread.c @@ -77,10 +77,20 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) /* * __wt_thread_id -- + * Return an arithmetic representation of a thread ID on POSIX. + */ +void +__wt_thread_id(uintmax_t *id) +{ + *id = (uintmax_t)GetCurrentThreadId(); +} + +/* + * __wt_thread_str -- * Fill in a printable version of the process and thread IDs. */ int -__wt_thread_id(char *buf, size_t buflen) +__wt_thread_str(char *buf, size_t buflen) { return (__wt_snprintf(buf, buflen, "%" PRIu64 ":%" PRIu64, diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 3e857fef324..b509c49cbbc 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -407,6 +407,18 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, */ WT_PAGE_LOCK(session, page); + /* + * Now that the page is locked, if attempting to evict it, check again + * whether eviction is permitted. The page's state could have changed + * while we were waiting to acquire the lock (e.g., the page could have + * split). + */ + if (LF_ISSET(WT_REC_EVICT) && + !__wt_page_can_evict(session, ref, NULL)) { + WT_PAGE_UNLOCK(session, page); + return (EBUSY); + } + oldest_id = __wt_txn_oldest_id(session); if (LF_ISSET(WT_REC_EVICT)) mod->last_eviction_id = oldest_id; @@ -1449,6 +1461,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE)) return (EBUSY); + WT_ASSERT(session, r->max_txn != WT_TXN_NONE); + /* * The order of the updates on the list matters, we can't move only the * unresolved updates, move the entire update list. @@ -6050,7 +6064,7 @@ __rec_las_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->supd != NULL) WT_ERR(__wt_las_insert_block( - session, r->page, cursor, multi, key)); + session, cursor, r->page, multi, key)); err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c index 420fab63d68..3b27c8300e2 100644 --- a/src/third_party/wiredtiger/src/schema/schema_open.c +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -106,8 +106,6 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) } table->cg_complete = true; - table->is_simple_file = (table->is_simple && - WT_PREFIX_MATCH(table->cgroups[0]->source, "file:")); err: __wt_scr_free(session, &buf); __wt_schema_destroy_colgroup(session, &colgroup); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index fa33b55c936..d81735234a0 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -259,9 +259,9 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config) ret = __wt_config_getones(session, config, "ignore_cache_size", &cval); if (ret == 0) { if (cval.val) - F_SET(session, WT_SESSION_NO_EVICTION); + F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); else - F_CLR(session, WT_SESSION_NO_EVICTION); + F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE); } WT_ERR_NOTFOUND_OK(ret); @@ -1489,7 +1489,12 @@ __session_timestamp_transaction(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; +#ifdef HAVE_DIAGNOSTIC SESSION_API_CALL(session, timestamp_transaction, config, cfg); +#else + SESSION_API_CALL(session, timestamp_transaction, NULL, cfg); + cfg[1] = config; +#endif WT_TRET(__wt_txn_set_timestamp(session, cfg)); err: API_END_RET(session, ret); } diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c index 6ccf3161229..aa2f1bc3bd8 100644 --- a/src/third_party/wiredtiger/src/session/session_compact.c +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -349,23 +349,21 @@ __wt_session_compact( WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; - bool no_eviction_set; + bool ignore_cache_size_set; - no_eviction_set = false; + ignore_cache_size_set = false; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* - * Don't highjack the compaction thread for eviction; it's holding locks - * blocking checkpoints and once an application is tapped for eviction, - * it can spend a long time doing nothing else. (And, if we're tapping - * application threads for eviction, compaction should quit, it's not - * making anything better.) + * The compaction thread should not block when the cache is full: it is + * holding locks blocking checkpoints and once the cache is full, it can + * spend a long time doing eviction. */ - if (!F_ISSET(session, WT_SESSION_NO_EVICTION)) { - no_eviction_set = true; - F_SET(session, WT_SESSION_NO_EVICTION); + if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) { + ignore_cache_size_set = true; + F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); } /* In-memory ignores compaction operations. */ @@ -437,8 +435,8 @@ err: session->compact = NULL; */ WT_TRET(__wt_session_release_resources(session)); - if (no_eviction_set) - F_CLR(session, WT_SESSION_NO_EVICTION); + if (ignore_cache_size_set) + F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_compact_fail); diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index ffbdba0b25b..a3b87b6edde 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -217,7 +217,7 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * followed by a colon. */ __wt_epoch(session, &ts); - WT_ERR(__wt_thread_id(tid, sizeof(tid))); + WT_ERR(__wt_thread_str(tid, sizeof(tid))); WT_ERROR_APPEND(p, remain, "[%" PRIuMAX ":%" PRIuMAX "][%s]", (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 924afaa21d6..b4533841ec6 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1004,6 +1004,8 @@ static const char * const __stats_connection_desc[] = { "transaction: transaction range of IDs currently pinned", "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned by named snapshots", + "transaction: transaction range of timestamps currently pinned", + "transaction: transaction range of timestamps pinned by the oldest timestamp", "transaction: transaction sync calls", "transaction: transactions commit timestamp queue inserts to head", "transaction: transactions commit timestamp queue inserts total", @@ -1335,6 +1337,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing txn_pinned_range */ /* not clearing txn_pinned_checkpoint_range */ /* not clearing txn_pinned_snapshot_range */ + /* not clearing txn_pinned_timestamp */ + /* not clearing txn_pinned_timestamp_oldest */ stats->txn_sync = 0; stats->txn_commit_queue_head = 0; stats->txn_commit_queue_inserts = 0; @@ -1769,6 +1773,9 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, txn_pinned_checkpoint_range); to->txn_pinned_snapshot_range += WT_STAT_READ(from, txn_pinned_snapshot_range); + to->txn_pinned_timestamp += WT_STAT_READ(from, txn_pinned_timestamp); + to->txn_pinned_timestamp_oldest += + WT_STAT_READ(from, txn_pinned_timestamp_oldest); to->txn_sync += WT_STAT_READ(from, txn_sync); to->txn_commit_queue_head += WT_STAT_READ(from, txn_commit_queue_head); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 3d45ff8a88c..8b4a7fc7936 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -612,7 +612,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; u_int i; - bool did_update, locked; + bool locked; #ifdef HAVE_TIMESTAMPS wt_timestamp_t prev_commit_timestamp, ts; bool update_timestamp; @@ -621,11 +621,11 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) txn = &session->txn; conn = S2C(session); txn_global = &conn->txn_global; - did_update = txn->mod_count != 0; locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || !did_update); + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || + txn->mod_count == 0); /* * Look for a commit timestamp. @@ -716,7 +716,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) } /* If we are logging, write a commit log record. */ - if (did_update && + if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* @@ -757,8 +757,8 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * Writes to the lookaside file can be evicted as soon * as they commit. */ - if (conn->las_fileid != 0 && - op->fileid == conn->las_fileid) { + if (conn->cache->las_fileid != 0 && + op->fileid == conn->cache->las_fileid) { op->u.upd->txnid = WT_TXN_NONE; break; } @@ -823,6 +823,20 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * write lock and re-check. */ if (update_timestamp) { +#if WT_TIMESTAMP_SIZE == 8 + while (__wt_timestamp_cmp( + &txn->commit_timestamp, &prev_commit_timestamp) > 0) { + if (__wt_atomic_cas64( + &txn_global->commit_timestamp.val, + prev_commit_timestamp.val, + txn->commit_timestamp.val)) { + txn_global->has_commit_timestamp = true; + break; + } + __wt_timestamp_set( + &prev_commit_timestamp, &txn_global->commit_timestamp); + } +#else __wt_writelock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&txn->commit_timestamp, &txn_global->commit_timestamp) > 0) { @@ -831,6 +845,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) txn_global->has_commit_timestamp = true; } __wt_writeunlock(session, &txn_global->rwlock); +#endif } #endif @@ -881,8 +896,9 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: WT_ASSERT(session, op->u.upd->txnid == txn->id); - WT_ASSERT(session, S2C(session)->las_fileid == 0 || - op->fileid != S2C(session)->las_fileid); + WT_ASSERT(session, + S2C(session)->cache->las_fileid == 0 || + op->fileid != S2C(session)->cache->las_fileid); op->u.upd->txnid = WT_TXN_ABORTED; break; case WT_TXN_OP_REF: @@ -962,6 +978,15 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_STAT_SET(session, stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); +#if WT_TIMESTAMP_SIZE == 8 + WT_STAT_SET(session, stats, txn_pinned_timestamp, + txn_global->commit_timestamp.val - + txn_global->pinned_timestamp.val); + WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest, + txn_global->commit_timestamp.val - + txn_global->oldest_timestamp.val); +#endif + WT_STAT_SET(session, stats, txn_pinned_snapshot_range, snapshot_pinned == WT_TXN_NONE ? 0 : txn_global->current - snapshot_pinned); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index eb32ef2d06a..c82187daf85 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -122,7 +122,7 @@ __checkpoint_update_generation(WT_SESSION_IMPL *session) */ static int __checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[], - int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp) + int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; @@ -205,7 +205,7 @@ err: __wt_scr_free(session, &tmp); */ static int __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], - int (*op)(WT_SESSION_IMPL *, const char *[])) + int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; u_int i; @@ -440,6 +440,13 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) if (current_dirty <= (double)cache->eviction_checkpoint_target) break; + /* + * Don't scrub when the lookaside table is in use: scrubbing is + * counter-productive in that case. + */ + if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE)) + break; + __wt_sleep(0, stepdown_us / 10); __wt_epoch(session, &stop); current_us = WT_TIMEDIFF_US(stop, last); @@ -1080,7 +1087,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) */ #undef WT_CHECKPOINT_SESSION_FLAGS #define WT_CHECKPOINT_SESSION_FLAGS \ - (WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION) + (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE) #undef WT_CHECKPOINT_SESSION_FLAGS_OFF #define WT_CHECKPOINT_SESSION_FLAGS_OFF \ (WT_SESSION_LOOKASIDE_CURSOR) diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 929aba30155..c68d00d7503 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -46,12 +46,12 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) __wt_las_cursor(session, &cursor, &session_flags); /* Discard pages we read as soon as we're done with them. */ - F_SET(session, WT_SESSION_NO_CACHE); + F_SET(session, WT_SESSION_READ_WONT_NEED); /* Walk the file. */ for (; (ret = cursor->next(cursor)) == 0; ) { WT_ERR(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); + &las_pageid, &las_id, &las_counter, &las_key)); /* Check the file ID so we can skip durable tables */ if (las_id >= conn->stable_rollback_maxfile) @@ -79,7 +79,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_STAT_CONN_SET(session, cache_lookaside_entries, las_total); - F_CLR(session, WT_SESSION_NO_CACHE); + F_CLR(session, WT_SESSION_READ_WONT_NEED); return (ret); } diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 98887627bfc..5a39a6d84dc 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -210,6 +210,10 @@ __txn_global_query_timestamp( __wt_timestamp_set(&ts, &txn_global->commit_timestamp)); WT_ASSERT(session, !__wt_timestamp_iszero(&ts)); + /* Skip the lock if there are no running transactions. */ + if (TAILQ_EMPTY(&txn_global->commit_timestamph)) + goto done; + /* Compare with the oldest running transaction. */ __wt_readlock(session, &txn_global->commit_timestamp_rwlock); txn = TAILQ_FIRST(&txn_global->commit_timestamph); @@ -254,7 +258,7 @@ __txn_global_query_timestamp( WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str); - __wt_timestamp_set(tsp, &ts); +done: __wt_timestamp_set(tsp, &ts); return (0); } #endif @@ -292,7 +296,8 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) { WT_DECL_RET; WT_TXN_GLOBAL *txn_global; - wt_timestamp_t active_timestamp, oldest_timestamp, pinned_timestamp; + wt_timestamp_t active_timestamp, last_pinned_timestamp; + wt_timestamp_t oldest_timestamp, pinned_timestamp; const char *query_cfg[] = { WT_CONFIG_BASE(session, WT_CONNECTION_query_timestamp), "get=pinned", NULL }; @@ -316,6 +321,16 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) } else __wt_timestamp_set(&pinned_timestamp, &active_timestamp); + if (txn_global->has_pinned_timestamp) { + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &last_pinned_timestamp, &txn_global->pinned_timestamp)); + + if (__wt_timestamp_cmp( + &pinned_timestamp, &last_pinned_timestamp) <= 0) + return (0); + } + __wt_writelock(session, &txn_global->rwlock); if (!txn_global->has_pinned_timestamp || __wt_timestamp_cmp( &txn_global->pinned_timestamp, &pinned_timestamp) < 0) { @@ -364,6 +379,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) { WT_TXN_GLOBAL *txn_global; wt_timestamp_t commit_ts, oldest_ts, stable_ts; + wt_timestamp_t last_oldest_ts, last_stable_ts; txn_global = &S2C(session)->txn_global; /* @@ -376,7 +392,11 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) session, "oldest", &oldest_ts, &oldest_cval)); WT_RET(__wt_txn_parse_timestamp( session, "stable", &stable_ts, &stable_cval)); - __wt_writelock(session, &txn_global->rwlock); + + __wt_readlock(session, &txn_global->rwlock); + + __wt_timestamp_set(&last_oldest_ts, &txn_global->oldest_timestamp); + __wt_timestamp_set(&last_stable_ts, &txn_global->stable_timestamp); /* * First do error checking on the timestamp values. The @@ -388,9 +408,9 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) if (!has_commit && txn_global->has_commit_timestamp) __wt_timestamp_set(&commit_ts, &txn_global->commit_timestamp); if (!has_oldest && txn_global->has_oldest_timestamp) - __wt_timestamp_set(&oldest_ts, &txn_global->oldest_timestamp); - if (!has_stable && txn_global->has_oldest_timestamp) - __wt_timestamp_set(&stable_ts, &txn_global->stable_timestamp); + __wt_timestamp_set(&oldest_ts, &last_oldest_ts); + if (!has_stable && txn_global->has_stable_timestamp) + __wt_timestamp_set(&stable_ts, &last_stable_ts); /* * If a commit timestamp was supplied, check that it is no older than @@ -398,7 +418,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) */ if (has_commit && (has_oldest || txn_global->has_oldest_timestamp) && __wt_timestamp_cmp(&oldest_ts, &commit_ts) > 0) { - __wt_writeunlock(session, &txn_global->rwlock); + __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "set_timestamp: oldest timestamp must not be later than " "commit timestamp"); @@ -406,7 +426,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) if (has_commit && (has_stable || txn_global->has_stable_timestamp) && __wt_timestamp_cmp(&stable_ts, &commit_ts) > 0) { - __wt_writeunlock(session, &txn_global->rwlock); + __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "set_timestamp: stable timestamp must not be later than " "commit timestamp"); @@ -420,12 +440,27 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) (has_oldest || txn_global->has_oldest_timestamp) && (has_stable || txn_global->has_stable_timestamp) && __wt_timestamp_cmp(&oldest_ts, &stable_ts) > 0) { - __wt_writeunlock(session, &txn_global->rwlock); + __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "set_timestamp: oldest timestamp must not be later than " "stable timestamp"); } + __wt_readunlock(session, &txn_global->rwlock); + + /* Check if we are actually updating anything. */ + if (has_oldest && txn_global->has_oldest_timestamp && + __wt_timestamp_cmp(&oldest_ts, &last_oldest_ts) <= 0) + has_oldest = false; + + if (has_stable && txn_global->has_stable_timestamp && + __wt_timestamp_cmp(&stable_ts, &last_stable_ts) <= 0) + has_stable = false; + + if (!has_commit && !has_oldest && !has_stable) + return (0); + + __wt_writelock(session, &txn_global->rwlock); /* * This method can be called from multiple threads, check that we are * moving the global timestamps forwards. @@ -543,7 +578,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) /* * Look for a commit timestamp. */ - ret = __wt_config_gets(session, cfg, "commit_timestamp", &cval); + ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval); if (ret == 0 && cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_TXN *txn = &session->txn; diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c index 7299784ab35..ae499c2e96b 100644 --- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c +++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c @@ -67,7 +67,7 @@ checkpointer(void *arg) WT_UNUSED(arg); - testutil_check(__wt_thread_id(tid, sizeof(tid))); + testutil_check(__wt_thread_str(tid, sizeof(tid))); printf("checkpointer thread starting: tid: %s\n", tid); (void)real_checkpointer(); diff --git a/src/third_party/wiredtiger/test/checkpoint/workers.c b/src/third_party/wiredtiger/test/checkpoint/workers.c index e75f86f141a..cd32db6746f 100644 --- a/src/third_party/wiredtiger/test/checkpoint/workers.c +++ b/src/third_party/wiredtiger/test/checkpoint/workers.c @@ -148,7 +148,7 @@ worker(void *arg) WT_UNUSED(arg); - testutil_check(__wt_thread_id(tid, sizeof(tid))); + testutil_check(__wt_thread_str(tid, sizeof(tid))); printf("worker thread starting: tid: %s\n", tid); (void)real_worker(); diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index ca5fa10c2db..79b232b532a 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -75,7 +75,7 @@ static const char * const ckpt_file = "checkpoint_done"; static bool compat, inmem, use_ts; static volatile uint64_t global_ts = 1; -static uint64_t th_ts[MAX_TH]; +static volatile uint64_t th_ts[MAX_TH]; #define ENV_CONFIG_COMPAT ",compatibility=(release=\"2.9\")" #define ENV_CONFIG_DEF \ @@ -121,7 +121,7 @@ thread_ts_run(void *arg) WT_CURSOR *cur_stable; WT_SESSION *session; THREAD_DATA *td; - uint64_t i, last_ts, oldest_ts; + uint64_t i, last_ts, oldest_ts, this_ts; char tscfg[64]; td = (THREAD_DATA *)arg; @@ -148,10 +148,11 @@ thread_ts_run(void *arg) * any thread still with a zero timestamp we go to * sleep. */ - if (th_ts[i] == 0) + this_ts = th_ts[i]; + if (this_ts == 0) goto ts_wait; - if (th_ts[i] != 0 && th_ts[i] < oldest_ts) - oldest_ts = th_ts[i]; + else if (this_ts < oldest_ts) + oldest_ts = this_ts; } if (oldest_ts != UINT64_MAX && @@ -638,7 +639,9 @@ main(int argc, char *argv[]) } /* * !!! If we wanted to take a copy of the directory before recovery, - * this is the place to do it. + * this is the place to do it. Don't do it all the time because + * it can use a lot of disk space, which can cause test machine + * issues. */ if (chdir(home) != 0) testutil_die(errno, "parent chdir: %s", home); diff --git a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c index 10824aec744..58e2a0bc113 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c @@ -171,7 +171,6 @@ static const char * const list[] = { ",verbose=(\"salvage\")", ",verbose=(\"shared_cache\")", ",verbose=(\"split\")", - ",verbose=(\"temporary\")", ",verbose=(\"transaction\")", ",verbose=(\"verify\")", ",verbose=(\"version\")", diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c b/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c index 810bf895b42..576860483ba 100644 --- a/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c +++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c @@ -221,7 +221,7 @@ reverse_scan(void *arg) id = (uintmax_t)arg; s = &run_info[id]; cfg = s->cfg; - testutil_check(__wt_thread_id(tid, sizeof(tid))); + testutil_check(__wt_thread_str(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf(" reverse scan thread %2" PRIuMAX @@ -305,7 +305,7 @@ append_insert(void *arg) id = (uintmax_t)arg; s = &run_info[id]; cfg = s->cfg; - testutil_check(__wt_thread_id(tid, sizeof(tid))); + testutil_check(__wt_thread_str(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf("write thread %2" PRIuMAX " starting: tid: %s, file: %s\n", diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index 83ddf307cc9..3c61ab5a66b 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -591,7 +591,7 @@ timestamp(void *arg) WT_SESSION *session; TINFO **tinfo_list, *tinfo; time_t last, now; - uint64_t oldest_timestamp, usecs; + uint64_t oldest_timestamp, this_ts, usecs; uint32_t i; char config_buf[64]; @@ -614,9 +614,10 @@ timestamp(void *arg) oldest_timestamp = UINT64_MAX; for (i = 0; i < g.c_threads; ++i) { tinfo = tinfo_list[i]; - if (tinfo->timestamp != 0 && - tinfo->timestamp < oldest_timestamp) - oldest_timestamp = tinfo->timestamp; + this_ts = tinfo->timestamp; + if (this_ts != 0 && + this_ts < oldest_timestamp) + oldest_timestamp = this_ts; } if (oldest_timestamp == UINT64_MAX) { __wt_sleep(1, 0); diff --git a/src/third_party/wiredtiger/test/thread/rw.c b/src/third_party/wiredtiger/test/thread/rw.c index 49af7c782b5..bda54d388b5 100644 --- a/src/third_party/wiredtiger/test/thread/rw.c +++ b/src/third_party/wiredtiger/test/thread/rw.c @@ -191,7 +191,7 @@ reader(void *arg) id = (int)(uintptr_t)arg; s = &run_info[id]; - testutil_check(__wt_thread_id(tid, sizeof(tid))); + testutil_check(__wt_thread_str(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf(" read thread %2d starting: tid: %s, file: %s\n", @@ -287,7 +287,7 @@ writer(void *arg) id = (int)(uintptr_t)arg; s = &run_info[id]; - testutil_check(__wt_thread_id(tid, sizeof(tid))); + testutil_check(__wt_thread_str(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf("write thread %2d starting: tid: %s, file: %s\n", |