diff options
47 files changed, 1261 insertions, 938 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 7d8a58c83bb..d29e9655fb3 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -417,6 +417,11 @@ connection_runtime_config = [ maximum heap memory to allocate for the cache. A database should configure either \c cache_size or \c shared_cache but not both''', min='1MB', max='10TB'), + Config('cache_max_wait_ms', '0', r''' + the maximum number of milliseconds an application thread will wait + for space to be available in cache before giving up. Default will + wait forever''', + min=0), Config('cache_overhead', '8', r''' assume the heap allocator overhead is the specified percentage, and adjust the cache usage by that amount (for example, if there is 10GB @@ -460,7 +465,7 @@ connection_runtime_config = [ vary depending on the current eviction load''', min=1, max=20), ]), - Config('eviction_checkpoint_target', '5', r''' + Config('eviction_checkpoint_target', '1', r''' perform eviction at the beginning of checkpoints to bring the dirty content in cache to this level. It is a percentage of the cache size if the value is within the range of 0 to 100 or an absolute size when @@ -585,7 +590,8 @@ connection_runtime_config = [ type='list', undoc=True, choices=[ 'checkpoint_slow', 'lookaside_sweep_race', 'split_1', 'split_2', - 'split_3', 'split_4', 'split_5', 'split_6', 'split_7']), + 'split_3', 'split_4', 'split_5', 'split_6', 'split_7', 'split_8', + 'split_9']), Config('verbose', '', r''' enable messages for various events. Options are given as a list, such as <code>"verbose=[evictserver,read]"</code>''', diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 135c930b306..f4f8f61ee1e 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -279,6 +279,7 @@ connection_stats = [ CacheStat('cache_read_lookaside_delay_checkpoint', 'pages read into cache with skipped lookaside entries needed later by checkpoint'), CacheStat('cache_read_lookaside_skipped', 'pages read into cache skipping older lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), + CacheStat('cache_timed_out_ops', 'operations timed out waiting for space in cache'), CacheStat('cache_write', 'pages written from cache'), CacheStat('cache_write_app_count', 'application threads page write from cache to disk count'), CacheStat('cache_write_app_time', 'application threads page write from cache to disk time (usecs)'), @@ -515,6 +516,7 @@ connection_stats = [ TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'), TxnStat('txn_pinned_timestamp', 'transaction range of timestamps currently pinned', 'no_clear,no_scale'), + TxnStat('txn_pinned_timestamp_checkpoint', 'transaction range of timestamps pinned by a checkpoint', 'no_clear,no_scale'), TxnStat('txn_pinned_timestamp_oldest', 'transaction range of timestamps pinned by the oldest timestamp', 'no_clear,no_scale'), TxnStat('txn_prepare', 'prepared transactions'), TxnStat('txn_prepare_active', 'prepared transactions currently active'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 770e5ed12ec..1e72dbc56ae 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,6 +1,6 @@ { - "commit": "a8a6314182ccf7dc6625b9c24891a355b07faa8e", + "commit": "27f8e047911ff31500fecf4ea760e688ec541b97", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", - "branch": "mongodb-4.0" + "branch": "mongodb-4.2" } diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c index cf4743009ee..d506af89ab7 100644 --- a/src/third_party/wiredtiger/src/bloom/bloom.c +++ b/src/third_party/wiredtiger/src/bloom/bloom.c @@ -302,7 +302,16 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash) err: if (c != NULL) WT_TRET(c->reset(c)); - /* Don't return WT_NOTFOUND from a failed cursor open or search. */ + /* + * Error handling from this function is complex. A search in the + * backing bit field should never return WT_NOTFOUND - so translate + * that into a different error code and report an error. If we got a + * WT_ROLLBACK it may be because there is a lot of cache pressure and + * the transaction is being killed - don't report an error message in + * that case. + */ + if (ret == WT_ROLLBACK || ret == WT_CACHE_FULL) + return (ret); WT_RET_MSG(bloom->session, ret == WT_NOTFOUND ? WT_ERROR : ret, "Failed lookup in bloom filter"); diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 02cceab3123..3a031b49db5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -429,7 +429,8 @@ __cursor_key_order_check_row( WT_ERR(__wt_scr_alloc(session, 512, &b)); WT_PANIC_ERR(session, EINVAL, - "WT_CURSOR.%s out-of-order returns: returned key %s then key %s", + "WT_CURSOR.%s out-of-order returns: returned key %.1024s then " + "key %.1024s", next ? "next" : "prev", __wt_buf_set_printable_format(session, cbt->lastkey->data, cbt->lastkey->size, btree->key_format, a), diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 566157abd61..16e25c1fe25 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -805,11 +805,13 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_PAGE_INDEX *pindex; WT_PAGE_MODIFY *mod; WT_SESSION_IMPL *session; + uint64_t split_gen; uint32_t entries; session = ds->session; page = ref->page; mod = page->modify; + split_gen = 0; WT_RET(ds->f(ds, "%p", (void *)ref)); @@ -818,6 +820,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; + split_gen = page->pg_intl_split_gen; break; case WT_PAGE_COL_FIX: WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); @@ -830,6 +833,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) case WT_PAGE_ROW_INT: WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; + split_gen = page->pg_intl_split_gen; break; case WT_PAGE_ROW_LEAF: entries = page->entries; @@ -845,8 +849,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", entries %" PRIu32, entries)); WT_RET(ds->f(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean")); - WT_RET(ds->f(ds, - ", memory_size %" WT_SIZET_FMT, page->memory_footprint)); if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(ds->f(ds, ", keys-built")); @@ -878,9 +880,12 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) break; WT_ILLEGAL_VALUE(session); } + if (split_gen != 0) + WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen)); if (mod != NULL) - WT_RET( - ds->f(ds, ", write generation=%" PRIu32, mod->write_gen)); + WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen)); + WT_RET(ds->f(ds, + ", memory-size %" WT_SIZET_FMT, page->memory_footprint)); WT_RET(ds->f(ds, "\n")); return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index 17497561248..4f310b27237 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -262,7 +262,7 @@ restart: /* * holding nothing on failure. */ descend: if ((ret = __wt_page_swap( - session, current, descent, false, flags)) == 0) { + session, current, descent, flags)) == 0) { current = descent; continue; } diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 9e530be4f0e..c8368624d3c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -276,13 +276,15 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) */ page->modify->first_dirty_txn = WT_TXN_FIRST; - if (ref->page_las->las_skew_newest && + FLD_SET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE); + + if (ref->page_las->skew_newest && !S2C(session)->txn_global.has_stable_timestamp && - __wt_txn_visible_all(session, ref->page_las->las_max_txn, - WT_TIMESTAMP_NULL(&ref->page_las->onpage_timestamp))) { - page->modify->rec_max_txn = ref->page_las->las_max_txn; + __wt_txn_visible_all(session, ref->page_las->unstable_txn, + WT_TIMESTAMP_NULL(&ref->page_las->unstable_timestamp))) { + page->modify->rec_max_txn = ref->page_las->max_txn; __wt_timestamp_set(&page->modify->rec_max_timestamp, - &ref->page_las->onpage_timestamp); + &ref->page_las->max_timestamp); __wt_page_modify_clear(session, page); } } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 62212607f18..a98de6c6c9f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1414,6 +1414,7 @@ __split_multi_inmem( WT_DECL_ITEM(key); WT_DECL_RET; WT_PAGE *page; + WT_PAGE_MODIFY *mod; WT_SAVE_UPD *supd; WT_UPDATE *upd; uint64_t recno; @@ -1520,17 +1521,26 @@ __split_multi_inmem( * might be older than that. Set the first dirty transaction to an * impossibly old value so this page is never skipped in a checkpoint. */ - page->modify->first_dirty_txn = WT_TXN_FIRST; + mod = page->modify; + mod->first_dirty_txn = WT_TXN_FIRST; /* * If the new page is modified, save the eviction generation to avoid * repeatedly attempting eviction on the same page. */ - page->modify->last_evict_pass_gen = orig->modify->last_evict_pass_gen; - page->modify->last_eviction_id = orig->modify->last_eviction_id; - __wt_timestamp_set(&page->modify->last_eviction_timestamp, + mod->last_evict_pass_gen = orig->modify->last_evict_pass_gen; + mod->last_eviction_id = orig->modify->last_eviction_id; + __wt_timestamp_set(&mod->last_eviction_timestamp, &orig->modify->last_eviction_timestamp); - page->modify->update_restored = 1; + + /* Add the update/restore flag to any previous state. */ + __wt_timestamp_set(&mod->last_stable_timestamp, + &orig->modify->last_stable_timestamp); + mod->rec_max_txn = orig->modify->rec_max_txn; + __wt_timestamp_set(&mod->rec_max_timestamp, + &orig->modify->rec_max_timestamp); + mod->restore_state = orig->modify->restore_state; + FLD_SET(mod->restore_state, WT_PAGE_RS_RESTORED); err: /* Free any resources that may have been cached in the cursor. */ WT_TRET(__wt_btcur_close(&cbt, true)); @@ -1684,7 +1694,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_RET(__wt_calloc_one(session, &ref->page_las)); *ref->page_las = multi->page_las; - WT_ASSERT(session, ref->page_las->las_max_txn != WT_TXN_NONE); + WT_ASSERT(session, ref->page_las->max_txn != WT_TXN_NONE); ref->state = WT_REF_LOOKASIDE; } diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index d445184b7dd..a2386d907c7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -176,44 +176,96 @@ __ref_ascend(WT_SESSION_IMPL *session, } /* - * __ref_initial_descent_prev -- - * Descend the tree one level, when setting up the initial cursor position - * for a previous-cursor walk. + * __split_prev_race -- + * Check for races when descending the tree during a previous-cursor walk. */ static inline bool -__ref_initial_descent_prev( +__split_prev_race( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; /* - * When splitting an internal page into its parent, we move the WT_REF - * structures and update the parent's page index before updating the - * split page's page index, and it's not an atomic update. A thread can - * read the parent page's replacement page index, then read the split - * page's original index, or the parent page's original and the split - * page's replacement. + * Handle a cursor moving backwards through the tree or setting up at + * the end of the tree. We're passed the child page into which we're + * descending, and the parent page's page-index we used to find that + * child page. * - * This isn't a problem for a cursor setting up at the start of the tree - * because we do right-hand splits on internal pages and the initial - * part of the split page's namespace won't change as part of a split. - * A thread reading the parent page's and split page's indexes will move - * to the same slot no matter what order of indexes are read. - * - * Handle a cursor setting up at the end of the tree. + * When splitting an internal page into its parent, we move the split + * pages WT_REF structures, then update the parent's page index, then + * update the split page's page index, and nothing is atomic. A thread + * can read the parent page's replacement page index and then the split + * page's original index, or vice-versa, and either change can cause a + * cursor moving backwards through the tree to skip pages. * - * We're passed a child page into which we're descending, and on which - * we have a hazard pointer. + * This isn't a problem for a cursor setting up at the start of the tree + * or moving forward through the tree because we do right-hand splits on + * internal pages and the initial part of the split page's namespace + * won't change as part of a split (in other words, a thread reading the + * parent page's and split page's indexes will move to the same slot no + * matter what order of indexes are read. * - * Acquire a page index for the child page and then confirm we haven't - * raced with a parent split. + * Acquire the child's page index, then confirm the parent's page index + * hasn't changed, to check for reading an old version of the parent's + * page index and then reading a new version of the child's page index. */ WT_INTL_INDEX_GET(session, ref->page, pindex); if (__wt_split_descent_race(session, ref, *pindexp)) - return (false); + return (true); + + /* + * That doesn't check if we read a new version of parent's page index + * and then an old version of the child's page index. For example, if + * a thread were in a newly created split page subtree, the split + * completes into the parent before the thread reads it and descends + * into the child (where the split hasn't yet completed). + * + * Imagine an internal page with 3 child pages, with the namespaces a-f, + * g-h and i-j; the first child page splits. The parent starts out with + * the following page-index: + * + * | ... | a | g | i | ... | + * + * The split page starts out with the following page-index: + * + * | a | b | c | d | e | f | + * + * The first step is to move the c-f ranges into a new subtree, so, for + * example we might have two new internal pages 'c' and 'e', where the + * new 'c' page references the c-d namespace and the new 'e' page + * references the e-f namespace. The top of the subtree references the + * parent page, but until the parent's page index is updated, threads in + * the subtree won't be able to ascend out of the subtree. However, once + * the parent page's page index is updated to this: + * + * | ... | a | c | e | g | i | ... | + * + * threads in the subtree can ascend into the parent. Imagine a cursor + * in the c-d part of the namespace that ascends to the parent's 'c' + * slot. It would then decrement to the slot before the 'c' slot, the + * 'a' slot. + * + * The previous-cursor movement selects the last slot in the 'a' page; + * if the split page's page-index hasn't been updated yet, it selects + * the 'f' slot, which is incorrect. Once the split page's page index is + * updated to this: + * + * | a | b | + * + * the previous-cursor movement will select the 'b' slot, which is + * correct. + * + * If the last slot on the page no longer points to the current page as + * its "home", the page is being split and part of its namespace moved, + * restart. (We probably don't have to restart, I think we could spin + * until the page-index is updated, but I'm not willing to debug that + * one if I'm wrong.) + */ + if (pindex->index[pindex->entries - 1]->home != ref->page) + return (true); *pindexp = pindex; - return (true); + return (false); } /* @@ -229,22 +281,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; - WT_REF *couple, *couple_orig, *ref; + WT_REF *couple, *ref, *ref_orig; uint64_t sleep_usecs, yield_count; uint32_t current_state, slot; - bool empty_internal, initial_descent, prev, skip; + bool empty_internal, prev, skip; btree = S2BT(session); pindex = NULL; sleep_usecs = yield_count = 0; - empty_internal = initial_descent = false; + empty_internal = false; /* - * Tree walks are special: they look inside page structures that splits - * may want to free. Publish that the tree is active during this - * window. + * We're not supposed to walk trees without root pages. As this has not + * always been the case, assert to debug that change. */ - WT_ENTER_PAGE_INDEX(session); + WT_ASSERT(session, btree->root.page != NULL); /* Check whether deleted pages can be skipped. */ if (!LF_ISSET(WT_READ_DELETED_SKIP)) @@ -284,36 +335,41 @@ __tree_walk_internal(WT_SESSION_IMPL *session, * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * - * !!! - * NOTE: we depend on the fact it's OK to release a page we don't hold, - * that is, it's OK to release couple when couple is set to NULL. - * - * Take a copy of any held page and clear the return value. Remember - * the hazard pointer we're currently holding. - * - * Clear the returned value, it makes future error handling easier. + * The hazard pointer on the original location is held until the end of + * the movement, in case we have to restart the movement. Take a copy + * of any held page and clear the return value (it makes future error + * handling easier). */ - couple = couple_orig = ref = *refp; + couple = NULL; + ref_orig = *refp; *refp = NULL; + /* + * Tree walks are special: they look inside page structures that splits + * may want to free. Publish the tree is active during this window. + */ + WT_ENTER_PAGE_INDEX(session); + /* If no page is active, begin a walk from the start/end of the tree. */ - if (ref == NULL) { -restart: /* - * We can be here with a NULL or root WT_REF; the page release - * function handles them internally, don't complicate this code - * by calling them out. - */ - WT_ERR(__wt_page_release(session, couple, flags)); + if ((ref = ref_orig) == NULL) { + if (0) { +restart: /* + * Yield before retrying, and if we've yielded enough + * times, start sleeping so we don't burn CPU to no + * purpose. + */ + __wt_spin_backoff(&yield_count, &sleep_usecs); - /* - * We're not supposed to walk trees without root pages. As this - * has not always been the case, assert to debug that change. - */ - WT_ASSERT(session, btree->root.page != NULL); + WT_ERR(__wt_page_release(session, couple, flags)); + couple = NULL; + } - couple = couple_orig = ref = &btree->root; - initial_descent = true; - goto descend; + if ((ref = ref_orig) == NULL) { + ref = &btree->root; + WT_INTL_INDEX_GET(session, ref->page, pindex); + slot = prev ? pindex->entries - 1 : 0; + goto descend; + } } /* @@ -340,12 +396,9 @@ restart: /* /* * If at the root and returning internal pages, return - * the root page, otherwise we're done. Regardless, no - * hazard pointer is required, release the one we hold. + * the root page, otherwise we're done. */ if (__wt_ref_is_root(ref)) { - WT_ERR(__wt_page_release( - session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; @@ -356,7 +409,7 @@ restart: /* * all of the child pages were deleted, mark it for * eviction. */ - if (empty_internal && pindex->entries > 1) { + if (empty_internal) { __wt_page_evict_soon(session, ref); empty_internal = false; } @@ -367,17 +420,18 @@ restart: /* * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, - * the parent can't have been evicted. (This is why we - * don't pass "prev" to the page-swap function, we can't - * handle the restart error returned if the parent page - * is currently splitting.) + * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( - session, couple, ref, false, flags)); + session, couple, ref, flags)); + couple = NULL; *refp = ref; goto done; } + + /* Encourage races. */ + __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_8); } if (prev) @@ -389,9 +443,9 @@ restart: /* ++*walkcntp; for (;;) { - /* - * Move to the next slot, and set the reference hint if - * it's wrong (used when we continue the walk). We don't +descend: /* + * Get a reference, setting the reference hint if it's + * wrong (used when we continue the walk). We don't * always update the hints when splitting, it's expected * for them to be incorrect in some workloads. */ @@ -452,12 +506,41 @@ restart: /* break; } - ret = __wt_page_swap(session, couple, ref, prev, + ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); + if (ret == 0) { + /* Success, so "couple" has been released. */ + couple = NULL; + + /* Return leaf pages to our caller. */ + if (!WT_PAGE_IS_INTERNAL(ref->page)) { + *refp = ref; + goto done; + } + + /* Set the new "couple" value. */ + couple = ref; + + /* Configure traversal of any internal page. */ + empty_internal = true; + if (prev) { + if (__split_prev_race( + session, ref, &pindex)) + goto restart; + slot = pindex->entries - 1; + } else { + WT_INTL_INDEX_GET( + session, ref->page, pindex); + slot = 0; + } + continue; + } /* - * Not-found is an expected return when only walking + * Not-found is an expected return when walking only * in-cache pages, or if we see a deleted page. + * + * An expected error, so "couple" is unchanged. */ if (ret == WT_NOTFOUND) { WT_NOT_READ(ret, 0); @@ -466,94 +549,24 @@ restart: /* /* * The page we're moving to might have split, in which - * case move to the last position we held. - */ - if (ret == WT_RESTART) { - ret = 0; - - /* - * Yield before retrying, and if we've yielded - * enough times, start sleeping so we don't burn - * CPU to no purpose. - */ - __wt_spin_backoff( - &yield_count, &sleep_usecs); - - /* - * If a cursor is setting up at the end of the - * tree, we can't use our parent page's index, - * because it may have already split; restart - * the walk. - */ - if (prev && initial_descent) - goto restart; - - /* - * If a new walk that never coupled from the - * root to a new saved position in the tree, - * restart the walk. - */ - if (couple == &btree->root) - goto restart; - - /* - * If restarting from some original position, - * repeat the increment or decrement we made at - * that time. Otherwise, couple is an internal - * page we've acquired after moving from that - * starting position and we can treat it as a - * new page. This works because we never acquire - * a hazard pointer on a leaf page we're not - * going to return to our caller, this will quit - * working if that ever changes. - */ - WT_ASSERT(session, - couple == couple_orig || - WT_PAGE_IS_INTERNAL(couple->page)); - ref = couple; - __ref_index_slot(session, ref, &pindex, &slot); - if (couple == couple_orig) - break; - } - WT_ERR(ret); - couple = ref; - - /* - * A new page: configure for traversal of any internal - * page's children, else return the leaf page. + * case restart the movement. + * + * An expected error, so "couple" is unchanged. */ - if (WT_PAGE_IS_INTERNAL(ref->page)) { -descend: empty_internal = true; - - /* - * There's a split race when a cursor is setting - * up at the end of the tree. - */ - if (prev && initial_descent) { - if (!__ref_initial_descent_prev( - session, ref, &pindex)) - goto restart; - } else - WT_INTL_INDEX_GET( - session, ref->page, pindex); - slot = prev ? pindex->entries - 1 : 0; - continue; - } + if (ret == WT_RESTART) + goto restart; - /* - * The tree-walk restart code knows we return any leaf - * page we acquire (never hazard-pointer coupling on - * after acquiring a leaf page), and asserts no restart - * happens while holding a leaf page. This page must be - * returned to our caller. - */ - *refp = ref; - goto done; + /* Unexpected error, so "couple" was released. */ + couple = NULL; + goto err; } } done: -err: WT_LEAVE_PAGE_INDEX(session); +err: + WT_TRET(__wt_page_release(session, couple, flags)); + WT_TRET(__wt_page_release(session, ref_orig, flags)); + WT_LEAVE_PAGE_INDEX(session); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index 8cc6630599b..123b640cdf4 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -179,6 +179,9 @@ descend: /* descent = pindex->index[base - 1]; } + /* Encourage races. */ + __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_9); + /* * Swap the current page for the child page. If the page splits * while we're retrieving it, restart the search at the root. @@ -192,7 +195,7 @@ descend: /* * holding nothing on failure. */ if ((ret = __wt_page_swap(session, - current, descent, false, WT_READ_RESTART_OK)) == 0) { + current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 20acda8a1ab..a3f05a2700f 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -431,7 +431,10 @@ append: if (__wt_split_descent_race( goto restart; } -descend: /* +descend: /* Encourage races. */ + __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_9); + + /* * Swap the current page for the child page. If the page splits * while we're retrieving it, restart the search at the root. * We cannot restart in the "current" page; for example, if a @@ -444,7 +447,7 @@ descend: /* * holding nothing on failure. */ if ((ret = __wt_page_swap(session, - current, descent, false, WT_READ_RESTART_OK)) == 0) { + current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 54bf8c78171..cd11a3793c5 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -404,43 +404,34 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) return (false); /* - * If page image has the newest version of data and includes data newer - * than the reader's snapshot then we should read the history. + * If some of the page's history overlaps with the reader's snapshot + * then we have to read it. This is only relevant if we chose versions + * that were unstable when the page was written. */ - if (ref->page_las->las_skew_newest && - WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn)) + if (ref->page_las->skew_newest && + WT_TXNID_LE(txn->snap_min, ref->page_las->unstable_txn)) return (false); - /* - * If page image has the oldest version of data and some of the history - * overlaps with the reader's snapshot then we should read the history. - */ - if (!ref->page_las->las_skew_newest && - WT_TXNID_LE(ref->page_las->las_min_txn, txn->snap_max)) - return (false); - - if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_newest) - return (true); + if (!F_ISSET(txn, WT_TXN_HAS_TS_READ)) + return (ref->page_las->skew_newest); #ifdef HAVE_TIMESTAMPS /* * Skip lookaside pages if reading as of a timestamp, we evicted new * versions of data and all the updates are in the past. */ - if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - ref->page_las->las_skew_newest && + if (ref->page_las->skew_newest && __wt_timestamp_cmp( - &ref->page_las->onpage_timestamp, &session->txn.read_timestamp) < 0) + &txn->read_timestamp, &ref->page_las->unstable_timestamp) > 0) return (true); /* * Skip lookaside pages if reading as of a timestamp, we evicted old - * versions of data and all the updates are in the future. + * versions of data and all the unstable updates are in the future. */ - if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - !ref->page_las->las_skew_newest && + if (!ref->page_las->skew_newest && __wt_timestamp_cmp( - &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) + &txn->read_timestamp, &ref->page_las->unstable_timestamp) < 0) return (true); #endif @@ -563,8 +554,8 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) (void)__wt_eviction_dirty_needed(session, &pct_dirty); #ifdef HAVE_TIMESTAMPS - WT_RET(__wt_timestamp_to_hex_string( - session, hex_timestamp, &multi->page_las.min_timestamp)); + WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp, + &multi->page_las.unstable_timestamp)); ts = hex_timestamp; #else ts = "disabled"; @@ -573,14 +564,14 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY, "Page reconciliation triggered lookaside write " "file ID %" PRIu32 ", page ID %" PRIu64 ". " - "Max txn ID %" PRIu64 ", min timestamp %s, skewed %s. " + "Max txn ID %" PRIu64 ", unstable timestamp %s, %s. " "Entries now in lookaside file: %" PRId64 ", " "cache dirty: %2.3f%% , " "cache use: %2.3f%%", btree_id, multi->page_las.las_pageid, - multi->page_las.las_max_txn, + multi->page_las.max_txn, ts, - multi->page_las.las_skew_newest ? "newest" : "oldest", + multi->page_las.skew_newest ? "newest" : "not newest", WT_STAT_READ(conn->stats, cache_lookaside_entries), pct_dirty, pct_full); } @@ -724,8 +715,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, * table. (We check the length because row-store doesn't * write zero-length data items.) */ - if (multi->page_las.las_skew_newest && - upd == list->onpage_upd && + if (upd == list->onpage_upd && upd->size > 0 && (upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY)) { diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 7dee7a5e756..0945d768ce2 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -138,6 +138,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "async", "category", NULL, NULL, confchk_wiredtiger_open_async_subconfigs, 3 }, + { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -189,7 +190,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "timing_stress_for_test", "list", NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\"]", + "\"split_6\",\"split_7\",\"split_8\",\"split_9\"]", NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\"," @@ -806,6 +807,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 }, { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, + { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -879,7 +881,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "timing_stress_for_test", "list", NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\"]", + "\"split_6\",\"split_7\",\"split_8\",\"split_9\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -909,6 +911,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 }, { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, + { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -982,7 +985,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "timing_stress_for_test", "list", NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\"]", + "\"split_6\",\"split_7\",\"split_8\",\"split_9\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -1013,6 +1016,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 }, { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, + { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -1082,7 +1086,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "timing_stress_for_test", "list", NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\"]", + "\"split_6\",\"split_7\",\"split_8\",\"split_9\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -1111,6 +1115,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 }, { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, + { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -1180,7 +1185,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "timing_stress_for_test", "list", NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\"]", + "\"split_6\",\"split_7\",\"split_8\",\"split_9\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -1250,13 +1255,13 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_CONNECTION_query_timestamp, 1 }, { "WT_CONNECTION.reconfigure", - "async=(enabled=false,ops_max=1024,threads=2),cache_overhead=8," - "cache_size=100MB,checkpoint=(log_size=0,wait=0)," - "compatibility=(release=),error_prefix=,eviction=(threads_max=8," - "threads_min=1),eviction_checkpoint_target=5," - "eviction_dirty_target=5,eviction_dirty_trigger=20," - "eviction_target=80,eviction_trigger=95," - "file_manager=(close_handle_minimum=250,close_idle_time=30," + "async=(enabled=false,ops_max=1024,threads=2),cache_max_wait_ms=0" + ",cache_overhead=8,cache_size=100MB,checkpoint=(log_size=0," + "wait=0),compatibility=(release=),error_prefix=," + "eviction=(threads_max=8,threads_min=1)," + "eviction_checkpoint_target=1,eviction_dirty_target=5," + "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" + ",file_manager=(close_handle_minimum=250,close_idle_time=30," "close_scan_interval=10),log=(archive=true,prealloc=true," "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," "lsm_merge=true,operation_tracking=(enabled=false,path=\".\")," @@ -1264,7 +1269,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "statistics=none,statistics_log=(json=false,on_close=false," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,verbose=", - confchk_WT_CONNECTION_reconfigure, 22 + confchk_WT_CONNECTION_reconfigure, 23 }, { "WT_CONNECTION.rollback_to_stable", "", @@ -1489,66 +1494,67 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "wiredtiger_open", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" - ",builtin_extension_config=,cache_cursors=true,cache_overhead=8," - "cache_size=100MB,checkpoint=(log_size=0,wait=0)," - "checkpoint_sync=true,compatibility=(release=,require_max=," - "require_min=),config_base=true,create=false,direct_io=," - "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=8,threads_min=1)," - "eviction_checkpoint_target=5,eviction_dirty_target=5," - "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" - ",exclusive=false,extensions=,file_extend=," - "file_manager=(close_handle_minimum=250,close_idle_time=30," - "close_scan_interval=10),hazard_max=1000,in_memory=false," - "log=(archive=true,compressor=,enabled=false,file_max=100MB," - "path=\".\",prealloc=true,recover=on,zero_fill=false)," - "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," - "mmap=true,multiprocess=false,operation_tracking=(enabled=false," - "path=\".\"),readonly=false,session_max=100," - "session_scratch_max=2MB,session_table_cache=true," - "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," + ",builtin_extension_config=,cache_cursors=true," + "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," + "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," + "compatibility=(release=,require_max=,require_min=)," + "config_base=true,create=false,direct_io=,encryption=(keyid=," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," + "threads_min=1),eviction_checkpoint_target=1," + "eviction_dirty_target=5,eviction_dirty_trigger=20," + "eviction_target=80,eviction_trigger=95,exclusive=false," + "extensions=,file_extend=,file_manager=(close_handle_minimum=250," + "close_idle_time=30,close_scan_interval=10),hazard_max=1000," + "in_memory=false,log=(archive=true,compressor=,enabled=false," + "file_max=100MB,path=\".\",prealloc=true,recover=on," + "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," + "lsm_merge=true,mmap=true,multiprocess=false," + "operation_tracking=(enabled=false,path=\".\"),readonly=false," + "session_max=100,session_scratch_max=2MB,session_table_cache=true" + ",shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," "statistics=none,statistics_log=(json=false,on_close=false," "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),use_environment=true,use_environment_priv=false," "verbose=,write_through=", - confchk_wiredtiger_open, 45 + confchk_wiredtiger_open, 46 }, { "wiredtiger_open_all", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" - ",builtin_extension_config=,cache_cursors=true,cache_overhead=8," - "cache_size=100MB,checkpoint=(log_size=0,wait=0)," - "checkpoint_sync=true,compatibility=(release=,require_max=," - "require_min=),config_base=true,create=false,direct_io=," - "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=8,threads_min=1)," - "eviction_checkpoint_target=5,eviction_dirty_target=5," - "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" - ",exclusive=false,extensions=,file_extend=," - "file_manager=(close_handle_minimum=250,close_idle_time=30," - "close_scan_interval=10),hazard_max=1000,in_memory=false," - "log=(archive=true,compressor=,enabled=false,file_max=100MB," - "path=\".\",prealloc=true,recover=on,zero_fill=false)," - "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," - "mmap=true,multiprocess=false,operation_tracking=(enabled=false," - "path=\".\"),readonly=false,session_max=100," - "session_scratch_max=2MB,session_table_cache=true," - "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," + ",builtin_extension_config=,cache_cursors=true," + "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," + "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," + "compatibility=(release=,require_max=,require_min=)," + "config_base=true,create=false,direct_io=,encryption=(keyid=," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," + "threads_min=1),eviction_checkpoint_target=1," + "eviction_dirty_target=5,eviction_dirty_trigger=20," + "eviction_target=80,eviction_trigger=95,exclusive=false," + "extensions=,file_extend=,file_manager=(close_handle_minimum=250," + "close_idle_time=30,close_scan_interval=10),hazard_max=1000," + "in_memory=false,log=(archive=true,compressor=,enabled=false," + "file_max=100MB,path=\".\",prealloc=true,recover=on," + "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," + "lsm_merge=true,mmap=true,multiprocess=false," + "operation_tracking=(enabled=false,path=\".\"),readonly=false," + "session_max=100,session_scratch_max=2MB,session_table_cache=true" + ",shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," "statistics=none,statistics_log=(json=false,on_close=false," "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),use_environment=true,use_environment_priv=false," "verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_all, 46 + confchk_wiredtiger_open_all, 47 }, { "wiredtiger_open_basecfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" - ",builtin_extension_config=,cache_cursors=true,cache_overhead=8," - "cache_size=100MB,checkpoint=(log_size=0,wait=0)," - "checkpoint_sync=true,compatibility=(release=,require_max=," - "require_min=),direct_io=,encryption=(keyid=,name=,secretkey=)," - "error_prefix=,eviction=(threads_max=8,threads_min=1)," - "eviction_checkpoint_target=5,eviction_dirty_target=5," + ",builtin_extension_config=,cache_cursors=true," + "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," + "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," + "compatibility=(release=,require_max=,require_min=),direct_io=," + "encryption=(keyid=,name=,secretkey=),error_prefix=," + "eviction=(threads_max=8,threads_min=1)," + "eviction_checkpoint_target=1,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," @@ -1563,16 +1569,17 @@ static const WT_CONFIG_ENTRY config_entries[] = { "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_basecfg, 40 + confchk_wiredtiger_open_basecfg, 41 }, { "wiredtiger_open_usercfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" - ",builtin_extension_config=,cache_cursors=true,cache_overhead=8," - "cache_size=100MB,checkpoint=(log_size=0,wait=0)," - "checkpoint_sync=true,compatibility=(release=,require_max=," - "require_min=),direct_io=,encryption=(keyid=,name=,secretkey=)," - "error_prefix=,eviction=(threads_max=8,threads_min=1)," - "eviction_checkpoint_target=5,eviction_dirty_target=5," + ",builtin_extension_config=,cache_cursors=true," + "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," + "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," + "compatibility=(release=,require_max=,require_min=),direct_io=," + "encryption=(keyid=,name=,secretkey=),error_prefix=," + "eviction=(threads_max=8,threads_min=1)," + "eviction_checkpoint_target=1,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," @@ -1587,7 +1594,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),verbose=,write_through=", - confchk_wiredtiger_open_usercfg, 39 + confchk_wiredtiger_open_usercfg, 40 }, { NULL, NULL, NULL, 0 } }; diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index d322caac04a..c1e6e1eb6cf 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -2024,6 +2024,8 @@ __wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[]) { "split_5", WT_TIMING_STRESS_SPLIT_5 }, { "split_6", WT_TIMING_STRESS_SPLIT_6 }, { "split_7", WT_TIMING_STRESS_SPLIT_7 }, + { "split_8", WT_TIMING_STRESS_SPLIT_8 }, + { "split_9", WT_TIMING_STRESS_SPLIT_9 }, { NULL, 0 } }; WT_CONFIG_ITEM cval, sval; diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index 00de16e6c21..dbb602921a8 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -143,6 +143,10 @@ __cache_config_local(WT_SESSION_IMPL *session, bool shared, const char *cfg[]) conn->evict_threads_max = evict_threads_max; conn->evict_threads_min = evict_threads_min; + /* Retrieve the wait time and convert from milliseconds */ + WT_RET(__wt_config_gets(session, cfg, "cache_max_wait_ms", &cval)); + cache->cache_max_wait_us = (uint64_t)(cval.val * WT_THOUSAND); + return (0); } diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 8396612b7ca..05397843fc7 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -563,7 +563,7 @@ __evict_update_work(WT_SESSION_IMPL *session) conn = S2C(session); cache = conn->cache; - dirty_target = cache->eviction_dirty_target; + dirty_target = __wt_eviction_dirty_target(cache); dirty_trigger = cache->eviction_dirty_trigger; target = cache->eviction_target; trigger = cache->eviction_trigger; @@ -2345,7 +2345,8 @@ __wt_cache_eviction_worker( WT_TRACK_OP_DECL; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; - uint64_t initial_progress, max_progress, time_start, time_stop; + uint64_t elapsed, time_start, time_stop; + uint64_t initial_progress, max_progress; bool timer; WT_TRACK_OP_INIT(session); @@ -2367,8 +2368,7 @@ __wt_cache_eviction_worker( __wt_evict_server_wake(session); /* Track how long application threads spend doing eviction. */ - timer = - WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL); + timer = !F_ISSET(session, WT_SESSION_INTERNAL); if (timer) time_start = __wt_clock(session); @@ -2405,22 +2405,10 @@ __wt_cache_eviction_worker( /* See if eviction is still needed. */ if (!__wt_eviction_needed(session, busy, readonly, &pct_full) || - ((pct_full < 100.0 || cache->eviction_scrub_limit > 0.0) && - (cache->eviction_progress > + (pct_full < 100.0 && (cache->eviction_progress > initial_progress + max_progress))) break; - /* - * Don't make application threads participate in scrubbing for - * checkpoints. Just throttle updates instead. - */ - if (WT_EVICT_HAS_WORKERS(session) && - cache->eviction_scrub_limit > 0.0 && - !F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) { - __wt_yield(); - continue; - } - /* Evict a page. */ switch (ret = __evict_page(session, false)) { case 0: @@ -2438,13 +2426,26 @@ __wt_cache_eviction_worker( default: goto err; } + /* Stop if we've exceeded the time out. */ + if (timer && cache->cache_max_wait_us != 0) { + time_stop = __wt_clock(session); + if (session->cache_wait_us + + WT_CLOCKDIFF_US(time_stop, time_start) > + cache->cache_max_wait_us) + goto err; + } } err: if (timer) { time_stop = __wt_clock(session); - WT_STAT_CONN_INCRV(session, - application_cache_time, - WT_CLOCKDIFF_US(time_stop, time_start)); + elapsed = WT_CLOCKDIFF_US(time_stop, time_start); + WT_STAT_CONN_INCRV(session, application_cache_time, elapsed); + session->cache_wait_us += elapsed; + if (cache->cache_max_wait_us != 0 && + session->cache_wait_us > cache->cache_max_wait_us) { + WT_TRET(WT_CACHE_FULL); + WT_STAT_CONN_INCR(session, cache_timed_out_ops); + } } done: WT_TRACK_OP_END(session); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index ca2176fcf0e..aabb19c86aa 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -48,6 +48,9 @@ WT_TRACK_OP_INIT(s); \ WT_SINGLE_THREAD_CHECK_START(s); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ + /* Reset wait time if this isn't an API re entry. */ \ + if (__oldname == NULL) \ + (s)->cache_wait_us = 0; \ __wt_verbose((s), WT_VERB_API, "%s", "CALL: " #h ":" #n) #define API_CALL_NOCONF(s, h, n, dh) do { \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 33e382feba2..64e84e59d36 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -234,14 +234,13 @@ struct __wt_ovfl_reuse { */ struct __wt_page_lookaside { uint64_t las_pageid; /* Page ID in lookaside */ - uint64_t las_max_txn; /* Max transaction ID in lookaside */ - uint64_t las_min_txn; /* Min transaction ID in lookaside */ - WT_DECL_TIMESTAMP(min_timestamp)/* Min timestamp in lookaside */ - /* Max timestamp on page */ - WT_DECL_TIMESTAMP(onpage_timestamp) + uint64_t max_txn; /* Maximum transaction ID */ + uint64_t unstable_txn; /* First transaction ID not on page */ + WT_DECL_TIMESTAMP(max_timestamp)/* Maximum timestamp */ + WT_DECL_TIMESTAMP(unstable_timestamp)/* First timestamp not on page */ bool eviction_to_lookaside; /* Revert to lookaside on eviction */ - bool las_skew_newest; /* On-page skewed to newest */ bool invalid; /* History is required correct reads */ + bool skew_newest; /* Page image has newest versions */ }; /* @@ -270,6 +269,9 @@ struct __wt_page_modify { uint64_t rec_max_txn; WT_DECL_TIMESTAMP(rec_max_timestamp) + /* Stable timestamp at last reconciliation. */ + WT_DECL_TIMESTAMP(last_stable_timestamp) + /* The largest update transaction ID (approximate). */ uint64_t update_txn; @@ -481,7 +483,9 @@ struct __wt_page_modify { #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */ uint8_t rec_result; /* Reconciliation state */ - uint8_t update_restored; /* Page created by restoring updates */ +#define WT_PAGE_RS_LOOKASIDE 0x1 +#define WT_PAGE_RS_RESTORED 0x2 + uint8_t restore_state; /* Created by restoring updates */ }; /* diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 81c166eb0e4..d7edcd00d5a 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1190,10 +1190,10 @@ __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref) if ((page_las = ref->page_las) == NULL) return (false); - if (page_las->invalid || !ref->page_las->las_skew_newest) + if (page_las->invalid || !ref->page_las->skew_newest) return (true); - if (__wt_txn_visible_all(session, page_las->las_max_txn, - WT_TIMESTAMP_NULL(&page_las->onpage_timestamp))) + if (__wt_txn_visible_all(session, page_las->max_txn, + WT_TIMESTAMP_NULL(&page_las->max_timestamp))) return (false); return (true); @@ -1329,6 +1329,7 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) static inline bool __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page) { + WT_DECL_TIMESTAMP(pinned_ts) WT_PAGE_MODIFY *mod; WT_TXN_GLOBAL *txn_global; @@ -1338,7 +1339,8 @@ __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page) * If the page hasn't been through one round of update/restore, give it * a try. */ - if ((mod = page->modify) == NULL || !mod->update_restored) + if ((mod = page->modify) == NULL || + !FLD_ISSET(mod->restore_state, WT_PAGE_RS_RESTORED)) return (true); /* @@ -1356,17 +1358,12 @@ __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page) return (true); #ifdef HAVE_TIMESTAMPS - { - bool same_timestamp; - - same_timestamp = false; - if (!__wt_timestamp_iszero(&mod->last_eviction_timestamp)) - WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, - same_timestamp = __wt_timestamp_cmp( + if (!__wt_timestamp_iszero(&mod->last_eviction_timestamp)) { + __wt_txn_pinned_timestamp(session, &pinned_ts); + if (__wt_timestamp_cmp( &mod->last_eviction_timestamp, - &txn_global->pinned_timestamp) == 0); - if (!same_timestamp) - return (true); + &txn_global->pinned_timestamp) != 0) + return (true); } #endif @@ -1605,6 +1602,8 @@ __wt_split_descent_race( * update. A thread can read the parent page's original page index and * then read the split page's replacement index. * + * For example, imagine a search descending the tree. + * * Because internal page splits work by truncating the original page to * the initial part of the original page, the result of this race is we * will have a search key that points past the end of the current page. @@ -1649,73 +1648,17 @@ __wt_split_descent_race( * work by truncating the split page, so the split page search is for * content the split page retains after the split, and we ignore this * race. - */ - WT_INTL_INDEX_GET(session, ref->home, pindex); - return (pindex != saved_pindex); -} - -/* - * __wt_split_prev_race -- - * Return if we raced with an internal page split when moving backwards - * through the tree. - */ -static inline bool -__wt_split_prev_race(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_PAGE_INDEX *pindex; - - /* - * There's a split race when a cursor moving backwards through the tree - * descends the tree. If we're splitting an internal page into its - * parent, we move the WT_REF structures and update the parent's page - * index before updating the split page's page index, and it's not an - * atomic update. A thread can read the parent and split page's original - * indexes during a split, or read the parent page's replacement page - * index and then read the split page's original index, either of which - * can lead to skipping pages. * - * For example, imagine an internal page with 3 child pages, with the - * namespaces a-f, g-h and i-j; the first child page splits. The parent - * starts out with the following page-index: + * This code is a general purpose check for a descent race and we call + * it in other cases, for example, a cursor traversing backwards through + * the tree. * - * | ... | a | g | i | ... | - * - * The split page starts out with the following page-index: - * - * | a | b | c | d | e | f | - * - * The first step is to move the c-f ranges into a new subtree, so, for - * example we might have two new internal pages 'c' and 'e', where the - * new 'c' page references the c-d namespace and the new 'e' page - * references the e-f namespace. The top of the subtree references the - * parent page, but until the parent's page index is updated, threads in - * the subtree won't be able to ascend out of the subtree. However, once - * the parent page's page index is updated to this: - * - * | ... | a | c | e | g | i | ... | - * - * threads in the subtree can ascend into the parent. Imagine a cursor - * in the c-d part of the namespace that ascends to the parent's 'c' - * slot. It would then decrement to the slot before the 'c' slot, the - * 'a' slot. - * - * The previous-cursor movement selects the last slot in the 'a' page; - * if the split page's page-index hasn't been updated yet, it selects - * the 'f' slot, which is incorrect. Once the split page's page index is - * updated to this: - * - * | a | b | - * - * the previous-cursor movement will select the 'b' slot, which is - * correct. - * - * This function takes an argument which is the internal page into which - * we're coupling. If the last slot on the page no longer points to - * the current page as its "home", the page is being split and part of - * its namespace moved, we have to restart. + * Presumably we acquired a page index on the child page before calling + * this code, don't re-order that acquisition with this check. */ - WT_INTL_INDEX_GET(session, ref->page, pindex); - return (pindex->index[pindex->entries - 1]->home != ref->page); + WT_BARRIER(); + WT_INTL_INDEX_GET(session, ref->home, pindex); + return (pindex != saved_pindex); } /* @@ -1724,8 +1667,8 @@ __wt_split_prev_race(WT_SESSION_IMPL *session, WT_REF *ref) * coupling up/down the tree. */ static inline int -__wt_page_swap_func(WT_SESSION_IMPL *session, - WT_REF *held, WT_REF *want, bool prev_race, uint32_t flags +__wt_page_swap_func( + WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif @@ -1755,18 +1698,6 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, ); /* - * We can race when descending into an internal page as part of moving - * backwards through the tree, and we have to detect that race before - * releasing the page from which we are coupling, else we can't restart - * the movement. - */ - if (ret == 0 && prev_race && WT_PAGE_IS_INTERNAL(want->page) && - __wt_split_prev_race(session, want)) { - ret = WT_RESTART; - WT_TRET(__wt_page_release(session, want, flags)); - } - - /* * Expected failures: page not found or restart. Our callers list the * errors they're expecting to handle. */ diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 1299d3e90e3..7d07e6dfd98 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -120,11 +120,11 @@ struct __wt_cache { double eviction_checkpoint_target;/* Percent to reduce dirty to during checkpoint scrubs */ - double eviction_scrub_limit; /* Percent of cache to trigger - dirty eviction during checkpoint - scrubs */ + double eviction_scrub_target; /* Current scrub target */ u_int overhead_pct; /* Cache percent adjustment */ + uint64_t cache_max_wait_us; /* Maximum time an operation waits for + * space in cache */ /* * Eviction thread tuning information. diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index fc127942d02..7f12949e162 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -276,6 +276,22 @@ __wt_eviction_clean_needed(WT_SESSION_IMPL *session, double *pct_fullp) } /* + * __wt_eviction_dirty_target -- + * Return the effective dirty target (including checkpoint scrubbing). + */ +static inline double +__wt_eviction_dirty_target(WT_CACHE *cache) +{ + double dirty_target, scrub_target; + + dirty_target = cache->eviction_dirty_target; + scrub_target = cache->eviction_scrub_target; + + return (scrub_target > 0 && scrub_target < dirty_target ? + scrub_target : dirty_target); +} + +/* * __wt_eviction_dirty_needed -- * Return if an application thread should do eviction due to the total * volume of dirty data in cache. @@ -284,7 +300,6 @@ static inline bool __wt_eviction_dirty_needed(WT_SESSION_IMPL *session, double *pct_fullp) { WT_CACHE *cache; - double dirty_trigger; uint64_t dirty_inuse, bytes_max; cache = S2C(session)->cache; @@ -299,10 +314,8 @@ __wt_eviction_dirty_needed(WT_SESSION_IMPL *session, double *pct_fullp) if (pct_fullp != NULL) *pct_fullp = ((100.0 * dirty_inuse) / bytes_max); - if ((dirty_trigger = cache->eviction_scrub_limit) < 1.0) - dirty_trigger = cache->eviction_dirty_trigger; - - return (dirty_inuse > (uint64_t)(dirty_trigger * bytes_max) / 100); + return (dirty_inuse > (uint64_t)( + cache->eviction_dirty_trigger * bytes_max) / 100); } /* diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index d0bebe8da5d..22459b0072c 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -458,6 +458,8 @@ struct __wt_connection_impl { #define WT_TIMING_STRESS_SPLIT_5 0x040u #define WT_TIMING_STRESS_SPLIT_6 0x080u #define WT_TIMING_STRESS_SPLIT_7 0x100u +#define WT_TIMING_STRESS_SPLIT_8 0x200u +#define WT_TIMING_STRESS_SPLIT_9 0x400u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint64_t timing_stress_flags; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 7e2d4a4786d..c78c460f445 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -498,6 +498,7 @@ extern int __wt_lsm_work_switch(WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **ent extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_lsm_chunk_visible_all(WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_lsm_work_enable_evict(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index f515e03519a..67ef28757ef 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -107,7 +107,8 @@ struct __wt_lsm_chunk { uint32_t id; /* ID used to generate URIs */ uint32_t generation; /* Merge generation */ uint32_t refcnt; /* Number of worker thread references */ - uint32_t bloom_busy; /* Number of worker thread references */ + uint32_t bloom_busy; /* Currently creating bloom filter */ + uint32_t evict_enabled; /* Eviction allowed on the chunk */ int8_t empty; /* 1/0: checkpoint missing */ int8_t evicted; /* 1/0: in-memory chunk was evicted */ @@ -129,13 +130,19 @@ struct __wt_lsm_chunk { * is required. */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_LSM_WORK_BLOOM 0x01u /* Create a bloom filter */ -#define WT_LSM_WORK_DROP 0x02u /* Drop unused chunks */ -#define WT_LSM_WORK_FLUSH 0x04u /* Flush a chunk to disk */ -#define WT_LSM_WORK_MERGE 0x08u /* Look for a tree merge */ -#define WT_LSM_WORK_SWITCH 0x10u /* Switch to new in-memory chunk */ +#define WT_LSM_WORK_BLOOM 0x01u /* Create a bloom filter */ +#define WT_LSM_WORK_DROP 0x02u /* Drop unused chunks */ +#define WT_LSM_WORK_ENABLE_EVICT 0x04u /* Create a bloom filter */ +#define WT_LSM_WORK_FLUSH 0x08u /* Flush a chunk to disk */ +#define WT_LSM_WORK_MERGE 0x10u /* Look for a tree merge */ +#define WT_LSM_WORK_SWITCH 0x20u /* Switch the in-memory chunk */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ +/* Work units that are serviced by general worker threads. */ +#define WT_LSM_WORK_GENERAL_OPS \ + (WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_ENABLE_EVICT |\ + WT_LSM_WORK_FLUSH | WT_LSM_WORK_SWITCH) + /* * WT_LSM_WORK_UNIT -- * A definition of maintenance that an LSM tree needs done. diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 1507e2d07cc..d76560d26e6 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -294,16 +294,15 @@ typedef void wt_timestamp_t; __wt_scr_alloc_func(session, size, scratchp, __func__, __LINE__) #define __wt_page_in(session, ref, flags) \ __wt_page_in_func(session, ref, flags, __func__, __LINE__) -#define __wt_page_swap(session, held, want, prev_race, flags) \ - __wt_page_swap_func( \ - session, held, want, prev_race, flags, __func__, __LINE__) +#define __wt_page_swap(session, held, want, flags) \ + __wt_page_swap_func(session, held, want, flags, __func__, __LINE__) #else #define __wt_scr_alloc(session, size, scratchp) \ __wt_scr_alloc_func(session, size, scratchp) #define __wt_page_in(session, ref, flags) \ __wt_page_in_func(session, ref, flags) -#define __wt_page_swap(session, held, want, prev_race, flags) \ - __wt_page_swap_func(session, held, want, prev_race, flags) +#define __wt_page_swap(session, held, want, flags) \ + __wt_page_swap_func(session, held, want, flags) #endif /* Called on unexpected code path: locate the failure. */ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index e102d7f5057..cbf572f9a23 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -141,6 +141,8 @@ struct __wt_session_impl { u_int ckpt_handle_next; /* Next empty slot */ size_t ckpt_handle_allocated; /* Bytes allocated */ + uint64_t cache_wait_us; /* Wait time for cache for current operation */ + /* * Operations acting on handles. * diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 92f28d88e62..1693b9baa82 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -406,6 +406,7 @@ struct __wt_connection_stats { int64_t cache_eviction_maximum_page_size; int64_t cache_eviction_dirty; int64_t cache_eviction_app_dirty; + int64_t cache_timed_out_ops; int64_t cache_read_overflow; int64_t cache_eviction_deepen; int64_t cache_write_lookaside; @@ -663,6 +664,7 @@ struct __wt_connection_stats { int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_snapshot_range; int64_t txn_pinned_timestamp; + int64_t txn_pinned_timestamp_checkpoint; int64_t txn_pinned_timestamp_oldest; int64_t txn_sync; int64_t txn_commit; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 32234dca23e..480d31b188e 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -147,7 +147,7 @@ struct __wt_txn_global { volatile bool checkpoint_running; /* Checkpoint running */ volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ WT_TXN_STATE checkpoint_state; /* Checkpoint's txn state */ - WT_TXN *checkpoint_txn; /* Checkpoint's txn structure */ + WT_DECL_TIMESTAMP(checkpoint_timestamp) /* Checkpoint's timestamp */ volatile uint64_t metadata_pinned; /* Oldest ID for metadata */ diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 9276ca62903..0efc32811e6 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -396,6 +396,60 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) return (checkpoint_pinned); } +#ifdef HAVE_TIMESTAMPS +/* + * __wt_txn_pinned_timestamp -- + * Get the first timestamp that has to be kept for the current tree. + */ +static inline void +__wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp) +{ + WT_BTREE *btree; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t checkpoint_ts, pinned_ts; + bool include_checkpoint_txn; + + btree = S2BT_SAFE(session); + txn_global = &S2C(session)->txn_global; + + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp)); + __wt_timestamp_set(pinned_tsp, &pinned_ts); + + /* + * Checkpoint transactions often fall behind ordinary application + * threads. Take special effort to not keep changes pinned in cache if + * they are only required for the checkpoint and it has already seen + * them. + * + * If there is no active checkpoint or this handle is up to date with + * the active checkpoint then it's safe to ignore the checkpoint ID in + * the visibility check. + */ + include_checkpoint_txn = btree == NULL || + (!F_ISSET(btree, WT_BTREE_LOOKASIDE) && + btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT)); + if (!include_checkpoint_txn) + return; + + /* + * The read of the timestamp pinned by a checkpoint needs to be + * carefully ordered: if a checkpoint is starting and we have to use + * the checkpoint timestamp, we take the minimum of it with the oldest + * timestamp, which is what we want. + */ + WT_READ_BARRIER(); + + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&checkpoint_ts, + &txn_global->checkpoint_timestamp)); + + if (!__wt_timestamp_iszero(&checkpoint_ts) && + __wt_timestamp_cmp(&checkpoint_ts, &pinned_ts) < 0) + __wt_timestamp_set(pinned_tsp, &checkpoint_ts); +} +#endif + /* * __txn_visible_all_id -- * Check if a given transaction ID is "globally visible". This is, if @@ -427,8 +481,7 @@ __wt_txn_visible_all( #ifdef HAVE_TIMESTAMPS { - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; - int cmp; + wt_timestamp_t pinned_ts; /* Timestamp check. */ if (timestamp == NULL || __wt_timestamp_iszero(timestamp)) @@ -438,20 +491,11 @@ __wt_txn_visible_all( * If no oldest timestamp has been supplied, updates have to stay in * cache until we are shutting down. */ - if (!txn_global->has_pinned_timestamp) + if (!S2C(session)->txn_global.has_pinned_timestamp) return (F_ISSET(S2C(session), WT_CONN_CLOSING)); - WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, - cmp = __wt_timestamp_cmp(timestamp, &txn_global->pinned_timestamp)); - - /* - * We can discard updates with timestamps less than or equal to the - * pinned timestamp. This is different to the situation for - * transaction IDs, because we know that updates with timestamps are - * definitely committed (and in this case, that the transaction ID is - * globally visible). - */ - return (cmp <= 0); + __wt_txn_pinned_timestamp(session, &pinned_ts); + return (__wt_timestamp_cmp(timestamp, &pinned_ts) <= 0); } #else WT_UNUSED(timestamp); diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 159e9e2cf72..a4ba834d5ef 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -602,6 +602,14 @@ struct __wt_cursor { * * The key must first be set and the record must already exist. * + * Note that reserve works by doing a special update operation that is + * not logged and does not change the value of the record. This update + * is aborted when the enclosing transaction ends regardless of whether + * it commits or rolls back. Given that, reserve can only be used to + * detect conflicts between transactions that execute concurrently. It + * cannot detect all logical conflicts between transactions. For that, + * some update to the record must be committed. + * * @snippet ex_all.c Reserve a record * * On success, the cursor ends positioned at the specified record; to @@ -1639,6 +1647,12 @@ struct __wt_session { * the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the * beginning (end) of the table. * + * When a range truncate is in progress, and another transaction inserts + * a key into that range, the behavior is not well defined - a conflict + * may be detected or both transactions may be permitted to commit. If + * they do commit, and if there is a crash and recovery runs, the result + * may be different than what was in cache before the crash. + * * @param session the session handle * @param name the URI of the table or file to truncate * @param start optional cursor marking the first record discarded; @@ -2128,6 +2142,10 @@ struct __wt_connection { * thread uses a session from the configured session_max., an integer * between 1 and 20; default \c 2.} * @config{ ),,} + * @config{cache_max_wait_ms, the maximum number of milliseconds an + * application thread will wait for space to be available in cache + * before giving up. Default will wait forever., an integer greater + * than or equal to 0; default \c 0.} * @config{cache_overhead, assume the heap allocator overhead is the * specified percentage\, and adjust the cache usage by that amount (for * example\, if there is 10GB of data in cache\, a percentage of 10 @@ -2179,7 +2197,7 @@ struct __wt_connection { * is a percentage of the cache size if the value is within the range of * 0 to 100 or an absolute size when greater than 100. The value is not * allowed to exceed the \c cache_size. Ignored if set to zero or \c - * in_memory is \c true., an integer between 0 and 10TB; default \c 5.} + * in_memory is \c true., an integer between 0 and 10TB; default \c 1.} * @config{eviction_dirty_target, perform eviction in worker threads * when the cache contains at least this much dirty content. It is a * percentage of the cache size if the value is within the range of 1 to @@ -2708,6 +2726,10 @@ struct __wt_connection { * default value for any sessions created\, and can be overridden in configuring * \c cache_cursors in WT_CONNECTION.open_session., a boolean flag; default \c * true.} + * @config{cache_max_wait_ms, the maximum number of milliseconds an application + * thread will wait for space to be available in cache before giving up. + * Default will wait forever., an integer greater than or equal to 0; default \c + * 0.} * @config{cache_overhead, assume the heap allocator overhead is the specified * percentage\, and adjust the cache usage by that amount (for example\, if * there is 10GB of data in cache\, a percentage of 10 means WiredTiger treats @@ -2808,7 +2830,7 @@ struct __wt_connection { * percentage of the cache size if the value is within the range of 0 to 100 or * an absolute size when greater than 100. The value is not allowed to exceed * the \c cache_size. Ignored if set to zero or \c in_memory is \c true., an - * integer between 0 and 10TB; default \c 5.} + * integer between 0 and 10TB; default \c 1.} * @config{eviction_dirty_target, perform eviction in worker threads when the * cache contains at least this much dirty content. It is a percentage of the * cache size if the value is within the range of 1 to 100 or an absolute size @@ -5103,596 +5125,600 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1091 /*! cache: modified pages evicted by application threads */ #define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1092 +/*! cache: operations timed out waiting for space in cache */ +#define WT_STAT_CONN_CACHE_TIMED_OUT_OPS 1093 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1093 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1094 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1094 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1095 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1095 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1096 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1096 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1097 /*! cache: pages evicted because they exceeded the in-memory maximum count */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1097 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1098 /*! * cache: pages evicted because they exceeded the in-memory maximum time * (usecs) */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1098 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1099 /*! cache: pages evicted because they had chains of deleted items count */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1099 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1100 /*! * cache: pages evicted because they had chains of deleted items time * (usecs) */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1100 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1101 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1101 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1102 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1102 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1103 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1103 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1104 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1104 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1105 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1105 +#define WT_STAT_CONN_CACHE_READ 1106 /*! cache: pages read into cache after truncate */ -#define WT_STAT_CONN_CACHE_READ_DELETED 1106 +#define WT_STAT_CONN_CACHE_READ_DELETED 1107 /*! cache: pages read into cache after truncate in prepare state */ -#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1107 +#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1108 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1108 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1109 /*! cache: pages read into cache requiring lookaside for checkpoint */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1109 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1110 /*! cache: pages read into cache skipping older lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1110 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1111 /*! * cache: pages read into cache with skipped lookaside entries needed * later */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1111 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1112 /*! * cache: pages read into cache with skipped lookaside entries needed * later by checkpoint */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1112 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1113 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1113 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1114 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1114 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1115 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1115 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1116 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1116 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1117 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1117 +#define WT_STAT_CONN_CACHE_WRITE 1118 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1118 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1119 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1119 +#define WT_STAT_CONN_CACHE_OVERHEAD 1120 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1120 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1121 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1121 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1122 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1122 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1123 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1123 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1124 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1124 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1125 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1125 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1126 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1126 +#define WT_STAT_CONN_COND_AUTO_WAIT 1127 /*! connection: detected system time went backwards */ -#define WT_STAT_CONN_TIME_TRAVEL 1127 +#define WT_STAT_CONN_TIME_TRAVEL 1128 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1128 +#define WT_STAT_CONN_FILE_OPEN 1129 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1129 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1130 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1130 +#define WT_STAT_CONN_MEMORY_FREE 1131 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1131 +#define WT_STAT_CONN_MEMORY_GROW 1132 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1132 +#define WT_STAT_CONN_COND_WAIT 1133 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1133 +#define WT_STAT_CONN_RWLOCK_READ 1134 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1134 +#define WT_STAT_CONN_RWLOCK_WRITE 1135 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1135 +#define WT_STAT_CONN_FSYNC_IO 1136 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1136 +#define WT_STAT_CONN_READ_IO 1137 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1137 +#define WT_STAT_CONN_WRITE_IO 1138 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1138 +#define WT_STAT_CONN_CURSOR_CREATE 1139 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1139 +#define WT_STAT_CONN_CURSOR_INSERT 1140 /*! cursor: cursor modify calls */ -#define WT_STAT_CONN_CURSOR_MODIFY 1140 +#define WT_STAT_CONN_CURSOR_MODIFY 1141 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1141 +#define WT_STAT_CONN_CURSOR_NEXT 1142 /*! cursor: cursor operation restarted */ -#define WT_STAT_CONN_CURSOR_RESTART 1142 +#define WT_STAT_CONN_CURSOR_RESTART 1143 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1143 +#define WT_STAT_CONN_CURSOR_PREV 1144 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1144 +#define WT_STAT_CONN_CURSOR_REMOVE 1145 /*! cursor: cursor reserve calls */ -#define WT_STAT_CONN_CURSOR_RESERVE 1145 +#define WT_STAT_CONN_CURSOR_RESERVE 1146 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1146 +#define WT_STAT_CONN_CURSOR_RESET 1147 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1147 +#define WT_STAT_CONN_CURSOR_SEARCH 1148 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1148 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1149 /*! cursor: cursor sweep buckets */ -#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1149 +#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1150 /*! cursor: cursor sweep cursors closed */ -#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1150 +#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1151 /*! cursor: cursor sweep cursors examined */ -#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1151 +#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1152 /*! cursor: cursor sweeps */ -#define WT_STAT_CONN_CURSOR_SWEEP 1152 +#define WT_STAT_CONN_CURSOR_SWEEP 1153 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1153 +#define WT_STAT_CONN_CURSOR_UPDATE 1154 /*! cursor: cursors cached on close */ -#define WT_STAT_CONN_CURSOR_CACHE 1154 +#define WT_STAT_CONN_CURSOR_CACHE 1155 /*! cursor: cursors reused from cache */ -#define WT_STAT_CONN_CURSOR_REOPEN 1155 +#define WT_STAT_CONN_CURSOR_REOPEN 1156 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1156 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1157 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1157 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1158 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1158 +#define WT_STAT_CONN_DH_SWEEP_REF 1159 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1159 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1160 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1160 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1161 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1161 +#define WT_STAT_CONN_DH_SWEEP_TOD 1162 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1162 +#define WT_STAT_CONN_DH_SWEEPS 1163 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1163 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1164 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1164 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1165 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1165 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1166 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1166 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1167 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1167 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1168 /*! * lock: commit timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1168 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1169 /*! * lock: commit timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1169 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1170 /*! lock: commit timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1170 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1171 /*! lock: commit timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1171 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1172 /*! * lock: dhandle lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1172 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1173 /*! * lock: dhandle lock internal thread time waiting for the dhandle lock * (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1173 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1174 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1174 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1175 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1175 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1176 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1176 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1177 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1177 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1178 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1178 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1179 /*! * lock: read timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1179 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1180 /*! * lock: read timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1180 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1181 /*! lock: read timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1181 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1182 /*! lock: read timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1182 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1183 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1183 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1184 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1184 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1185 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1185 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1186 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1186 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1187 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1187 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1188 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1188 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1189 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1189 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1190 /*! * lock: txn global lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1190 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1191 /*! * lock: txn global lock internal thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1191 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1192 /*! lock: txn global read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1192 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1193 /*! lock: txn global write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1193 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1194 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1194 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1195 /*! log: force archive time sleeping (usecs) */ -#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1195 +#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1196 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1196 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1197 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1197 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1198 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1198 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1199 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1199 +#define WT_STAT_CONN_LOG_FLUSH 1200 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1200 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1201 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1201 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1202 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1202 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1203 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1203 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1204 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1204 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1205 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1205 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1206 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1206 +#define WT_STAT_CONN_LOG_SCANS 1207 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1207 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1208 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1208 +#define WT_STAT_CONN_LOG_WRITE_LSN 1209 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1209 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1210 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1210 +#define WT_STAT_CONN_LOG_SYNC 1211 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1211 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1212 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1212 +#define WT_STAT_CONN_LOG_SYNC_DIR 1213 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1213 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1214 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1214 +#define WT_STAT_CONN_LOG_WRITES 1215 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1215 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1216 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1216 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1217 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1217 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1218 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1218 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1219 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1219 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1220 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1220 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1221 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1221 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1222 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1222 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1223 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1223 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1224 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1224 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1225 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1225 +#define WT_STAT_CONN_LOG_SLOT_RACES 1226 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1226 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1227 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1227 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1228 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1228 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1229 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1229 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1230 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1230 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1231 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1231 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1232 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1232 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1233 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1233 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1234 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1234 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1235 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1235 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1236 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1236 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1237 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1237 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1238 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1238 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1239 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1239 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1240 /*! perf: file system read latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1240 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1241 /*! perf: file system read latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1241 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1242 /*! perf: file system read latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1242 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1243 /*! perf: file system read latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1243 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1244 /*! perf: file system read latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1244 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1245 /*! perf: file system read latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1245 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1246 /*! perf: file system write latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1246 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1247 /*! perf: file system write latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1247 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1248 /*! perf: file system write latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1248 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1249 /*! perf: file system write latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1249 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1250 /*! perf: file system write latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1250 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1251 /*! perf: file system write latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1251 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1252 /*! perf: operation read latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1252 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1253 /*! perf: operation read latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1253 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1254 /*! perf: operation read latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1254 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1255 /*! perf: operation read latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1255 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1256 /*! perf: operation read latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1256 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1257 /*! perf: operation write latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1257 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1258 /*! perf: operation write latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1258 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1259 /*! perf: operation write latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1259 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1260 /*! perf: operation write latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1260 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1261 /*! perf: operation write latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1261 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1262 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1262 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1263 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1263 +#define WT_STAT_CONN_REC_PAGES 1264 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1264 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1265 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1265 +#define WT_STAT_CONN_REC_PAGE_DELETE 1266 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1266 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1267 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1267 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1268 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1268 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1269 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1269 +#define WT_STAT_CONN_SESSION_OPEN 1270 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1270 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1271 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1271 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1272 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1272 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1273 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1273 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1274 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1274 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1275 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1275 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1276 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1276 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1277 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1277 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1278 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1278 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1279 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1279 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1280 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1280 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1281 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1281 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1282 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1282 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1283 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1283 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1284 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1284 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1285 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1285 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1286 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1286 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1287 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1287 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1288 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1288 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1289 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1289 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1290 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1290 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1291 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1291 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1292 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1292 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1293 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1293 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1294 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1294 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1295 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1295 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1296 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1296 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1297 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1297 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1298 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1298 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1299 /*! thread-yield: page access yielded due to prepare state change */ -#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1299 +#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1300 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1300 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1301 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1301 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1302 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1302 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1303 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1303 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1304 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1304 +#define WT_STAT_CONN_PAGE_SLEEP 1305 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1305 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1306 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1306 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1307 /*! transaction: commit timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1307 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1308 /*! transaction: commit timestamp queue inserts to tail */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_TAIL 1308 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_TAIL 1309 /*! transaction: commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1309 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1310 /*! transaction: commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1310 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1311 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1311 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1312 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1312 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1313 /*! transaction: prepared transactions */ -#define WT_STAT_CONN_TXN_PREPARE 1313 +#define WT_STAT_CONN_TXN_PREPARE 1314 /*! transaction: prepared transactions committed */ -#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1314 +#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1315 /*! transaction: prepared transactions currently active */ -#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1315 +#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1316 /*! transaction: prepared transactions rolled back */ -#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1316 +#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1317 /*! transaction: query timestamp calls */ -#define WT_STAT_CONN_TXN_QUERY_TS 1317 +#define WT_STAT_CONN_TXN_QUERY_TS 1318 /*! transaction: read timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1318 +#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1319 /*! transaction: read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1319 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1320 /*! transaction: read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1320 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1321 /*! transaction: read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1321 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1322 /*! transaction: rollback to stable calls */ -#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1322 +#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1323 /*! transaction: rollback to stable updates aborted */ -#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1323 +#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1324 /*! transaction: rollback to stable updates removed from lookaside */ -#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1324 +#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1325 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1325 +#define WT_STAT_CONN_TXN_SET_TS 1326 /*! transaction: set timestamp commit calls */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1326 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1327 /*! transaction: set timestamp commit updates */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1327 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1328 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1328 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1329 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1329 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1330 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1330 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1331 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1331 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1332 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1332 +#define WT_STAT_CONN_TXN_BEGIN 1333 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1333 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1334 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1334 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1335 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1335 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1336 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1336 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1337 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1337 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1338 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1338 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1339 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1339 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1340 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1340 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1341 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1341 +#define WT_STAT_CONN_TXN_CHECKPOINT 1342 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1342 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1343 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1343 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1344 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1344 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1345 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1345 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1346 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1346 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1347 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1347 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1348 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1348 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1349 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1349 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1350 +/*! transaction: transaction range of timestamps pinned by a checkpoint */ +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1351 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1350 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1352 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1351 +#define WT_STAT_CONN_TXN_SYNC 1353 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1352 +#define WT_STAT_CONN_TXN_COMMIT 1354 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1353 +#define WT_STAT_CONN_TXN_ROLLBACK 1355 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1354 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1356 /*! * @} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 4d9f6f92832..13d7d857a04 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -265,6 +265,12 @@ open: WT_WITH_SCHEMA_LOCK(session, } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { + /* + * Opening this LSM cursor has opened a number of btree + * cursors, ensure other code doesn't think this is the first + * cursor in a session. + */ + ++session->ncursors; WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } @@ -284,6 +290,7 @@ __clsm_leave(WT_CURSOR_LSM *clsm) session = (WT_SESSION_IMPL *)clsm->iface.session; if (F_ISSET(clsm, WT_CLSM_ACTIVE)) { + --session->ncursors; __cursor_leave(session); F_CLR(clsm, WT_CLSM_ACTIVE); } @@ -365,12 +372,17 @@ __clsm_deleted_decode(WT_CURSOR_LSM *clsm, WT_ITEM *value) * Close any btree cursors that are not needed. */ static int -__clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) +__clsm_close_cursors( + WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int start, u_int end) { WT_BLOOM *bloom; WT_CURSOR *c; u_int i; + __wt_verbose(session, WT_VERB_LSM, + "LSM closing cursor session(%p):clsm(%p), start: %u, end: %u", + (void *)session, (void *)clsm, start, end); + if (clsm->chunks == NULL || clsm->nchunks == 0) return (0); @@ -609,7 +621,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { saved_gen = lsm_tree->dsk_gen; locked = false; __wt_lsm_tree_readunlock(session, lsm_tree); - WT_ERR(__clsm_close_cursors( + WT_ERR(__clsm_close_cursors(session, clsm, close_range_start, close_range_end)); __wt_lsm_tree_readlock(session, lsm_tree); locked = true; @@ -626,6 +638,10 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ + __wt_verbose(session, WT_VERB_LSM, + "LSM opening cursor session(%p):clsm(%p)%s, chunks: %u, good: %u", + (void *)session, (void *)clsm, + update ? ", update" : "", nchunks, ngood); for (i = ngood; i != nchunks; i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ @@ -1736,7 +1752,7 @@ __wt_clsm_close(WT_CURSOR *cursor) */ clsm = (WT_CURSOR_LSM *)cursor; CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); - WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks)); + WT_TRET(__clsm_close_cursors(session, clsm, 0, clsm->nchunks)); __clsm_free_chunks(session, clsm); /* In case we were somehow left positioned, clear that. */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 40ff5fc0b26..1a5c60344bc 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -72,11 +72,7 @@ __lsm_general_worker_start(WT_SESSION_IMPL *session) worker_args->type = WT_LSM_WORK_DROP | WT_LSM_WORK_SWITCH; else { - worker_args->type = - WT_LSM_WORK_BLOOM | - WT_LSM_WORK_DROP | - WT_LSM_WORK_FLUSH | - WT_LSM_WORK_SWITCH; + worker_args->type = WT_LSM_WORK_GENERAL_OPS; /* * Only allow half of the threads to run merges to * avoid all all workers getting stuck in long-running @@ -422,9 +418,10 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) fillms = 10000; /* * If the tree appears to not be triggering enough - * LSM maintenance, help it out. Additional work units - * don't hurt, and can be necessary if some work - * units aren't completed for some reason. + * LSM maintenance, help it out. Some types of + * additional work units don't hurt, and can be + * necessary if some work units aren't completed for + * some reason. * If the tree hasn't been modified, and there are * more than 1 chunks - try to get the tree smaller * so queries run faster. diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 6f18f4fb152..a283670eba6 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -313,6 +313,37 @@ __wt_lsm_chunk_visible_all( } /* + * __lsm_set_chunk_evictable -- + * Enable eviction in an LSM chunk. + */ +static int +__lsm_set_chunk_evictable( + WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk, bool need_handle) +{ + WT_BTREE *btree; + WT_DECL_RET; + + if (chunk->evict_enabled != 0) + return (0); + + /* See if we win the race to enable eviction. */ + if (__wt_atomic_cas32(&chunk->evict_enabled, 0, 1)) { + if (need_handle) + WT_RET(__wt_session_get_dhandle( + session, chunk->uri, NULL, NULL, 0)); + btree = session->dhandle->handle; + if (btree->evict_disabled_open) { + btree->evict_disabled_open = false; + __wt_evict_file_exclusive_off(session); + } + + if (need_handle) + WT_TRET(__wt_session_release_dhandle(session)); + } + return (ret); +} + +/* * __lsm_checkpoint_chunk -- * Checkpoint an LSM chunk, separated out to make locking easier. */ @@ -340,7 +371,6 @@ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { - WT_BTREE *btree; WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set, release_dhandle; @@ -375,6 +405,14 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); if (!__wt_lsm_chunk_visible_all(session, chunk)) { + /* + * If there is cache pressure consider making a chunk evictable + * to avoid the cache getting stuck when history is required. + */ + if (__wt_eviction_needed(session, false, false, NULL)) + WT_ERR(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_ENABLE_EVICT, 0, lsm_tree)); + __wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri); @@ -446,11 +484,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * Enable eviction on the live chunk so it doesn't block the cache. * Future reads should direct to the on-disk chunk anyway. */ - btree = session->dhandle->handle; - if (btree->evict_disabled_open) { - btree->evict_disabled_open = false; - __wt_evict_file_exclusive_off(session); - } + WT_ERR(__lsm_set_chunk_evictable(session, chunk, false)); release_dhandle = false; WT_ERR(__wt_session_release_dhandle(session)); @@ -481,6 +515,54 @@ err: if (flush_set) } /* + * __wt_lsm_work_enable_evict -- + * LSM usually pins live chunks in memory - preferring to force them + * out via a checkpoint when they are no longer required. For applications + * that keep data pinned for a long time this can lead to the cache + * being pinned full. This work unit detects that case, and enables + * regular eviction in chunks that can be correctly evicted. + */ +int +__wt_lsm_work_enable_evict(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_WORKER_COOKIE cookie; + u_int i; + + WT_CLEAR(cookie); + + /* Only do this if there is cache pressure */ + if (!__wt_eviction_needed(session, false, false, NULL)) + return (0); + + WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, false)); + + /* + * Turn on eviction in chunks that have had some chance to + * checkpoint if there is cache pressure. + */ + for (i = 0; cookie.nchunks > 2 && i < cookie.nchunks - 2; i++) { + chunk = cookie.chunk_array[i]; + + /* + * Skip if the chunk isn't on disk yet, or if it's still in + * cache for a reason other than transaction visibility. + */ + if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) || + chunk->evict_enabled != 0 || + __wt_lsm_chunk_visible_all(session, chunk)) + continue; + + WT_ERR(__lsm_set_chunk_evictable(session, chunk, true)); + } + +err: __lsm_unpin_chunks(session, &cookie); + __wt_free(session, cookie.chunk_array); + return (ret); +} + +/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c index 82f72bdf355..8588737f6c3 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_worker.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c @@ -42,7 +42,9 @@ __wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) /* * __lsm_worker_general_op -- - * Execute a single bloom, drop or flush work unit. + * Execute a single medium importance maintenance operation that should + * not be super long running. That includes bloom creation, drop or flush + * work unit types. */ static int __lsm_worker_general_op( @@ -55,11 +57,7 @@ __lsm_worker_general_op( *completed = false; - /* - * Return if this thread cannot process a bloom, drop or flush. - */ - if (!FLD_ISSET(cookie->type, - WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH)) + if (!FLD_ISSET(cookie->type, WT_LSM_WORK_GENERAL_OPS)) return (WT_NOTFOUND); if ((ret = __wt_lsm_manager_pop_entry(session, @@ -88,6 +86,8 @@ __lsm_worker_general_op( WT_ERR(__wt_lsm_free_chunks(session, entry->lsm_tree)); else if (entry->type == WT_LSM_WORK_BLOOM) WT_ERR(__wt_lsm_work_bloom(session, entry->lsm_tree)); + else if (entry->type == WT_LSM_WORK_ENABLE_EVICT) + WT_ERR(__wt_lsm_work_enable_evict(session, entry->lsm_tree)); *completed = true; err: __wt_lsm_manager_free_work_unit(session, entry); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index eb3b0038525..c0ce4c2d235 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -47,10 +47,11 @@ typedef struct { /* Track the page's min/maximum transactions. */ uint64_t max_txn; - uint64_t min_txn_unstable; WT_DECL_TIMESTAMP(max_timestamp) - WT_DECL_TIMESTAMP(max_onpage_timestamp) - WT_DECL_TIMESTAMP(min_saved_timestamp) + + /* Lookaside boundary tracking. */ + uint64_t unstable_txn; + WT_DECL_TIMESTAMP(unstable_timestamp) u_int updates_seen; /* Count of updates seen. */ u_int updates_unstable; /* Count of updates not visible_all. */ @@ -422,14 +423,27 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, return (EBUSY); } + /* Initialize the reconciliation structure for each new run. */ + if ((ret = __rec_init( + session, ref, flags, salvage, &session->reconcile)) != 0) { + WT_PAGE_UNLOCK(session, page); + return (ret); + } + r = session->reconcile; + oldest_id = __wt_txn_oldest_id(session); + + /* + * During eviction, save the transaction state that causes history to + * be pinned, regardless of whether reconciliation succeeds or fails. + * There is usually no point retrying eviction until this state + * changes. + */ if (LF_ISSET(WT_REC_EVICT)) { mod->last_eviction_id = oldest_id; #ifdef HAVE_TIMESTAMPS - WT_WITH_TIMESTAMP_READLOCK(session, - &S2C(session)->txn_global.rwlock, - __wt_timestamp_set(&mod->last_eviction_timestamp, - &S2C(session)->txn_global.pinned_timestamp)); + __wt_txn_pinned_timestamp( + session, &mod->last_eviction_timestamp); #endif mod->last_evict_pass_gen = S2C(session)->cache->evict_pass_gen; } @@ -444,14 +458,6 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, mod->last_oldest_id = oldest_id; #endif - /* Initialize the reconciliation structure for each new run. */ - if ((ret = __rec_init( - session, ref, flags, salvage, &session->reconcile)) != 0) { - WT_PAGE_UNLOCK(session, page); - return (ret); - } - r = session->reconcile; - /* Reconcile the page. */ switch (page->type) { case WT_PAGE_COL_FIX: @@ -495,6 +501,17 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, else WT_TRET(__rec_write_wrapup_err(session, r, page)); +#ifdef HAVE_TIMESTAMPS + /* + * If reconciliation completes successfully, save the stable timestamp. + */ + if (ret == 0 && S2C(session)->txn_global.has_stable_timestamp) + WT_WITH_TIMESTAMP_READLOCK(session, + &S2C(session)->txn_global.rwlock, + __wt_timestamp_set(&mod->last_stable_timestamp, + &S2C(session)->txn_global.stable_timestamp)); +#endif + /* Release the reconciliation lock. */ WT_PAGE_UNLOCK(session, page); @@ -681,7 +698,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) } else { /* * Track the page's maximum transaction ID (used to decide if - * we're likely to be able to evict this page in the future). + * we can evict a clean page and discard its history). */ mod->rec_max_txn = r->max_txn; __wt_timestamp_set(&mod->rec_max_timestamp, &r->max_timestamp); @@ -873,7 +890,6 @@ __rec_init(WT_SESSION_IMPL *session, WT_PAGE *page; WT_RECONCILE *r; WT_TXN_GLOBAL *txn_global; - bool las_skew_oldest; btree = S2BT(session); page = ref->page; @@ -928,27 +944,24 @@ __rec_init(WT_SESSION_IMPL *session, * We usually prefer to skew to newer versions, the logic being that by * the time the next checkpoint runs, it is likely that all the updates * we choose will be stable. However, if checkpointing with a - * timestamp (indicated by a stable_timestamp being set), and the - * timestamp hasn't changed since the last time this page was - * reconciled, skew oldest instead. If a checkpoint is already running, - * the oldest version is more likely to be what it needs. + * timestamp (indicated by a stable_timestamp being set), and there is + * a checkpoint already running, or this page was read with lookaside + * history, or the stable timestamp hasn't changed since last time this + * page was successfully, skew oldest instead. */ - if (__wt_btree_immediately_durable(session)) - las_skew_oldest = false; - else { - WT_ORDERED_READ(las_skew_oldest, - txn_global->has_stable_timestamp); - if (las_skew_oldest) { - las_skew_oldest = (ref->page_las != NULL && - !__wt_txn_visible_all(session, WT_TXN_NONE, - WT_TIMESTAMP_NULL( - &ref->page_las->min_timestamp))) || - btree->checkpoint_gen != - __wt_gen(session, WT_GEN_CHECKPOINT); - } - } - r->las_skew_newest = LF_ISSET(WT_REC_LOOKASIDE) && - LF_ISSET(WT_REC_VISIBLE_ALL) && !las_skew_oldest; + r->las_skew_newest = + LF_ISSET(WT_REC_LOOKASIDE) && LF_ISSET(WT_REC_VISIBLE_ALL); +#ifdef HAVE_TIMESTAMPS + if (r->las_skew_newest && + !__wt_btree_immediately_durable(session) && + txn_global->has_stable_timestamp && + ((btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT) && + txn_global->stable_is_pinned) || + FLD_ISSET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE) || + __wt_timestamp_cmp(&page->modify->last_stable_timestamp, + &txn_global->stable_timestamp) == 0)) + r->las_skew_newest = false; +#endif /* * When operating on the lookaside table, we should never try @@ -979,10 +992,21 @@ __rec_init(WT_SESSION_IMPL *session, /* Track the page's min/maximum transaction */ r->max_txn = WT_TXN_NONE; - r->min_txn_unstable = WT_TXN_ABORTED; __wt_timestamp_set_zero(&r->max_timestamp); - __wt_timestamp_set_zero(&r->max_onpage_timestamp); - __wt_timestamp_set_inf(&r->min_saved_timestamp); + + /* + * Track the first unstable transaction (when skewing newest this is + * the newest update, otherwise the newest update not on the page). + * This is the boundary between the on-page information and the history + * stored in the lookaside table. + */ + if (r->las_skew_newest) { + r->unstable_txn = WT_TXN_NONE; + __wt_timestamp_set_zero(&r->unstable_timestamp); + } else { + r->unstable_txn = WT_TXN_ABORTED; + __wt_timestamp_set_inf(&r->unstable_timestamp); + } /* Track if updates were used and/or uncommitted. */ r->updates_seen = r->updates_unstable = 0; @@ -1264,7 +1288,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *first_txn_upd, *first_upd, *upd; wt_timestamp_t *timestampp; size_t upd_memsize; - uint64_t max_txn, min_txn_unstable, txnid; + uint64_t max_txn, txnid; bool all_visible, skipped_birthmark, uncommitted; #ifdef HAVE_TIMESTAMPS @@ -1280,7 +1304,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, first_txn_upd = NULL; upd_memsize = 0; max_txn = WT_TXN_NONE; - min_txn_unstable = WT_TXN_ABORTED; skipped_birthmark = uncommitted = false; /* @@ -1380,13 +1403,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (upd->type == WT_UPDATE_BIRTHMARK) skipped_birthmark = true; - /* - * Track minimum transaction ID for unstable updates. - */ - if (txnid != WT_TXN_NONE && - WT_TXNID_LT(txnid, min_txn_unstable)) - min_txn_unstable = txnid; - continue; } @@ -1434,23 +1450,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (WT_TXNID_LT(r->max_txn, max_txn)) r->max_txn = max_txn; - /* - * Track the oldest unstable transaction in the page. It is used to - * decide whether to or not to read the history during a page read. - */ - if (WT_TXNID_LT(min_txn_unstable, r->min_txn_unstable)) - r->min_txn_unstable = min_txn_unstable; - #ifdef HAVE_TIMESTAMPS /* Update the maximum timestamp. */ if (first_ts_upd != NULL && __wt_timestamp_cmp(&r->max_timestamp, &first_ts_upd->timestamp) < 0) __wt_timestamp_set(&r->max_timestamp, &first_ts_upd->timestamp); - - /* Update the maximum on-page timestamp. */ - if (upd != NULL && - __wt_timestamp_cmp(&upd->timestamp, &r->max_onpage_timestamp) > 0) - __wt_timestamp_set(&r->max_onpage_timestamp, &upd->timestamp); #endif /* @@ -1527,24 +1531,38 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (upd_savedp != NULL) *upd_savedp = true; + /* + * Track the first off-page update when saving history in the lookaside + * table. When skewing newest, we want the first (non-aborted) update + * after the one stored on the page. Otherwise, we want the update + * before the on-page update. + */ + if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) { + if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid)) + r->unstable_txn = first_upd->txnid; #ifdef HAVE_TIMESTAMPS - /* Track the oldest saved timestamp for lookaside. */ - if (F_ISSET(r, WT_REC_LOOKASIDE)) { - /* If no updates had timestamps, we're done. */ - if (first_ts_upd == NULL) - __wt_timestamp_set_zero(&r->min_saved_timestamp); + if (first_ts_upd != NULL && + __wt_timestamp_cmp(&r->unstable_timestamp, + &first_ts_upd->timestamp) < 0) + __wt_timestamp_set(&r->unstable_timestamp, + &first_ts_upd->timestamp); +#endif + } else if (F_ISSET(r, WT_REC_LOOKASIDE)) { for (upd = first_upd; upd != *updp; upd = upd->next) { - if (upd->txnid != WT_TXN_ABORTED && - __wt_timestamp_cmp(&upd->timestamp, - &r->min_saved_timestamp) < 0) - __wt_timestamp_set(&r->min_saved_timestamp, - &upd->timestamp); + if (upd->txnid == WT_TXN_ABORTED) + continue; - WT_ASSERT(session, upd->txnid == WT_TXN_ABORTED || - WT_TXNID_LE(upd->txnid, r->max_txn)); + if (upd->txnid != WT_TXN_NONE && + WT_TXNID_LT(upd->txnid, r->unstable_txn)) + r->unstable_txn = upd->txnid; +#ifdef HAVE_TIMESTAMPS + if (__wt_timestamp_cmp(&upd->timestamp, + &r->unstable_timestamp) < 0) + __wt_timestamp_set(&r->unstable_timestamp, + &upd->timestamp); +#endif } } -#endif check_original_value: /* @@ -3429,16 +3447,15 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, done: if (F_ISSET(r, WT_REC_LOOKASIDE)) { /* Track the oldest lookaside timestamp seen so far. */ - multi->page_las.las_skew_newest = r->las_skew_newest; - multi->page_las.las_max_txn = r->max_txn; - multi->page_las.las_min_txn = r->min_txn_unstable; - WT_ASSERT(session, r->max_txn != WT_TXN_NONE); - WT_ASSERT(session, r->min_txn_unstable != WT_TXN_NONE); + multi->page_las.skew_newest = r->las_skew_newest; + multi->page_las.max_txn = r->max_txn; + multi->page_las.unstable_txn = r->unstable_txn; + WT_ASSERT(session, r->unstable_txn != WT_TXN_NONE); #ifdef HAVE_TIMESTAMPS - __wt_timestamp_set(&multi->page_las.min_timestamp, - &r->min_saved_timestamp); - __wt_timestamp_set(&multi->page_las.onpage_timestamp, - &r->max_onpage_timestamp); + __wt_timestamp_set(&multi->page_las.max_timestamp, + &r->max_timestamp); + __wt_timestamp_set(&multi->page_las.unstable_timestamp, + &r->unstable_timestamp); #endif } diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 909019b3b24..0d39a5b682e 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -840,6 +840,7 @@ static const char * const __stats_connection_desc[] = { "cache: maximum page size at eviction", "cache: modified pages evicted", "cache: modified pages evicted by application threads", + "cache: operations timed out waiting for space in cache", "cache: overflow pages read into cache", "cache: page split during eviction deepened the tree", "cache: page written requiring lookaside records", @@ -1097,6 +1098,7 @@ static const char * const __stats_connection_desc[] = { "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned by named snapshots", "transaction: transaction range of timestamps currently pinned", + "transaction: transaction range of timestamps pinned by a checkpoint", "transaction: transaction range of timestamps pinned by the oldest timestamp", "transaction: transaction sync calls", "transaction: transactions committed", @@ -1237,6 +1239,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing cache_eviction_maximum_page_size */ stats->cache_eviction_dirty = 0; stats->cache_eviction_app_dirty = 0; + stats->cache_timed_out_ops = 0; stats->cache_read_overflow = 0; stats->cache_eviction_deepen = 0; stats->cache_write_lookaside = 0; @@ -1494,6 +1497,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing txn_pinned_checkpoint_range */ /* not clearing txn_pinned_snapshot_range */ /* not clearing txn_pinned_timestamp */ + /* not clearing txn_pinned_timestamp_checkpoint */ /* not clearing txn_pinned_timestamp_oldest */ stats->txn_sync = 0; stats->txn_commit = 0; @@ -1662,6 +1666,7 @@ __wt_stat_connection_aggregate( to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty); to->cache_eviction_app_dirty += WT_STAT_READ(from, cache_eviction_app_dirty); + to->cache_timed_out_ops += WT_STAT_READ(from, cache_timed_out_ops); to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow); to->cache_eviction_deepen += WT_STAT_READ(from, cache_eviction_deepen); @@ -2047,6 +2052,8 @@ __wt_stat_connection_aggregate( to->txn_pinned_snapshot_range += WT_STAT_READ(from, txn_pinned_snapshot_range); to->txn_pinned_timestamp += WT_STAT_READ(from, txn_pinned_timestamp); + to->txn_pinned_timestamp_checkpoint += + WT_STAT_READ(from, txn_pinned_timestamp_checkpoint); to->txn_pinned_timestamp_oldest += WT_STAT_READ(from, txn_pinned_timestamp_oldest); to->txn_sync += WT_STAT_READ(from, txn_sync); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index cf233ab9a5d..4cb780c0042 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -119,9 +119,11 @@ void __wt_txn_release_snapshot(WT_SESSION_IMPL *session) { WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; + txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, @@ -131,6 +133,14 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE; F_CLR(txn, WT_TXN_HAS_SNAPSHOT); + + /* Clear a checkpoint's pinned ID. */ + if (WT_SESSION_IS_CHECKPOINT(session)) { + txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; + __wt_timestamp_set_zero(&txn_global->checkpoint_timestamp); + } + + __wt_txn_clear_read_timestamp(session); } /* @@ -528,8 +538,7 @@ __wt_txn_release(WT_SESSION_IMPL *session) if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE); - txn->id = txn_global->checkpoint_state.id = - txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; + txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE; /* * Be extra careful to cleanup everything for checkpoints: once @@ -548,7 +557,6 @@ __wt_txn_release(WT_SESSION_IMPL *session) } __wt_txn_clear_commit_timestamp(session); - __wt_txn_clear_read_timestamp(session); /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); @@ -1283,12 +1291,24 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) txn_global->current - txn_global->oldest_id); #if WT_TIMESTAMP_SIZE == 8 + { + WT_DECL_TIMESTAMP(checkpoint_timestamp) + WT_DECL_TIMESTAMP(commit_timestamp) + WT_DECL_TIMESTAMP(pinned_timestamp) + + checkpoint_timestamp = txn_global->checkpoint_timestamp; + commit_timestamp = txn_global->commit_timestamp; + pinned_timestamp = txn_global->pinned_timestamp; + if (checkpoint_timestamp.val != 0 && + checkpoint_timestamp.val < pinned_timestamp.val) + pinned_timestamp = checkpoint_timestamp; WT_STAT_SET(session, stats, txn_pinned_timestamp, - txn_global->commit_timestamp.val - - txn_global->pinned_timestamp.val); + commit_timestamp.val - pinned_timestamp.val); + WT_STAT_SET(session, stats, txn_pinned_timestamp_checkpoint, + commit_timestamp.val - checkpoint_timestamp.val); WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest, - txn_global->commit_timestamp.val - - txn_global->oldest_timestamp.val); + commit_timestamp.val - txn_global->oldest_timestamp.val); + } #endif WT_STAT_SET(session, stats, txn_pinned_snapshot_range, diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index b0ba6735056..ad8351923a0 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -372,12 +372,11 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - double current_dirty, delta, scrub_min; - uint64_t bytes_written_last, bytes_written_start, bytes_written_total; + double current_dirty, prev_dirty; + uint64_t bytes_written_start, bytes_written_total; uint64_t cache_size, max_write; - uint64_t current_us, stepdown_us, total_ms, work_us; - uint64_t time_last, time_start, time_stop; - bool progress; + uint64_t time_start, time_stop; + uint64_t total_ms; conn = S2C(session); cache = conn->cache; @@ -388,61 +387,41 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) * scrubbing cannot help). */ if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) || - cache->eviction_checkpoint_target < DBL_EPSILON || - cache->eviction_checkpoint_target >= cache->eviction_dirty_trigger) + cache->eviction_checkpoint_target < DBL_EPSILON) return; - time_last = time_start = __wt_clock(session); - bytes_written_last = 0; + time_start = __wt_clock(session); bytes_written_start = cache->bytes_written; - cache_size = conn->cache_size; + /* * If the cache size is zero or very small, we're done. The cache * size can briefly become zero if we're transitioning to a shared * cache via reconfigure. This avoids potential divide by zero. */ - if (cache_size < 10 * WT_MEGABYTE) + if ((cache_size = conn->cache_size) < 10 * WT_MEGABYTE) return; - /* - * Skip scrubbing if it won't perform at-least some minimum amount of - * work. Scrubbing is supposed to bring down the dirty data to eviction - * checkpoint target before the actual checkpoint starts. Do not perform - * scrubbing if the dirty data to scrub is less than a pre-configured - * size. This size is to an extent based on the configured cache size - * without being too large or too small for large cache sizes. For the - * values chosen, for instance, 100 GB cache will require at-least - * 200 MB of dirty data above eviction checkpoint target, which should - * equate to a scrub phase a few seconds long. That said, the value of - * 0.2% and 500 MB are still somewhat arbitrary. - */ - scrub_min = WT_MIN((0.2 * conn->cache_size) / 100, 500 * WT_MEGABYTE); - if (__wt_cache_dirty_leaf_inuse(cache) < - ((cache->eviction_checkpoint_target * conn->cache_size) / 100) + - scrub_min) + current_dirty = + (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; + if (current_dirty <= cache->eviction_checkpoint_target) return; - stepdown_us = 10000; - work_us = 0; - progress = false; - - /* Step down the scrub target (as a percentage) in units of 10MB. */ - delta = WT_MIN(1.0, (100 * 10.0 * WT_MEGABYTE) / cache_size); - - /* - * Start with the scrub target equal to the expected maximum percentage - * of dirty data in cache. - */ - cache->eviction_scrub_limit = cache->eviction_dirty_trigger; - /* Stop if we write as much dirty data as is currently in cache. */ max_write = __wt_cache_dirty_leaf_inuse(cache); - /* Step down the dirty target to the eviction trigger */ + /* Set the dirty trigger to the target value. */ + cache->eviction_scrub_target = cache->eviction_checkpoint_target; + WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); + + /* Wait while the dirty level is going down. */ for (;;) { + __wt_sleep(0, 100 * WT_THOUSAND); + + prev_dirty = current_dirty; current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; - if (current_dirty <= cache->eviction_checkpoint_target) + if (current_dirty <= cache->eviction_checkpoint_target || + current_dirty >= prev_dirty) break; /* @@ -452,63 +431,17 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE)) break; - __wt_sleep(0, stepdown_us / 10); - time_stop = __wt_clock(session); - current_us = WT_CLOCKDIFF_US(time_stop, time_last); - bytes_written_total = - cache->bytes_written - bytes_written_start; - - if (current_dirty > cache->eviction_scrub_limit) { - /* - * We haven't reached the current target. - * - * Don't wait indefinitely: there might be dirty pages - * that can't be evicted. If we can't meet the target, - * give up and start the checkpoint for real. - */ - if (current_us > WT_MAX(WT_MILLION, 10 * stepdown_us) || - bytes_written_total > max_write) - break; - continue; - } - /* - * Estimate how long the next step down of dirty data should - * take. - * - * The calculation here assumes that the system is writing from - * cache as fast as it can, and determines the write throughput - * based on the change in the bytes written from cache since - * the start of the call. We use that to estimate how long it - * will take to step the dirty target down by delta. + * We haven't reached the current target. * - * Take care to avoid dividing by zero. - */ - if (bytes_written_total - bytes_written_last > WT_MEGABYTE && - work_us > 0) { - stepdown_us = (uint64_t)((delta * cache_size / 100) / - ((double)bytes_written_total / work_us)); - stepdown_us = WT_MAX(1, stepdown_us); - if (!progress) - stepdown_us = WT_MIN(stepdown_us, 200000); - progress = true; - - bytes_written_last = bytes_written_total; - } - - work_us += current_us; - - /* - * Smooth out step down: try to limit the impact on - * performance to 10% by waiting once we reach the last - * level. + * Don't wait indefinitely: there might be dirty pages + * that can't be evicted. If we can't meet the target, + * give up and start the checkpoint for real. */ - __wt_sleep(0, 10 * stepdown_us); - cache->eviction_scrub_limit = - WT_MAX(cache->eviction_dirty_target, current_dirty - delta); - WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, - cache->eviction_scrub_limit); - time_last = __wt_clock(session); + bytes_written_total = + cache->bytes_written - bytes_written_start; + if (bytes_written_total > max_write) + break; } time_stop = __wt_clock(session); @@ -681,8 +614,7 @@ __checkpoint_prepare( */ __wt_writelock(session, &txn_global->rwlock); txn_global->checkpoint_state = *txn_state; - txn_global->checkpoint_txn = txn; - txn_global->checkpoint_state.pinned_id = WT_MIN(txn->id, txn->snap_min); + txn_global->checkpoint_state.pinned_id = txn->snap_min; /* * Sanity check that the oldest ID hasn't moved on before we have @@ -724,6 +656,8 @@ __checkpoint_prepare( if (txn_global->has_stable_timestamp) { __wt_timestamp_set(&txn->read_timestamp, &txn_global->stable_timestamp); + __wt_timestamp_set(&txn_global->checkpoint_timestamp, + &txn->read_timestamp); F_SET(txn, WT_TXN_HAS_TS_READ); if (!F_ISSET(conn, WT_CONN_RECOVERING)) __wt_timestamp_set( @@ -975,7 +909,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Unblock updates -- we can figure out that any updates to clean pages * after this point are too new to be written in the checkpoint. */ - cache->eviction_scrub_limit = 0.0; + cache->eviction_scrub_target = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); /* Tell logging that we have started a database checkpoint. */ @@ -1125,7 +1059,7 @@ err: /* if (tracking) WT_TRET(__wt_meta_track_off(session, false, failed)); - cache->eviction_scrub_limit = 0.0; + cache->eviction_scrub_target = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); if (F_ISSET(txn, WT_TXN_RUNNING)) { diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 35a89eeb072..e01db53fda9 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -275,8 +275,9 @@ __txn_rollback_to_stable_btree_walk( WT_READ_CACHE | WT_READ_LOOKASIDE | WT_READ_NO_EVICT)) == 0 && ref != NULL) { if (ref->page_las != NULL && + ref->page_las->skew_newest && __wt_timestamp_cmp(rollback_timestamp, - &ref->page_las->onpage_timestamp) < 0) + &ref->page_las->unstable_timestamp) < 0) ref->page_las->invalid = true; /* Review deleted page saved to the ref */ diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index a10ff740df6..64887c9a583 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -193,6 +193,44 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, } /* + * __txn_get_pinned_timestamp -- + * Calculate the current pinned timestamp. + */ +static int +__txn_get_pinned_timestamp( + WT_SESSION_IMPL *session, wt_timestamp_t *tsp, bool include_checkpoint) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + + conn = S2C(session); + txn_global = &conn->txn_global; + + if (!txn_global->has_oldest_timestamp) + return (WT_NOTFOUND); + __wt_readlock(session, &txn_global->rwlock); + __wt_timestamp_set(tsp, &txn_global->oldest_timestamp); + + /* Check for a running checkpoint */ + if (include_checkpoint && + !__wt_timestamp_iszero(&txn_global->checkpoint_timestamp) && + __wt_timestamp_cmp(&txn_global->checkpoint_timestamp, tsp) < 0) + __wt_timestamp_set(tsp, &txn_global->checkpoint_timestamp); + __wt_readunlock(session, &txn_global->rwlock); + + /* Look for the oldest ordinary reader. */ + __wt_readlock(session, &txn_global->read_timestamp_rwlock); + txn = TAILQ_FIRST(&txn_global->read_timestamph); + if (txn != NULL && + __wt_timestamp_cmp(&txn->read_timestamp, tsp) < 0) + __wt_timestamp_set(tsp, &txn->read_timestamp); + __wt_readunlock(session, &txn_global->read_timestamp_rwlock); + + return (0); +} + +/* * __txn_global_query_timestamp -- * Query a timestamp. */ @@ -248,26 +286,7 @@ __txn_global_query_timestamp( WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, __wt_timestamp_set(&ts, &txn_global->oldest_timestamp)); } else if (WT_STRING_MATCH("pinned", cval.str, cval.len)) { - if (!txn_global->has_oldest_timestamp) - return (WT_NOTFOUND); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&ts, &txn_global->oldest_timestamp); - - /* Check for a running checkpoint */ - txn = txn_global->checkpoint_txn; - if (txn_global->checkpoint_state.pinned_id != WT_TXN_NONE && - !__wt_timestamp_iszero(&txn->read_timestamp) && - __wt_timestamp_cmp(&txn->read_timestamp, &ts) < 0) - __wt_timestamp_set(&ts, &txn->read_timestamp); - __wt_readunlock(session, &txn_global->rwlock); - - /* Look for the oldest ordinary reader. */ - __wt_readlock(session, &txn_global->read_timestamp_rwlock); - txn = TAILQ_FIRST(&txn_global->read_timestamph); - if (txn != NULL && - __wt_timestamp_cmp(&txn->read_timestamp, &ts) < 0) - __wt_timestamp_set(&ts, &txn->read_timestamp); - __wt_readunlock(session, &txn_global->read_timestamp_rwlock); + WT_RET(__txn_get_pinned_timestamp(session, &ts, true)); } else if (WT_STRING_MATCH("recovery", cval.str, cval.len)) /* Read-only value forever. No lock needed. */ __wt_timestamp_set(&ts, &txn_global->recovery_timestamp); @@ -320,8 +339,6 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) WT_TXN_GLOBAL *txn_global; wt_timestamp_t active_timestamp, last_pinned_timestamp; wt_timestamp_t oldest_timestamp, pinned_timestamp; - const char *query_cfg[] = { WT_CONFIG_BASE(session, - WT_CONNECTION_query_timestamp), "get=pinned", NULL }; txn_global = &S2C(session)->txn_global; @@ -334,8 +351,8 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) &oldest_timestamp, &txn_global->oldest_timestamp)); /* Scan to find the global pinned timestamp. */ - if ((ret = __txn_global_query_timestamp( - session, &active_timestamp, query_cfg)) != 0) + if ((ret = __txn_get_pinned_timestamp( + session, &active_timestamp, false)) != 0) return (ret == WT_NOTFOUND ? 0 : ret); if (__wt_timestamp_cmp(&oldest_timestamp, &active_timestamp) < 0) { @@ -362,6 +379,9 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) txn_global->oldest_is_pinned = __wt_timestamp_cmp( &txn_global->pinned_timestamp, &txn_global->oldest_timestamp) == 0; + txn_global->stable_is_pinned = __wt_timestamp_cmp( + &txn_global->pinned_timestamp, + &txn_global->stable_timestamp) == 0; __wt_verbose_timestamp(session, &pinned_timestamp, "Updated pinned timestamp"); } diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 51dc906465a..c398c1a96b2 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -331,41 +331,49 @@ static CONFIG c[] = { C_IGNORE, 0, 0, UINT_MAX, &g.c_timer, NULL }, { "timing_stress_checkpoint", - "configure slow checkpoints", /* 2% */ + "stress checkpoints", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_checkpoint, NULL }, { "timing_stress_lookaside_sweep", - "configure slow lookaside sweep", /* 2% */ + "stress lookaside sweep", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_lookaside_sweep, NULL }, { "timing_stress_split_1", - "configure slow splits (#1)", /* 2% */ + "stress splits (#1)", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_1, NULL }, { "timing_stress_split_2", - "configure slow splits (#2)", /* 2% */ + "stress splits (#2)", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_2, NULL }, { "timing_stress_split_3", - "configure slow splits (#3)", /* 2% */ + "stress splits (#3)", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_3, NULL }, { "timing_stress_split_4", - "configure slow splits (#4)", /* 2% */ + "stress splits (#4)", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_4, NULL }, { "timing_stress_split_5", - "configure slow splits (#5)", /* 2% */ + "stress splits (#5)", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_5, NULL }, { "timing_stress_split_6", - "configure slow splits (#6)", /* 2% */ + "stress splits (#6)", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_6, NULL }, { "timing_stress_split_7", - "configure slow splits (#7)", /* 2% */ + "stress splits (#7)", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_7, NULL }, + { "timing_stress_split_8", + "stress splits (#8)", /* 2% */ + C_BOOL, 2, 0, 0, &g.c_timing_stress_split_8, NULL }, + + { "timing_stress_split_9", + "stress splits (#9)", /* 2% */ + C_BOOL, 2, 0, 0, &g.c_timing_stress_split_9, NULL }, + { "transaction_timestamps", /* 10% */ "enable transaction timestamp support", C_BOOL, 10, 0, 0, &g.c_txn_timestamps, NULL }, diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 0eca6657dd9..1406d2b3fb5 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -221,6 +221,8 @@ typedef struct { uint32_t c_timing_stress_split_5; uint32_t c_timing_stress_split_6; uint32_t c_timing_stress_split_7; + uint32_t c_timing_stress_split_8; + uint32_t c_timing_stress_split_9; uint32_t c_truncate; uint32_t c_txn_freq; uint32_t c_txn_timestamps; diff --git a/src/third_party/wiredtiger/test/format/lrt.c b/src/third_party/wiredtiger/test/format/lrt.c index 9d99933ef64..31c5de93870 100644 --- a/src/third_party/wiredtiger/test/format/lrt.c +++ b/src/third_party/wiredtiger/test/format/lrt.c @@ -110,8 +110,15 @@ lrt(void *arg) */ testutil_check(session->snapshot(session, "name=test")); __wt_sleep(1, 0); - testutil_check(session->begin_transaction( - session, "snapshot=test")); + /* + * Keep trying to start a new transaction if it's + * timing out - we know there aren't any resources + * pinned so it should succeed eventually. + */ + while ((ret = session->begin_transaction( + session, "snapshot=test")) == WT_CACHE_FULL) + ; + testutil_check(ret); testutil_check(session->snapshot( session, "drop=(all)")); testutil_check(session->commit_transaction( @@ -123,8 +130,10 @@ lrt(void *arg) * positioned. As soon as the cursor loses its position * a new snapshot will be allocated. */ - testutil_check(session->begin_transaction( - session, "isolation=snapshot")); + while ((ret = session->begin_transaction( + session, "snapshot=snapshot")) == WT_CACHE_FULL) + ; + testutil_check(ret); /* Read a record at the end of the table. */ do { diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 7d08dbd8bd8..b3f5fd51ce2 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -498,6 +498,7 @@ static void begin_transaction(TINFO *tinfo, WT_SESSION *session, u_int *iso_configp) { u_int v; + int ret; const char *config; char config_buf[64]; bool locked; @@ -523,7 +524,15 @@ begin_transaction(TINFO *tinfo, WT_SESSION *session, u_int *iso_configp) } *iso_configp = v; - testutil_check(session->begin_transaction(session, config)); + /* + * Keep trying to start a new transaction if it's timing out - we + * know there aren't any resources pinned so it should succeed + * eventually. + */ + while ((ret = + session->begin_transaction(session, config)) == WT_CACHE_FULL) + ; + testutil_check(ret); if (v == ISOLATION_SNAPSHOT && g.c_txn_timestamps) { /* Avoid starting a new reader when a prepare is in progress. */ diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index dd87adeae56..8040142aa19 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -262,6 +262,10 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) CONFIG_APPEND(p, ",split_6"); if (g.c_timing_stress_split_7) CONFIG_APPEND(p, ",split_7"); + if (g.c_timing_stress_split_8) + CONFIG_APPEND(p, ",split_8"); + if (g.c_timing_stress_split_9) + CONFIG_APPEND(p, ",split_9"); CONFIG_APPEND(p, "]"); /* Extensions. */ diff --git a/src/third_party/wiredtiger/test/suite/test_las03.py b/src/third_party/wiredtiger/test/suite/test_las03.py new file mode 100644 index 00000000000..6934bd9a741 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_las03.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +from helper import copy_wiredtiger_home +import wiredtiger, wttest +from wiredtiger import stat +from wtdataset import SimpleDataSet + +def timestamp_str(t): + return '%x' % t + +# test_las03.py +# Ensure checkpoints don't read too unnecessary lookaside entries. +class test_las03(wttest.WiredTigerTestCase): + # Force a small cache. + def conn_config(self): + return 'cache_size=50MB,statistics=(fast)' + + def get_stat(self, stat): + stat_cursor = self.session.open_cursor('statistics:') + val = stat_cursor[stat][2] + stat_cursor.close() + return val + + def large_updates(self, session, uri, value, ds, nrows, nops): + # Update a large number of records, we'll hang if the lookaside table + # isn't doing its thing. + cursor = session.open_cursor(uri) + for i in range(nrows + 1, nrows + nops + 1): + session.begin_transaction() + cursor[ds.key(i)] = value + session.commit_transaction('commit_timestamp=' + timestamp_str(i)) + cursor.close() + + def test_checkpoint_las_reads(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + # Create a small table. + uri = "table:test_las03" + nrows = 100 + ds = SimpleDataSet(self, uri, nrows, key_format="S", value_format='u') + ds.populate() + bigvalue = "aaaaa" * 100 + + # Initially load huge data + cursor = self.session.open_cursor(uri) + for i in range(1, 10000): + cursor[ds.key(nrows + i)] = bigvalue + cursor.close() + self.session.checkpoint() + + # Check to see LAS working with old timestamp + bigvalue2 = "ddddd" * 100 + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(1)) + las_writes_start = self.get_stat(stat.conn.cache_write_lookaside) + self.large_updates(self.session, uri, bigvalue2, ds, nrows, 10000) + + # If the test sizing is correct, the history will overflow the cache + self.session.checkpoint() + las_writes = self.get_stat(stat.conn.cache_write_lookaside) - las_writes_start + self.assertGreaterEqual(las_writes, 0) + + for ts in range(2, 4): + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(ts)) + + # Now just update one record and checkpoint again + self.large_updates(self.session, uri, bigvalue2, ds, nrows, 1) + + las_reads_start = self.get_stat(stat.conn.cache_read_lookaside) + self.session.checkpoint() + las_reads = self.get_stat(stat.conn.cache_read_lookaside) - las_reads_start + + # Since we're dealing with eviction concurrent with checkpoints + # and skewing is controlled by a heuristic, we can't put too tight + # a bound on this. + self.assertLessEqual(las_reads, 100) + +if __name__ == '__main__': + wttest.run() |