summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSusan LoVerso <sue@wiredtiger.com>2015-11-18 14:52:02 -0500
committerSusan LoVerso <sue@wiredtiger.com>2015-11-18 14:52:02 -0500
commit6caf4c5674139482520753c04db1e1b779f3ac7c (patch)
tree129e2914d85c2c7d3bb2c0174f0678fdfe7e47c8
parente95bff1310097caef190dbe8210ee3f59b7681ac (diff)
parente2f11301fe0f877225dbd102aba458b887cd40ee (diff)
downloadmongo-6caf4c5674139482520753c04db1e1b779f3ac7c.tar.gz
Merge branch 'develop' into trunc-stats
Conflicts: src/include/wiredtiger.in
-rw-r--r--bench/wtperf/wtperf.c14
-rw-r--r--dist/s_define.list5
-rw-r--r--dist/s_string.ok1
-rw-r--r--dist/stat_data.py8
-rw-r--r--src/btree/bt_handle.c17
-rw-r--r--src/btree/bt_read.c6
-rw-r--r--src/btree/bt_split.c1800
-rw-r--r--src/btree/bt_sync.c2
-rw-r--r--src/btree/col_srch.c33
-rw-r--r--src/btree/row_srch.c89
-rw-r--r--src/conn/conn_cache_pool.c2
-rw-r--r--src/conn/conn_ckpt.c2
-rw-r--r--src/conn/conn_log.c4
-rw-r--r--src/conn/conn_stat.c2
-rw-r--r--src/conn/conn_sweep.c33
-rw-r--r--src/docs/upgrading.dox7
-rw-r--r--src/evict/evict_lru.c12
-rw-r--r--src/evict/evict_page.c14
-rw-r--r--src/include/btmem.h17
-rw-r--r--src/include/btree.i48
-rw-r--r--src/include/connection.h14
-rw-r--r--src/include/extern.h3
-rw-r--r--src/include/misc.h1
-rw-r--r--src/include/misc.i16
-rw-r--r--src/include/mutex.i4
-rw-r--r--src/include/os.h11
-rw-r--r--src/include/stat.h6
-rw-r--r--src/include/wiredtiger.in336
-rw-r--r--src/log/log.c2
-rw-r--r--src/log/log_slot.c2
-rw-r--r--src/lsm/lsm_cursor.c6
-rw-r--r--src/lsm/lsm_manager.c3
-rw-r--r--src/lsm/lsm_merge.c4
-rw-r--r--src/lsm/lsm_tree.c22
-rw-r--r--src/os_posix/os_mtx_cond.c4
-rw-r--r--src/os_posix/os_mtx_rw.c4
-rw-r--r--src/os_posix/os_sleep.c4
-rw-r--r--src/os_posix/os_time.c18
-rw-r--r--src/os_win/os_sleep.c6
-rw-r--r--src/os_win/os_time.c16
-rw-r--r--src/reconcile/rec_write.c2
-rw-r--r--src/session/session_api.c2
-rw-r--r--src/session/session_compact.c3
-rw-r--r--src/session/session_dhandle.c5
-rw-r--r--src/support/err.c3
-rw-r--r--src/support/stat.c26
-rw-r--r--src/txn/txn_ckpt.c4
-rw-r--r--test/format/bulk.c24
-rw-r--r--test/format/config.c57
-rw-r--r--test/format/config.h32
-rw-r--r--test/format/format.h1
-rw-r--r--test/format/wts.c11
52 files changed, 1660 insertions, 1108 deletions
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 44aff59963c..9ac96862fa1 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -653,7 +653,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) {
goto err;
}
++trk->latency_ops;
- usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ usecs = WT_TIMEDIFF_US(stop, start);
track_operation(trk, usecs);
}
/* Increment operation count */
@@ -936,7 +936,7 @@ populate_thread(void *arg)
goto err;
}
++trk->latency_ops;
- usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ usecs = WT_TIMEDIFF_US(stop, start);
track_operation(trk, usecs);
}
++thread->insert.ops; /* Same as trk->ops */
@@ -1068,7 +1068,7 @@ populate_async(void *arg)
goto err;
}
++trk->latency_ops;
- usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ usecs = WT_TIMEDIFF_US(stop, start);
track_operation(trk, usecs);
}
if ((ret = session->close(session, NULL)) != 0) {
@@ -1386,7 +1386,7 @@ execute_populate(CONFIG *cfg)
}
lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount);
- msecs = ns_to_ms(WT_TIMEDIFF(stop, start));
+ msecs = WT_TIMEDIFF_MS(stop, start);
/*
* This is needed as the divisions will fail if the insert takes no time
@@ -1444,7 +1444,7 @@ execute_populate(CONFIG *cfg)
}
lprintf(cfg, 0, 1,
"Compact completed in %" PRIu64 " seconds",
- (uint64_t)(ns_to_sec(WT_TIMEDIFF(stop, start))));
+ (uint64_t)(WT_TIMEDIFF_SEC(stop, start)));
assert(tables == 0);
}
return (0);
@@ -2423,7 +2423,7 @@ worker_throttle(int64_t throttle, int64_t *ops, struct timespec *interval)
* If we did enough operations in less than a second, sleep for
* the rest of the second.
*/
- usecs_to_complete = ns_to_us(WT_TIMEDIFF(now, *interval));
+ usecs_to_complete = WT_TIMEDIFF_US(now, *interval);
if (usecs_to_complete < USEC_PER_SEC)
(void)usleep((useconds_t)(USEC_PER_SEC - usecs_to_complete));
@@ -2457,7 +2457,7 @@ drop_all_tables(CONFIG *cfg)
}
}
(void)__wt_epoch(NULL, &stop);
- msecs = ns_to_ms(WT_TIMEDIFF(stop, start));
+ msecs = WT_TIMEDIFF_MS(stop, start);
lprintf(cfg, 0, 1,
"Executed %" PRIu32 " drop operations average time %" PRIu64 "ms",
cfg->table_count, msecs / cfg->table_count);
diff --git a/dist/s_define.list b/dist/s_define.list
index d204a11835b..a2b86610755 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -36,6 +36,8 @@ WT_READ_BARRIER
WT_REF_SIZE
WT_SESSION_LOCKED_CHECKPOINT
WT_SESSION_LOCKED_TURTLE
+WT_STATS_FIELD_TO_SLOT
+WT_STATS_SLOT_ID
WT_STAT_DECR
WT_STAT_DECRV
WT_STAT_FAST_CONN_DECRV
@@ -45,9 +47,8 @@ WT_STAT_FAST_DECRV
WT_STAT_FAST_INCR
WT_STAT_FAST_INCRV
WT_STAT_FAST_SET
-WT_STATS_FIELD_TO_SLOT
-WT_STATS_SLOT_ID
WT_STAT_WRITE
+WT_TIMEDIFF_US
WT_TRET_ERROR_OK
WT_WITH_LOCK
__F
diff --git a/dist/s_string.ok b/dist/s_string.ok
index c14f4c961e6..21cd360c144 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -507,6 +507,7 @@ dev
dh
dhandle
dhandles
+difftime
dir
dirlist
dlclose
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 9c49e20fa61..85ecdeb5be5 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -199,7 +199,9 @@ connection_stats = [
'eviction server populating queue, but not evicting pages'),
CacheStat('cache_eviction_slow',
'eviction server unable to reach eviction goal'),
- CacheStat('cache_eviction_split', 'pages split during eviction'),
+ CacheStat('cache_eviction_split_internal',
+ 'internal pages split during eviction'),
+ CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
CacheStat('cache_eviction_walk', 'pages walked for eviction'),
CacheStat('cache_eviction_worker_evicting',
'eviction worker thread evicting pages'),
@@ -479,7 +481,9 @@ dsrc_stats = [
'data source pages selected for eviction unable to be evicted'),
CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'),
CacheStat('cache_eviction_internal', 'internal pages evicted'),
- CacheStat('cache_eviction_split', 'pages split during eviction'),
+ CacheStat('cache_eviction_split_internal',
+ 'internal pages split during eviction'),
+ CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
CacheStat('cache_inmem_splittable',
'in-memory page passed criteria to be split'),
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 3e611a107ab..dbdf94fc1b6 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -643,11 +643,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
uint64_t cache_size;
uint32_t intl_split_size, leaf_split_size;
const char **cfg;
btree = S2BT(session);
+ conn = S2C(session);
cfg = btree->dhandle->cfg;
/*
@@ -688,8 +690,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
btree->maxmempage =
WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
- if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) {
- if ((cache_size = S2C(session)->cache_size) > 0)
+ if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+ if ((cache_size = conn->cache_size) > 0)
btree->maxmempage =
WT_MIN(btree->maxmempage, cache_size / 4);
}
@@ -723,6 +725,17 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
/*
* Get the maximum internal/leaf page key/value sizes.
*
+ * In-memory configuration overrides any key/value sizes, there's no
+ * such thing as an overflow item in an in-memory configuration.
+ */
+ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
+ btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE;
+ btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE;
+ btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE;
+ return (0);
+ }
+
+ /*
* In historic versions of WiredTiger, the maximum internal/leaf page
* key/value sizes were set by the internal_item_max and leaf_item_max
* configuration strings. Look for those strings if we don't find the
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index e60f7b3fb02..389ac761c5b 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -586,8 +586,8 @@ skip_evict:
* CPU to no purpose.
*/
if (stalled)
- wait_cnt += 1000;
- else if (++wait_cnt < 1000) {
+ wait_cnt += WT_THOUSAND;
+ else if (++wait_cnt < WT_THOUSAND) {
__wt_yield();
continue;
}
@@ -603,7 +603,7 @@ skip_evict:
if (cache_work)
continue;
}
- sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
+ sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
__wt_sleep(0, sleep_cnt);
}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 9e45bf10a5c..eaeac683f9a 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -169,54 +169,58 @@ __split_safe_free(WT_SESSION_IMPL *session,
return (__split_stash_add(session, split_gen, p, s));
}
+#ifdef HAVE_DIAGNOSTIC
/*
- * __split_should_deepen --
- * Return if we should deepen the tree.
+ * __split_verify_intl_key_order --
+ * Verify the key order on an internal page after a split, diagnostic only.
*/
-static bool
-__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
+static void
+__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
- WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_ITEM *next, _next, *last, _last, *tmp;
+ WT_REF *ref;
+ uint64_t recno;
+ int cmp;
+ bool first;
btree = S2BT(session);
- page = ref->page;
-
- /*
- * Our caller is holding the parent page locked to single-thread splits,
- * which means we can safely look at the page's index without setting a
- * split generation.
- */
- pindex = WT_INTL_INDEX_GET_SAFE(page);
-
- /*
- * Sanity check for a reasonable number of keys on-page keys. Splitting
- * with too few keys leads to excessively deep trees.
- */
- if (pindex->entries < 100)
- return (false);
-
- /*
- * Deepen the tree if the page's memory footprint is larger than the
- * maximum size for a page in memory (presumably putting eviction
- * pressure on the cache).
- */
- if (page->memory_footprint > btree->maxmempage)
- return (true);
- /*
- * Check if the page has enough keys to make it worth splitting. If
- * the number of keys is allowed to grow too large, the cost of
- * splitting into parent pages can become large enough to result
- * in slow operations.
- */
- if (!__wt_ref_is_root(ref) &&
- pindex->entries > btree->split_deepen_min_child)
- return (true);
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ recno = 0; /* Less than any valid record number. */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->key.recno > recno);
+ recno = ref->key.recno;
+ } WT_INTL_FOREACH_END;
+ break;
+ case WT_PAGE_ROW_INT:
+ next = &_next;
+ WT_CLEAR(_next);
+ last = &_last;
+ WT_CLEAR(_last);
- return (false);
+ first = true;
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __wt_ref_key(page, ref, &next->data, &next->size);
+ if (last->size == 0) {
+ if (first)
+ first = false;
+ else {
+ WT_ASSERT(session, __wt_compare(
+ session, btree->collator, last,
+ next, &cmp) == 0);
+ WT_ASSERT(session, cmp < 0);
+ }
+ }
+ tmp = last;
+ last = next;
+ next = tmp;
+ } WT_INTL_FOREACH_END;
+ break;
+ }
}
+#endif
/*
* __split_ovfl_key_cleanup --
@@ -267,47 +271,58 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
}
/*
- * __split_ref_deepen_move --
- * Move a WT_REF from a parent to a child in service of a split to deepen
- * the tree, including updating the accounting information.
+ * __split_ref_move --
+ * Move a WT_REF from one page to another, including updating accounting
+ * information.
*/
static int
-__split_ref_deepen_move(WT_SESSION_IMPL *session,
- WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
+ WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp)
{
WT_ADDR *addr;
WT_CELL_UNPACK unpack;
WT_DECL_RET;
WT_IKEY *ikey;
+ WT_REF *ref;
size_t size;
void *key;
+ ref = *from_refp;
+
/*
+ * The from-home argument is the page into which the "from" WT_REF may
+ * point, for example, if there's an on-page key the "from" WT_REF
+ * references, it will be on the page "from-home".
+ *
* Instantiate row-store keys, and column- and row-store addresses in
- * the WT_REF structures referenced by a page that's being split (and
- * deepening the tree). The WT_REF structures aren't moving, but the
- * index references are moving from the page we're splitting to a set
- * of child pages, and so we can no longer reference the block image
- * that remains with the page being split.
+ * the WT_REF structures referenced by a page that's being split. The
+ * WT_REF structures aren't moving, but the index references are moving
+ * from the page we're splitting to a set of new pages, and so we can
+ * no longer reference the block image that remains with the page being
+ * split.
*
* No locking is required to update the WT_REF structure because we're
- * the only thread splitting the parent page, and there's no way for
- * readers to race with our updates of single pointers. The changes
- * have to be written before the page goes away, of course, our caller
- * owns that problem.
- *
- * Row-store keys, first.
+ * the only thread splitting the page, and there's no way for readers
+ * to race with our updates of single pointers. The changes have to be
+ * written before the page goes away, of course, our caller owns that
+ * problem.
*/
- if (parent->type == WT_PAGE_ROW_INT) {
+ if (from_home->type == WT_PAGE_ROW_INT) {
+ /*
+ * Row-store keys: if it's not yet instantiated, instantiate it.
+ * If already instantiated, check for overflow cleanup (overflow
+ * keys are always instantiated).
+ */
if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
- __wt_ref_key(parent, ref, &key, &size);
+ __wt_ref_key(from_home, ref, &key, &size);
WT_RET(__wt_row_ikey(session, 0, key, size, ref));
ikey = ref->key.ikey;
} else {
- WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
- *parent_decrp += sizeof(WT_IKEY) + ikey->size;
+ WT_RET(
+ __split_ovfl_key_cleanup(session, from_home, ref));
+ *decrp += sizeof(WT_IKEY) + ikey->size;
}
- *child_incrp += sizeof(WT_IKEY) + ikey->size;
+ *incrp += sizeof(WT_IKEY) + ikey->size;
}
/*
@@ -316,7 +331,7 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
* get the address from the on-page cell.
*/
addr = ref->addr;
- if (addr != NULL && !__wt_off_page(parent, addr)) {
+ if (addr != NULL && !__wt_off_page(from_home, addr)) {
__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
WT_RET(__wt_calloc_one(session, &addr));
if ((ret = __wt_strndup(
@@ -330,364 +345,1042 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
ref->addr = addr;
}
- /* And finally, the WT_REF itself. */
- WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF));
+ /* And finally, copy the WT_REF pointer itself. */
+ *to_refp = ref;
+ WT_MEM_TRANSFER(*decrp, *incrp, sizeof(WT_REF));
return (0);
}
-#ifdef HAVE_DIAGNOSTIC
/*
- * __split_verify_intl_key_order --
- * Verify the key order on an internal page after a split, diagnostic only.
+ * __split_child_block_evict_and_split --
+ * Ensure the newly created child isn't evicted or split for now.
*/
static void
-__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
+__split_child_block_evict_and_split(WT_PAGE *child)
{
- WT_BTREE *btree;
- WT_ITEM *next, _next, *last, _last, *tmp;
- WT_REF *ref;
- uint64_t recno;
- int cmp;
- bool first;
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page which previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * newly created page's modify structure has a field with a transaction
+ * ID that's checked before any internal page is evicted. Unfortunately,
+ * we don't know the correct value until we update the original page's
+ * index (we need a transaction ID from after that update), but the act
+ * of updating the original page's index is what allows the eviction to
+ * happen.
+ *
+ * Once the split is live, newly created internal pages might themselves
+ * split. The split itself is not the problem: if a page splits before
+ * we fix up its WT_REF (in other words, a WT_REF we move is then moved
+ * again, before we reset the underlying page's parent reference), it's
+ * OK because the test we use to find a WT_REF and WT_PAGE that require
+ * fixing up is only that the WT_REF points to the wrong parent, not it
+ * points to a specific wrong parent. The problem is our fix up of the
+ * WT_REFs in the created page could race with the subsequent fix of the
+ * same WT_REFs (in a different created page), we'd have to acquire some
+ * lock to prevent that race, and that's going to be difficult at best.
+ *
+ * For now, block eviction and splits in newly created pages until they
+ * have been fixed up.
+ */
+ F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+}
- btree = S2BT(session);
+/*
+ * __split_ref_move_final --
+ * Finalize the moved WT_REF structures after the split succeeds.
+ */
+static int
+__split_ref_move_final(
+ WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
+{
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_REF *ref, *child_ref;
+ uint64_t txn_new_id;
+ uint32_t i;
- switch (page->type) {
- case WT_PAGE_COL_INT:
- recno = 0; /* Less than any valid record number. */
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- WT_ASSERT(session, ref->key.recno > recno);
- recno = ref->key.recno;
- } WT_INTL_FOREACH_END;
- break;
- case WT_PAGE_ROW_INT:
- next = &_next;
- WT_CLEAR(_next);
- last = &_last;
- WT_CLEAR(_last);
+ /*
+ * When creating new internal pages as part of a split, we set a field
+ * in those pages modify structure to prevent them from being evicted
+ * until all threads are known to have exited the index of the page that
+ * previously "owned" the WT_REF. Set that field to a safe value.
+ */
+ txn_new_id = __wt_txn_new_id(session);
- first = true;
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- __wt_ref_key(page, ref, &next->data, &next->size);
- if (last->size == 0) {
- if (first)
- first = false;
- else {
- WT_ASSERT(session, __wt_compare(
- session, btree->collator, last,
- next, &cmp) == 0);
- WT_ASSERT(session, cmp < 0);
- }
+ /*
+ * The WT_REF structures moved to newly allocated child pages reference
+ * the wrong parent page and we have to fix that up. The problem is
+ * revealed when a thread of control searches for the child page's
+ * reference structure slot, and fails to find it because the parent
+ * page being searched no longer references the child. When that failure
+ * happens the thread waits for the reference's home page to be updated,
+ * which we do here: walk the children and fix them up.
+ */
+ for (i = 0; i < entries; ++i, ++refp) {
+ ref = *refp;
+
+ /*
+ * We don't hold hazard pointers on created pages, they cannot
+ * be evicted because the page-modify transaction value set as
+ * they were created prevents eviction. (See above, we reset
+ * that value as part of fixing up the page.) But, an eviction
+ * thread might be attempting to evict the page (the WT_REF may
+ * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF
+ * may be WT_REF_READING), or it may be in some other state.
+ * Acquire a hazard pointer for any in-memory pages so we know
+ * the state of the page. Ignore pages not in-memory (deleted,
+ * on-disk, being read), there's no in-memory structure to fix.
+ */
+ if ((ret = __wt_page_in(session,
+ ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
+ continue;
+ WT_ERR(ret);
+
+ child = ref->page;
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, child));
+#endif
+ /*
+ * We use a page flag to prevent the child from splitting from
+ * underneath us, but the split-generation error checks don't
+ * know about that flag; use the standard macros to ensure that
+ * reading the child's page index structure is safe.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ /*
+ * The page's home reference may not be wrong, as we
+ * opened up access from the top of the tree already,
+ * disk pages may have been read in since then, and
+ * those pages would have correct parent references.
+ */
+ if (child_ref->home != child) {
+ child_ref->home = child;
+ child_ref->pindex_hint = 0;
+
+ child->modify->mod_split_txn = txn_new_id;
}
- tmp = last;
- last = next;
- next = tmp;
} WT_INTL_FOREACH_END;
- break;
+ WT_LEAVE_PAGE_INDEX(session);
+
+ /* The child can now be evicted or split. */
+ F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
+ WT_ERR(__wt_hazard_clear(session, child));
}
+
+ /*
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+ return (0);
+
+err: /* Something really bad just happened. */
+ WT_PANIC_RET(session, ret, "fatal error resolving a split");
}
-#endif
/*
- * __split_deepen --
- * Split an internal page in-memory, deepening the tree.
+ * __split_root --
+ * Split the root page in-memory, deepening the tree.
*/
static int
-__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+__split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *child;
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
WT_REF **alloc_refp;
- WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
- size_t child_incr, parent_decr, parent_incr, size;
+ WT_REF **child_refp, *ref, **root_refp;
+ size_t child_incr, root_decr, root_incr, size;
uint64_t split_gen;
- uint32_t children, chunk, i, j, moved_entries, new_entries, remain;
- uint32_t skip_leading, slots;
+ uint32_t children, chunk, i, j, remain;
+ uint32_t slots;
bool complete;
void *p;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
btree = S2BT(session);
alloc_index = NULL;
- parent_incr = parent_decr = 0;
+ root_decr = root_incr = 0;
complete = false;
+ /* The root page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, root));
+
/*
- * Our caller is holding the parent page locked to single-thread splits,
+ * Our caller is holding the root page locked to single-thread splits,
* which means we can safely look at the page's index without setting a
* split generation.
*/
- pindex = WT_INTL_INDEX_GET_SAFE(parent);
-
- /*
- * A prepending/appending workload will repeatedly deepen parts of the
- * tree that aren't changing, and appending workloads are not uncommon.
- * First, keep the first/last pages of the tree at their current level,
- * to catch simple workloads. Second, track the number of entries which
- * resulted from the last time we deepened this page, and if we refilled
- * this page without splitting into those slots, ignore them for this
- * split. It's not exact because an eviction might split into any part
- * of the page: if 80% of the splits are at the end of the page, assume
- * an append-style workload. Of course, the plan eventually fails: when
- * repeatedly deepening this page for an append-only workload, we will
- * progressively ignore more and more of the slots. When ignoring 90% of
- * the slots, deepen the entire page again.
- *
- * Figure out how many slots we're leaving at this level and how many
- * child pages we're creating.
- */
-#undef skip_trailing
-#define skip_trailing 1
- skip_leading = 1;
- new_entries = pindex->entries - parent->pg_intl_deepen_split_last;
- if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10)
- skip_leading = parent->pg_intl_deepen_split_last;
- if (skip_leading > (pindex->entries * 9) * 10)
- skip_leading = 1;
+ pindex = WT_INTL_INDEX_GET_SAFE(root);
/*
- * In a few (rare) cases we split pages with only a few entries, and in
- * those cases we keep it simple, 10 children, skip only first and last
- * entries. Otherwise, split into a lot of child pages.
+ * Decide how many child pages to create, then calculate the standard
+ * chunk and whatever remains. Sanity check the number of children:
+ * the decision to split matched to the deepen-per-child configuration
+ * might get it wrong.
*/
- moved_entries = pindex->entries - (skip_leading + skip_trailing);
- children = moved_entries / btree->split_deepen_per_child;
+ children = pindex->entries / btree->split_deepen_per_child;
if (children < 10) {
+ if (pindex->entries < 100)
+ return (EBUSY);
children = 10;
- skip_leading = 1;
- moved_entries =
- pindex->entries - (skip_leading + skip_trailing);
}
+ chunk = pindex->entries / children;
+ remain = pindex->entries - chunk * (children - 1);
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
- parent, pindex->entries, children));
+ "%p: %" PRIu32 " root page elements, splitting into %" PRIu32
+ " children",
+ root, pindex->entries, children));
/*
- * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
- * the slots of the allocated WT_PAGE_INDEX to point to the pages we're
- * keeping at the current level, and the rest of the slots to point to
- * new WT_REF objects.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
+ * into the root page, replacing the root's page-index.
*/
- size = sizeof(WT_PAGE_INDEX) +
- (children + skip_leading + skip_trailing) * sizeof(WT_REF *);
+ size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
- parent_incr += size;
+ root_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
- alloc_index->entries = children + skip_leading + skip_trailing;
- for (alloc_refp = alloc_index->index,
- i = 0; i < skip_leading; ++alloc_refp, ++i)
- alloc_index->index[i] = pindex->index[i];
- for (i = 0; i < children; ++alloc_refp, ++i)
+ alloc_index->entries = children;
+ alloc_refp = alloc_index->index;
+ for (i = 0; i < children; alloc_refp++, ++i)
WT_ERR(__wt_calloc_one(session, alloc_refp));
- parent_incr += children * sizeof(WT_REF);
- alloc_index->index[alloc_index->entries - 1] =
- pindex->index[pindex->entries - 1];
+ root_incr += children * sizeof(WT_REF);
/* Allocate child pages, and connect them into the new page index. */
- chunk = moved_entries / children;
- remain = moved_entries - chunk * (children - 1);
- for (parent_refp = pindex->index + skip_leading,
- alloc_refp = alloc_index->index + skip_leading,
- i = 0; i < children; ++i) {
+ for (root_refp = pindex->index,
+ alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
WT_ERR(__wt_page_alloc(
- session, parent->type, 0, slots, false, &child));
+ session, root->type, 0, slots, false, &child));
/*
- * Initialize the parent page's child reference; we need a copy
- * of the page's key.
+ * Initialize the page's child reference; we need a copy of the
+ * page's key.
*/
ref = *alloc_refp++;
- ref->home = parent;
+ ref->home = root;
ref->page = child;
ref->addr = NULL;
- if (parent->type == WT_PAGE_ROW_INT) {
- __wt_ref_key(parent, *parent_refp, &p, &size);
+ if (root->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(root, *root_refp, &p, &size);
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
- parent_incr += sizeof(WT_IKEY) + size;
+ root_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = (*parent_refp)->key.recno;
+ ref->key.recno = (*root_refp)->key.recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
- if (parent->type == WT_PAGE_COL_INT)
- child->pg_intl_recno = (*parent_refp)->key.recno;
+ if (root->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*root_refp)->key.recno;
child->pg_intl_parent_ref = ref;
/* Mark it dirty. */
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /*
- * Once the split goes live, the newly created internal pages
- * might be evicted and their WT_REF structures freed. If those
- * pages are evicted before threads exit the previous page index
- * array, a thread might see a freed WT_REF. Set the eviction
- * transaction requirement for the newly created internal pages.
- */
- child->modify->mod_split_txn = __wt_txn_new_id(session);
+ /* Ensure the page isn't evicted or split for now. */
+ __split_child_block_evict_and_split(child);
/*
* The newly allocated child's page index references the same
- * structures as the parent. (We cannot move WT_REF structures,
+ * structures as the root. (We cannot move WT_REF structures,
* threads may be underneath us right now changing the structure
* state.) However, if the WT_REF structures reference on-page
* information, we have to fix that, because the disk image for
* the page that has an page index entry for the WT_REF is about
* to change.
*/
- child_incr = 0;
child_pindex = WT_INTL_INDEX_GET_SAFE(child);
- for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
- WT_ERR(__split_ref_deepen_move(session,
- parent, *parent_refp, &parent_decr, &child_incr));
- *child_refp++ = *parent_refp++;
- }
+ child_incr = 0;
+ for (child_refp = child_pindex->index,
+ j = 0; j < slots; ++child_refp, ++root_refp, ++j)
+ WT_ERR(__split_ref_move(session, root,
+ root_refp, &root_decr, child_refp, &child_incr));
+
__wt_cache_page_inmem_incr(session, child, child_incr);
}
WT_ASSERT(session,
- alloc_refp - alloc_index->index ==
- (ptrdiff_t)(alloc_index->entries - skip_trailing));
- WT_ASSERT(session, parent_refp - pindex->index ==
- (ptrdiff_t)(pindex->entries - skip_trailing));
+ alloc_refp - alloc_index->index == (ptrdiff_t)alloc_index->entries);
+ WT_ASSERT(session,
+ root_refp - pindex->index == (ptrdiff_t)pindex->entries);
/*
- * Confirm the parent page's index hasn't moved, then update it, which
+ * Confirm the root page's index hasn't moved, then update it, which
* makes the split visible to threads descending the tree. From this
* point on, we're committed to the split.
*
* A note on error handling: until this point, there's no problem with
* unwinding on error. We allocated a new page index, a new set of
* WT_REFs and a new set of child pages -- if an error occurred, the
- * parent remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the parent page, attention
+ * root remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the root page, attention
* needs to be paid. However, subsequent failures are relatively benign,
* the split is OK and complete. For that reason, we ignore errors past
* this point unless there's a panic.
*/
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
+ WT_INTL_INDEX_SET(root, alloc_index);
+ complete = true;
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, root));
+#endif
+ /* Fix up the moved WT_REF structures. */
+ WT_ERR(__split_ref_move_final(
+ session, alloc_index->index, alloc_index->entries));
+
+ /* We've installed the allocated page-index, ensure error handling. */
+ alloc_index = NULL;
+
+ /*
+ * We can't free the previous root's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling: we have already
+ * updated the page with a new index. Even if stashing the old value
+ * fails, we don't roll back that change, because threads may already
+ * be using the new index.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
+ root_decr += size;
+
+ /* Adjust the root's memory footprint and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, root, root_incr);
+ __wt_cache_page_inmem_decr(session, root, root_decr);
+ __wt_page_modify_set(session, root);
+
+err: /*
+ * If complete is true, we saw an error after opening up the tree to
+ * descent through the root page's new index. There is nothing we
+ * can do, there are threads potentially active in both versions of
+ * the tree.
+ *
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete)
+ __wt_free_ref_index(session, root, alloc_index, true);
+
+ if (ret != 0 && ret != WT_PANIC)
+ __wt_err(session, ret,
+ "ignoring not-fatal error during root page split to "
+ "deepen the tree");
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_parent --
+ * Resolve a multi-page split, inserting new information into the parent.
+ */
+static int
+__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
+ uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard)
+{
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_PAGE *parent;
+ WT_PAGE_INDEX *alloc_index, *pindex;
+ WT_REF **alloc_refp, *next_ref;
+ size_t parent_decr, size;
+ uint64_t split_gen;
+ uint32_t i, j;
+ uint32_t deleted_entries, parent_entries, result_entries;
+ bool complete, empty_parent;
+
+ parent = ref->home;
+
+ alloc_index = pindex = NULL;
+ parent_decr = 0;
+ parent_entries = 0;
+ complete = empty_parent = false;
+
+ /* The parent page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, parent));
+
+ /*
+ * We've locked the parent, which means it cannot split (which is the
+ * only reason to worry about split generation values).
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(parent);
+ parent_entries = pindex->entries;
+
+ /*
+ * Remove any refs to deleted pages while we are splitting, we have
+ * the internal page locked down, and are copying the refs into a new
+ * array anyway. Switch them to the special split state, so that any
+ * reading thread will restart.
+ */
+ for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
+ if (next_ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, next_ref, true) &&
+ __wt_atomic_casv32(
+ &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
+ deleted_entries++;
+ }
+
+ /*
+ * The final entry count consists of the original count, plus any new
+ * pages, less any WT_REFs we're removing (deleted entries plus the
+ * entry we're replacing).
+ */
+ result_entries = (parent_entries + new_entries) - (deleted_entries + 1);
+
+ /*
+ * If there are no remaining entries on the parent, give up, we can't
+ * leave an empty internal page. Mark it to be evicted soon and clean
+ * up any references that have changed state.
+ */
+ if (result_entries == 0) {
+ empty_parent = true;
+ __wt_page_evict_soon(parent);
+ goto err;
+ }
+
+ /*
+ * Allocate and initialize a new page index array for the parent, then
+ * copy references from the original index array, plus references from
+ * the newly created split array, into place.
+ */
+ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ parent_incr += size;
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = result_entries;
+ for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref == ref)
+ for (j = 0; j < new_entries; ++j) {
+ ref_new[j]->home = parent;
+ *alloc_refp++ = ref_new[j];
+ }
+ else if (next_ref->state != WT_REF_SPLIT)
+ /* Skip refs we have marked for deletion. */
+ *alloc_refp++ = next_ref;
+ }
+
+ /* Check that we filled in all the entries. */
+ WT_ASSERT(session,
+ alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+
+ /*
+ * Confirm the parent page's index hasn't moved then update it, which
+ * makes the split visible to threads descending the tree.
+ */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- complete = true;
+ alloc_index = NULL;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, parent));
#endif
+
+ /*
+ * If discarding the page's original WT_REF field, reset it to split and
+ * increment the number of entries being discarded. Threads cursoring
+ * through the tree were blocked because that WT_REF state was set to
+ * locked. Changing the locked state to split unblocks those threads and
+ * causes them to re-calculate their position based on the just-updated
+ * parent page's index.
+ */
+ if (discard) {
+ ++deleted_entries;
+ WT_PUBLISH(ref->state, WT_REF_SPLIT);
+ }
+
+ /*
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+
/*
- * Save the number of entries created by deepening the tree and reset
- * the count of splits into this page after that point.
+ * A note on error handling: failures before we swapped the new page
+ * index into the parent can be resolved by freeing allocated memory
+ * because the original page is unchanged, we can continue to use it
+ * and we have not yet modified the parent. Failures after we swap
+ * the new page index into the parent are also relatively benign, the
+ * split is OK and complete. For those reasons, we ignore errors past
+ * this point unless there's a panic.
*/
- parent->pg_intl_deepen_split_append = 0;
- parent->pg_intl_deepen_split_last = alloc_index->entries;
+ complete = true;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
+ " (%s%" PRIu32 ")",
+ ref->page, ref->page == NULL ?
+ "unknown page type" : __wt_page_type_string(ref->page->type),
+ ref->page == NULL ? "reverse " : "", parent,
+ parent_entries, result_entries,
+ ref->page == NULL ? "-" : "+",
+ ref->page == NULL ?
+ parent_entries - result_entries : result_entries - parent_entries));
/*
- * The moved reference structures now reference the wrong parent page,
- * and we have to fix that up. The problem is revealed when a thread
- * of control searches for a page's reference structure slot, and fails
- * to find it because the page it's searching no longer references it.
- * When that failure happens, the thread waits for the reference's home
- * page to be updated, which we do here: walk the children and fix them
- * up.
+ * The new page index is in place, free the WT_REF we were splitting and
+ * any deleted WT_REFs we found, modulo the usual safe free semantics.
*
- * We're not acquiring hazard pointers on these pages, they cannot be
- * evicted because of the eviction transaction value set above.
- */
- for (parent_refp = alloc_index->index,
- i = alloc_index->entries; i > 0; ++parent_refp, --i) {
- parent_ref = *parent_refp;
- WT_ASSERT(session, parent_ref->home == parent);
- if (parent_ref->state != WT_REF_MEM)
+ * Acquire a new split generation.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref->state != WT_REF_SPLIT)
continue;
+ --deleted_entries;
/*
- * We left the first/last children of the parent at the current
- * level to avoid bad split patterns, they might be leaf pages;
- * check the page type before we continue.
- */
- child = parent_ref->page;
- if (!WT_PAGE_IS_INTERNAL(child))
- continue;
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, child));
-#endif
- /*
- * We have the parent locked, but there's nothing to prevent
- * this child from splitting beneath us; ensure that reading
- * the child's page index structure is safe.
+ * We set the WT_REF to split, discard it, freeing any resources
+ * it holds.
+ *
+ * Row-store trees where the old version of the page is being
+ * discarded: the previous parent page's key for this child page
+ * may have been an on-page overflow key. In that case, if the
+ * key hasn't been deleted, delete it now, including its backing
+ * blocks. We are exchanging the WT_REF that referenced it for
+ * the split page WT_REFs and their keys, and there's no longer
+ * any reference to it. Done after completing the split (if we
+ * failed, we'd leak the underlying blocks, but the parent page
+ * would be unaffected).
*/
- WT_ENTER_PAGE_INDEX(session);
- WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ if (parent->type == WT_PAGE_ROW_INT) {
+ WT_TRET(__split_ovfl_key_cleanup(
+ session, parent, next_ref));
+ ikey = __wt_ref_key_instantiated(next_ref);
+ if (ikey != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(
+ session, split_gen, exclusive, ikey, size));
+ parent_decr += size;
+ }
/*
- * The page's parent reference may not be wrong, as we
- * opened up access from the top of the tree already,
- * pages may have been read in since then. Check and
- * only update pages that reference the original page,
- * they must be wrong.
+ * The page_del structure can be freed immediately: it
+ * is only read when the ref state is WT_REF_DELETED.
+ * The size of the structure wasn't added to the parent,
+ * don't decrement.
*/
- if (child_ref->home == parent) {
- child_ref->home = child;
- child_ref->pindex_hint = 0;
+ if (next_ref->page_del != NULL) {
+ __wt_free(session,
+ next_ref->page_del->update_list);
+ __wt_free(session, next_ref->page_del);
}
- } WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
+ }
+
+ WT_TRET(__split_safe_free(
+ session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
+ parent_decr += sizeof(WT_REF);
}
+ /* We freed the reference that was split in the loop above. */
+ ref = NULL;
+
/*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
+ * We can't free the previous page index, there may be threads using it.
+ * Add it to the session discard list, to be freed when it's safe.
*/
- WT_FULL_BARRIER();
- alloc_index = NULL;
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
+ parent_decr += size;
+
+ /* Adjust the parent's memory footprint and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, parent, parent_incr);
+ __wt_cache_page_inmem_decr(session, parent, parent_decr);
+ __wt_page_modify_set(session, parent);
+
+err: /*
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete) {
+ for (i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref->state == WT_REF_SPLIT)
+ next_ref->state = WT_REF_DELETED;
+ }
+
+ __wt_free_ref_index(session, NULL, alloc_index, false);
+
+ /*
+ * The split couldn't proceed because the parent would be empty,
+ * return EBUSY so our caller knows to unlock the WT_REF that's
+ * being deleted, but don't be noisy, there's nothing wrong.
+ */
+ if (empty_parent)
+ return (EBUSY);
+ }
+
+ if (ret != 0 && ret != WT_PANIC)
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page split");
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_internal --
+ * Split an internal page into its parent.
+ */
+static int
+__split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
+ WT_REF **alloc_refp;
+ WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ size_t child_incr, page_decr, page_incr, parent_incr, size;
+ uint64_t split_gen;
+ uint32_t children, chunk, i, j, remain;
+ uint32_t slots;
+ bool complete;
+ void *p;
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
+
+ /* The page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ btree = S2BT(session);
+ alloc_index = replace_index = NULL;
+ page_ref = page->pg_intl_parent_ref;
+ page_decr = page_incr = parent_incr = 0;
+ complete = false;
/*
- * We can't free the previous parent's index, there may be threads using
- * it. Add to the session's discard list, to be freed once we know no
- * threads can still be using it.
+ * Our caller is holding the page locked to single-thread splits, which
+ * means we can safely look at the page's index without setting a split
+ * generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+
+ /*
+ * Decide how many child pages to create, then calculate the standard
+ * chunk and whatever remains. Sanity check the number of children:
+ * the decision to split matched to the deepen-per-child configuration
+ * might get it wrong.
+ */
+ children = pindex->entries / btree->split_deepen_per_child;
+ if (children < 10) {
+ if (pindex->entries < 100)
+ return (EBUSY);
+ children = 10;
+ }
+ chunk = pindex->entries / children;
+ remain = pindex->entries - chunk * (children - 1);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %" PRIu32 " internal page elements, splitting %" PRIu32
+ " children into parent %p",
+ page, pindex->entries, children, parent));
+
+ /*
+ * Ideally, we'd discard the original page, but that's hard since other
+ * threads of control are using it (for example, if eviction is walking
+ * the tree and looking at the page.) Instead, perform a right-split,
+ * moving all except the first chunk of the page's WT_REF objects to new
+ * pages.
*
- * This change requires care with error handling: we have already
- * updated the page with a new index. Even if stashing the old value
- * fails, we don't roll back that change, because threads may already
- * be using the new index.
+ * Create and initialize a replacement WT_PAGE_INDEX for the original
+ * page.
*/
- size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size));
- parent_decr += size;
+ size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &replace_index));
+ page_incr += size;
+ replace_index->index = (WT_REF **)(replace_index + 1);
+ replace_index->entries = chunk;
+ for (page_refp = pindex->index, i = 0; i < chunk; ++i)
+ replace_index->index[i] = *page_refp++;
/*
- * Adjust the parent's memory footprint.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
+ * into the page's parent, replacing the page's page-index.
+ *
+ * The first slot of the new WT_PAGE_INDEX is the original page WT_REF.
+ * The remainder of the slots are allocated WT_REFs.
*/
- __wt_cache_page_inmem_incr(session, parent, parent_incr);
- __wt_cache_page_inmem_decr(session, parent, parent_decr);
+ size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ parent_incr += size;
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = children;
+ alloc_refp = alloc_index->index;
+ *alloc_refp++ = page_ref;
+ for (i = 1; i < children; ++alloc_refp, ++i)
+ WT_ERR(__wt_calloc_one(session, alloc_refp));
+ parent_incr += children * sizeof(WT_REF);
+
+ /* Allocate child pages, and connect them into the new page index. */
+ WT_ASSERT(session, page_refp == pindex->index + chunk);
+ for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
+ slots = i == children - 1 ? remain : chunk;
+ WT_ERR(__wt_page_alloc(
+ session, page->type, 0, slots, false, &child));
+
+ /*
+ * Initialize the page's child reference; we need a copy of the
+ * page's key.
+ */
+ ref = *alloc_refp++;
+ ref->home = parent;
+ ref->page = child;
+ ref->addr = NULL;
+ if (page->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(page, *page_refp, &p, &size);
+ WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
+ parent_incr += sizeof(WT_IKEY) + size;
+ } else
+ ref->key.recno = (*page_refp)->key.recno;
+ ref->state = WT_REF_MEM;
+
+ /* Initialize the child page. */
+ if (page->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*page_refp)->key.recno;
+ child->pg_intl_parent_ref = ref;
+
+ /* Mark it dirty. */
+ WT_ERR(__wt_page_modify_init(session, child));
+ __wt_page_modify_set(session, child);
+
+ /* Ensure the page isn't evicted or split for now. */
+ __split_child_block_evict_and_split(child);
+
+ /*
+ * The newly allocated child's page index references the same
+ * structures as the parent. (We cannot move WT_REF structures,
+ * threads may be underneath us right now changing the structure
+ * state.) However, if the WT_REF structures reference on-page
+ * information, we have to fix that, because the disk image for
+ * the page that has an page index entry for the WT_REF is about
+ * to be discarded.
+ */
+ child_pindex = WT_INTL_INDEX_GET_SAFE(child);
+ child_incr = 0;
+ for (child_refp = child_pindex->index,
+ j = 0; j < slots; ++child_refp, ++page_refp, ++j)
+ WT_ERR(__split_ref_move(session, page,
+ page_refp, &page_decr, child_refp, &child_incr));
+
+ __wt_cache_page_inmem_incr(session, child, child_incr);
+ }
+ WT_ASSERT(session, alloc_refp -
+ alloc_index->index == (ptrdiff_t)alloc_index->entries);
+ WT_ASSERT(session,
+ page_refp - pindex->index == (ptrdiff_t)pindex->entries);
+
+ /* Split into the parent. */
+ WT_ERR(__split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false));
+
+ /*
+ * A note on error handling: until this point, there's no problem with
+ * unwinding on error. We allocated a new page index, a new set of
+ * WT_REFs and a new set of child pages -- if an error occurred, the
+ * page remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the parent page, attention
+ * needs to be paid. However, subsequent failures are relatively benign,
+ * the split is OK and complete. For that reason, we ignore errors past
+ * this point unless there's a panic.
+ */
+ complete = true;
+
+ /* Confirm the page's index hasn't moved, then update it. */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, page));
+#endif
+
+ /* Fix up the moved WT_REF structures. */
+ WT_ERR(__split_ref_move_final(
+ session, alloc_index->index + 1, alloc_index->entries - 1));
+
+ /*
+ * We don't care about the page-index we allocated, all we needed was
+ * the array of WT_REF structures, which has now been split into the
+ * parent page.
+ */
+ __wt_free(session, alloc_index);
+
+ /*
+ * We can't free the previous page's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling, we've already updated
+ * the parent page. Even if stashing the old value fails, we don't roll
+ * back that change, because threads may already be using the new parent
+ * page.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
+ page_decr += size;
+
+ /* Adjust the page's memory footprint, and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, page, page_incr);
+ __wt_cache_page_inmem_decr(session, page, page_decr);
+ __wt_page_modify_set(session, page);
err: /*
* If complete is true, we saw an error after opening up the tree to
- * descent through the parent page's new index. There is nothing we
- * can do, there are threads potentially active in both versions of
- * the tree.
+ * descent through the page's new index. There is nothing we can do,
+ * there are threads potentially active in both versions of the tree.
*
* A note on error handling: if we completed the split, return success,
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
- if (!complete)
- __wt_free_ref_index(session, parent, alloc_index, true);
+ if (!complete) {
+ __wt_free_ref_index(session, page, alloc_index, true);
+ __wt_free_ref_index(session, page, replace_index, false);
+ }
if (ret != 0 && ret != WT_PANIC)
__wt_err(session, ret,
- "ignoring not-fatal error during parent page split to "
- "deepen the tree");
+ "ignoring not-fatal error during internal page split");
return (ret == WT_PANIC || !complete ? ret : 0);
}
/*
+ * __split_internal_lock --
+ * Lock an internal page.
+ */
+static int
+__split_internal_lock(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_REF *parent_ref;
+
+ *hazardp = false;
+ *parentp = NULL;
+
+ /*
+ * A checkpoint reconciling this parent page can deadlock with
+ * our split. We have an exclusive page lock on the child before
+ * we acquire the page's reconciliation lock, and reconciliation
+ * acquires the page's reconciliation lock before it encounters
+ * the child's exclusive lock (which causes reconciliation to
+ * loop until the exclusive lock is resolved). If we want to split
+ * the parent, give up to avoid that deadlock.
+ */
+ if (S2BT(session)->checkpointing != WT_CKPT_OFF)
+ return (EBUSY);
+
+ /*
+ * Get a page-level lock on the parent to single-thread splits into the
+ * page because we need to single-thread sizing/growing the page index.
+ * It's OK to queue up multiple splits as the child pages split, but the
+ * actual split into the parent has to be serialized. Note we allocate
+ * memory inside of the lock and may want to invest effort in making the
+ * locked period shorter.
+ *
+ * We use the reconciliation lock here because not only do we have to
+ * single-thread the split, we have to lock out reconciliation of the
+ * parent because reconciliation of the parent can't deal with finding
+ * a split child during internal page traversal. Basically, there's no
+ * reason to use a different lock if we have to block reconciliation
+ * anyway.
+ */
+ for (;;) {
+ parent = ref->home;
+
+ /* Skip pages that aren't ready to split. */
+ if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
+ return (EBUSY);
+
+ WT_RET(__wt_fair_lock(session, &parent->page_lock));
+ if (parent == ref->home)
+ break;
+ WT_RET(__wt_fair_unlock(session, &parent->page_lock));
+ }
+
+ /*
+ * We have exclusive access to split the parent, and at this point, the
+ * child prevents the parent from being evicted. However, once we
+ * update the parent's index, it may no longer refer to the child, and
+ * could conceivably be evicted. Get a hazard pointer on the parent
+ * now, so that we can safely access it after updating the index.
+ *
+ * Take care getting the page doesn't trigger eviction work: we could
+ * block trying to split a different child of our parent and deadlock
+ * or we could be the eviction server relied upon by other threads to
+ * populate the eviction queue.
+ */
+ if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
+ WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
+ *hazardp = true;
+ }
+
+ *parentp = parent;
+ return (0);
+
+err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
+ return (ret);
+}
+
+/*
+ * __split_internal_unlock --
+ * Unlock the parent page.
+ */
+static int
+__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
+{
+ WT_DECL_RET;
+
+ if (hazard)
+ ret = __wt_hazard_clear(session, parent);
+
+ WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
+ return (ret);
+}
+
+/*
+ * __split_internal_should_split --
+ * Return if we should split an internal page.
+ */
+static bool
+__split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /*
+ * Our caller is holding the parent page locked to single-thread splits,
+ * which means we can safely look at the page's index without setting a
+ * split generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+
+ /* Sanity check for a reasonable number of on-page keys. */
+ if (pindex->entries < 100)
+ return (false);
+
+ /*
+ * Deepen the tree if the page's memory footprint is larger than the
+ * maximum size for a page in memory (presumably putting eviction
+ * pressure on the cache).
+ */
+ if (page->memory_footprint > btree->maxmempage)
+ return (true);
+
+ /*
+ * Check if the page has enough keys to make it worth splitting. If
+ * the number of keys is allowed to grow too large, the cost of
+ * splitting into parent pages can become large enough to result
+ * in slow operations.
+ */
+ if (pindex->entries > btree->split_deepen_min_child)
+ return (true);
+
+ return (false);
+}
+
+/*
+ * __split_parent_climb --
+ * Check if we should split up the tree.
+ */
+static int
+__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_REF *ref;
+ bool parent_hazard;
+
+ /*
+ * Page splits trickle up the tree, that is, as leaf pages grow large
+ * enough and are evicted, they'll split into their parent. And, as
+ * that parent page grows large enough and is evicted, it splits into
+ * its parent and so on. When the page split wave reaches the root,
+ * the tree will permanently deepen as multiple root pages are written.
+ *
+ * However, this only helps if internal pages are evicted (and we resist
+ * evicting internal pages for obvious reasons), or if the tree were to
+ * be closed and re-opened from a disk image, which may be a rare event.
+ *
+ * To avoid internal pages becoming too large absent eviction, check
+ * parent pages each time pages are split into them. If the page is big
+ * enough, either split the page into its parent or, in the case of the
+ * root, deepen the tree.
+ *
+ * Split up the tree.
+ */
+ for (;;) {
+ parent = NULL;
+ parent_hazard = false;
+ ref = page->pg_intl_parent_ref;
+
+ /* If we don't need to split the page, we're done. */
+ if (!__split_internal_should_split(session, ref))
+ break;
+
+ /*
+ * If we've reached the root page, there are no subsequent pages
+ * to review, deepen the tree and quit.
+ */
+ if (__wt_ref_is_root(ref)) {
+ ret = __split_root(session, page);
+ break;
+ }
+
+ /*
+ * Lock the parent and split into it, then swap the parent/page
+ * locks, lock-coupling up the tree.
+ */
+ WT_ERR(__split_internal_lock(
+ session, ref, &parent, &parent_hazard));
+ ret = __split_internal(session, parent, page);
+ WT_TRET(__split_internal_unlock(session, page, page_hazard));
+
+ page = parent;
+ page_hazard = parent_hazard;
+ parent = NULL;
+ parent_hazard = false;
+ WT_ERR(ret);
+ }
+
+err: if (parent != NULL)
+ WT_TRET(
+ __split_internal_unlock(session, parent, parent_hazard));
+ WT_TRET(__split_internal_unlock(session, page, page_hazard));
+
+ /* A page may have been busy, in which case return without error. */
+ WT_RET_BUSY_OK(ret);
+ return (0);
+}
+
+/*
* __split_multi_inmem --
* Instantiate a page in a multi-block set.
*/
@@ -901,369 +1594,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
}
/*
- * __split_parent_lock --
- * Lock the parent page.
- */
-static int
-__split_parent_lock(
- WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
-{
- WT_DECL_RET;
- WT_PAGE *parent;
- WT_REF *parent_ref;
-
- *hazardp = false;
- *parentp = NULL;
-
- /*
- * A checkpoint reconciling this parent page can deadlock with
- * our split. We have an exclusive page lock on the child before
- * we acquire the page's reconciliation lock, and reconciliation
- * acquires the page's reconciliation lock before it encounters
- * the child's exclusive lock (which causes reconciliation to
- * loop until the exclusive lock is resolved). If we want to split
- * the parent, give up to avoid that deadlock.
- */
- if (S2BT(session)->checkpointing != WT_CKPT_OFF)
- return (EBUSY);
-
- /*
- * Get a page-level lock on the parent to single-thread splits into the
- * page because we need to single-thread sizing/growing the page index.
- * It's OK to queue up multiple splits as the child pages split, but the
- * actual split into the parent has to be serialized. Note we allocate
- * memory inside of the lock and may want to invest effort in making the
- * locked period shorter.
- *
- * We use the reconciliation lock here because not only do we have to
- * single-thread the split, we have to lock out reconciliation of the
- * parent because reconciliation of the parent can't deal with finding
- * a split child during internal page traversal. Basically, there's no
- * reason to use a different lock if we have to block reconciliation
- * anyway.
- */
- for (;;) {
- parent = ref->home;
- WT_RET(__wt_fair_lock(session, &parent->page_lock));
- if (parent == ref->home)
- break;
- /* Try again if the page deepened while we were waiting */
- WT_RET(__wt_fair_unlock(session, &parent->page_lock));
- }
-
- /*
- * We have exclusive access to split the parent, and at this point, the
- * child prevents the parent from being evicted. However, once we
- * update the parent's index, it will no longer refer to the child, and
- * could conceivably be evicted. Get a hazard pointer on the parent
- * now, so that we can safely access it after updating the index.
- *
- * Take care getting the page doesn't trigger eviction work: we could
- * block trying to split a different child of our parent and deadlock
- * or we could be the eviction server relied upon by other threads to
- * populate the eviction queue.
- */
- if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
- WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
- *hazardp = true;
- }
-
- *parentp = parent;
- return (0);
-
-err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
- return (ret);
-}
-
-/*
- * __split_parent_unlock --
- * Unlock the parent page.
- */
-static int
-__split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
-{
- WT_DECL_RET;
-
- if (hazard)
- ret = __wt_hazard_clear(session, parent);
-
- WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
- return (ret);
-}
-
-/*
- * __split_parent --
- * Resolve a multi-page split, inserting new information into the parent.
- */
-static int
-__split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
- WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive)
-{
- WT_DECL_RET;
- WT_IKEY *ikey;
- WT_PAGE *parent;
- WT_PAGE_INDEX *alloc_index, *pindex;
- WT_REF **alloc_refp, *next_ref, *parent_ref;
- size_t parent_decr, size;
- uint64_t split_gen;
- uint32_t i, j;
- uint32_t deleted_entries, parent_entries, result_entries;
- bool complete;
-
- parent = ref->home;
- parent_ref = parent->pg_intl_parent_ref;
-
- alloc_index = pindex = NULL;
- parent_decr = 0;
- parent_entries = 0;
- complete = false;
-
- /*
- * We've locked the parent, which means it cannot split (which is the
- * only reason to worry about split generation values).
- */
- pindex = WT_INTL_INDEX_GET_SAFE(parent);
- parent_entries = pindex->entries;
-
- /*
- * Remove any refs to deleted pages while we are splitting, we have
- * the internal page locked down, and are copying the refs into a new
- * array anyway. Switch them to the special split state, so that any
- * reading thread will restart. Include the ref we are splitting in
- * the count to be deleted.
- */
- for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
- if (next_ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, next_ref, true) &&
- __wt_atomic_casv32(
- &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
- deleted_entries++;
- }
-
- /*
- * The final entry count consists of the original count, plus any new
- * pages, less any WT_REFs we're removing.
- */
- result_entries = (parent_entries + new_entries) - deleted_entries;
-
- /*
- * If the entire (sub)tree is empty, give up: we can't leave an empty
- * internal page. Mark it to be evicted soon and clean up any
- * references that have changed state.
- */
- if (result_entries == 0) {
- __wt_page_evict_soon(parent);
- goto err;
- }
-
- /*
- * Allocate and initialize a new page index array for the parent, then
- * copy references from the original index array, plus references from
- * the newly created split array, into place.
- */
- size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
- WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
- parent_incr += size;
- alloc_index->index = (WT_REF **)(alloc_index + 1);
- alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref == ref) {
- for (j = 0; j < new_entries; ++j) {
- ref_new[j]->home = parent;
- *alloc_refp++ = ref_new[j];
-
- /*
- * Clear the split reference as it moves to the
- * allocated page index, so it never appears on
- * both after an error.
- */
- ref_new[j] = NULL;
- }
-
- /*
- * We detect append-style workloads to avoid repeatedly
- * deepening parts of the tree where no work is being
- * done by tracking if we're splitting after the slots
- * created by the last split to deepen this parent.
- *
- * Note the calculation: i is a 0-based array offset and
- * split-last is a count of entries, also either or both
- * i and split-last might be unsigned 0, don't decrement
- * either one.
- */
- if (i > parent->pg_intl_deepen_split_last)
- parent->
- pg_intl_deepen_split_append += new_entries;
- } else if (next_ref->state != WT_REF_SPLIT)
- /* Skip refs we have marked for deletion. */
- *alloc_refp++ = next_ref;
- }
-
- /* Check that we filled in all the entries. */
- WT_ASSERT(session,
- alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
-
- /*
- * Confirm the parent page's index hasn't moved then update it, which
- * makes the split visible to threads descending the tree.
- */
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
- WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- alloc_index = NULL;
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
-#endif
-
- /*
- * Reset the page's original WT_REF field to split. Threads cursoring
- * through the tree were blocked because that WT_REF state was set to
- * locked. This update changes the locked state to split, unblocking
- * those threads and causing them to re-calculate their position based
- * on the updated parent page's index.
- */
- WT_PUBLISH(ref->state, WT_REF_SPLIT);
-
- /*
- * A note on error handling: failures before we swapped the new page
- * index into the parent can be resolved by freeing allocated memory
- * because the original page is unchanged, we can continue to use it
- * and we have not yet modified the parent. Failures after we swap
- * the new page index into the parent are also relatively benign, the
- * split is OK and complete. For those reasons, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
-
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%s split into parent %" PRIu32 " -> %" PRIu32
- " (%" PRIu32 ")", ref->page == NULL ?
- "reverse" : __wt_page_type_string(ref->page->type),
- parent_entries, result_entries, result_entries - parent_entries));
-
- /*
- * The new page index is in place, free the WT_REF we were splitting
- * and any deleted WT_REFs we found, modulo the usual safe free
- * semantics.
- */
- for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref->state != WT_REF_SPLIT)
- continue;
- --deleted_entries;
-
- /*
- * We set the WT_REF to split, discard it, freeing any resources
- * it holds.
- *
- * Row-store trees where the old version of the page is being
- * discarded: the previous parent page's key for this child page
- * may have been an on-page overflow key. In that case, if the
- * key hasn't been deleted, delete it now, including its backing
- * blocks. We are exchanging the WT_REF that referenced it for
- * the split page WT_REFs and their keys, and there's no longer
- * any reference to it. Done after completing the split (if we
- * failed, we'd leak the underlying blocks, but the parent page
- * would be unaffected).
- */
- if (parent->type == WT_PAGE_ROW_INT) {
- WT_TRET(__split_ovfl_key_cleanup(
- session, parent, next_ref));
- ikey = __wt_ref_key_instantiated(next_ref);
- if (ikey != NULL) {
- size = sizeof(WT_IKEY) + ikey->size;
- WT_TRET(__split_safe_free(
- session, split_gen, 0, ikey, size));
- parent_decr += size;
- }
- /*
- * The page_del structure can be freed immediately: it
- * is only read when the ref state is WT_REF_DELETED.
- * The size of the structure wasn't added to the parent,
- * don't decrement.
- */
- if (next_ref->page_del != NULL) {
- __wt_free(session,
- next_ref->page_del->update_list);
- __wt_free(session, next_ref->page_del);
- }
- }
-
- WT_TRET(__split_safe_free(
- session, split_gen, 0, next_ref, sizeof(WT_REF)));
- parent_decr += sizeof(WT_REF);
- }
-
- /* We freed the reference that was split in the loop above. */
- ref = NULL;
-
- /*
- * We can't free the previous page index, there may be threads using it.
- * Add it to the session discard list, to be freed when it's safe.
- */
- size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
- parent_decr += size;
-
- /*
- * Adjust the parent's memory footprint.
- */
- __wt_cache_page_inmem_incr(session, parent, parent_incr);
- __wt_cache_page_inmem_decr(session, parent, parent_decr);
-
- /*
- * Simple page splits trickle up the tree, that is, as leaf pages grow
- * large enough and are evicted, they'll split into their parent. And,
- * as that parent grows large enough and is evicted, it will split into
- * its parent and so on. When the page split wave reaches the root,
- * the tree will permanently deepen as multiple root pages are written.
- * However, this only helps if first, the pages are evicted (and
- * we resist evicting internal pages for obvious reasons), and second,
- * if the tree is closed and re-opened from a disk image, which may be
- * a rare event.
- * To avoid the case of internal pages becoming too large when they
- * aren't being evicted, check internal pages each time a leaf page is
- * split into them. If it's big enough, deepen the tree at that point.
- * Do the check here because we've just grown the parent page and
- * are holding it locked.
- */
- if (ret == 0 && !exclusive &&
- __split_should_deepen(session, parent_ref))
- ret = __split_deepen(session, parent);
-
-err: /*
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete) {
- for (i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref->state == WT_REF_SPLIT)
- next_ref->state = WT_REF_DELETED;
- }
-
- /* If we gave up on a reverse split, unlock the child. */
- if (ref_new == NULL) {
- WT_ASSERT(session, ref->state == WT_REF_LOCKED);
- ref->state = WT_REF_DELETED;
- }
-
- __wt_free_ref_index(session, NULL, alloc_index, false);
- }
-
- if (ret != 0 && ret != WT_PANIC)
- __wt_err(session, ret,
- "ignoring not-fatal error during parent page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
-}
-
-/*
* __split_insert --
* Split a page's last insert list entries into a separate page.
*/
@@ -1279,6 +1609,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
size_t page_decr, parent_incr, right_incr;
int i;
+ WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
+ WT_STAT_FAST_DATA_INCR(session, cache_inmem_split);
+
page = ref->page;
right = NULL;
page_decr = parent_incr = right_incr = 0;
@@ -1491,7 +1824,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, false)) != 0) {
+ session, ref, split_ref, 2, parent_incr, false, true)) != 0) {
/*
* Move the insert list element back to the original page list.
* For simplicity, the previous skip list pointers originally
@@ -1513,9 +1846,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ERR(ret);
}
- WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
- WT_STAT_FAST_DATA_INCR(session, cache_inmem_split);
-
return (0);
err: if (split_ref[0] != NULL) {
@@ -1543,83 +1873,21 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE *parent;
bool hazard;
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_insert(session, ref);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
- return (ret);
-}
-
-/*
- * __wt_split_reverse --
- * We have a locked ref that is empty and we want to rewrite the index in
- * its parent.
- */
-int
-__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_PAGE *parent;
- bool hazard;
-
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_parent(session, ref, NULL, 0, 0, 0);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
- return (ret);
-}
-
-/*
- * __wt_split_rewrite --
- * Rewrite an in-memory page with a new version.
- */
-int
-__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
- WT_REF new;
-
- page = ref->page;
- mod = page->modify;
-
- /*
- * This isn't a split: a reconciliation failed because we couldn't write
- * something, and in the case of forced eviction, we need to stop this
- * page from being such a problem. We have exclusive access, rewrite the
- * page in memory. The code lives here because the split code knows how
- * to re-create a page in memory after it's been reconciled, and that's
- * exactly what we want to do.
- *
- * Build the new page.
- */
- memset(&new, 0, sizeof(new));
- WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-insert", ref->page));
- /*
- * The rewrite succeeded, we can no longer fail.
- *
- * Finalize the move, discarding moved update lists from the original
- * page.
- */
- __split_multi_inmem_final(page, &mod->mod_multi[0]);
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ if ((ret = __split_insert(session, ref)) != 0) {
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+ }
/*
- * Discard the original page.
- *
- * Pages with unresolved changes are not marked clean during
- * reconciliation, do it now.
+ * Split up through the tree as necessary; we're holding the original
+ * parent page locked, note the functions we call are responsible for
+ * releasing that lock.
*/
- __wt_page_modify_clear(session, page);
- __wt_ref_out(session, ref);
-
- /* Swap the new page into place. */
- ref->page = new.page;
- WT_PUBLISH(ref->state, WT_REF_MEM);
-
- return (0);
-
-err: __split_multi_inmem_fail(session, &new);
- return (ret);
+ return (__split_parent_climb(session, parent, hazard));
}
/*
@@ -1636,6 +1904,9 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
size_t parent_incr;
uint32_t i, new_entries;
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_leaf);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_leaf);
+
page = ref->page;
mod = page->modify;
new_entries = mod->mod_multi_entries;
@@ -1656,10 +1927,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* exclusively.
*/
WT_ERR(__split_parent(
- session, ref, ref_new, new_entries, parent_incr, closing));
-
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
- WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
+ session, ref, ref_new, new_entries, parent_incr, closing, true));
/*
* The split succeeded, we can no longer fail.
@@ -1697,8 +1965,98 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_PAGE *parent;
bool hazard;
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_multi(session, ref, closing);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-multi", ref->page));
+
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+ }
+
+ /*
+ * Split up through the tree as necessary; we're holding the original
+ * parent page locked, note the functions we call are responsible for
+ * releasing that lock.
+ */
+ return (__split_parent_climb(session, parent, hazard));
+}
+
+/*
+ * __wt_split_reverse --
+ * We have a locked ref that is empty and we want to rewrite the index in
+ * its parent.
+ */
+int
+__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ bool hazard;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: reverse-split", ref->page));
+
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ ret = __split_parent(session, ref, NULL, 0, 0, false, true);
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+}
+
+/*
+ * __wt_split_rewrite --
+ * Rewrite an in-memory page with a new version.
+ */
+int
+__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF new;
+
+ page = ref->page;
+ mod = page->modify;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page));
+
+ /*
+ * This isn't a split: a reconciliation failed because we couldn't write
+ * something, and in the case of forced eviction, we need to stop this
+ * page from being such a problem. We have exclusive access, rewrite the
+ * page in memory. The code lives here because the split code knows how
+ * to re-create a page in memory after it's been reconciled, and that's
+ * exactly what we want to do.
+ *
+ * Build the new page.
+ */
+ memset(&new, 0, sizeof(new));
+ WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
+
+ /*
+ * The rewrite succeeded, we can no longer fail.
+ *
+ * Finalize the move, discarding moved update lists from the original
+ * page.
+ */
+ __split_multi_inmem_final(page, &mod->mod_multi[0]);
+
+ /*
+ * Discard the original page.
+ *
+ * Pages with unresolved changes are not marked clean during
+ * reconciliation, do it now.
+ */
+ __wt_page_modify_clear(session, page);
+ __wt_ref_out(session, ref);
+
+ /* Swap the new page into place. */
+ ref->page = new.page;
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+
+ return (0);
+
+err: __split_multi_inmem_fail(session, &new);
return (ret);
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 7395cce11e1..07bb2eb3a01 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -191,7 +191,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
syncop == WT_SYNC_WRITE_LEAVES ?
"WRITE_LEAVES" : "CHECKPOINT",
leaf_bytes, leaf_pages, internal_bytes, internal_pages,
- WT_TIMEDIFF(end, start) / WT_MILLION));
+ WT_TIMEDIFF_MS(end, start)));
}
err: /* On error, clear any left-over tree walk. */
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index d02f23ed164..e9fa570f97b 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -22,7 +22,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_INSERT *ins;
WT_INSERT_HEAD *ins_head;
WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
uint32_t base, indx, limit;
int depth;
@@ -37,10 +37,12 @@ __wt_col_search(WT_SESSION_IMPL *session,
goto leaf_only;
}
+restart_root:
/* Search the internal pages of the tree. */
current = &btree->root;
- for (depth = 2;; ++depth) {
-restart: page = current->page;
+ for (depth = 2, pindex = NULL;; ++depth) {
+ parent_pindex = pindex;
+restart_page: page = current->page;
if (page->type != WT_PAGE_COL_INT)
break;
@@ -51,8 +53,19 @@ restart: page = current->page;
descent = pindex->index[base - 1];
/* Fast path appends. */
- if (recno >= descent->key.recno)
+ if (recno >= descent->key.recno) {
+ /*
+ * If on the last slot (the key is larger than any key
+ * on the page), check for an internal page split race.
+ */
+ if (parent_pindex != NULL &&
+ __wt_split_intl_race(
+ session, current->home, parent_pindex)) {
+ WT_RET(__wt_page_release(session, current, 0));
+ goto restart_root;
+ }
goto descend;
+ }
/* Binary search of internal pages. */
for (base = 0,
@@ -90,15 +103,13 @@ descend: /*
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- switch (ret = __wt_page_swap(session, current, descent, 0)) {
- case 0:
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
current = descent;
- break;
- case WT_RESTART:
- goto restart;
- default:
- return (ret);
+ continue;
}
+ if (ret == WT_RESTART)
+ goto restart_page;
+ return (ret);
}
/* Track how deep the tree gets. */
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 7b21f1e40bb..d2d8a4640ca 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -144,7 +144,7 @@ __wt_row_search(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_ITEM *item;
WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
WT_ROW *rip;
size_t match, skiphigh, skiplow;
@@ -155,16 +155,16 @@ __wt_row_search(WT_SESSION_IMPL *session,
btree = S2BT(session);
collator = btree->collator;
item = cbt->tmp;
+ current = NULL;
__cursor_pos_clear(cbt);
/*
- * The row-store search routine uses a different comparison API.
- * The assumption is we're comparing more than a few keys with
- * matching prefixes, and it's a win to avoid the memory fetches
- * by skipping over those prefixes. That's done by tracking the
- * length of the prefix match for the lowest and highest keys we
- * compare as we descend the tree.
+ * In some cases we expect we're comparing more than a few keys with
+ * matching prefixes, so it's faster to avoid the memory fetches by
+ * skipping over those prefixes. That's done by tracking the length of
+ * the prefix match for the lowest and highest keys we compare as we
+ * descend the tree.
*/
skiphigh = skiplow = 0;
@@ -186,10 +186,11 @@ __wt_row_search(WT_SESSION_IMPL *session,
}
/* Search the internal pages of the tree. */
- cmp = -1;
+restart_root:
current = &btree->root;
- for (depth = 2;; ++depth) {
-restart: page = current->page;
+ for (depth = 2, pindex = NULL;; ++depth) {
+ parent_pindex = pindex;
+restart_page: page = current->page;
if (page->type != WT_PAGE_ROW_INT)
break;
@@ -211,7 +212,7 @@ restart: page = current->page;
WT_ERR(__wt_compare(
session, collator, srch_key, item, &cmp));
if (cmp >= 0)
- goto descend;
+ goto append;
/* A failed append check turns off append checks. */
append_check = false;
@@ -252,7 +253,26 @@ restart: page = current->page;
} else if (cmp == 0)
goto descend;
}
- else if (collator == NULL)
+ else if (collator == NULL) {
+ /*
+ * Reset the skipped prefix counts; we'd normally expect
+ * the parent's skipped prefix values to be larger than
+ * the child's values and so we'd only increase them as
+ * we walk down the tree (in other words, if we can skip
+ * N bytes on the parent, we can skip at least N bytes
+ * on the child). However, if a child internal page was
+ * split up into the parent, the child page's key space
+ * will have been truncated, and the values from the
+ * parent's search may be wrong for the child. We only
+ * need to reset the high count because the split-page
+ * algorithm truncates the end of the internal page's
+ * key space, the low count is still correct. We also
+ * don't need to clear either count when transitioning
+ * to a leaf page, a leaf page's key space can't change
+ * in flight.
+ */
+ skiphigh = 0;
+
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
descent = pindex->index[indx];
@@ -271,7 +291,7 @@ restart: page = current->page;
else
goto descend;
}
- else
+ } else
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
descent = pindex->index[indx];
@@ -288,9 +308,10 @@ restart: page = current->page;
}
/*
- * Set the slot to descend the tree: descent is already set if
- * there was an exact match on the page, otherwise, base is
- * the smallest index greater than key, possibly (last + 1).
+ * Set the slot to descend the tree: descent was already set if
+ * there was an exact match on the page, otherwise, base is the
+ * smallest index greater than key, possibly one past the last
+ * slot.
*/
descent = pindex->index[base - 1];
@@ -298,25 +319,41 @@ restart: page = current->page;
* If we end up somewhere other than the last slot, it's not a
* right-side descent.
*/
- if (pindex->entries != base - 1)
+ if (pindex->entries != base)
descend_right = false;
+ /*
+ * If on the last slot (the key is larger than any key on the
+ * page), check for an internal page split race.
+ */
+ if (pindex->entries == base) {
+append: if (parent_pindex != NULL &&
+ __wt_split_intl_race(
+ session, current->home, parent_pindex)) {
+ if ((ret = __wt_page_release(
+ session, current, 0)) != 0)
+ return (ret);
+
+ skiplow = skiphigh = 0;
+ goto restart_root;
+ }
+ }
+
descend: /*
* Swap the current page for the child page. If the page splits
* while we're retrieving it, restart the search in the current
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- switch (ret = __wt_page_swap(session, current, descent, 0)) {
- case 0:
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
current = descent;
- break;
- case WT_RESTART:
+ continue;
+ }
+ if (ret == WT_RESTART) {
skiphigh = skiplow = 0;
- goto restart;
- default:
- return (ret);
+ goto restart_page;
}
+ return (ret);
}
/* Track how deep the tree gets. */
@@ -517,7 +554,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
__cursor_pos_clear(cbt);
-restart:
+restart_root:
/* Walk the internal pages of the tree. */
current = &btree->root;
for (;;) {
@@ -544,7 +581,7 @@ restart:
*/
if (ret == WT_RESTART &&
(ret = __wt_page_release(session, current, 0)) == 0)
- goto restart;
+ goto restart_root;
return (ret);
}
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index aa14e9aadde..8d16f94c092 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -734,7 +734,7 @@ __wt_cache_pool_server(void *arg)
F_ISSET(cache, WT_CACHE_POOL_RUN)) {
if (cp->currently_used <= cp->size)
WT_ERR(__wt_cond_wait(session,
- cp->cache_pool_cond, 1000000));
+ cp->cache_pool_cond, WT_MILLION));
/*
* Re-check pool run flag - since we want to avoid getting the
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index 8f039e61654..b47e2550b23 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -31,7 +31,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp)
* Checkpoints based on log size also require logging be enabled.
*/
WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
- conn->ckpt_usecs = (uint64_t)cval.val * 1000000;
+ conn->ckpt_usecs = (uint64_t)cval.val * WT_MILLION;
WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval));
conn->ckpt_logsize = (wt_off_t)cval.val;
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 007d4273e72..1d44d816467 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -538,8 +538,8 @@ restart:
while (i < WT_SLOT_POOL) {
save_i = i;
slot = &log->slot_pool[i++];
- WT_ASSERT(session, slot->slot_state != 0 ||
- slot->slot_release_lsn.file >= log->write_lsn.file);
+ WT_ASSERT(session, slot->slot_state != 0 ||
+ slot->slot_release_lsn.file >= log->write_lsn.file);
if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
continue;
written[written_i].slot_index = save_i;
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index ec3a630581a..455ec9514f0 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -83,7 +83,7 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp)
WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval));
/* Only start the server if wait time is non-zero */
*runp = cval.val != 0;
- conn->stat_usecs = (uint64_t)cval.val * 1000000;
+ conn->stat_usecs = (uint64_t)cval.val * WT_MILLION;
WT_RET(__wt_config_gets(
session, cfg, "statistics_log.on_close", &cval));
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index a8620ebaa99..b9b46f3211c 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -136,7 +136,8 @@ __sweep_expire(WT_SESSION_IMPL *session, time_t now)
!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
dhandle->session_inuse != 0 ||
dhandle->timeofdeath == 0 ||
- now <= dhandle->timeofdeath + conn->sweep_idle_time)
+ difftime(now, dhandle->timeofdeath) <=
+ conn->sweep_idle_time)
continue;
WT_WITH_DHANDLE(session, dhandle,
@@ -276,8 +277,8 @@ __sweep_server(void *arg)
while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
F_ISSET(conn, WT_CONN_SERVER_SWEEP)) {
/* Wait until the next event. */
- WT_ERR(__wt_cond_wait(session, conn->sweep_cond,
- (uint64_t)conn->sweep_interval * WT_MILLION));
+ WT_ERR(__wt_cond_wait(session,
+ conn->sweep_cond, conn->sweep_interval * WT_MILLION));
WT_ERR(__wt_seconds(session, &now));
WT_STAT_FAST_CONN_INCR(session, dh_sweeps);
@@ -329,27 +330,25 @@ __wt_sweep_config(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
- /* Pull out the sweep configurations. */
- WT_RET(__wt_config_gets(session,
- cfg, "file_manager.close_idle_time", &cval));
- conn->sweep_idle_time = (time_t)cval.val;
-
- /* Non-zero sweep idle time is incompatible with in-memory */
- if (conn->sweep_idle_time != 0) {
- WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
- if (cval.val != 0)
- WT_RET_MSG(session, EINVAL,
- "In memory configuration incompatible with "
- "non zero file_manager=(close_idle_time)");
+ /*
+ * A non-zero idle time is incompatible with in-memory, and the default
+ * is non-zero; set the in-memory configuration idle time to zero.
+ */
+ conn->sweep_idle_time = 0;
+ WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
+ if (cval.val == 0) {
+ WT_RET(__wt_config_gets(session,
+ cfg, "file_manager.close_idle_time", &cval));
+ conn->sweep_idle_time = (uint64_t)cval.val;
}
WT_RET(__wt_config_gets(session,
cfg, "file_manager.close_scan_interval", &cval));
- conn->sweep_interval = (time_t)cval.val;
+ conn->sweep_interval = (uint64_t)cval.val;
WT_RET(__wt_config_gets(session,
cfg, "file_manager.close_handle_minimum", &cval));
- conn->sweep_handles_min = (u_int)cval.val;
+ conn->sweep_handles_min = (uint64_t)cval.val;
return (0);
}
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 3b227d00198..c40e764e2f6 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -11,6 +11,13 @@ was relying on this behavior, a connection will be opened with different
settings after upgrading, which could lead to errors or unexpected behavior.
</dd>
+<dt>Statistic change</dt>
+<dd>
+The statistic "pages split during eviction" was replaced. It has been
+replaced by a pair of statistics "internal pages split during eviction" and
+"leaf pages split during eviction".
+</dd>
+
<dt>Change to WT_CURSOR::insert</dt>
<dd>
The WT_CURSOR::insert method in this release has slightly different semantics
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 18335d6fb5e..fa6c4f4313f 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -183,10 +183,10 @@ __evict_server(void *arg)
session, &conn->dhandle_lock)) == EBUSY &&
!F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
spins++) {
- if (spins < 1000)
+ if (spins < WT_THOUSAND)
__wt_yield();
else
- __wt_sleep(0, 1000);
+ __wt_sleep(0, WT_THOUSAND);
}
/*
* If we gave up acquiring the lock, that indicates a
@@ -210,7 +210,7 @@ __evict_server(void *arg)
else {
/* After being stuck for 5 minutes, give up. */
WT_ERR(__wt_epoch(session, &now));
- if (WT_TIMEDIFF(now, stuck_ts) / WT_BILLION > 300) {
+ if (WT_TIMEDIFF_SEC(now, stuck_ts) > 300) {
__wt_errx(session,
"Cache stuck for too long, giving up");
(void)__wt_cache_dump(session, NULL);
@@ -601,7 +601,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* that can free space in cache, such as LSM discarding
* handles.
*/
- __wt_sleep(0, 1000 * (uint64_t)loop);
+ __wt_sleep(0, WT_THOUSAND * (uint64_t)loop);
if (loop == 100) {
/*
* Mark the cache as stuck if we need space
@@ -992,10 +992,10 @@ retry: while (slot < max_entries && ret == 0) {
session, &conn->dhandle_lock)) == EBUSY &&
!F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
spins++) {
- if (spins < 1000)
+ if (spins < WT_THOUSAND)
__wt_yield();
else
- __wt_sleep(0, 1000);
+ __wt_sleep(0, WT_THOUSAND);
}
if (ret != 0)
break;
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index e49098e90db..94c969fa5bb 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -179,9 +179,17 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* something is busy, be sure that the page still ends up
* marked deleted.
*/
- if (ndeleted > pindex->entries / 10 && pindex->entries > 1 &&
- (ret = __wt_split_reverse(session, ref)) != EBUSY)
- return (ret);
+ if (ndeleted > pindex->entries / 10 && pindex->entries > 1) {
+ if ((ret = __wt_split_reverse(session, ref)) == 0)
+ return (0);
+ WT_RET_BUSY_OK(ret);
+
+ /*
+ * The child must be locked after a failed reverse
+ * split.
+ */
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+ }
}
WT_PUBLISH(ref->state, WT_REF_DELETED);
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 02819237c13..ae29dc68003 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -437,24 +437,10 @@ struct __wt_page {
uint32_t deleted_entries;
WT_REF **index;
} * volatile __index; /* Collated children */
-
- /*
- * When splitting to deepen the tree, track the number
- * of entries in the newly created parent, and how many
- * subsequent splits follow the initial set of entries.
- * If future splits into the page are generally after
- * the initial set of items, perform future deepening
- * splits in this page to optimize for an append-style
- * workload.
- */
- uint32_t deepen_split_append;
- uint32_t deepen_split_last;
} intl;
#undef pg_intl_recno
#define pg_intl_recno u.intl.recno
#define pg_intl_parent_ref u.intl.parent_ref
-#define pg_intl_deepen_split_append u.intl.deepen_split_append
-#define pg_intl_deepen_split_last u.intl.deepen_split_last
/*
* Macros to copy/set the index because the name is obscured to ensure
@@ -581,7 +567,8 @@ struct __wt_page {
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
-#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */
+#define WT_PAGE_SPLIT_BLOCK 0x40 /* Split blocking eviction and splits */
+#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
uint8_t unused[2]; /* Unused padding */
diff --git a/src/include/btree.i b/src/include/btree.i
index 23e212eb772..a92d52e784a 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -1101,16 +1101,17 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
return (false);
/*
- * If the tree was deepened, there's a requirement that newly created
- * internal pages not be evicted until all threads are known to have
- * exited the original page index array, because evicting an internal
- * page discards its WT_REF array, and a thread traversing the original
- * page index array might see a freed WT_REF. During the split we set
- * a transaction value, once that's globally visible, we know we can
- * evict the created page.
+ * If a split created new internal pages, those newly created internal
+ * pages cannot be evicted until all threads are known to have exited
+ * the original parent page's index, because evicting an internal page
+ * discards its WT_REF array, and a thread traversing the original
+ * parent page index might see a freed WT_REF. During the split we set
+ * a transaction value, we can evict the created page as soon as that
+ * transaction value is globally visible.
*/
if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
- !__wt_txn_visible_all(session, mod->mod_split_txn))
+ (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) ||
+ !__wt_txn_visible_all(session, mod->mod_split_txn)))
return (false);
/*
@@ -1374,3 +1375,34 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
return (child->memory_footprint > maxsize);
}
+
+/*
+ * __wt_split_intl_race --
+ * Return if we raced with an internal page split when descending the tree.
+ */
+static inline bool
+__wt_split_intl_race(
+ WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE_INDEX *saved_pindex)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * A place to hang this comment...
+ *
+ * There's a page-split race when we walk the tree: if we're splitting
+ * an internal page into its parent, we update the parent's page index
+ * and then update the page being split, and it's not an atomic update.
+ * A thread could read the parent page's original page index, and then
+ * read the page's replacement index. Because internal page splits work
+ * by replacing the original page with the initial part of the original
+ * page, the result of this race is we will have a key that's past the
+ * end of the current page, and the parent's page index will have moved.
+ *
+ * It's also possible a thread could read the parent page's replacement
+ * page index, and then read the page's original index. Because internal
+ * splits work by truncating the original page, the original page's old
+ * content is compatible, this isn't a problem and we ignore this race.
+ */
+ WT_INTL_INDEX_GET(session, parent, pindex);
+ return (pindex != saved_pindex);
+}
diff --git a/src/include/connection.h b/src/include/connection.h
index 35a83d7c50f..3e8d3705373 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -365,13 +365,13 @@ struct __wt_connection_impl {
WT_SESSION_IMPL *meta_ckpt_session;/* Metadata checkpoint session */
- WT_SESSION_IMPL *sweep_session; /* Handle sweep session */
- wt_thread_t sweep_tid; /* Handle sweep thread */
- int sweep_tid_set; /* Handle sweep thread set */
- WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */
- time_t sweep_idle_time;/* Handle sweep idle time */
- time_t sweep_interval;/* Handle sweep interval */
- u_int sweep_handles_min;/* Handle sweep minimum open */
+ WT_SESSION_IMPL *sweep_session; /* Handle sweep session */
+ wt_thread_t sweep_tid; /* Handle sweep thread */
+ int sweep_tid_set; /* Handle sweep thread set */
+ WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */
+ uint64_t sweep_idle_time; /* Handle sweep idle time */
+ uint64_t sweep_interval; /* Handle sweep interval */
+ uint64_t sweep_handles_min;/* Handle sweep minimum open */
/*
* Shared lookaside lock, session and cursor, used by threads accessing
diff --git a/src/include/extern.h b/src/include/extern.h
index 032b94b7040..a6ccc526f8c 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -155,9 +155,9 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
-extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op);
extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
@@ -523,7 +523,6 @@ extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base);
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg);
extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid);
extern void __wt_thread_id(char *buf, size_t buflen);
-extern int __wt_seconds(WT_SESSION_IMPL *session, time_t *timep);
extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
extern void __wt_yield(void);
extern int __wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...);
diff --git a/src/include/misc.h b/src/include/misc.h
index eca77214b47..e542baec642 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -13,6 +13,7 @@
#define WT_UNUSED(var) (void)(var)
/* Basic constants. */
+#define WT_THOUSAND (1000)
#define WT_MILLION (1000000)
#define WT_BILLION (1000000000)
diff --git a/src/include/misc.i b/src/include/misc.i
index 80096d0cf72..75068706b70 100644
--- a/src/include/misc.i
+++ b/src/include/misc.i
@@ -30,6 +30,22 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
}
/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+static inline int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ WT_RET(__wt_epoch(session, &t));
+
+ *timep = t.tv_sec;
+
+ return (0);
+}
+
+/*
* __wt_verbose --
* Verbose message.
*/
diff --git a/src/include/mutex.i b/src/include/mutex.i
index 843c4ad9350..7eb042dd79f 100644
--- a/src/include/mutex.i
+++ b/src/include/mutex.i
@@ -18,7 +18,7 @@
/* Default to spinning 1000 times before yielding. */
#ifndef WT_SPIN_COUNT
-#define WT_SPIN_COUNT 1000
+#define WT_SPIN_COUNT WT_THOUSAND
#endif
/*
@@ -300,7 +300,7 @@ __wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
* situation happens if there are more threads than cores in the
* system and we're thrashing on shared resources.
*/
- if (++pause_cnt < 1000)
+ if (++pause_cnt < WT_THOUSAND)
WT_PAUSE();
else
__wt_sleep(0, 10);
diff --git a/src/include/os.h b/src/include/os.h
index 4ba588111b8..d135fd9eb1f 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -65,9 +65,16 @@ typedef enum {
} \
} while (0)
-#define WT_TIMEDIFF(end, begin) \
- (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \
+#define WT_TIMEDIFF_NS(end, begin) \
+ (WT_BILLION * (uint64_t)((end).tv_sec - (begin).tv_sec) + \
(uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec)
+#define WT_TIMEDIFF_US(end, begin) \
+ (WT_TIMEDIFF_NS((end), (begin)) / WT_THOUSAND)
+#define WT_TIMEDIFF_MS(end, begin) \
+ (WT_TIMEDIFF_NS((end), (begin)) / WT_MILLION)
+#define WT_TIMEDIFF_SEC(end, begin) \
+ (WT_TIMEDIFF_NS((end), (begin)) / WT_BILLION)
+
#define WT_TIMECMP(t1, t2) \
((t1).tv_sec < (t2).tv_sec ? -1 : \
(t1).tv_sec == (t2.tv_sec) ? \
diff --git a/src/include/stat.h b/src/include/stat.h
index 2a8552fded1..0ad872d11da 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -272,7 +272,8 @@ struct __wt_connection_stats {
int64_t cache_eviction_server_evicting;
int64_t cache_eviction_server_not_evicting;
int64_t cache_eviction_slow;
- int64_t cache_eviction_split;
+ int64_t cache_eviction_split_internal;
+ int64_t cache_eviction_split_leaf;
int64_t cache_eviction_walk;
int64_t cache_eviction_worker_evicting;
int64_t cache_inmem_split;
@@ -434,7 +435,8 @@ struct __wt_dsrc_stats {
int64_t cache_eviction_fail;
int64_t cache_eviction_hazard;
int64_t cache_eviction_internal;
- int64_t cache_eviction_split;
+ int64_t cache_eviction_split_internal;
+ int64_t cache_eviction_split_leaf;
int64_t cache_inmem_split;
int64_t cache_inmem_splittable;
int64_t cache_overflow_value;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 8503f5918e9..1b8c345e32b 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -3713,228 +3713,230 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1043
/*! cache: eviction server unable to reach eviction goal */
#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1044
-/*! cache: pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1045
+/*! cache: internal pages split during eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1045
+/*! cache: leaf pages split during eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1046
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1046
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1047
/*! cache: eviction worker thread evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1048
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1049
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1050
/*! cache: lookaside table insert calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050
+#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1051
/*! cache: lookaside table remove calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1052
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1052
+#define WT_STAT_CONN_CACHE_OVERHEAD 1053
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1054
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1055
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1055
+#define WT_STAT_CONN_CACHE_READ 1056
/*! cache: pages read into cache requiring lookaside entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1057
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1057
+#define WT_STAT_CONN_CACHE_WRITE 1058
/*! cache: page written requiring lookaside records */
-#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1059
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1060
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1060
+#define WT_STAT_CONN_COND_WAIT 1061
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1061
+#define WT_STAT_CONN_CURSOR_CREATE 1062
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1062
+#define WT_STAT_CONN_CURSOR_INSERT 1063
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1063
+#define WT_STAT_CONN_CURSOR_NEXT 1064
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1064
+#define WT_STAT_CONN_CURSOR_PREV 1065
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1065
+#define WT_STAT_CONN_CURSOR_REMOVE 1066
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1066
+#define WT_STAT_CONN_CURSOR_RESET 1067
/*! cursor: cursor restarted searches */
-#define WT_STAT_CONN_CURSOR_RESTART 1067
+#define WT_STAT_CONN_CURSOR_RESTART 1068
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1068
+#define WT_STAT_CONN_CURSOR_SEARCH 1069
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1070
/*! cursor: truncate calls */
-#define WT_STAT_CONN_CURSOR_TRUNCATE 1070
+#define WT_STAT_CONN_CURSOR_TRUNCATE 1071
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1071
+#define WT_STAT_CONN_CURSOR_UPDATE 1072
/*! data-handle: connection data handles currently active */
-#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1072
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1073
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1073
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1074
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1074
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1075
/*! data-handle: connection sweep dhandles closed */
-#define WT_STAT_CONN_DH_SWEEP_CLOSE 1075
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1076
/*! data-handle: connection sweep candidate became referenced */
-#define WT_STAT_CONN_DH_SWEEP_REF 1076
+#define WT_STAT_CONN_DH_SWEEP_REF 1077
/*! data-handle: connection sweep dhandles removed from hash list */
-#define WT_STAT_CONN_DH_SWEEP_REMOVE 1077
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1078
/*! data-handle: connection sweep time-of-death sets */
-#define WT_STAT_CONN_DH_SWEEP_TOD 1078
+#define WT_STAT_CONN_DH_SWEEP_TOD 1079
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_SWEEPS 1079
+#define WT_STAT_CONN_DH_SWEEPS 1080
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1080
+#define WT_STAT_CONN_FILE_OPEN 1081
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1081
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1082
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1082
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1083
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1083
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1084
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1084
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1085
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1085
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1086
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1086
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1087
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1087
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1088
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1088
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1089
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1089
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1090
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1090
+#define WT_STAT_CONN_LOG_FLUSH 1091
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1091
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1092
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1092
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1093
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1093
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1094
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1094
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1095
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1095
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1096
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1096
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1097
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1097
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1098
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1098
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1099
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1099
+#define WT_STAT_CONN_LOG_SCANS 1100
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1100
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1101
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1101
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1102
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1102
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1103
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1103
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1104
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1104
+#define WT_STAT_CONN_LOG_SLOT_RACES 1105
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1105
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1106
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1106
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1107
/*! log: consolidated slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1107
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1108
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1108
+#define WT_STAT_CONN_LOG_SYNC 1109
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1109
+#define WT_STAT_CONN_LOG_SYNC_DIR 1110
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1110
+#define WT_STAT_CONN_LOG_WRITE_LSN 1111
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1111
+#define WT_STAT_CONN_LOG_WRITES 1112
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1112
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1113
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1113
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1114
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1114
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1115
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1115
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1116
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1116
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1117
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1117
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1118
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1118
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1119
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1119
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1120
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1120
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1121
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1121
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1122
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1122
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1123
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1123
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1124
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1124
+#define WT_STAT_CONN_MEMORY_FREE 1125
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1125
+#define WT_STAT_CONN_MEMORY_GROW 1126
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1126
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1127
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1127
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1128
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1128
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1129
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1129
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1130
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1130
+#define WT_STAT_CONN_PAGE_SLEEP 1131
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1131
+#define WT_STAT_CONN_READ_IO 1132
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1132
+#define WT_STAT_CONN_REC_PAGE_DELETE 1133
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1133
+#define WT_STAT_CONN_REC_PAGES 1134
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1134
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1135
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1135
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1136
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1136
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1137
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1137
+#define WT_STAT_CONN_RWLOCK_READ 1138
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1138
+#define WT_STAT_CONN_RWLOCK_WRITE 1139
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1139
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1140
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1140
+#define WT_STAT_CONN_SESSION_OPEN 1141
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1141
+#define WT_STAT_CONN_TXN_BEGIN 1142
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1142
+#define WT_STAT_CONN_TXN_CHECKPOINT 1143
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1143
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1144
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1144
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1145
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1145
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1146
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1146
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1147
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1147
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1148
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1148
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1149
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1149
+#define WT_STAT_CONN_TXN_COMMIT 1150
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1150
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1151
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1151
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1152
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1152
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1153
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1153
+#define WT_STAT_CONN_TXN_ROLLBACK 1154
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1154
+#define WT_STAT_CONN_TXN_SYNC 1155
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1155
+#define WT_STAT_CONN_WRITE_IO 1156
/*!
* @}
@@ -4030,114 +4032,116 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042
/*! cache: internal pages evicted */
#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043
-/*! cache: pages split during eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044
+/*! cache: internal pages split during eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2044
+/*! cache: leaf pages split during eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2045
/*! cache: in-memory page splits */
-#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045
+#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046
+#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2047
/*! cache: overflow values cached in memory */
-#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2048
/*! cache: pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ 2048
+#define WT_STAT_DSRC_CACHE_READ 2049
/*! cache: pages read into cache requiring lookaside entries */
-#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049
+#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2050
/*! cache: overflow pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051
/*! cache: pages written from cache */
-#define WT_STAT_DSRC_CACHE_WRITE 2051
+#define WT_STAT_DSRC_CACHE_WRITE 2052
/*! cache: page written requiring lookaside records */
-#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052
+#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2053
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053
+#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2054
/*! compression: raw compression call failed, no additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2055
/*! compression: raw compression call failed, additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2056
/*! compression: raw compression call succeeded */
-#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2057
/*! compression: compressed pages read */
-#define WT_STAT_DSRC_COMPRESS_READ 2057
+#define WT_STAT_DSRC_COMPRESS_READ 2058
/*! compression: compressed pages written */
-#define WT_STAT_DSRC_COMPRESS_WRITE 2058
+#define WT_STAT_DSRC_COMPRESS_WRITE 2059
/*! compression: page written failed to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2060
/*! compression: page written was too small to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2061
/*! cursor: create calls */
-#define WT_STAT_DSRC_CURSOR_CREATE 2061
+#define WT_STAT_DSRC_CURSOR_CREATE 2062
/*! cursor: insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 2062
+#define WT_STAT_DSRC_CURSOR_INSERT 2063
/*! cursor: bulk-loaded cursor-insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2064
/*! cursor: cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2065
/*! cursor: next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 2065
+#define WT_STAT_DSRC_CURSOR_NEXT 2066
/*! cursor: prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 2066
+#define WT_STAT_DSRC_CURSOR_PREV 2067
/*! cursor: remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 2067
+#define WT_STAT_DSRC_CURSOR_REMOVE 2068
/*! cursor: cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2069
/*! cursor: reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 2069
+#define WT_STAT_DSRC_CURSOR_RESET 2070
/*! cursor: restarted searches */
-#define WT_STAT_DSRC_CURSOR_RESTART 2070
+#define WT_STAT_DSRC_CURSOR_RESTART 2071
/*! cursor: search calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH 2071
+#define WT_STAT_DSRC_CURSOR_SEARCH 2072
/*! cursor: search near calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2073
/*! cursor: truncate calls */
-#define WT_STAT_DSRC_CURSOR_TRUNCATE 2073
+#define WT_STAT_DSRC_CURSOR_TRUNCATE 2074
/*! cursor: update calls */
-#define WT_STAT_DSRC_CURSOR_UPDATE 2074
+#define WT_STAT_DSRC_CURSOR_UPDATE 2075
/*! cursor: cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2075
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2076
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2076
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2077
/*! LSM: chunks in the LSM tree */
-#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2077
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2078
/*! LSM: highest merge generation in the LSM tree */
-#define WT_STAT_DSRC_LSM_GENERATION_MAX 2078
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2079
/*! LSM: queries that could have benefited from a Bloom filter that did
* not exist */
-#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2079
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2080
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2080
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2081
/*! reconciliation: dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 2081
+#define WT_STAT_DSRC_REC_DICTIONARY 2082
/*! reconciliation: internal page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2082
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2083
/*! reconciliation: leaf page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2083
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2084
/*! reconciliation: maximum blocks required for a page */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2084
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2085
/*! reconciliation: internal-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2085
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2086
/*! reconciliation: leaf-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2086
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2087
/*! reconciliation: overflow values written */
-#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2087
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2088
/*! reconciliation: pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2088
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2089
/*! reconciliation: page checksum matches */
-#define WT_STAT_DSRC_REC_PAGE_MATCH 2089
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2090
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2090
+#define WT_STAT_DSRC_REC_PAGES 2091
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2091
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2092
/*! reconciliation: leaf page key bytes discarded using prefix compression */
-#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2092
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2093
/*! reconciliation: internal page key bytes discarded using suffix
* compression */
-#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2093
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2094
/*! session: object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 2094
+#define WT_STAT_DSRC_SESSION_COMPACT 2095
/*! session: open cursor count */
-#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2095
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2096
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2096
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2097
/*! @} */
/*
* Statistics section: END
diff --git a/src/log/log.c b/src/log/log.c
index 44dc7dc30a7..3106094e7e3 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -1313,7 +1313,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_unlock(session, &log->log_slot_lock);
WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
- if (++yield_count < 1000)
+ if (++yield_count < WT_THOUSAND)
__wt_yield();
else
ret = __wt_cond_wait(session, log->log_write_cond, 200);
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index b3790412536..255551f99a4 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -380,7 +380,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
* There should almost always be a slot open.
*/
#ifdef HAVE_DIAGNOSTIC
- unbuf_force = (++log->write_calls % 1000) == 0;
+ unbuf_force = (++log->write_calls % WT_THOUSAND) == 0;
#endif
for (;;) {
WT_BARRIER();
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index dbd6a105475..953698476ef 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -81,7 +81,7 @@ __wt_clsm_await_switch(WT_CURSOR_LSM *clsm)
lsm_tree->nchunks == 0 ||
clsm->dsk_gen == lsm_tree->dsk_gen;
++waited) {
- if (waited % 1000 == 0)
+ if (waited % WT_THOUSAND == 0)
WT_RET(__wt_lsm_manager_push_entry(
session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
__wt_sleep(0, 10);
@@ -1530,6 +1530,10 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
if (!WT_PREFIX_MATCH(uri, "lsm:"))
return (EINVAL);
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_RET_MSG(session, EINVAL,
+ "LSM trees not supported by in-memory configurations");
+
WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
if (cval.len != 0)
WT_RET_MSG(session, EINVAL,
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index 1c5124c32af..d8cf36f2cc1 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -388,8 +388,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
continue;
WT_ERR(__wt_epoch(session, &now));
pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 :
- WT_TIMEDIFF(
- now, lsm_tree->work_push_ts) / WT_MILLION;
+ WT_TIMEDIFF_MS(now, lsm_tree->work_push_ts);
fillms = 3 * lsm_tree->chunk_fill_ms;
if (fillms == 0)
fillms = 10000;
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index dd1419fe67d..1a2608803e4 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -94,7 +94,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_RET(__wt_epoch(session, &now));
msec_since_last_merge =
- WT_TIMEDIFF(now, lsm_tree->merge_aggressive_ts) / WT_MILLION;
+ WT_TIMEDIFF_MS(now, lsm_tree->merge_aggressive_ts);
/*
* If there is no estimate for how long it's taking to fill chunks
@@ -457,7 +457,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
cfg[2] = NULL;
WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
-#define LSM_MERGE_CHECK_INTERVAL 1000
+#define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 30af051bbcf..0c3642e70e8 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -111,7 +111,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* other schema level operations will return EBUSY, even though
* we're dropping the schema lock here.
*/
- if (i % 1000 == 0) {
+ if (i % WT_THOUSAND == 0) {
WT_WITHOUT_LOCKS(session, ret =
__wt_lsm_manager_clear_tree(session, lsm_tree));
WT_RET(ret);
@@ -336,6 +336,11 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
}
WT_RET_NOTFOUND_OK(ret);
+ /* In-memory configurations don't make sense for LSM. */
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_RET_MSG(session, EINVAL,
+ "LSM trees not supported by in-memory configurations");
+
WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
if (WT_STRING_MATCH("r", cval.str, cval.len))
WT_RET_MSG(session, EINVAL,
@@ -747,7 +752,7 @@ __wt_lsm_tree_throttle(
WT_ASSERT(session,
WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
timediff =
- WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
+ WT_TIMEDIFF_NS(last_chunk->create_ts, ondisk->create_ts);
lsm_tree->ckpt_throttle =
(in_memory - 2) * timediff / (20 * record_count);
@@ -783,8 +788,8 @@ __wt_lsm_tree_throttle(
}
/* Put an upper bound of 1s on both throttle calculations. */
- lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
- lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);
+ lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle);
+ lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle);
/*
* Update our estimate of how long each in-memory chunk stays active.
@@ -798,15 +803,16 @@ __wt_lsm_tree_throttle(
WT_ASSERT(session, prev_chunk->generation == 0);
WT_ASSERT(session, WT_TIMECMP(
last_chunk->create_ts, prev_chunk->create_ts) >= 0);
- timediff =
- WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
+ timediff = WT_TIMEDIFF_NS(
+ last_chunk->create_ts, prev_chunk->create_ts);
WT_ASSERT(session,
WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
- oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
+ oldtime = WT_TIMEDIFF_NS(
+ prev_chunk->create_ts, ondisk->create_ts);
if (timediff < 10 * oldtime)
lsm_tree->chunk_fill_ms =
(3 * lsm_tree->chunk_fill_ms +
- timediff / 1000000) / 4;
+ timediff / WT_MILLION) / 4;
}
}
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index fac2c06957d..d5fc86b648b 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -76,9 +76,9 @@ __wt_cond_wait_signal(
if (usecs > 0) {
WT_ERR(__wt_epoch(session, &ts));
ts.tv_sec += (time_t)
- (((uint64_t)ts.tv_nsec + 1000 * usecs) / WT_BILLION);
+ (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION);
ts.tv_nsec = (long)
- (((uint64_t)ts.tv_nsec + 1000 * usecs) % WT_BILLION);
+ (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) % WT_BILLION);
ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts);
} else
ret = pthread_cond_wait(&cond->cond, &cond->mtx);
diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c
index d47ab197643..46f134feabb 100644
--- a/src/os_posix/os_mtx_rw.c
+++ b/src/os_posix/os_mtx_rw.c
@@ -201,7 +201,7 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* Don't sleep long when waiting on a read lock, hopefully we're
* waiting on another read thread to increment the reader count.
*/
- if (++pause_cnt < 1000)
+ if (++pause_cnt < WT_THOUSAND)
WT_PAUSE();
else
__wt_sleep(0, 10);
@@ -300,7 +300,7 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* situation happens if there are more threads than cores in the
* system and we're thrashing on shared resources.
*/
- if (++pause_cnt < 1000)
+ if (++pause_cnt < WT_THOUSAND)
WT_PAUSE();
else
__wt_sleep(0, 10);
diff --git a/src/os_posix/os_sleep.c b/src/os_posix/os_sleep.c
index f888e51bf7f..4e90edabc53 100644
--- a/src/os_posix/os_sleep.c
+++ b/src/os_posix/os_sleep.c
@@ -17,8 +17,8 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds)
{
struct timeval t;
- t.tv_sec = (time_t)(seconds + micro_seconds / 1000000);
- t.tv_usec = (suseconds_t)(micro_seconds % 1000000);
+ t.tv_sec = (time_t)(seconds + micro_seconds / WT_MILLION);
+ t.tv_usec = (suseconds_t)(micro_seconds % WT_MILLION);
(void)select(0, NULL, NULL, NULL, &t);
}
diff --git a/src/os_posix/os_time.c b/src/os_posix/os_time.c
index c52772e77e1..c3052df62e7 100644
--- a/src/os_posix/os_time.c
+++ b/src/os_posix/os_time.c
@@ -9,22 +9,6 @@
#include "wt_internal.h"
/*
- * __wt_seconds --
- * Return the seconds since the Epoch.
- */
-int
-__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
-{
- struct timespec t;
-
- WT_RET(__wt_epoch(session, &t));
-
- *timep = t.tv_sec;
-
- return (0);
-}
-
-/*
* __wt_epoch --
* Return the time since the Epoch.
*/
@@ -44,7 +28,7 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
if (ret == 0) {
tsp->tv_sec = v.tv_sec;
- tsp->tv_nsec = v.tv_usec * 1000;
+ tsp->tv_nsec = v.tv_usec * WT_THOUSAND;
return (0);
}
WT_RET_MSG(session, ret, "gettimeofday");
diff --git a/src/os_win/os_sleep.c b/src/os_win/os_sleep.c
index 484cf218f26..33e04c1d8a9 100644
--- a/src/os_win/os_sleep.c
+++ b/src/os_win/os_sleep.c
@@ -19,7 +19,7 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds)
* If the caller wants a small pause, set to our
* smallest granularity.
*/
- if (seconds == 0 && micro_seconds < 1000)
- micro_seconds = 1000;
- Sleep(seconds * 1000 + micro_seconds / 1000);
+ if (seconds == 0 && micro_seconds < WT_THOUSAND)
+ micro_seconds = WT_THOUSAND;
+ Sleep(seconds * WT_THOUSAND + micro_seconds / WT_THOUSAND);
}
diff --git a/src/os_win/os_time.c b/src/os_win/os_time.c
index c51db118ce1..2292c317a64 100644
--- a/src/os_win/os_time.c
+++ b/src/os_win/os_time.c
@@ -9,22 +9,6 @@
#include "wt_internal.h"
/*
- * __wt_seconds --
- * Return the seconds since the Epoch.
- */
-int
-__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
-{
- struct timespec t;
-
- WT_RET(__wt_epoch(session, &t));
-
- *timep = t.tv_sec;
-
- return (0);
-}
-
-/*
* __wt_epoch --
* Return the time since the Epoch.
*/
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index e57e18f4e87..6d53230e9e0 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -960,7 +960,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
* than 10,000 boundary structure elements, discard the boundary array
* entirely and start over next time.
*/
- if (destroy || r->bnd_entries > 10 * 1000) {
+ if (destroy || r->bnd_entries > 10 * WT_THOUSAND) {
for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->disk_image);
diff --git a/src/session/session_api.c b/src/session/session_api.c
index f1599320675..7d718c38c4f 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -1010,7 +1010,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
WT_ERR(__wt_cond_signal(session, conn->log_file_cond));
WT_ERR(__wt_epoch(session, &now));
- waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION;
+ waited_ms = WT_TIMEDIFF_MS(now, start);
if (forever || waited_ms < timeout_ms)
/*
* Note, we will wait an increasing amount of time
diff --git a/src/session/session_compact.c b/src/session/session_compact.c
index bd503cd7826..456fcd3ce03 100644
--- a/src/session/session_compact.c
+++ b/src/session/session_compact.c
@@ -133,8 +133,7 @@ __session_compact_check_timeout(
return (0);
WT_RET(__wt_epoch(session, &end));
- if (session->compact->max_time <
- WT_TIMEDIFF(end, begin) / WT_BILLION)
+ if (session->compact->max_time < WT_TIMEDIFF_SEC(end, begin))
WT_RET(ETIMEDOUT);
return (0);
}
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index ec2f0921ef2..dd5094fb480 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -390,7 +390,7 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
* do it again.
*/
WT_RET(__wt_seconds(session, &now));
- if (now - session->last_sweep < conn->sweep_interval)
+ if (difftime(now, session->last_sweep) < conn->sweep_interval)
return (0);
session->last_sweep = now;
@@ -404,7 +404,8 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
dhandle->session_inuse == 0 &&
(WT_DHANDLE_INACTIVE(dhandle) ||
(dhandle->timeofdeath != 0 &&
- now - dhandle->timeofdeath > conn->sweep_idle_time))) {
+ difftime(now, dhandle->timeofdeath) >
+ conn->sweep_idle_time))) {
WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
WT_ASSERT(session, !WT_IS_METADATA(dhandle));
__session_discard_dhandle(session, dhandle_cache);
diff --git a/src/support/err.c b/src/support/err.c
index c4bf4e8946a..de518cbf08b 100644
--- a/src/support/err.c
+++ b/src/support/err.c
@@ -199,7 +199,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error,
remain = WT_PTRDIFF(end, p);
wlen = (size_t)snprintf(p, remain,
"[%" PRIuMAX ":%" PRIuMAX "][%s]",
- (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid);
+ (uintmax_t)ts.tv_sec,
+ (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid);
p = wlen >= remain ? end : p + wlen;
prefix_cnt = 1;
}
diff --git a/src/support/stat.c b/src/support/stat.c
index 7a84a7b39da..82377d843ee 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -47,7 +47,8 @@ static const char * const __stats_dsrc_desc[] = {
"cache: data source pages selected for eviction unable to be evicted",
"cache: hazard pointer blocked page eviction",
"cache: internal pages evicted",
- "cache: pages split during eviction",
+ "cache: internal pages split during eviction",
+ "cache: leaf pages split during eviction",
"cache: in-memory page splits",
"cache: in-memory page passed criteria to be split",
"cache: overflow values cached in memory",
@@ -164,6 +165,8 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cache_inmem_splittable = 0;
stats->cache_inmem_split = 0;
stats->cache_eviction_internal = 0;
+ stats->cache_eviction_split_internal = 0;
+ stats->cache_eviction_split_leaf = 0;
stats->cache_eviction_dirty = 0;
stats->cache_read_overflow = 0;
stats->cache_overflow_value = 0;
@@ -171,7 +174,6 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cache_write_lookaside = 0;
stats->cache_read = 0;
stats->cache_read_lookaside = 0;
- stats->cache_eviction_split = 0;
stats->cache_write = 0;
stats->cache_write_restore = 0;
stats->cache_eviction_clean = 0;
@@ -282,6 +284,9 @@ __wt_stat_dsrc_aggregate_single(
to->cache_inmem_splittable += from->cache_inmem_splittable;
to->cache_inmem_split += from->cache_inmem_split;
to->cache_eviction_internal += from->cache_eviction_internal;
+ to->cache_eviction_split_internal +=
+ from->cache_eviction_split_internal;
+ to->cache_eviction_split_leaf += from->cache_eviction_split_leaf;
to->cache_eviction_dirty += from->cache_eviction_dirty;
to->cache_read_overflow += from->cache_read_overflow;
to->cache_overflow_value += from->cache_overflow_value;
@@ -289,7 +294,6 @@ __wt_stat_dsrc_aggregate_single(
to->cache_write_lookaside += from->cache_write_lookaside;
to->cache_read += from->cache_read;
to->cache_read_lookaside += from->cache_read_lookaside;
- to->cache_eviction_split += from->cache_eviction_split;
to->cache_write += from->cache_write;
to->cache_write_restore += from->cache_write_restore;
to->cache_eviction_clean += from->cache_eviction_clean;
@@ -410,6 +414,10 @@ __wt_stat_dsrc_aggregate(
to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
to->cache_eviction_internal +=
WT_STAT_READ(from, cache_eviction_internal);
+ to->cache_eviction_split_internal +=
+ WT_STAT_READ(from, cache_eviction_split_internal);
+ to->cache_eviction_split_leaf +=
+ WT_STAT_READ(from, cache_eviction_split_leaf);
to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow);
to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value);
@@ -419,7 +427,6 @@ __wt_stat_dsrc_aggregate(
WT_STAT_READ(from, cache_write_lookaside);
to->cache_read += WT_STAT_READ(from, cache_read);
to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
- to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
to->cache_write += WT_STAT_READ(from, cache_write);
to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
@@ -533,7 +540,8 @@ static const char * const __stats_connection_desc[] = {
"cache: eviction server evicting pages",
"cache: eviction server populating queue, but not evicting pages",
"cache: eviction server unable to reach eviction goal",
- "cache: pages split during eviction",
+ "cache: internal pages split during eviction",
+ "cache: leaf pages split during eviction",
"cache: pages walked for eviction",
"cache: eviction worker thread evicting pages",
"cache: in-memory page splits",
@@ -707,6 +715,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_inmem_splittable = 0;
stats->cache_inmem_split = 0;
stats->cache_eviction_internal = 0;
+ stats->cache_eviction_split_internal = 0;
+ stats->cache_eviction_split_leaf = 0;
stats->cache_lookaside_insert = 0;
stats->cache_lookaside_remove = 0;
/* not clearing cache_bytes_max */
@@ -721,7 +731,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_read = 0;
stats->cache_read_lookaside = 0;
stats->cache_eviction_fail = 0;
- stats->cache_eviction_split = 0;
stats->cache_eviction_walk = 0;
stats->cache_write = 0;
stats->cache_write_restore = 0;
@@ -888,6 +897,10 @@ __wt_stat_connection_aggregate(
to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
to->cache_eviction_internal +=
WT_STAT_READ(from, cache_eviction_internal);
+ to->cache_eviction_split_internal +=
+ WT_STAT_READ(from, cache_eviction_split_internal);
+ to->cache_eviction_split_leaf +=
+ WT_STAT_READ(from, cache_eviction_split_leaf);
to->cache_lookaside_insert +=
WT_STAT_READ(from, cache_lookaside_insert);
to->cache_lookaside_remove +=
@@ -908,7 +921,6 @@ __wt_stat_connection_aggregate(
to->cache_read += WT_STAT_READ(from, cache_read);
to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
- to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk);
to->cache_write += WT_STAT_READ(from, cache_write);
to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 066abc9ed0f..bc1537ca878 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -297,7 +297,7 @@ __checkpoint_stats(
/*
* Get time diff in microseconds.
*/
- msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION;
+ msec = WT_TIMEDIFF_MS(*stop, *start);
if (msec > conn->ckpt_time_max)
conn->ckpt_time_max = msec;
@@ -327,7 +327,7 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session,
/*
* Get time diff in microseconds.
*/
- msec = WT_TIMEDIFF(stop, *start) / WT_MILLION;
+ msec = WT_TIMEDIFF_MS(stop, *start);
WT_RET(__wt_verbose(session,
WT_VERB_CHECKPOINT, "time: %" PRIu64 " us, gen: %" PRIu64
": Full database checkpoint %s",
diff --git a/test/format/bulk.c b/test/format/bulk.c
index 203043166a4..75a158af741 100644
--- a/test/format/bulk.c
+++ b/test/format/bulk.c
@@ -36,7 +36,8 @@ wts_load(void)
WT_ITEM key, value;
WT_SESSION *session;
uint8_t *keybuf, *valbuf;
- int is_bulk, ret;
+ bool is_bulk;
+ int ret;
conn = g.wts_conn;
keybuf = valbuf = NULL;
@@ -49,12 +50,23 @@ wts_load(void)
"=============== bulk load start ===============");
/*
- * Avoid bulk load with KVS (there's no bulk load support for a
- * data-source); avoid bulk load with a custom collator, because
- * the order of insertion will not match the collation order.
+ * No bulk load with data-sources.
+ *
+ * XXX
+ * No bulk load with in-memory configurations (currently, WiredTiger
+ * fails in the column-store case unless you specify the key).
+ *
+ * No bulk load with custom collators, the order of insertion will not
+ * match the collation order.
*/
- is_bulk = !g.c_reverse &&
- !DATASOURCE("kvsbdb") && !DATASOURCE("helium");
+ is_bulk = true;
+ if (DATASOURCE("kvsbdb") && DATASOURCE("helium"))
+ is_bulk = false;
+ if (g.c_in_memory)
+ is_bulk = false;
+ if (g.c_reverse)
+ is_bulk = false;
+
if ((ret = session->open_cursor(session, g.uri, NULL,
is_bulk ? "bulk" : NULL, &cursor)) != 0)
die(ret, "session.open_cursor");
diff --git a/test/format/config.c b/test/format/config.c
index 45310bf1dab..b9d0e437765 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -34,6 +34,7 @@ static void config_compression(const char *);
static void config_encryption(void);
static const char *config_file_type(u_int);
static CONFIG *config_find(const char *, size_t);
+static void config_in_memory(void);
static int config_is_perm(const char *);
static void config_isolation(void);
static void config_lrt(void);
@@ -56,6 +57,13 @@ config_setup(void)
config_clear();
/*
+ * Periodically, run in-memory; don't do it on the first run, all our
+ * smoke tests would hit it.
+ */
+ if (!config_is_perm("in_memory") && g.run_cnt % 20 == 19)
+ g.c_in_memory = 1;
+
+ /*
* Choose a data source type and a file type: they're interrelated (LSM
* trees are only compatible with row-store) and other items depend on
* them.
@@ -66,8 +74,11 @@ config_setup(void)
config_single("data_source=file", 0);
break;
case 2:
- config_single("data_source=lsm", 0);
- break;
+ if (!g.c_in_memory) {
+ config_single("data_source=lsm", 0);
+ break;
+ }
+ /* FALLTHROUGH */
case 3:
config_single("data_source=table", 0);
break;
@@ -147,6 +158,7 @@ config_setup(void)
config_compression("compression");
config_compression("logging_compression");
config_encryption();
+ config_in_memory();
config_isolation();
config_lrt();
@@ -301,6 +313,43 @@ config_encryption(void)
}
/*
+ * config_in_memory --
+ * In-memory configuration.
+ */
+static void
+config_in_memory(void)
+{
+ if (g.c_in_memory == 0)
+ return;
+
+ /* Turn off a lot of stuff. */
+ if (!config_is_perm("backups"))
+ g.c_backups = 0;
+ if (!config_is_perm("checkpoints"))
+ g.c_checkpoints = 0;
+ if (!config_is_perm("compression"))
+ g.c_compression = 0;
+ if (!config_is_perm("logging"))
+ g.c_logging = 0;
+ if (!config_is_perm("salvage"))
+ g.c_salvage = 0;
+ if (!config_is_perm("verify"))
+ g.c_verify = 0;
+
+ /*
+ * Ensure there is 250MB of cache per thread; keep keys/values small,
+ * overflow items aren't an issue for in-memory configurations and it
+ * keeps us from overflowing the cache.
+ */
+ if (!config_is_perm("cache"))
+ g.c_cache = g.c_threads * 250;
+ if (!config_is_perm("key_max"))
+ g.c_value_max = 64;
+ if (!config_is_perm("value_max"))
+ g.c_value_max = 128;
+}
+
+/*
* config_isolation --
* Isolation configuration.
*/
@@ -341,8 +390,8 @@ static void
config_lrt(void)
{
/*
- * The underlying engine doesn't support a lookaside file for
- * fixed-length column stores.
+ * WiredTiger doesn't support a lookaside file for fixed-length column
+ * stores.
*/
if (g.type == FIX) {
if (config_is_perm("long_running_txn"))
diff --git a/test/format/config.h b/test/format/config.h
index 7574cd38882..30fcf038439 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -70,9 +70,9 @@ static CONFIG c[] = {
"if LSM inserts are throttled", /* 90% */
C_BOOL, 90, 0, 0, &g.c_auto_throttle, NULL },
- { "firstfit",
- "if allocation is firstfit", /* 10% */
- C_BOOL, 10, 0, 0, &g.c_firstfit, NULL },
+ { "backups",
+ "if backups are enabled", /* 5% */
+ C_BOOL, 5, 0, 0, &g.c_backups, NULL },
{ "bitcnt",
"number of bits for fixed-length column-store files",
@@ -146,9 +146,9 @@ static CONFIG c[] = {
"type of store to create (fix | var | row)",
C_IGNORE|C_STRING, 1, 3, 3, NULL, &g.c_file_type },
- { "backups",
- "if backups are enabled", /* 5% */
- C_BOOL, 5, 0, 0, &g.c_backups, NULL },
+ { "firstfit",
+ "if allocation is firstfit", /* 10% */
+ C_BOOL, 10, 0, 0, &g.c_firstfit, NULL },
{ "huffman_key",
"if keys are huffman encoded", /* 20% */
@@ -158,6 +158,10 @@ static CONFIG c[] = {
"if values are huffman encoded", /* 20% */
C_BOOL, 20, 0, 0, &g.c_huffman_value, NULL },
+ { "in_memory",
+ "if in-memory configured",
+ C_IGNORE, 0, 0, 1, &g.c_in_memory, NULL },
+
{ "insert_pct",
"percent operations that are inserts",
0x0, 0, 45, 90, &g.c_insert_pct, NULL },
@@ -187,26 +191,26 @@ static CONFIG c[] = {
"minimum size of keys",
0x0, 10, 32, 256, &g.c_key_min, NULL },
- { "leak_memory",
- "if memory should be leaked on close",
- C_BOOL, 0, 0, 0, &g.c_leak_memory, NULL },
-
{ "leaf_page_max",
"maximum size of Btree leaf nodes",
0x0, 9, 17, 27, &g.c_leaf_page_max, NULL },
+ { "leak_memory",
+ "if memory should be leaked on close",
+ C_BOOL, 0, 0, 0, &g.c_leak_memory, NULL },
+
{ "logging",
"if logging configured", /* 30% */
C_BOOL, 30, 0, 0, &g.c_logging, NULL },
- { "logging_compression",
- "type of logging compression " COMPRESSION_LIST,
- C_IGNORE|C_STRING, 0, 0, 0, NULL, &g.c_logging_compression },
-
{ "logging_archive",
"if log file archival configured", /* 50% */
C_BOOL, 50, 0, 0, &g.c_logging_archive, NULL },
+ { "logging_compression",
+ "type of logging compression " COMPRESSION_LIST,
+ C_IGNORE|C_STRING, 0, 0, 0, NULL, &g.c_logging_compression },
+
{ "logging_prealloc",
"if log file pre-allocation configured", /* 50% */
C_BOOL, 50, 0, 0, &g.c_logging_prealloc, NULL },
diff --git a/test/format/format.h b/test/format/format.h
index 76e8bc5e43c..c996e98ea3e 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -190,6 +190,7 @@ typedef struct {
char *c_file_type;
uint32_t c_huffman_key;
uint32_t c_huffman_value;
+ uint32_t c_in_memory;
uint32_t c_insert_pct;
uint32_t c_internal_key_truncation;
uint32_t c_intl_page_max;
diff --git a/test/format/wts.c b/test/format/wts.c
index 23823c20184..cab004cc91d 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -149,6 +149,10 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp)
p += snprintf(p, REMAIN(p, end), ",error_prefix=\"%s\"", g.progname);
#endif
+ /* In-memory configuration. */
+ if (g.c_in_memory != 0)
+ p += snprintf(p, REMAIN(p, end), ",in_memory=1");
+
/* LSM configuration. */
if (DATASOURCE("lsm"))
p += snprintf(p, REMAIN(p, end),
@@ -455,7 +459,12 @@ wts_dump(const char *tag, int dump_bdb)
int ret;
char *cmd;
- /* Some data-sources don't support dump through the wt utility. */
+ /*
+ * In-memory configurations and data-sources don't support dump through
+ * the wt utility.
+ */
+ if (g.c_in_memory != 0)
+ return;
if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
return;