1 files changed, 377 insertions, 403 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index 255c08e0b60..525728b73dc 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -10,433 +10,407 @@
 
 /*
  * __wt_row_random_leaf --
- *	Return a random key from a row-store leaf page.
+ *     Return a random key from a row-store leaf page.
  */
 int
 __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
 {
-	WT_INSERT *ins, **start, **stop;
-	WT_INSERT_HEAD *ins_head;
-	WT_PAGE *page;
-	uint64_t samples;
-	uint32_t choice, entries, i;
-	int level;
-
-	page = cbt->ref->page;
-	start = stop = NULL;		/* [-Wconditional-uninitialized] */
-	entries = 0;			/* [-Wconditional-uninitialized] */
-
-	__cursor_pos_clear(cbt);
-
-	/* If the page has disk-based entries, select from them. */
-	if (page->entries != 0) {
-		cbt->compare = 0;
-		cbt->slot = __wt_random(&session->rnd) % page->entries;
-
-		/*
-		 * The real row-store search function builds the key, so we
-		 * have to as well.
-		 */
-		return (__wt_row_leaf_key(session,
-		    page, page->pg_row + cbt->slot, cbt->tmp, false));
-	}
-
-	/*
-	 * If the tree is new (and not empty), it might have a large insert
-	 * list.
-	 *
-	 * Walk down the list until we find a level with at least 50 entries,
-	 * that's where we'll start rolling random numbers. The value 50 is
-	 * used to ignore levels with only a few entries, that is, levels which
-	 * are potentially badly skewed.
-	 */
-	F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
-	if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
-		return (WT_NOTFOUND);
-	for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
-		start = &ins_head->head[level];
-		for (entries = 0, stop = start;
-		    *stop != NULL; stop = &(*stop)->next[level])
-			++entries;
-
-		if (entries > 50)
-			break;
-	}
-
-	/*
-	 * If it's a tiny list and we went all the way to level 0, correct the
-	 * level; entries is correctly set.
-	 */
-	if (level < 0)
-		level = 0;
-
-	/*
-	 * Step down the skip list levels, selecting a random chunk of the name
-	 * space at each level.
-	 */
-	for (samples = entries; level > 0; samples += entries) {
-		/*
-		 * There are (entries) or (entries + 1) chunks of the name space
-		 * considered at each level. They are: between start and the 1st
-		 * element, between the 1st and 2nd elements, and so on to the
-		 * last chunk which is the name space after the stop element on
-		 * the current level. This last chunk of name space may or may
-		 * not be there: as we descend the levels of the skip list, this
-		 * chunk may appear, depending if the next level down has
-		 * entries logically after the stop point in the current level.
-		 * We can't ignore those entries: because of the algorithm used
-		 * to determine the depth of a skiplist, there may be a large
-		 * number of entries "revealed" by descending a level.
-		 *
-		 * If the next level down has more items after the current stop
-		 * point, there are (entries + 1) chunks to consider, else there
-		 * are (entries) chunks.
-		 */
-		if (*(stop - 1) == NULL)
-			choice = __wt_random(&session->rnd) % entries;
-		else
-			choice = __wt_random(&session->rnd) % (entries + 1);
-
-		if (choice == entries) {
-			/*
-			 * We selected the name space after the stop element on
-			 * this level. Set the start point to the current stop
-			 * point, descend a level and move the stop element to
-			 * the end of the list, that is, the end of the newly
-			 * discovered name space, counting entries as we go.
-			 */
-			start = stop;
-			--start;
-			--level;
-			for (entries = 0, stop = start;
-			    *stop != NULL; stop = &(*stop)->next[level])
-				++entries;
-		} else {
-			/*
-			 * We selected another name space on the level. Move the
-			 * start pointer the selected number of entries forward
-			 * to the start of the selected chunk (if the selected
-			 * number is 0, start won't move). Set the stop pointer
-			 * to the next element in the list and drop both start
-			 * and stop down a level.
-			 */
-			for (i = 0; i < choice; ++i)
-				start = &(*start)->next[level];
-			stop = &(*start)->next[level];
-
-			--start;
-			--stop;
-			--level;
-
-			/* Count the entries in the selected name space. */
-			for (entries = 0,
-			    ins = *start; ins != *stop; ins = ins->next[level])
-				++entries;
-		}
-	}
-
-	/*
-	 * When we reach the bottom level, entries will already be set. Select
-	 * a random entry from the name space and return it.
-	 *
-	 * It should be impossible for the entries count to be 0 at this point,
-	 * but check for it out of paranoia and to quiet static testing tools.
-	 */
-	if (entries > 0)
-		entries = __wt_random(&session->rnd) % entries;
-	for (ins = *start; entries > 0; --entries)
-		ins = ins->next[0];
-
-	cbt->ins = ins;
-	cbt->ins_head = ins_head;
-	cbt->compare = 0;
-
-	/*
-	 * Random lookups in newly created collections can be slow if a page
-	 * consists of a large skiplist. Schedule the page for eviction if we
-	 * encounter a large skiplist. This worthwhile because applications
-	 * that take a sample often take many samples, so the overhead of
-	 * traversing the skip list each time accumulates to real time.
-	 */
-	if (samples > 5000)
-		__wt_page_evict_soon(session, cbt->ref);
-
-	return (0);
+    WT_INSERT *ins, **start, **stop;
+    WT_INSERT_HEAD *ins_head;
+    WT_PAGE *page;
+    uint64_t samples;
+    uint32_t choice, entries, i;
+    int level;
+
+    page = cbt->ref->page;
+    start = stop = NULL; /* [-Wconditional-uninitialized] */
+    entries = 0;         /* [-Wconditional-uninitialized] */
+
+    __cursor_pos_clear(cbt);
+
+    /* If the page has disk-based entries, select from them. */
+    if (page->entries != 0) {
+        cbt->compare = 0;
+        cbt->slot = __wt_random(&session->rnd) % page->entries;
+
+        /*
+         * The real row-store search function builds the key, so we have to as well.
+         */
+        return (__wt_row_leaf_key(session, page, page->pg_row + cbt->slot, cbt->tmp, false));
+    }
+
+    /*
+     * If the tree is new (and not empty), it might have a large insert
+     * list.
+     *
+     * Walk down the list until we find a level with at least 50 entries,
+     * that's where we'll start rolling random numbers. The value 50 is
+     * used to ignore levels with only a few entries, that is, levels which
+     * are potentially badly skewed.
+     */
+    F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+    if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+        return (WT_NOTFOUND);
+    for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+        start = &ins_head->head[level];
+        for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level])
+            ++entries;
+
+        if (entries > 50)
+            break;
+    }
+
+    /*
+     * If it's a tiny list and we went all the way to level 0, correct the level; entries is
+     * correctly set.
+     */
+    if (level < 0)
+        level = 0;
+
+    /*
+     * Step down the skip list levels, selecting a random chunk of the name space at each level.
+     */
+    for (samples = entries; level > 0; samples += entries) {
+        /*
+         * There are (entries) or (entries + 1) chunks of the name space
+         * considered at each level. They are: between start and the 1st
+         * element, between the 1st and 2nd elements, and so on to the
+         * last chunk which is the name space after the stop element on
+         * the current level. This last chunk of name space may or may
+         * not be there: as we descend the levels of the skip list, this
+         * chunk may appear, depending if the next level down has
+         * entries logically after the stop point in the current level.
+         * We can't ignore those entries: because of the algorithm used
+         * to determine the depth of a skiplist, there may be a large
+         * number of entries "revealed" by descending a level.
+         *
+         * If the next level down has more items after the current stop
+         * point, there are (entries + 1) chunks to consider, else there
+         * are (entries) chunks.
+         */
+        if (*(stop - 1) == NULL)
+            choice = __wt_random(&session->rnd) % entries;
+        else
+            choice = __wt_random(&session->rnd) % (entries + 1);
+
+        if (choice == entries) {
+            /*
+             * We selected the name space after the stop element on this level. Set the start point
+             * to the current stop point, descend a level and move the stop element to the end of
+             * the list, that is, the end of the newly discovered name space, counting entries as we
+             * go.
+             */
+            start = stop;
+            --start;
+            --level;
+            for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level])
+                ++entries;
+        } else {
+            /*
+             * We selected another name space on the level. Move the start pointer the selected
+             * number of entries forward to the start of the selected chunk (if the selected number
+             * is 0, start won't move). Set the stop pointer to the next element in the list and
+             * drop both start and stop down a level.
+             */
+            for (i = 0; i < choice; ++i)
+                start = &(*start)->next[level];
+            stop = &(*start)->next[level];
+
+            --start;
+            --stop;
+            --level;
+
+            /* Count the entries in the selected name space. */
+            for (entries = 0, ins = *start; ins != *stop; ins = ins->next[level])
+                ++entries;
+        }
+    }
+
+    /*
+     * When we reach the bottom level, entries will already be set. Select
+     * a random entry from the name space and return it.
+     *
+     * It should be impossible for the entries count to be 0 at this point,
+     * but check for it out of paranoia and to quiet static testing tools.
+     */
+    if (entries > 0)
+        entries = __wt_random(&session->rnd) % entries;
+    for (ins = *start; entries > 0; --entries)
+        ins = ins->next[0];
+
+    cbt->ins = ins;
+    cbt->ins_head = ins_head;
+    cbt->compare = 0;
+
+    /*
+     * Random lookups in newly created collections can be slow if a page consists of a large
+     * skiplist. Schedule the page for eviction if we encounter a large skiplist. This worthwhile
+     * because applications that take a sample often take many samples, so the overhead of
+     * traversing the skip list each time accumulates to real time.
+     */
+    if (samples > 5000)
+        __wt_page_evict_soon(session, cbt->ref);
+
+    return (0);
 }
 
 /*
  * __wt_random_descent --
- *	Find a random page in a tree for either sampling or eviction.
+ *     Find a random page in a tree for either sampling or eviction.
  */
 int
 __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
 {
-	WT_BTREE *btree;
-	WT_DECL_RET;
-	WT_PAGE *page;
-	WT_PAGE_INDEX *pindex;
-	WT_REF *current, *descent;
-	uint32_t i, entries, retry;
-	bool eviction;
-
-	*refp = NULL;
-
-	btree = S2BT(session);
-	current = NULL;
-	retry = 100;
-	/*
-	 * This function is called by eviction to find a random page in the
-	 * cache. That case is indicated by the WT_READ_CACHE flag. Ordinary
-	 * lookups in a tree will read pages into cache as needed.
-	 */
-	eviction = LF_ISSET(WT_READ_CACHE);
-
-	if (0) {
+    WT_BTREE *btree;
+    WT_DECL_RET;
+    WT_PAGE *page;
+    WT_PAGE_INDEX *pindex;
+    WT_REF *current, *descent;
+    uint32_t i, entries, retry;
+    bool eviction;
+
+    *refp = NULL;
+
+    btree = S2BT(session);
+    current = NULL;
+    retry = 100;
+    /*
+     * This function is called by eviction to find a random page in the cache. That case is
+     * indicated by the WT_READ_CACHE flag. Ordinary lookups in a tree will read pages into cache as
+     * needed.
+     */
+    eviction = LF_ISSET(WT_READ_CACHE);
+
+    if (0) {
 restart:
-		/*
-		 * Discard the currently held page and restart the search from
-		 * the root.
-		 */
-		WT_RET(__wt_page_release(session, current, flags));
-	}
-
-	/* Search the internal pages of the tree. */
-	current = &btree->root;
-	for (;;) {
-		page = current->page;
-		if (!WT_PAGE_IS_INTERNAL(page))
-			break;
-
-		WT_INTL_INDEX_GET(session, page, pindex);
-		entries = pindex->entries;
-
-		/* Eviction just wants any random child. */
-		if (eviction) {
-			descent = pindex->index[
-			    __wt_random(&session->rnd) % entries];
-			goto descend;
-		}
-
-		/*
-		 * There may be empty pages in the tree, and they're useless to
-		 * us. If we don't find a non-empty page in "entries" random
-		 * guesses, take the first non-empty page in the tree. If the
-		 * search page contains nothing other than empty pages, restart
-		 * from the root some number of times before giving up.
-		 *
-		 * Random sampling is looking for a key/value pair on a random
-		 * leaf page, and so will accept any page that contains a valid
-		 * key/value pair, so on-disk is fine, but deleted is not.
-		 */
-		descent = NULL;
-		for (i = 0; i < entries; ++i) {
-			descent =
-			    pindex->index[__wt_random(&session->rnd) % entries];
-			if (descent->state == WT_REF_DISK ||
-			    descent->state == WT_REF_LIMBO ||
-			    descent->state == WT_REF_LOOKASIDE ||
-			    descent->state == WT_REF_MEM)
-				break;
-		}
-		if (i == entries)
-			for (i = 0; i < entries; ++i) {
-				descent = pindex->index[i];
-				if (descent->state == WT_REF_DISK ||
-				    descent->state == WT_REF_LIMBO ||
-				    descent->state == WT_REF_LOOKASIDE ||
-				    descent->state == WT_REF_MEM)
-					break;
-			}
-		if (i == entries || descent == NULL) {
-			if (--retry > 0)
-				goto restart;
-
-			WT_RET(__wt_page_release(session, current, flags));
-			return (WT_NOTFOUND);
-		}
-
-		/*
-		 * Swap the current page for the child page. If the page splits
-		 * while we're retrieving it, restart the search at the root.
-		 *
-		 * On other error, simply return, the swap call ensures we're
-		 * holding nothing on failure.
-		 */
-descend:	if ((ret = __wt_page_swap(
-		    session, current, descent, flags)) == 0) {
-			current = descent;
-			continue;
-		}
-		if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
-			break;
-		if (ret == WT_RESTART)
-			goto restart;
-		return (ret);
-	}
-
-	/*
-	 * There is no point starting with the root page: the walk will exit
-	 * immediately.  In that case we aren't holding a hazard pointer so
-	 * there is nothing to release.
-	 */
-	if (!eviction || !__wt_ref_is_root(current))
-		*refp = current;
-	return (0);
+        /*
+         * Discard the currently held page and restart the search from the root.
+         */
+        WT_RET(__wt_page_release(session, current, flags));
+    }
+
+    /* Search the internal pages of the tree. */
+    current = &btree->root;
+    for (;;) {
+        page = current->page;
+        if (!WT_PAGE_IS_INTERNAL(page))
+            break;
+
+        WT_INTL_INDEX_GET(session, page, pindex);
+        entries = pindex->entries;
+
+        /* Eviction just wants any random child. */
+        if (eviction) {
+            descent = pindex->index[__wt_random(&session->rnd) % entries];
+            goto descend;
+        }
+
+        /*
+         * There may be empty pages in the tree, and they're useless to
+         * us. If we don't find a non-empty page in "entries" random
+         * guesses, take the first non-empty page in the tree. If the
+         * search page contains nothing other than empty pages, restart
+         * from the root some number of times before giving up.
+         *
+         * Random sampling is looking for a key/value pair on a random
+         * leaf page, and so will accept any page that contains a valid
+         * key/value pair, so on-disk is fine, but deleted is not.
+         */
+        descent = NULL;
+        for (i = 0; i < entries; ++i) {
+            descent = pindex->index[__wt_random(&session->rnd) % entries];
+            if (descent->state == WT_REF_DISK || descent->state == WT_REF_LIMBO ||
+              descent->state == WT_REF_LOOKASIDE || descent->state == WT_REF_MEM)
+                break;
+        }
+        if (i == entries)
+            for (i = 0; i < entries; ++i) {
+                descent = pindex->index[i];
+                if (descent->state == WT_REF_DISK || descent->state == WT_REF_LIMBO ||
+                  descent->state == WT_REF_LOOKASIDE || descent->state == WT_REF_MEM)
+                    break;
+            }
+        if (i == entries || descent == NULL) {
+            if (--retry > 0)
+                goto restart;
+
+            WT_RET(__wt_page_release(session, current, flags));
+            return (WT_NOTFOUND);
+        }
+
+    /*
+     * Swap the current page for the child page. If the page splits
+     * while we're retrieving it, restart the search at the root.
+     *
+     * On other error, simply return, the swap call ensures we're
+     * holding nothing on failure.
+     */
+descend:
+        if ((ret = __wt_page_swap(session, current, descent, flags)) == 0) {
+            current = descent;
+            continue;
+        }
+        if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
+            break;
+        if (ret == WT_RESTART)
+            goto restart;
+        return (ret);
+    }
+
+    /*
+     * There is no point starting with the root page: the walk will exit immediately. In that case
+     * we aren't holding a hazard pointer so there is nothing to release.
+     */
+    if (!eviction || !__wt_ref_is_root(current))
+        *refp = current;
+    return (0);
 }
 
 /*
  * __wt_btcur_next_random --
- *	Move to a random record in the tree. There are two algorithms, one
- *	where we select a record at random from the whole tree on each
- *	retrieval and one where we first select a record at random from the
- *	whole tree, and then subsequently sample forward from that location.
- *	The sampling approach allows us to select reasonably uniform random
- *	points from unbalanced trees.
+ *     Move to a random record in the tree. There are two algorithms, one where we select a record
+ *     at random from the whole tree on each retrieval and one where we first select a record at
+ *     random from the whole tree, and then subsequently sample forward from that location. The
+ *     sampling approach allows us to select reasonably uniform random points from unbalanced trees.
  */
 int
 __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
 {
-	WT_BTREE *btree;
-	WT_CURSOR *cursor;
-	WT_DECL_RET;
-	WT_SESSION_IMPL *session;
-	WT_UPDATE *upd;
-	wt_off_t size;
-	uint64_t n, skip;
-	uint32_t read_flags;
-	bool valid;
-
-	btree = cbt->btree;
-	cursor = &cbt->iface;
-	session = (WT_SESSION_IMPL *)cbt->iface.session;
-	read_flags = WT_READ_RESTART_OK;
-	if (F_ISSET(cbt, WT_CBT_READ_ONCE))
-		FLD_SET(read_flags, WT_READ_WONT_NEED);
-
-	/*
-	 * Only supports row-store: applications can trivially select a random
-	 * value from a column-store, if there were any reason to do so.
-	 */
-	if (btree->type != BTREE_ROW)
-		WT_RET_MSG(session, ENOTSUP,
-		    "WT_CURSOR.next_random only supported by row-store tables");
-
-	WT_STAT_CONN_INCR(session, cursor_next);
-	WT_STAT_DATA_INCR(session, cursor_next);
-
-	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+    WT_BTREE *btree;
+    WT_CURSOR *cursor;
+    WT_DECL_RET;
+    WT_SESSION_IMPL *session;
+    WT_UPDATE *upd;
+    wt_off_t size;
+    uint64_t n, skip;
+    uint32_t read_flags;
+    bool valid;
+
+    btree = cbt->btree;
+    cursor = &cbt->iface;
+    session = (WT_SESSION_IMPL *)cbt->iface.session;
+    read_flags = WT_READ_RESTART_OK;
+    if (F_ISSET(cbt, WT_CBT_READ_ONCE))
+        FLD_SET(read_flags, WT_READ_WONT_NEED);
+
+    /*
+     * Only supports row-store: applications can trivially select a random value from a
+     * column-store, if there were any reason to do so.
+     */
+    if (btree->type != BTREE_ROW)
+        WT_RET_MSG(session, ENOTSUP, "WT_CURSOR.next_random only supported by row-store tables");
+
+    WT_STAT_CONN_INCR(session, cursor_next);
+    WT_STAT_DATA_INCR(session, cursor_next);
+
+    F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
 
 #ifdef HAVE_DIAGNOSTIC
-	/*
-	 * Under some conditions we end up using the underlying cursor.next to
-	 * walk through the object. Since there are multiple calls, we can hit
-	 * the cursor-order checks, turn them off.
-	 */
-	__wt_cursor_key_order_reset(cbt);
+    /*
+     * Under some conditions we end up using the underlying cursor.next to walk through the object.
+     * Since there are multiple calls, we can hit the cursor-order checks, turn them off.
+     */
+    __wt_cursor_key_order_reset(cbt);
 #endif
-	/*
-	 * If we don't have a current position in the tree, or if retrieving
-	 * random values without sampling, pick a roughly random leaf page in
-	 * the tree and return an entry from it.
-	 */
-	if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
-		WT_ERR(__cursor_func_init(cbt, true));
-		WT_WITH_PAGE_INDEX(session,
-		    ret = __wt_random_descent(session, &cbt->ref, read_flags));
-		if (ret == 0)
-			goto random_page_entry;
-
-		/*
-		 * Random descent may return not-found: the tree might be empty
-		 * or have so many deleted items we didn't find any valid pages.
-		 * We can't return WT_NOTFOUND to the application unless a tree
-		 * is really empty, fallback to skipping through tree pages.
-		 */
-		WT_ERR_NOTFOUND_OK(ret);
-	}
-
-	/*
-	 * Cursor through the tree, skipping past the sample size of the leaf
-	 * pages in the tree between each random key return to compensate for
-	 * unbalanced trees.
-	 *
-	 * If the random descent attempt failed, we don't have a configured
-	 * sample size, use 100 for no particular reason.
-	 */
-	if (cbt->next_random_sample_size == 0)
-		cbt->next_random_sample_size = 100;
-
-	/*
-	 * If the random descent attempt failed, or it's our first skip attempt,
-	 * we haven't yet set the pages to skip, do it now.
-	 *
-	 * Use the underlying file size divided by its block allocation size as
-	 * our guess of leaf pages in the file (this can be entirely wrong, as
-	 * it depends on how many pages are in this particular checkpoint, how
-	 * large the leaf and internal pages really are, and other factors).
-	 * Then, divide that value by the configured sample size and increment
-	 * the final result to make sure tiny files don't leave us with a skip
-	 * value of 0.
-	 *
-	 * !!!
-	 * Ideally, the number would be prime to avoid restart issues.
-	 */
-	if (cbt->next_random_leaf_skip == 0) {
-		WT_ERR(btree->bm->size(btree->bm, session, &size));
-		cbt->next_random_leaf_skip = (uint64_t)
-		    ((size / btree->allocsize) /
-		    cbt->next_random_sample_size) + 1;
-	}
-
-	/*
-	 * Be paranoid about loop termination: first, if the last leaf page
-	 * skipped was also the last leaf page in the tree, skip may be set to
-	 * zero on return along with the NULL WT_REF end-of-walk condition.
-	 * Second, if a tree has no valid pages at all (the condition after
-	 * initial creation), we might make no progress at all, or finally, if
-	 * a tree has only deleted pages, we'll make progress, but never get a
-	 * useful WT_REF. And, of course, the tree can switch from one of these
-	 * states to another without warning. Decrement skip regardless of what
-	 * is happening in the search, guarantee we eventually quit.
-	 *
-	 * Pages read for data sampling aren't "useful"; don't update the read
-	 * generation of pages already in memory, and if a page is read, set
-	 * its generation to a low value so it is evicted quickly.
-	 */
-	for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
-		n = skip;
-		WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip));
-		if (n == skip) {
-			if (skip == 0)
-				break;
-			--skip;
-		}
-	}
-
-	/*
-	 * We can't return WT_NOTFOUND to the application unless a tree is
-	 * really empty, fallback to a random entry from the first page in the
-	 * tree that has anything at all.
-	 */
-	if (cbt->ref == NULL)
-		WT_ERR(__wt_btcur_next(cbt, false));
+    /*
+     * If we don't have a current position in the tree, or if retrieving random values without
+     * sampling, pick a roughly random leaf page in the tree and return an entry from it.
+     */
+    if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
+        WT_ERR(__cursor_func_init(cbt, true));
+        WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(session, &cbt->ref, read_flags));
+        if (ret == 0)
+            goto random_page_entry;
+
+        /*
+         * Random descent may return not-found: the tree might be empty or have so many deleted
+         * items we didn't find any valid pages. We can't return WT_NOTFOUND to the application
+         * unless a tree is really empty, fallback to skipping through tree pages.
+         */
+        WT_ERR_NOTFOUND_OK(ret);
+    }
+
+    /*
+     * Cursor through the tree, skipping past the sample size of the leaf
+     * pages in the tree between each random key return to compensate for
+     * unbalanced trees.
+     *
+     * If the random descent attempt failed, we don't have a configured
+     * sample size, use 100 for no particular reason.
+     */
+    if (cbt->next_random_sample_size == 0)
+        cbt->next_random_sample_size = 100;
+
+    /*
+     * If the random descent attempt failed, or it's our first skip attempt,
+     * we haven't yet set the pages to skip, do it now.
+     *
+     * Use the underlying file size divided by its block allocation size as
+     * our guess of leaf pages in the file (this can be entirely wrong, as
+     * it depends on how many pages are in this particular checkpoint, how
+     * large the leaf and internal pages really are, and other factors).
+     * Then, divide that value by the configured sample size and increment
+     * the final result to make sure tiny files don't leave us with a skip
+     * value of 0.
+     *
+     * !!!
+     * Ideally, the number would be prime to avoid restart issues.
+     */
+    if (cbt->next_random_leaf_skip == 0) {
+        WT_ERR(btree->bm->size(btree->bm, session, &size));
+        cbt->next_random_leaf_skip =
+          (uint64_t)((size / btree->allocsize) / cbt->next_random_sample_size) + 1;
+    }
+
+    /*
+     * Be paranoid about loop termination: first, if the last leaf page
+     * skipped was also the last leaf page in the tree, skip may be set to
+     * zero on return along with the NULL WT_REF end-of-walk condition.
+     * Second, if a tree has no valid pages at all (the condition after
+     * initial creation), we might make no progress at all, or finally, if
+     * a tree has only deleted pages, we'll make progress, but never get a
+     * useful WT_REF. And, of course, the tree can switch from one of these
+     * states to another without warning. Decrement skip regardless of what
+     * is happening in the search, guarantee we eventually quit.
+     *
+     * Pages read for data sampling aren't "useful"; don't update the read
+     * generation of pages already in memory, and if a page is read, set
+     * its generation to a low value so it is evicted quickly.
+     */
+    for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
+        n = skip;
+        WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip));
+        if (n == skip) {
+            if (skip == 0)
+                break;
+            --skip;
+        }
+    }
+
+    /*
+     * We can't return WT_NOTFOUND to the application unless a tree is really empty, fallback to a
+     * random entry from the first page in the tree that has anything at all.
+     */
+    if (cbt->ref == NULL)
+        WT_ERR(__wt_btcur_next(cbt, false));
 
 random_page_entry:
-	/*
-	 * Select a random entry from the leaf page. If it's not valid, move to
-	 * the next entry, if that doesn't work, move to the previous entry.
-	 */
-	WT_ERR(__wt_row_random_leaf(session, cbt));
-	WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
-	if (valid)
-		WT_ERR(__cursor_kv_return(session, cbt, upd));
-	else {
-		if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
-			ret = __wt_btcur_prev(cbt, false);
-		WT_ERR(ret);
-	}
-	return (0);
-
-err:	WT_TRET(__cursor_reset(cbt));
-	return (ret);
+    /*
+     * Select a random entry from the leaf page. If it's not valid, move to the next entry, if that
+     * doesn't work, move to the previous entry.
+     */
+    WT_ERR(__wt_row_random_leaf(session, cbt));
+    WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
+    if (valid)
+        WT_ERR(__cursor_kv_return(session, cbt, upd));
+    else {
+        if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
+            ret = __wt_btcur_prev(cbt, false);
+        WT_ERR(ret);
+    }
+    return (0);
+
+err:
+    WT_TRET(__cursor_reset(cbt));
+    return (ret);
 }