1 files changed, 427 insertions, 0 deletions
diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c
new file mode 100644
index 00000000000..44de511f787
--- /dev/null
+++ b/src/btree/bt_random.c
@@ -0,0 +1,427 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_row_random_leaf --
+ *	Return a random key from a row-store leaf page.
+ */
+int
+__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+	WT_INSERT *ins, **start, **stop;
+	WT_INSERT_HEAD *ins_head;
+	WT_PAGE *page;
+	uint64_t samples;
+	uint32_t choice, entries, i;
+	int level;
+
+	page = cbt->ref->page;
+	start = stop = NULL;		/* [-Wconditional-uninitialized] */
+	entries = 0;			/* [-Wconditional-uninitialized] */
+
+	__cursor_pos_clear(cbt);
+
+	/* If the page has disk-based entries, select from them. */
+	if (page->entries != 0) {
+		cbt->compare = 0;
+		cbt->slot = __wt_random(&session->rnd) % page->entries;
+
+		/*
+		 * The real row-store search function builds the key, so we
+		 * have to as well.
+		 */
+		return (__wt_row_leaf_key(session,
+		    page, page->pg_row + cbt->slot, cbt->tmp, false));
+	}
+
+	/*
+	 * If the tree is new (and not empty), it might have a large insert
+	 * list.
+	 *
+	 * Walk down the list until we find a level with at least 50 entries,
+	 * that's where we'll start rolling random numbers. The value 50 is
+	 * used to ignore levels with only a few entries, that is, levels which
+	 * are potentially badly skewed.
+	 */
+	F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+	if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+		return (WT_NOTFOUND);
+	for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+		start = &ins_head->head[level];
+		for (entries = 0, stop = start;
+		    *stop != NULL; stop = &(*stop)->next[level])
+			++entries;
+
+		if (entries > 50)
+			break;
+	}
+
+	/*
+	 * If it's a tiny list and we went all the way to level 0, correct the
+	 * level; entries is correctly set.
+	 */
+	if (level < 0)
+		level = 0;
+
+	/*
+	 * Step down the skip list levels, selecting a random chunk of the name
+	 * space at each level.
+	 */
+	for (samples = entries; level > 0; samples += entries) {
+		/*
+		 * There are (entries) or (entries + 1) chunks of the name space
+		 * considered at each level. They are: between start and the 1st
+		 * element, between the 1st and 2nd elements, and so on to the
+		 * last chunk which is the name space after the stop element on
+		 * the current level. This last chunk of name space may or may
+		 * not be there: as we descend the levels of the skip list, this
+		 * chunk may appear, depending if the next level down has
+		 * entries logically after the stop point in the current level.
+		 * We can't ignore those entries: because of the algorithm used
+		 * to determine the depth of a skiplist, there may be a large
+		 * number of entries "revealed" by descending a level.
+		 *
+		 * If the next level down has more items after the current stop
+		 * point, there are (entries + 1) chunks to consider, else there
+		 * are (entries) chunks.
+		 */
+		if (*(stop - 1) == NULL)
+			choice = __wt_random(&session->rnd) % entries;
+		else
+			choice = __wt_random(&session->rnd) % (entries + 1);
+
+		if (choice == entries) {
+			/*
+			 * We selected the name space after the stop element on
+			 * this level. Set the start point to the current stop
+			 * point, descend a level and move the stop element to
+			 * the end of the list, that is, the end of the newly
+			 * discovered name space, counting entries as we go.
+			 */
+			start = stop;
+			--start;
+			--level;
+			for (entries = 0, stop = start;
+			    *stop != NULL; stop = &(*stop)->next[level])
+				++entries;
+		} else {
+			/*
+			 * We selected another name space on the level. Move the
+			 * start pointer the selected number of entries forward
+			 * to the start of the selected chunk (if the selected
+			 * number is 0, start won't move). Set the stop pointer
+			 * to the next element in the list and drop both start
+			 * and stop down a level.
+			 */
+			for (i = 0; i < choice; ++i)
+				start = &(*start)->next[level];
+			stop = &(*start)->next[level];
+
+			--start;
+			--stop;
+			--level;
+
+			/* Count the entries in the selected name space. */
+			for (entries = 0,
+			    ins = *start; ins != *stop; ins = ins->next[level])
+				++entries;
+		}
+	}
+
+	/*
+	 * When we reach the bottom level, entries will already be set. Select
+	 * a random entry from the name space and return it.
+	 *
+	 * It should be impossible for the entries count to be 0 at this point,
+	 * but check for it out of paranoia and to quiet static testing tools.
+	 */
+	if (entries > 0)
+		entries = __wt_random(&session->rnd) % entries;
+	for (ins = *start; entries > 0; --entries)
+		ins = ins->next[0];
+
+	cbt->ins = ins;
+	cbt->ins_head = ins_head;
+	cbt->compare = 0;
+
+	/*
+	 * Random lookups in newly created collections can be slow if a page
+	 * consists of a large skiplist. Schedule the page for eviction if we
+	 * encounter a large skiplist. This worthwhile because applications
+	 * that take a sample often take many samples, so the overhead of
+	 * traversing the skip list each time accumulates to real time.
+	 */
+	if (samples > 5000)
+		__wt_page_evict_soon(session, cbt->ref);
+
+	return (0);
+}
+
+/*
+ * __wt_random_descent --
+ *	Find a random page in a tree for either sampling or eviction.
+ */
+int
+__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *current, *descent;
+	uint32_t flags, i, entries, retry;
+
+	btree = S2BT(session);
+	current = NULL;
+	retry = 100;
+
+	/* Eviction should not be tapped to do eviction. */
+	if (eviction)
+		flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN |
+		    WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
+	else
+		flags = WT_READ_RESTART_OK;
+
+	if (0) {
+restart:	/*
+		 * Discard the currently held page and restart the search from
+		 * the root.
+		 */
+		WT_RET(__wt_page_release(session, current, flags));
+	}
+
+	/* Search the internal pages of the tree. */
+	current = &btree->root;
+	for (;;) {
+		page = current->page;
+		if (!WT_PAGE_IS_INTERNAL(page))
+			break;
+
+		WT_INTL_INDEX_GET(session, page, pindex);
+		entries = pindex->entries;
+
+		/* Eviction just wants any random child. */
+		if (eviction) {
+			descent = pindex->index[
+			    __wt_random(&session->rnd) % entries];
+			goto descend;
+		}
+
+		/*
+		 * There may be empty pages in the tree, and they're useless to
+		 * us. If we don't find a non-empty page in "entries" random
+		 * guesses, take the first non-empty page in the tree. If the
+		 * search page contains nothing other than empty pages, restart
+		 * from the root some number of times before giving up.
+		 *
+		 * Random sampling is looking for a key/value pair on a random
+		 * leaf page, and so will accept any page that contains a valid
+		 * key/value pair, so on-disk is fine, but deleted is not.
+		 */
+		descent = NULL;
+		for (i = 0; i < entries; ++i) {
+			descent =
+			    pindex->index[__wt_random(&session->rnd) % entries];
+			if (descent->state == WT_REF_MEM ||
+			    descent->state == WT_REF_DISK)
+				break;
+		}
+		if (i == entries)
+			for (i = 0; i < entries; ++i) {
+				descent = pindex->index[i];
+				if (descent->state == WT_REF_MEM ||
+				    descent->state == WT_REF_DISK)
+					break;
+			}
+		if (i == entries || descent == NULL) {
+			if (--retry > 0)
+				goto restart;
+
+			WT_RET(__wt_page_release(session, current, flags));
+			return (WT_NOTFOUND);
+		}
+
+		/*
+		 * Swap the current page for the child page. If the page splits
+		 * while we're retrieving it, restart the search at the root.
+		 *
+		 * On other error, simply return, the swap call ensures we're
+		 * holding nothing on failure.
+		 */
+descend:	if ((ret =
+		    __wt_page_swap(session, current, descent, flags)) == 0) {
+			current = descent;
+			continue;
+		}
+		if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
+			break;
+		if (ret == WT_RESTART)
+			goto restart;
+		return (ret);
+	}
+
+	/*
+	 * There is no point starting with the root page: the walk will exit
+	 * immediately.  In that case we aren't holding a hazard pointer so
+	 * there is nothing to release.
+	 */
+	if (!eviction || !__wt_ref_is_root(current))
+		*refp = current;
+	return (0);
+}
+
+/*
+ * __wt_btcur_next_random --
+ *	Move to a random record in the tree. There are two algorithms, one
+ *	where we select a record at random from the whole tree on each
+ *	retrieval and one where we first select a record at random from the
+ *	whole tree, and then subsequently sample forward from that location.
+ *	The sampling approach allows us to select reasonably uniform random
+ *	points from unbalanced trees.
+ */
+int
+__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+	wt_off_t size;
+	uint64_t n, skip;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	btree = cbt->btree;
+
+	/*
+	 * Only supports row-store: applications can trivially select a random
+	 * value from a column-store, if there were any reason to do so.
+	 */
+	if (btree->type != BTREE_ROW)
+		WT_RET_MSG(session, ENOTSUP,
+		    "WT_CURSOR.next_random only supported by row-store tables");
+
+	WT_STAT_CONN_INCR(session, cursor_next);
+	WT_STAT_DATA_INCR(session, cursor_next);
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * Under some conditions we end up using the underlying cursor.next to
+	 * walk through the object. Since there are multiple calls, we can hit
+	 * the cursor-order checks, turn them off.
+	 */
+	__wt_cursor_key_order_reset(cbt);
+#endif
+
+	/*
+	 * If we don't have a current position in the tree, or if retrieving
+	 * random values without sampling, pick a roughly random leaf page in
+	 * the tree and return an entry from it.
+	 */
+	if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
+		WT_ERR(__cursor_func_init(cbt, true));
+		WT_WITH_PAGE_INDEX(session,
+		    ret = __wt_random_descent(session, &cbt->ref, false));
+		if (ret == 0)
+			goto random_page_entry;
+
+		/*
+		 * Random descent may return not-found: the tree might be empty
+		 * or have so many deleted items we didn't find any valid pages.
+		 * We can't return WT_NOTFOUND to the application unless a tree
+		 * is really empty, fallback to skipping through tree pages.
+		 */
+		WT_ERR_NOTFOUND_OK(ret);
+	}
+
+	/*
+	 * Cursor through the tree, skipping past the sample size of the leaf
+	 * pages in the tree between each random key return to compensate for
+	 * unbalanced trees.
+	 *
+	 * If the random descent attempt failed, we don't have a configured
+	 * sample size, use 100 for no particular reason.
+	 */
+	if (cbt->next_random_sample_size == 0)
+		cbt->next_random_sample_size = 100;
+
+	/*
+	 * If the random descent attempt failed, or it's our first skip attempt,
+	 * we haven't yet set the pages to skip, do it now.
+	 *
+	 * Use the underlying file size divided by its block allocation size as
+	 * our guess of leaf pages in the file (this can be entirely wrong, as
+	 * it depends on how many pages are in this particular checkpoint, how
+	 * large the leaf and internal pages really are, and other factors).
+	 * Then, divide that value by the configured sample size and increment
+	 * the final result to make sure tiny files don't leave us with a skip
+	 * value of 0.
+	 *
+	 * !!!
+	 * Ideally, the number would be prime to avoid restart issues.
+	 */
+	if (cbt->next_random_leaf_skip == 0) {
+		WT_ERR(btree->bm->size(btree->bm, session, &size));
+		cbt->next_random_leaf_skip = (uint64_t)
+		    ((size / btree->allocsize) /
+		    cbt->next_random_sample_size) + 1;
+	}
+
+	/*
+	 * Be paranoid about loop termination: first, if the last leaf page
+	 * skipped was also the last leaf page in the tree, skip may be set to
+	 * zero on return along with the NULL WT_REF end-of-walk condition.
+	 * Second, if a tree has no valid pages at all (the condition after
+	 * initial creation), we might make no progress at all, or finally, if
+	 * a tree has only deleted pages, we'll make progress, but never get a
+	 * useful WT_REF. And, of course, the tree can switch from one of these
+	 * states to another without warning. Decrement skip regardless of what
+	 * is happening in the search, guarantee we eventually quit.
+	 *
+	 * Pages read for data sampling aren't "useful"; don't update the read
+	 * generation of pages already in memory, and if a page is read, set
+	 * its generation to a low value so it is evicted quickly.
+	 */
+	for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
+		n = skip;
+		WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
+		    WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+		if (n == skip) {
+			if (skip == 0)
+				break;
+			--skip;
+		}
+	}
+
+	/*
+	 * We can't return WT_NOTFOUND to the application unless a tree is
+	 * really empty, fallback to a random entry from the first page in the
+	 * tree that has anything at all.
+	 */
+	if (cbt->ref == NULL)
+		WT_ERR(__wt_btcur_next(cbt, false));
+
+random_page_entry:
+	/*
+	 * Select a random entry from the leaf page. If it's not valid, move to
+	 * the next entry, if that doesn't work, move to the previous entry.
+	 */
+	WT_ERR(__wt_row_random_leaf(session, cbt));
+	if (__wt_cursor_valid(cbt, &upd))
+		WT_ERR(__wt_kv_return(session, cbt, upd));
+	else {
+		if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
+			ret = __wt_btcur_prev(cbt, false);
+		WT_ERR(ret);
+	}
+	return (0);
+
+err:	WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}