summaryrefslogtreecommitdiff
path: root/src/btree/bt_random.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/btree/bt_random.c')
-rw-r--r--src/btree/bt_random.c427
1 files changed, 427 insertions, 0 deletions
diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c
new file mode 100644
index 00000000000..44de511f787
--- /dev/null
+++ b/src/btree/bt_random.c
@@ -0,0 +1,427 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_row_random_leaf --
+ * Return a random key from a row-store leaf page.
+ */
+int
+__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *ins, **start, **stop;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ uint64_t samples;
+ uint32_t choice, entries, i;
+ int level;
+
+ page = cbt->ref->page;
+ start = stop = NULL; /* [-Wconditional-uninitialized] */
+ entries = 0; /* [-Wconditional-uninitialized] */
+
+ __cursor_pos_clear(cbt);
+
+ /* If the page has disk-based entries, select from them. */
+ if (page->entries != 0) {
+ cbt->compare = 0;
+ cbt->slot = __wt_random(&session->rnd) % page->entries;
+
+ /*
+ * The real row-store search function builds the key, so we
+ * have to as well.
+ */
+ return (__wt_row_leaf_key(session,
+ page, page->pg_row + cbt->slot, cbt->tmp, false));
+ }
+
+ /*
+ * If the tree is new (and not empty), it might have a large insert
+ * list.
+ *
+ * Walk down the list until we find a level with at least 50 entries,
+ * that's where we'll start rolling random numbers. The value 50 is
+ * used to ignore levels with only a few entries, that is, levels which
+ * are potentially badly skewed.
+ */
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ return (WT_NOTFOUND);
+ for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ start = &ins_head->head[level];
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+
+ if (entries > 50)
+ break;
+ }
+
+ /*
+ * If it's a tiny list and we went all the way to level 0, correct the
+ * level; entries is correctly set.
+ */
+ if (level < 0)
+ level = 0;
+
+ /*
+ * Step down the skip list levels, selecting a random chunk of the name
+ * space at each level.
+ */
+ for (samples = entries; level > 0; samples += entries) {
+ /*
+ * There are (entries) or (entries + 1) chunks of the name space
+ * considered at each level. They are: between start and the 1st
+ * element, between the 1st and 2nd elements, and so on to the
+ * last chunk which is the name space after the stop element on
+ * the current level. This last chunk of name space may or may
+ * not be there: as we descend the levels of the skip list, this
+ * chunk may appear, depending if the next level down has
+ * entries logically after the stop point in the current level.
+ * We can't ignore those entries: because of the algorithm used
+ * to determine the depth of a skiplist, there may be a large
+ * number of entries "revealed" by descending a level.
+ *
+ * If the next level down has more items after the current stop
+ * point, there are (entries + 1) chunks to consider, else there
+ * are (entries) chunks.
+ */
+ if (*(stop - 1) == NULL)
+ choice = __wt_random(&session->rnd) % entries;
+ else
+ choice = __wt_random(&session->rnd) % (entries + 1);
+
+ if (choice == entries) {
+ /*
+ * We selected the name space after the stop element on
+ * this level. Set the start point to the current stop
+ * point, descend a level and move the stop element to
+ * the end of the list, that is, the end of the newly
+ * discovered name space, counting entries as we go.
+ */
+ start = stop;
+ --start;
+ --level;
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+ } else {
+ /*
+ * We selected another name space on the level. Move the
+ * start pointer the selected number of entries forward
+ * to the start of the selected chunk (if the selected
+ * number is 0, start won't move). Set the stop pointer
+ * to the next element in the list and drop both start
+ * and stop down a level.
+ */
+ for (i = 0; i < choice; ++i)
+ start = &(*start)->next[level];
+ stop = &(*start)->next[level];
+
+ --start;
+ --stop;
+ --level;
+
+ /* Count the entries in the selected name space. */
+ for (entries = 0,
+ ins = *start; ins != *stop; ins = ins->next[level])
+ ++entries;
+ }
+ }
+
+ /*
+ * When we reach the bottom level, entries will already be set. Select
+ * a random entry from the name space and return it.
+ *
+ * It should be impossible for the entries count to be 0 at this point,
+ * but check for it out of paranoia and to quiet static testing tools.
+ */
+ if (entries > 0)
+ entries = __wt_random(&session->rnd) % entries;
+ for (ins = *start; entries > 0; --entries)
+ ins = ins->next[0];
+
+ cbt->ins = ins;
+ cbt->ins_head = ins_head;
+ cbt->compare = 0;
+
+ /*
+ * Random lookups in newly created collections can be slow if a page
+ * consists of a large skiplist. Schedule the page for eviction if we
+ * encounter a large skiplist. This worthwhile because applications
+ * that take a sample often take many samples, so the overhead of
+ * traversing the skip list each time accumulates to real time.
+ */
+ if (samples > 5000)
+ __wt_page_evict_soon(session, cbt->ref);
+
+ return (0);
+}
+
+/*
+ * __wt_random_descent --
+ * Find a random page in a tree for either sampling or eviction.
+ */
+int
+__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ uint32_t flags, i, entries, retry;
+
+ btree = S2BT(session);
+ current = NULL;
+ retry = 100;
+
+ /* Eviction should not be tapped to do eviction. */
+ if (eviction)
+ flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN |
+ WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
+ else
+ flags = WT_READ_RESTART_OK;
+
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, flags));
+ }
+
+ /* Search the internal pages of the tree. */
+ current = &btree->root;
+ for (;;) {
+ page = current->page;
+ if (!WT_PAGE_IS_INTERNAL(page))
+ break;
+
+ WT_INTL_INDEX_GET(session, page, pindex);
+ entries = pindex->entries;
+
+ /* Eviction just wants any random child. */
+ if (eviction) {
+ descent = pindex->index[
+ __wt_random(&session->rnd) % entries];
+ goto descend;
+ }
+
+ /*
+ * There may be empty pages in the tree, and they're useless to
+ * us. If we don't find a non-empty page in "entries" random
+ * guesses, take the first non-empty page in the tree. If the
+ * search page contains nothing other than empty pages, restart
+ * from the root some number of times before giving up.
+ *
+ * Random sampling is looking for a key/value pair on a random
+ * leaf page, and so will accept any page that contains a valid
+ * key/value pair, so on-disk is fine, but deleted is not.
+ */
+ descent = NULL;
+ for (i = 0; i < entries; ++i) {
+ descent =
+ pindex->index[__wt_random(&session->rnd) % entries];
+ if (descent->state == WT_REF_MEM ||
+ descent->state == WT_REF_DISK)
+ break;
+ }
+ if (i == entries)
+ for (i = 0; i < entries; ++i) {
+ descent = pindex->index[i];
+ if (descent->state == WT_REF_MEM ||
+ descent->state == WT_REF_DISK)
+ break;
+ }
+ if (i == entries || descent == NULL) {
+ if (--retry > 0)
+ goto restart;
+
+ WT_RET(__wt_page_release(session, current, flags));
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search at the root.
+ *
+ * On other error, simply return, the swap call ensures we're
+ * holding nothing on failure.
+ */
+descend: if ((ret =
+ __wt_page_swap(session, current, descent, flags)) == 0) {
+ current = descent;
+ continue;
+ }
+ if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
+ break;
+ if (ret == WT_RESTART)
+ goto restart;
+ return (ret);
+ }
+
+ /*
+ * There is no point starting with the root page: the walk will exit
+ * immediately. In that case we aren't holding a hazard pointer so
+ * there is nothing to release.
+ */
+ if (!eviction || !__wt_ref_is_root(current))
+ *refp = current;
+ return (0);
+}
+
+/*
+ * __wt_btcur_next_random --
+ * Move to a random record in the tree. There are two algorithms, one
+ * where we select a record at random from the whole tree on each
+ * retrieval and one where we first select a record at random from the
+ * whole tree, and then subsequently sample forward from that location.
+ * The sampling approach allows us to select reasonably uniform random
+ * points from unbalanced trees.
+ */
+int
+__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+ wt_off_t size;
+ uint64_t n, skip;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = cbt->btree;
+
+ /*
+ * Only supports row-store: applications can trivially select a random
+ * value from a column-store, if there were any reason to do so.
+ */
+ if (btree->type != BTREE_ROW)
+ WT_RET_MSG(session, ENOTSUP,
+ "WT_CURSOR.next_random only supported by row-store tables");
+
+ WT_STAT_CONN_INCR(session, cursor_next);
+ WT_STAT_DATA_INCR(session, cursor_next);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Under some conditions we end up using the underlying cursor.next to
+ * walk through the object. Since there are multiple calls, we can hit
+ * the cursor-order checks, turn them off.
+ */
+ __wt_cursor_key_order_reset(cbt);
+#endif
+
+ /*
+ * If we don't have a current position in the tree, or if retrieving
+ * random values without sampling, pick a roughly random leaf page in
+ * the tree and return an entry from it.
+ */
+ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
+ WT_ERR(__cursor_func_init(cbt, true));
+ WT_WITH_PAGE_INDEX(session,
+ ret = __wt_random_descent(session, &cbt->ref, false));
+ if (ret == 0)
+ goto random_page_entry;
+
+ /*
+ * Random descent may return not-found: the tree might be empty
+ * or have so many deleted items we didn't find any valid pages.
+ * We can't return WT_NOTFOUND to the application unless a tree
+ * is really empty, fallback to skipping through tree pages.
+ */
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+
+ /*
+ * Cursor through the tree, skipping past the sample size of the leaf
+ * pages in the tree between each random key return to compensate for
+ * unbalanced trees.
+ *
+ * If the random descent attempt failed, we don't have a configured
+ * sample size, use 100 for no particular reason.
+ */
+ if (cbt->next_random_sample_size == 0)
+ cbt->next_random_sample_size = 100;
+
+ /*
+ * If the random descent attempt failed, or it's our first skip attempt,
+ * we haven't yet set the pages to skip, do it now.
+ *
+ * Use the underlying file size divided by its block allocation size as
+ * our guess of leaf pages in the file (this can be entirely wrong, as
+ * it depends on how many pages are in this particular checkpoint, how
+ * large the leaf and internal pages really are, and other factors).
+ * Then, divide that value by the configured sample size and increment
+ * the final result to make sure tiny files don't leave us with a skip
+ * value of 0.
+ *
+ * !!!
+ * Ideally, the number would be prime to avoid restart issues.
+ */
+ if (cbt->next_random_leaf_skip == 0) {
+ WT_ERR(btree->bm->size(btree->bm, session, &size));
+ cbt->next_random_leaf_skip = (uint64_t)
+ ((size / btree->allocsize) /
+ cbt->next_random_sample_size) + 1;
+ }
+
+ /*
+ * Be paranoid about loop termination: first, if the last leaf page
+ * skipped was also the last leaf page in the tree, skip may be set to
+ * zero on return along with the NULL WT_REF end-of-walk condition.
+ * Second, if a tree has no valid pages at all (the condition after
+ * initial creation), we might make no progress at all, or finally, if
+ * a tree has only deleted pages, we'll make progress, but never get a
+ * useful WT_REF. And, of course, the tree can switch from one of these
+ * states to another without warning. Decrement skip regardless of what
+ * is happening in the search, guarantee we eventually quit.
+ *
+ * Pages read for data sampling aren't "useful"; don't update the read
+ * generation of pages already in memory, and if a page is read, set
+ * its generation to a low value so it is evicted quickly.
+ */
+ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
+ n = skip;
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
+ WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ if (n == skip) {
+ if (skip == 0)
+ break;
+ --skip;
+ }
+ }
+
+ /*
+ * We can't return WT_NOTFOUND to the application unless a tree is
+ * really empty, fallback to a random entry from the first page in the
+ * tree that has anything at all.
+ */
+ if (cbt->ref == NULL)
+ WT_ERR(__wt_btcur_next(cbt, false));
+
+random_page_entry:
+ /*
+ * Select a random entry from the leaf page. If it's not valid, move to
+ * the next entry, if that doesn't work, move to the previous entry.
+ */
+ WT_ERR(__wt_row_random_leaf(session, cbt));
+ if (__wt_cursor_valid(cbt, &upd))
+ WT_ERR(__wt_kv_return(session, cbt, upd));
+ else {
+ if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
+ ret = __wt_btcur_prev(cbt, false);
+ WT_ERR(ret);
+ }
+ return (0);
+
+err: WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}