summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Pasette <dan@mongodb.com>2015-12-16 13:15:31 -0500
committerDan Pasette <dan@mongodb.com>2015-12-16 13:15:46 -0500
commit33831818603c3c00dee06dd8bfa2bc9bea06a8af (patch)
tree3ea23b289efc57d2de1d11208fd179a1494ebdca
parenta014a946bdea5013883bff963ae6fae9fe39e2a7 (diff)
downloadmongo-33831818603c3c00dee06dd8bfa2bc9bea06a8af.tar.gz
Import wiredtiger-wiredtiger-mongodb-3.2-rc4-75-gdecd916.tar.gz from wiredtiger branch mongodb-3.2
ref: 197eef0..decd916 48e1343 WT-2262 Have random sampling walk the tree so it isn't biased in skewed trees. eb838c7 WT-2260 Avoid adding internal pages to the eviction queue. a695751 WT-2258 WiredTiger preloads pages even when direct-IO is configured.
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py46
-rw-r--r--src/third_party/wiredtiger/dist/flags.py1
-rw-r--r--src/third_party/wiredtiger/src/block/block_read.c40
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c72
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c16
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_misc.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c113
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c94
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c8
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c14
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c8
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c8
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c20
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i12
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h2
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h8
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h9
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h5
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in40
26 files changed, 378 insertions, 164 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index f58a48b4a0b..ff6d3f3ccb5 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -814,21 +814,19 @@ methods = {
'WT_SESSION.open_cursor' : Method(cursor_runtime_config + [
Config('bulk', 'false', r'''
- configure the cursor for bulk-loading, a fast, initial load
- path (see @ref tune_bulk_load for more information). Bulk-load
- may only be used for newly created objects and cursors
- configured for bulk-load only support the WT_CURSOR::insert
- and WT_CURSOR::close methods. When bulk-loading row-store
- objects, keys must be loaded in sorted order. The value is
- usually a true/false flag; when bulk-loading fixed-length
- column store objects, the special value \c bitmap allows
- chunks of a memory resident bitmap to be loaded directly into
- a file by passing a \c WT_ITEM to WT_CURSOR::set_value where
- the \c size field indicates the number of records in the
- bitmap (as specified by the object's \c value_format
- configuration). Bulk-loaded bitmap values must end on a byte
- boundary relative to the bit count (except for the last set
- of values loaded)'''),
+ configure the cursor for bulk-loading, a fast, initial load path
+ (see @ref tune_bulk_load for more information). Bulk-load may
+ only be used for newly created objects and applications should
+ use the WT_CURSOR::insert method to insert rows. When
+ bulk-loading, rows must be loaded in sorted order. The value
+ is usually a true/false flag; when bulk-loading fixed-length
+ column store objects, the special value \c bitmap allows chunks
+ of a memory resident bitmap to be loaded directly into a file
+ by passing a \c WT_ITEM to WT_CURSOR::set_value where the \c
+ size field indicates the number of records in the bitmap (as
+ specified by the object's \c value_format configuration).
+ Bulk-loaded bitmap values must end on a byte boundary relative
+ to the bit count (except for the last set of values loaded)'''),
Config('checkpoint', '', r'''
the name of a checkpoint to open (the reserved name
"WiredTigerCheckpoint" opens the most recent internal
@@ -843,12 +841,20 @@ methods = {
with the @ref util_dump and @ref util_load commands''',
choices=['hex', 'json', 'print']),
Config('next_random', 'false', r'''
- configure the cursor to return a pseudo-random record from
- the object; valid only for row-store cursors. Cursors
- configured with \c next_random=true only support the
- WT_CURSOR::next and WT_CURSOR::close methods. See @ref
- cursor_random for details''',
+ configure the cursor to return a pseudo-random record from the
+ object when the WT_CURSOR::next method is called; valid only for
+ row-store cursors. See @ref cursor_random for details''',
type='boolean'),
+ Config('next_random_sample_size', '0', r'''
+ cursors configured by \c next_random to return pseudo-random
+ records from the object randomly select from the entire object,
+ by default. Setting \c next_random_sample_size to a non-zero
+ value sets the number of samples the application expects to take
+ using the \c next_random cursor. A cursor configured with both
+ \c next_random and \c next_random_sample_size attempts to divide
+ the object into \c next_random_sample_size equal-sized pieces,
+ and each retrieval returns a record from one of those pieces. See
+ @ref cursor_random for details'''),
Config('raw', 'false', r'''
ignore the encodings for the key and value, manage data as if
the formats were \c "u". See @ref cursor_raw for details''',
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
index 1965dfb7dbe..7d237dd39a4 100644
--- a/src/third_party/wiredtiger/dist/flags.py
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -37,6 +37,7 @@ flags = {
'READ_NO_WAIT',
'READ_PREV',
'READ_SKIP_INTL',
+ 'READ_SKIP_LEAF',
'READ_TRUNCATE',
'READ_WONT_NEED',
],
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
index 4b32e02a73a..ca7797f17af 100644
--- a/src/third_party/wiredtiger/src/block/block_read.c
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -13,10 +13,11 @@
* Pre-load a page.
*/
int
-__wt_bm_preload(WT_BM *bm,
- WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+__wt_bm_preload(
+ WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
{
WT_BLOCK *block;
+ WT_DECL_ITEM(tmp);
WT_DECL_RET;
wt_off_t offset;
uint32_t cksum, size;
@@ -24,7 +25,15 @@ __wt_bm_preload(WT_BM *bm,
WT_UNUSED(addr_size);
block = bm->block;
- ret = EINVAL; /* Play games due to conditional compilation */
+
+ /*
+ * Turn off pre-load when direct I/O is configured for the file,
+ * the kernel cache isn't interesting.
+ */
+ if (block->fh->direct_io)
+ return (0);
+
+ WT_STAT_FAST_CONN_INCR(session, block_preload);
/* Crack the cookie. */
WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
@@ -32,26 +41,19 @@ __wt_bm_preload(WT_BM *bm,
/* Check for a mapped block. */
mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
if (mapped)
- WT_RET(__wt_mmap_preload(
+ return (__wt_mmap_preload(
session, (uint8_t *)bm->map + offset, size));
- else {
+
#ifdef HAVE_POSIX_FADVISE
- ret = posix_fadvise(block->fh->fd,
- (wt_off_t)offset, (wt_off_t)size, POSIX_FADV_WILLNEED);
+ if (posix_fadvise(block->fh->fd,
+ (wt_off_t)offset, (wt_off_t)size, POSIX_FADV_WILLNEED) == 0)
+ return (0);
#endif
- if (ret != 0) {
- WT_DECL_ITEM(tmp);
- WT_RET(__wt_scr_alloc(session, size, &tmp));
- ret = __wt_block_read_off(
- session, block, tmp, offset, size, cksum);
- __wt_scr_free(session, &tmp);
- WT_RET(ret);
- }
- }
- WT_STAT_FAST_CONN_INCR(session, block_preload);
-
- return (0);
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+ ret = __wt_block_read_off(session, block, tmp, offset, size, cksum);
+ __wt_scr_free(session, &tmp);
+ return (ret);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index b2c9e4b67f8..8044d4f852d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -45,7 +45,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
* Ignore empty pages, they get merged into the parent.
*/
if (mod == NULL || mod->rec_result == 0) {
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL)
return (0);
WT_RET(
@@ -130,7 +130,7 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
* read, set its generation to a low value so it is evicted
* quickly.
*/
- WT_ERR(__wt_tree_walk(session, &ref, NULL,
+ WT_ERR(__wt_tree_walk(session, &ref,
WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
if (ref == NULL)
break;
@@ -182,7 +182,7 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
* address, the page isn't on disk, but we have to read internal pages
* to walk the tree regardless; throw up our hands and read it.
*/
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type));
+ __wt_ref_info(ref, &addr, &addr_size, &type);
if (addr == NULL)
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 3c96bad39d7..55843d1cae5 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -527,7 +527,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
__wt_page_evict_soon(page);
cbt->page_deleted_count = 0;
- WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index b7cea561b48..1d23b976edd 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -615,7 +615,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
__wt_page_evict_soon(page);
cbt->page_deleted_count = 0;
- WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 69512f45933..f2bf2978320 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -816,7 +816,12 @@ err: if (ret == WT_RESTART) {
/*
* __wt_btcur_next_random --
- * Move to a random record in the tree.
+ * Move to a random record in the tree. There are two algorithms, one
+ * where we select a record at random from the whole tree on each
+ * retrieval and one where we first select a record at random from the
+ * whole tree, and then subsequently sample forward from that location.
+ * The sampling approach allows us to select reasonably uniform random
+ * points from unbalanced trees.
*/
int
__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
@@ -825,6 +830,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_DECL_RET;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
+ uint64_t skip;
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = cbt->btree;
@@ -839,11 +845,63 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_STAT_FAST_CONN_INCR(session, cursor_next);
WT_STAT_FAST_DATA_INCR(session, cursor_next);
- WT_RET(__cursor_func_init(cbt, true));
+ /*
+ * If retrieving random values without sampling, or we don't have a
+ * page reference, pick a roughly random leaf page in the tree.
+ */
+ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
+ /*
+ * Skip past the sample size of the leaf pages in the tree
+ * between each random key return to compensate for unbalanced
+ * trees.
+ *
+ * Use the underlying file size divided by its block allocation
+ * size as our guess of leaf pages in the file (this can be
+ * entirely wrong, as it depends on how many pages are in this
+ * particular checkpoint, how large the leaf and internal pages
+ * really are, and other factors). Then, divide that value by
+ * the configured sample size and increment the final result to
+ * make sure tiny files don't leave us with a skip value of 0.
+ *
+ * !!!
+ * Ideally, the number would be prime to avoid restart issues.
+ */
+ if (cbt->next_random_sample_size != 0)
+ cbt->next_random_leaf_skip = (uint64_t)
+ ((btree->bm->block->fh->size / btree->allocsize) /
+ cbt->next_random_sample_size) + 1;
- WT_WITH_PAGE_INDEX(session,
- ret = __wt_row_random(session, cbt));
- WT_ERR(ret);
+ /*
+ * Choose a leaf page from the tree.
+ */
+ WT_ERR(__cursor_func_init(cbt, true));
+ WT_WITH_PAGE_INDEX(
+ session, ret = __wt_row_random_descent(session, cbt));
+ WT_ERR(ret);
+ } else {
+ /*
+ * Read through the tree, skipping leaf pages. Be cautious about
+ * the skip count: if the last leaf page skipped was also the
+ * last leaf page in the tree, it may be set to zero on return
+ * with the end-of-walk condition.
+ *
+ * Pages read for data sampling aren't "useful"; don't update
+ * the read generation of pages already in memory, and if a page
+ * is read, set its generation to a low value so it is evicted
+ * quickly.
+ */
+ for (skip =
+ cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;)
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
+ WT_READ_NO_GEN |
+ WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ }
+
+ /*
+ * Select a random entry from the leaf page. If it's not valid, move to
+ * the next entry, if that doesn't work, move to the previous entry.
+ */
+ WT_ERR(__wt_row_random_leaf(session, cbt));
if (__cursor_valid(cbt, &upd))
WT_ERR(__wt_kv_return(session, cbt, upd));
else {
@@ -851,9 +909,9 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
ret = __wt_btcur_prev(cbt, false);
WT_ERR(ret);
}
+ return (0);
-err: if (ret != 0)
- WT_TRET(__cursor_reset(cbt));
+err: WT_TRET(__cursor_reset(cbt));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 0f47c060daf..d52a94a6da2 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -43,7 +43,7 @@ static int __debug_page_col_var(WT_DBG *, WT_PAGE *);
static int __debug_page_metadata(WT_DBG *, WT_PAGE *);
static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
-static int __debug_ref(WT_DBG *, WT_REF *);
+static void __debug_ref(WT_DBG *, WT_REF *);
static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
static int __debug_tree(
WT_SESSION_IMPL *, WT_BTREE *, WT_PAGE *, const char *, uint32_t);
@@ -769,7 +769,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
WT_INTL_FOREACH_BEGIN(session, page, ref) {
__dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno);
- WT_RET(__debug_ref(ds, ref));
+ __debug_ref(ds, ref);
} WT_INTL_FOREACH_END;
if (LF_ISSET(WT_DEBUG_TREE_WALK))
@@ -843,7 +843,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
WT_INTL_FOREACH_BEGIN(session, page, ref) {
__wt_ref_key(page, ref, &p, &len);
__debug_item(ds, "K", p, len);
- WT_RET(__debug_ref(ds, ref));
+ __debug_ref(ds, ref);
} WT_INTL_FOREACH_END;
if (LF_ISSET(WT_DEBUG_TREE_WALK))
@@ -965,7 +965,7 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
* __debug_ref --
* Dump a WT_REF structure.
*/
-static int
+static void
__debug_ref(WT_DBG *ds, WT_REF *ref)
{
WT_SESSION_IMPL *session;
@@ -994,14 +994,14 @@ __debug_ref(WT_DBG *ds, WT_REF *ref)
case WT_REF_SPLIT:
__dmsg(ds, "split");
break;
- WT_ILLEGAL_VALUE(session);
+ default:
+ __dmsg(ds, "INVALID");
+ break;
}
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
__dmsg(ds, " %s\n",
__wt_addr_string(session, addr, addr_size, ds->tmp));
-
- return (0);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 294cc399d65..a6330326954 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -601,7 +601,7 @@ __btree_preload(WT_SESSION_IMPL *session)
/* Pre-load the second-level internal pages. */
WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) {
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr != NULL)
WT_RET(bm->preload(bm, session, addr, addr_size));
} WT_INTL_FOREACH_END;
@@ -622,7 +622,7 @@ __btree_get_last_recno(WT_SESSION_IMPL *session)
btree = S2BT(session);
next_walk = NULL;
- WT_RET(__wt_tree_walk(session, &next_walk, NULL, WT_READ_PREV));
+ WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV));
if (next_walk == NULL)
return (WT_NOTFOUND);
diff --git a/src/third_party/wiredtiger/src/btree/bt_misc.c b/src/third_party/wiredtiger/src/btree/bt_misc.c
index d2b16bb5d21..a60499ef8b7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_misc.c
+++ b/src/third_party/wiredtiger/src/btree/bt_misc.c
@@ -101,7 +101,7 @@ __wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf)
return (buf->data);
}
- (void)__wt_ref_info(session, ref, &addr, &addr_size, NULL);
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
return (__wt_addr_string(session, addr, addr_size, buf));
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 77215474359..c50f97bbe14 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -375,7 +375,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
* Get the address: if there is no address, the page was deleted, but a
* subsequent search or insert is forcing re-creation of the name space.
*/
- WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL) {
WT_ASSERT(session, previous_state == WT_REF_DELETED);
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index 2f8759b9d82..5dd75835b0b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -59,8 +59,8 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(session, stats, btree_row_leaf, 0);
next_walk = NULL;
- while ((ret = __wt_tree_walk(session, &next_walk, NULL, 0)) == 0 &&
- next_walk != NULL) {
+ while ((ret = __wt_tree_walk(
+ session, &next_walk, 0)) == 0 && next_walk != NULL) {
WT_WITH_PAGE_INDEX(session,
ret = __stat_page(session, next_walk->page, stats));
WT_RET(ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 07bb2eb3a01..86607d8f187 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -58,7 +58,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
for (walk = NULL;;) {
- WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
@@ -124,7 +124,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/* Write all dirty in-cache pages. */
flags |= WT_READ_NO_EVICT;
for (walk = NULL;;) {
- WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index c7d83d8dfff..b46c9a03dcf 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -69,16 +69,36 @@ retry: WT_INTL_INDEX_GET(session, ref->home, pindex);
}
/*
- * __wt_tree_walk --
+ * __ref_is_leaf --
+ * Check if a reference is for a leaf page.
+ */
+static inline bool
+__ref_is_leaf(WT_REF *ref)
+{
+ size_t addr_size;
+ u_int type;
+ const uint8_t *addr;
+
+ /*
+ * If the page has a disk address, we can crack it to figure out if
+ * this page is a leaf page or not. If there's no address, the page
+ * isn't on disk and we don't know the page type.
+ */
+ __wt_ref_info(ref, &addr, &addr_size, &type);
+ return (addr == NULL ?
+ false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO);
+}
+
+/*
+ * __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
-int
-__wt_tree_walk(WT_SESSION_IMPL *session,
- WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
+static inline int
+__tree_walk_internal(WT_SESSION_IMPL *session,
+ WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags)
{
WT_BTREE *btree;
WT_DECL_RET;
- WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *couple, *couple_orig, *ref;
bool empty_internal, prev, skip;
@@ -304,6 +324,31 @@ ascend: /*
break;
}
+ /*
+ * Optionally skip leaf pages: skip all leaf pages if
+ * WT_READ_SKIP_LEAF is set, when the skip-leaf-count
+ * variable is non-zero, skip some count of leaf pages.
+ * If this page is disk-based, crack the cell to figure
+ * out it's a leaf page without reading it.
+ *
+ * If skipping some number of leaf pages, decrement the
+ * count of pages to zero, and then take the next leaf
+ * page we can. Be cautious around the page decrement,
+ * if for some reason don't take this particular page,
+ * we can take the next one, and, there are additional
+ * tests/decrements when we're about to return a leaf
+ * page.
+ */
+ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF))
+ if (__ref_is_leaf(ref)) {
+ if (LF_ISSET(WT_READ_SKIP_LEAF))
+ break;
+ if (*skipleafcntp > 0) {
+ --*skipleafcntp;
+ break;
+ }
+ }
+
ret = __wt_page_swap(session, couple, ref, flags);
/*
@@ -359,13 +404,29 @@ ascend: /*
* A new page: configure for traversal of any internal
* page's children, else return the leaf page.
*/
-descend: couple = ref;
- page = ref->page;
- if (WT_PAGE_IS_INTERNAL(page)) {
- WT_INTL_INDEX_GET(session, page, pindex);
+ if (WT_PAGE_IS_INTERNAL(ref->page)) {
+descend: couple = ref;
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
slot = prev ? pindex->entries - 1 : 0;
empty_internal = true;
} else {
+ /*
+ * Optionally skip leaf pages, the second half.
+ * We didn't have an on-page cell to figure out
+ * if it was a leaf page, we had to acquire the
+ * hazard pointer and look at the page.
+ */
+ if (skipleafcntp != NULL ||
+ LF_ISSET(WT_READ_SKIP_LEAF)) {
+ couple = ref;
+ if (LF_ISSET(WT_READ_SKIP_LEAF))
+ break;
+ if (*skipleafcntp > 0) {
+ --*skipleafcntp;
+ break;
+ }
+ }
+
*refp = ref;
goto done;
}
@@ -376,3 +437,37 @@ done:
err: WT_LEAVE_PAGE_INDEX(session);
return (ret);
}
+
+/*
+ * __wt_tree_walk --
+ * Move to the next/previous page in the tree.
+ */
+int
+__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
+{
+ return (__tree_walk_internal(session, refp, NULL, NULL, flags));
+}
+
+/*
+ * __wt_tree_walk_count --
+ * Move to the next/previous page in the tree, tracking how many
+ * references were visited to get there.
+ */
+int
+__wt_tree_walk_count(WT_SESSION_IMPL *session,
+ WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
+{
+ return (__tree_walk_internal(session, refp, walkcntp, NULL, flags));
+}
+
+/*
+ * __wt_tree_walk_skip --
+ * Move to the next/previous page in the tree, skipping a certain number
+ * of leaf pages before returning.
+ */
+int
+__wt_tree_walk_skip(WT_SESSION_IMPL *session,
+ WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags)
+{
+ return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags));
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index d2d8a4640ca..079f9d3bad1 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -536,19 +536,66 @@ err: /*
}
/*
- * __wt_row_random --
- * Return a random key from a row-store tree.
+ * __wt_row_random_leaf --
+ * Return a random key from a row-store leaf page.
*/
int
-__wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *p, *t;
+ WT_PAGE *page;
+ uint32_t cnt;
+
+ page = cbt->ref->page;
+
+ if (page->pg_row_entries != 0) {
+ cbt->compare = 0;
+ cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries;
+
+ /*
+ * The real row-store search function builds the key, so we
+ * have to as well.
+ */
+ return (__wt_row_leaf_key(session,
+ page, page->pg_row_d + cbt->slot, cbt->tmp, false));
+ }
+
+ /*
+ * If the tree is new (and not empty), it might have a large insert
+ * list. Count how many records are in the list.
+ */
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ return (WT_NOTFOUND);
+ for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
+ if ((p = WT_SKIP_NEXT(p)) == NULL)
+ break;
+
+ /*
+ * Select a random number from 0 to (N - 1), return that record.
+ */
+ cnt = __wt_random(&session->rnd) % cnt;
+ for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
+ if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
+ break;
+ cbt->compare = 0;
+ cbt->ins = t;
+
+ return (0);
+}
+
+/*
+ * __wt_row_random_descent --
+ * Find a random leaf page in a row-store tree.
+ */
+int
+__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_DECL_RET;
- WT_INSERT *p, *t;
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *current, *descent;
- uint32_t cnt;
btree = S2BT(session);
@@ -585,43 +632,6 @@ restart_root:
return (ret);
}
- if (page->pg_row_entries != 0) {
- cbt->ref = current;
- cbt->compare = 0;
- cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries;
-
- /*
- * The real row-store search function builds the key, so we
- * have to as well.
- */
- return (__wt_row_leaf_key(session,
- page, page->pg_row_d + cbt->slot, cbt->tmp, false));
- }
-
- /*
- * If the tree is new (and not empty), it might have a large insert
- * list. Count how many records are in the list.
- */
- F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
- WT_ERR(WT_NOTFOUND);
- for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
- if ((p = WT_SKIP_NEXT(p)) == NULL)
- break;
-
- /*
- * Select a random number from 0 to (N - 1), return that record.
- */
- cnt = __wt_random(&session->rnd) % cnt;
- for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
- if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
- break;
cbt->ref = current;
- cbt->compare = 0;
- cbt->ins = t;
-
return (0);
-
-err: WT_TRET(__wt_page_release(session, current, 0));
- return (ret);
}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index d79ce6853e6..9d12e953498 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -323,6 +323,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = {
NULL, "choices=[\"hex\",\"json\",\"print\"]",
NULL, 0 },
{ "next_random", "boolean", NULL, NULL, NULL, 0 },
+ { "next_random_sample_size", "string", NULL, NULL, NULL, 0 },
{ "overwrite", "boolean", NULL, NULL, NULL, 0 },
{ "raw", "boolean", NULL, NULL, NULL, 0 },
{ "readonly", "boolean", NULL, NULL, NULL, 0 },
@@ -920,9 +921,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
NULL, 0
},
{ "WT_SESSION.open_cursor",
- "append=0,bulk=0,checkpoint=,dump=,next_random=0,overwrite=,raw=0"
- ",readonly=0,skip_sort_check=0,statistics=,target=",
- confchk_WT_SESSION_open_cursor, 11
+ "append=0,bulk=0,checkpoint=,dump=,next_random=0,"
+ "next_random_sample_size=0,overwrite=,raw=0,readonly=0,"
+ "skip_sort_check=0,statistics=,target=",
+ confchk_WT_SESSION_open_cursor, 12
},
{ "WT_SESSION.reconfigure",
"isolation=read-committed",
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index 63f77248ca8..b955b292292 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -455,14 +455,24 @@ __wt_curfile_create(WT_SESSION_IMPL *session,
}
/*
- * random_retrieval
- * Random retrieval cursors only support next, reset and close.
+ * Random retrieval, row-store only.
+ * Random retrieval cursors support a limited set of methods.
*/
WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
if (cval.val != 0) {
+ if (WT_CURSOR_RECNO(cursor))
+ WT_ERR_MSG(session, ENOTSUP,
+ "next_random configuration not supported for "
+ "column-store objects");
+
__wt_cursor_set_notsup(cursor);
cursor->next = __curfile_next_random;
cursor->reset = __curfile_reset;
+
+ WT_ERR(__wt_config_gets_def(
+ session, cfg, "next_random_sample_size", 0, &cval));
+ if (cval.val != 0)
+ cbt->next_random_sample_size = (u_int)cval.val;
}
/* Underlying btree initialization. */
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index f92426355ef..da38988b6c2 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -40,11 +40,11 @@ void
__wt_cursor_set_notsup(WT_CURSOR *cursor)
{
/*
- * Set all of the cursor methods (except for close and reset), to fail.
- * Close is unchanged so the cursor can be discarded, reset defaults to
+ * Set cursor methods other than close, reconfigure and reset, to fail.
+ * Close is unchanged so the cursor can be discarded; reset is set to
* a no-op because session transactional operations reset all of the
- * cursors in a session, and random cursors shouldn't block transactions
- * or checkpoints.
+ * cursors in a session. Reconfigure is left open in case it's possible
+ * in the future to change these configurations.
*/
cursor->compare =
(int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup;
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index 2b2117ad9fd..c5f6ae3d4d1 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -31,8 +31,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/* Walk the tree, discarding pages. */
next_ref = NULL;
- WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
- WT_READ_CACHE | WT_READ_NO_EVICT));
+ WT_ERR(__wt_tree_walk(
+ session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
while ((ref = next_ref) != NULL) {
page = ref->page;
@@ -68,8 +68,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* the reconciliation, the next walk call could miss a page in
* the tree.
*/
- WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
- WT_READ_CACHE | WT_READ_NO_EVICT));
+ WT_ERR(__wt_tree_walk(session,
+ &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
switch (syncop) {
case WT_SYNC_CLOSE:
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index b5cb850d83c..ac481581c23 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -473,6 +473,15 @@ __evict_update_work(WT_SESSION_IMPL *session)
return (false);
/*
+ * Setup the number of refs to consider in each handle, depending
+ * on how many handles are open. We want to consider less candidates
+ * from each file as more files are open. Handle the case where there
+ * are no files open by adding 1.
+ */
+ cache->evict_max_refs_per_file =
+ WT_MAX(100, WT_MILLION / (conn->open_file_count + 1));
+
+ /*
* Page eviction overrides the dirty target and other types of eviction,
* that is, we don't care where we are with respect to the dirty target
* if page eviction is configured.
@@ -1214,9 +1223,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
*/
for (evict = start, pages_walked = 0;
evict < end && !enough && (ret == 0 || ret == WT_NOTFOUND);
- ret = __wt_tree_walk(
+ ret = __wt_tree_walk_count(
session, &btree->evict_ref, &pages_walked, walk_flags)) {
- enough = pages_walked > WT_EVICT_MAX_PER_FILE;
+ enough = pages_walked > cache->evict_max_refs_per_file;
if ((ref = btree->evict_ref) == NULL) {
if (++restarts == 2 || enough)
break;
@@ -1321,8 +1330,9 @@ fast: /* If the page can't be evicted, give up. */
if (__wt_ref_is_root(ref))
WT_RET(__evict_clear_walk(session));
else if (ref->page->read_gen == WT_READGEN_OLDEST)
- WT_RET_NOTFOUND_OK(__wt_tree_walk(session,
- &btree->evict_ref, &pages_walked, walk_flags));
+ WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
+ session, &btree->evict_ref,
+ &pages_walked, walk_flags));
}
WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
@@ -1602,7 +1612,7 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
next_walk = NULL;
session->dhandle = dhandle;
- while (__wt_tree_walk(session, &next_walk, NULL,
+ while (__wt_tree_walk(session, &next_walk,
WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
next_walk != NULL) {
page = next_walk->page;
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 3e2e7158e04..23e0dfea2cd 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -948,9 +948,8 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
* __wt_ref_info --
* Return the addr/size and type triplet for a reference.
*/
-static inline int
-__wt_ref_info(WT_SESSION_IMPL *session,
- WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
+static inline void
+__wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
{
WT_ADDR *addr;
WT_CELL_UNPACK *unpack, _unpack;
@@ -984,7 +983,9 @@ __wt_ref_info(WT_SESSION_IMPL *session,
case WT_ADDR_LEAF_NO:
*typep = WT_CELL_ADDR_LEAF_NO;
break;
- WT_ILLEGAL_VALUE(session);
+ default:
+ *typep = 0;
+ break;
}
} else {
__wt_cell_unpack((WT_CELL *)addr, unpack);
@@ -993,7 +994,6 @@ __wt_ref_info(WT_SESSION_IMPL *session,
if (typep != NULL)
*typep = unpack->type;
}
- return (0);
}
/*
@@ -1009,7 +1009,7 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
if (ref->addr == NULL)
return (0);
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
WT_RET(__wt_btree_block_free(session, addr, addr_size));
/* Clear the address (so we don't free it twice). */
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index d8a3829863f..a0440f23a00 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -14,7 +14,6 @@
pages by this many increments of the
read generation. */
#define WT_EVICT_WALK_PER_FILE 10 /* Pages to queue per file */
-#define WT_EVICT_MAX_PER_FILE 100 /* Max pages to visit per file */
#define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */
#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
@@ -107,6 +106,7 @@ struct __wt_cache {
uint32_t evict_slots; /* LRU list eviction slots */
WT_DATA_HANDLE
*evict_file_next; /* LRU next file to search */
+ uint32_t evict_max_refs_per_file;/* LRU pages per file per pass */
/*
* Cache pool information.
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index 54787d2227b..275e2f2db46 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -104,6 +104,14 @@ struct __wt_cursor_btree {
uint64_t recno; /* Record number */
/*
+ * Next-random cursors can optionally be configured to step through a
+ * percentage of the total leaf pages to their next value. Note the
+ * configured value and the calculated number of leaf pages to skip.
+ */
+ uint64_t next_random_leaf_skip;
+ u_int next_random_sample_size;
+
+ /*
* The search function sets compare to:
* < 1 if the found key is less than the specified key
* 0 if the found key matches the specified key
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index af8a7aa70e9..d84403cc16d 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -51,7 +51,7 @@ extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats);
-extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum);
@@ -166,7 +166,9 @@ extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok);
extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf);
-extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
+extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags);
+extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
+extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags);
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove);
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
@@ -183,7 +185,8 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert);
-extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
extern int __wt_las_create(WT_SESSION_IMPL *session);
extern int __wt_las_destroy(WT_SESSION_IMPL *session);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index 064349125cc..bafff92fbc0 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -45,8 +45,9 @@
#define WT_READ_NO_WAIT 0x00000020
#define WT_READ_PREV 0x00000040
#define WT_READ_SKIP_INTL 0x00000080
-#define WT_READ_TRUNCATE 0x00000100
-#define WT_READ_WONT_NEED 0x00000200
+#define WT_READ_SKIP_LEAF 0x00000100
+#define WT_READ_TRUNCATE 0x00000200
+#define WT_READ_WONT_NEED 0x00000400
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
#define WT_SESSION_INTERNAL 0x00000004
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 08f73386090..bdd8bb65910 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -896,18 +896,17 @@ struct __wt_session {
* boolean flag; default \c false.}
* @config{bulk, configure the cursor for bulk-loading\, a fast\,
* initial load path (see @ref tune_bulk_load for more information).
- * Bulk-load may only be used for newly created objects and cursors
- * configured for bulk-load only support the WT_CURSOR::insert and
- * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys
- * must be loaded in sorted order. The value is usually a true/false
- * flag; when bulk-loading fixed-length column store objects\, the
- * special value \c bitmap allows chunks of a memory resident bitmap to
- * be loaded directly into a file by passing a \c WT_ITEM to
- * WT_CURSOR::set_value where the \c size field indicates the number of
- * records in the bitmap (as specified by the object's \c value_format
- * configuration). Bulk-loaded bitmap values must end on a byte boundary
- * relative to the bit count (except for the last set of values
- * loaded)., a string; default \c false.}
+ * Bulk-load may only be used for newly created objects and applications
+ * should use the WT_CURSOR::insert method to insert rows. When
+ * bulk-loading\, rows must be loaded in sorted order. The value is
+ * usually a true/false flag; when bulk-loading fixed-length column
+ * store objects\, the special value \c bitmap allows chunks of a memory
+ * resident bitmap to be loaded directly into a file by passing a \c
+ * WT_ITEM to WT_CURSOR::set_value where the \c size field indicates the
+ * number of records in the bitmap (as specified by the object's \c
+ * value_format configuration). Bulk-loaded bitmap values must end on a
+ * byte boundary relative to the bit count (except for the last set of
+ * values loaded)., a string; default \c false.}
* @config{checkpoint, the name of a checkpoint to open (the reserved
* name "WiredTigerCheckpoint" opens the most recent internal checkpoint
* taken for the object). The cursor does not support data
@@ -921,10 +920,19 @@ struct __wt_session {
* string\, chosen from the following options: \c "hex"\, \c "json"\, \c
* "print"; default empty.}
* @config{next_random, configure the cursor to return a pseudo-random
- * record from the object; valid only for row-store cursors. Cursors
- * configured with \c next_random=true only support the WT_CURSOR::next
- * and WT_CURSOR::close methods. See @ref cursor_random for details., a
- * boolean flag; default \c false.}
+ * record from the object when the WT_CURSOR::next method is called;
+ * valid only for row-store cursors. See @ref cursor_random for
+ * details., a boolean flag; default \c false.}
+ * @config{next_random_sample_size, cursors configured by \c next_random
+ * to return pseudo-random records from the object randomly select from
+ * the entire object\, by default. Setting \c next_random_sample_size
+ * to a non-zero value sets the number of samples the application
+ * expects to take using the \c next_random cursor. A cursor configured
+ * with both \c next_random and \c next_random_sample_size attempts to
+ * divide the object into \c next_random_sample_size equal-sized
+ * pieces\, and each retrieval returns a record from one of those
+ * pieces. See @ref cursor_random for details., a string; default \c
+ * 0.}
* @config{overwrite, configures whether the cursor's insert\, update
* and remove methods check the existing state of the record. If \c
* overwrite is \c false\, WT_CURSOR::insert fails with