summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-05-04 16:46:29 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-05-04 07:11:56 +0000
commitc712d5ddbd852b03331284ab9e08bae3addb4aa0 (patch)
treee5f3be8f18fa2ad415ae245cc84eb56dcf599457
parent3174b0ca6254e805983e7e03eae0295d3c48806c (diff)
downloadmongo-c712d5ddbd852b03331284ab9e08bae3addb4aa0.tar.gz
Import wiredtiger: 652def8ae433a2001145d34baec9ef9f5610a97e from branch mongodb-5.0
ref: 859f83a6cc..652def8ae4 for: 5.0.0 WT-7264 Creating a new configuration for search near that allows it to exit quickly when searching for prefixes
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py8
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c39
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c39
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c9
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c17
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c19
-rw-r--r--src/third_party/wiredtiger/src/include/btree_cmp_inline.h21
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h8
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h2
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in200
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c9
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_search_near01.py330
15 files changed, 581 insertions, 125 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 945414cf3e5..953474d404f 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -1245,6 +1245,14 @@ cursor_runtime_config = [
if the record exists, WT_CURSOR::update fails with ::WT_NOTFOUND
if the record does not exist''',
type='boolean'),
+ Config('prefix_search', 'false', r'''
+ when performing a search near for a prefix, if set to true this
+ configuration will allow the search near to exit early if it has left
+ the key range defined by the prefix. This is relevant when the table
+ contains a large number of records which potentially aren't visible to
+ the caller of search near, as such a large number of records could be skipped.
+ The prefix_search configuration provides a fast exit in this scenario.''', type='boolean',
+ undoc=True),
]
methods = {
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 16cd9553968..34e5b020a4a 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -813,6 +813,7 @@ conn_dsrc_stats = [
CursorStat('cursor_prev_skip_ge_100', 'cursor prev calls that skip greater than or equal to 100 entries'),
CursorStat('cursor_prev_skip_lt_100', 'cursor prev calls that skip less than 100 entries'),
CursorStat('cursor_prev_skip_total', 'Total number of entries skipped by cursor prev calls'),
+ CursorStat('cursor_search_near_prefix_fast_paths', 'Total number of times a search near has exited due to prefix config'),
CursorStat('cursor_skip_hs_cur_position', 'Total number of entries skipped to position the history store cursor'),
##########################################
# Checkpoint cleanup statistics
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 19ed0ad74b7..aa12ed897f8 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "859f83a6ccf9bf1361f8aa75c1ba59e5624ea02d"
+ "commit": "652def8ae433a2001145d34baec9ef9f5610a97e"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 18fdef73315..a2b7f161d3e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -305,7 +305,8 @@ restart_read:
* Move to the next row-store item.
*/
static inline int
-__cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp)
+__cursor_row_next(
+ WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp, WT_ITEM *prefix)
{
WT_CELL_UNPACK_KV kpack;
WT_INSERT *ins;
@@ -402,6 +403,17 @@ restart_read_insert:
restart_read_page:
rip = &page->pg_row[cbt->slot];
WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used));
+ /*
+ * If the cursor has prefix search configured we can early exit here if the key that we are
+ * visiting is after our prefix.
+ */
+ if (F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH) && prefix != NULL &&
+ __wt_prefix_match(prefix, &cbt->iface.key) < 0) {
+ /* It is not okay for the user to have a custom collator. */
+ WT_ASSERT(session, CUR2BT(cbt)->collator == NULL);
+ WT_STAT_CONN_DATA_INCR(session, cursor_search_near_prefix_fast_paths);
+ return (WT_NOTFOUND);
+ }
WT_RET(__wt_txn_read(
session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
@@ -622,11 +634,12 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
}
/*
- * __wt_btcur_next --
- * Move to the next record in the tree.
+ * __wt_btcur_next_prefix --
+ * Move to the next record in the tree. Taking an optional prefix item for a special case of
+ * search near.
*/
int
-__wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
+__wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -692,8 +705,14 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
total_skipped += skipped;
break;
case WT_PAGE_ROW_LEAF:
- ret = __cursor_row_next(cbt, newpage, restart, &skipped);
+ ret = __cursor_row_next(cbt, newpage, restart, &skipped, prefix);
total_skipped += skipped;
+ /*
+ * We can directly return WT_NOTFOUND here as the caller expects the cursor to be
+ * positioned when traversing keys for prefix search near.
+ */
+ if (ret == WT_NOTFOUND && F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH))
+ return (WT_NOTFOUND);
break;
default:
WT_ERR(__wt_illegal_value(session, page->type));
@@ -774,3 +793,13 @@ err:
F_CLR(cbt, WT_CBT_ITERATE_RETRY_PREV);
return (ret);
}
+
+/*
+ * __wt_btcur_next --
+ * Move to the next record in the tree.
+ */
+int
+__wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
+{
+ return (__wt_btcur_next_prefix(cbt, NULL, truncating));
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 7517eac77d8..867a46201a4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -441,10 +441,12 @@ restart_read:
/*
* __cursor_row_prev --
- * Move to the previous row-store item.
+ * Move to the previous row-store item. Taking an optional prefix item for a special case of
+ * search near.
*/
static inline int
-__cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp)
+__cursor_row_prev(
+ WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp, WT_ITEM *prefix)
{
WT_CELL_UNPACK_KV kpack;
WT_INSERT *ins;
@@ -553,6 +555,17 @@ restart_read_insert:
restart_read_page:
rip = &page->pg_row[cbt->slot];
WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used));
+ /*
+ * If the cursor has prefix search configured we can early exit here if the key we are
+ * visiting is before our prefix.
+ */
+ if (F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH) && prefix != NULL &&
+ __wt_prefix_match(prefix, &cbt->iface.key) > 0) {
+ /* It is not okay for the user to have a custom collator. */
+ WT_ASSERT(session, CUR2BT(cbt)->collator == NULL);
+ WT_STAT_CONN_DATA_INCR(session, cursor_search_near_prefix_fast_paths);
+ return (WT_NOTFOUND);
+ }
WT_RET(__wt_txn_read(
session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
@@ -572,11 +585,11 @@ restart_read_page:
}
/*
- * __wt_btcur_prev --
+ * __wt_btcur_prev_prefix --
* Move to the previous record in the tree.
*/
int
-__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
+__wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -653,8 +666,14 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
total_skipped += skipped;
break;
case WT_PAGE_ROW_LEAF:
- ret = __cursor_row_prev(cbt, newpage, restart, &skipped);
+ ret = __cursor_row_prev(cbt, newpage, restart, &skipped, prefix);
total_skipped += skipped;
+ /*
+ * We can directly return WT_NOTFOUND here as the caller will reset the cursor for
+ * us, this way we don't leave the cursor positioned after returning WT_NOTFOUND.
+ */
+ if (ret == WT_NOTFOUND && F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH))
+ return (WT_NOTFOUND);
break;
default:
WT_ERR(__wt_illegal_value(session, page->type));
@@ -726,3 +745,13 @@ err:
F_CLR(cbt, WT_CBT_ITERATE_RETRY_NEXT);
return (ret);
}
+
+/*
+ * __wt_btcur_prev --
+ * Move to the previous record in the tree.
+ */
+int
+__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
+{
+ return (__wt_btcur_prev_prefix(cbt, NULL, truncating));
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 3c8786b91cb..481a1632a08 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -690,7 +690,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
* here because at low isolation levels, new records could appear as we are stepping through
* the tree.
*/
- while ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) {
+ while ((ret = __wt_btcur_next_prefix(cbt, &state.key, false)) != WT_NOTFOUND) {
WT_ERR(ret);
if (btree->type == BTREE_ROW)
WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &state.key, &exact));
@@ -703,7 +703,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
/*
* We walked to the end of the tree without finding a match. Walk backwards instead.
*/
- while ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) {
+ while ((ret = __wt_btcur_prev_prefix(cbt, &state.key, false)) != WT_NOTFOUND) {
WT_ERR(ret);
if (btree->type == BTREE_ROW)
WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &state.key, &exact));
@@ -725,6 +725,11 @@ err:
#endif
if (ret != 0) {
+ /*
+ * It is important that this reset is kept as the cursor state is modified in the above prev
+ * and next loops. Those internally do reset the cursor but not when performing a prefix
+ * search near.
+ */
WT_TRET(__cursor_reset(cbt));
__cursor_state_restore(cursor, &state);
}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 585dec9806a..25cbb0e8b33 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -164,7 +164,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_set_timestamp[] = {
static const WT_CONFIG_CHECK confchk_WT_CURSOR_reconfigure[] = {
{"append", "boolean", NULL, NULL, NULL, 0}, {"overwrite", "boolean", NULL, NULL, NULL, 0},
- {NULL, NULL, NULL, NULL, NULL, 0}};
+ {"prefix_search", "boolean", NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_assert_subconfigs[] = {
{"commit_timestamp", "string", NULL,
@@ -357,9 +357,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = {
{"incremental", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_incremental_subconfigs, 7},
{"next_random", "boolean", NULL, NULL, NULL, 0},
{"next_random_sample_size", "string", NULL, NULL, NULL, 0},
- {"overwrite", "boolean", NULL, NULL, NULL, 0}, {"raw", "boolean", NULL, NULL, NULL, 0},
- {"read_once", "boolean", NULL, NULL, NULL, 0}, {"readonly", "boolean", NULL, NULL, NULL, 0},
- {"skip_sort_check", "boolean", NULL, NULL, NULL, 0},
+ {"overwrite", "boolean", NULL, NULL, NULL, 0}, {"prefix_search", "boolean", NULL, NULL, NULL, 0},
+ {"raw", "boolean", NULL, NULL, NULL, 0}, {"read_once", "boolean", NULL, NULL, NULL, 0},
+ {"readonly", "boolean", NULL, NULL, NULL, 0}, {"skip_sort_check", "boolean", NULL, NULL, NULL, 0},
{"statistics", "list", NULL,
"choices=[\"all\",\"cache_walk\",\"fast\",\"clear\","
"\"size\",\"tree_walk\"]",
@@ -1092,7 +1092,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"oldest_timestamp=,stable_timestamp=",
confchk_WT_CONNECTION_set_timestamp, 5},
{"WT_CURSOR.close", "", NULL, 0},
- {"WT_CURSOR.reconfigure", "append=false,overwrite=true", confchk_WT_CURSOR_reconfigure, 2},
+ {"WT_CURSOR.reconfigure", "append=false,overwrite=true,prefix_search=false",
+ confchk_WT_CURSOR_reconfigure, 3},
{"WT_SESSION.alter",
"access_pattern_hint=none,app_metadata=,"
"assert=(commit_timestamp=none,durable_timestamp=none,"
@@ -1155,9 +1156,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"debug=(release_evict=false),dump=,incremental=(consolidate=false"
",enabled=false,file=,force_stop=false,granularity=16MB,src_id=,"
"this_id=),next_random=false,next_random_sample_size=0,"
- "overwrite=true,raw=false,read_once=false,readonly=false,"
- "skip_sort_check=false,statistics=,target=",
- confchk_WT_SESSION_open_cursor, 16},
+ "overwrite=true,prefix_search=false,raw=false,read_once=false,"
+ "readonly=false,skip_sort_check=false,statistics=,target=",
+ confchk_WT_SESSION_open_cursor, 17},
{"WT_SESSION.prepare_transaction", "prepare_timestamp=", confchk_WT_SESSION_prepare_transaction,
1},
{"WT_SESSION.query_timestamp", "get=read", confchk_WT_SESSION_query_timestamp, 1},
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index 3b42302aefc..19a50939a7a 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -855,7 +855,8 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, uint64_t hash_v
* For these configuration values, there is no difference in the resulting cursor other
* than flag values, so fix them up according to the given configuration.
*/
- F_CLR(cursor, WT_CURSTD_APPEND | WT_CURSTD_RAW | WT_CURSTD_OVERWRITE);
+ F_CLR(cursor,
+ WT_CURSTD_APPEND | WT_CURSTD_PREFIX_SEARCH | WT_CURSTD_RAW | WT_CURSTD_OVERWRITE);
F_SET(cursor, overwrite_flag);
/*
* If this is a btree cursor, clear its read_once flag.
@@ -1059,6 +1060,22 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config)
} else
WT_ERR_NOTFOUND_OK(ret, false);
+ /* Set the prefix search near flag. */
+ if ((ret = __wt_config_getones(session, config, "prefix_key", &cval)) == 0) {
+ if (cval.val) {
+ /* Prefix search near configuration can only be used for row-store. */
+ if (WT_CURSOR_RECNO(cursor))
+ WT_ERR_MSG(
+ session, EINVAL, "cannot use prefix key search near for column store formats");
+ if (CUR2BT(cursor)->collator != NULL)
+ WT_ERR_MSG(
+ session, EINVAL, "cannot use prefix key search near with a custom collator");
+ F_SET(cursor, WT_CURSTD_PREFIX_SEARCH);
+ } else
+ F_CLR(cursor, WT_CURSTD_PREFIX_SEARCH);
+ } else
+ WT_ERR_NOTFOUND_OK(ret, false);
+
WT_ERR(__cursor_config_debug(cursor, cfg));
err:
diff --git a/src/third_party/wiredtiger/src/include/btree_cmp_inline.h b/src/third_party/wiredtiger/src/include/btree_cmp_inline.h
index 18d8a8e5158..0c7eaf9fdb9 100644
--- a/src/third_party/wiredtiger/src/include/btree_cmp_inline.h
+++ b/src/third_party/wiredtiger/src/include/btree_cmp_inline.h
@@ -23,11 +23,12 @@
* __wt_lex_compare --
* Lexicographic comparison routine. Returns: < 0 if user_item is lexicographically < tree_item
* = 0 if user_item is lexicographically = tree_item > 0 if user_item is lexicographically >
- * tree_item We use the names "user" and "tree" so it's clear in the btree code which the
- * application is looking at when we call its comparison function.
+ * tree_item. We use the names "user" and "tree" so it's clear in the btree code which the
+ * application is looking at when we call its comparison function. If prefix is specified, 0 can
+ * be returned when the user_item is equal to the tree_item for the minimum size.
*/
static inline int
-__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
+__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item, bool prefix)
{
size_t len, usz, tsz;
const uint8_t *userp, *treep;
@@ -92,7 +93,7 @@ __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
return (*userp < *treep ? -1 : 1);
/* Contents are equal up to the smallest length. */
- return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+ return ((usz == tsz || prefix) ? 0 : (usz < tsz) ? -1 : 1);
}
/*
@@ -104,13 +105,23 @@ __wt_compare(WT_SESSION_IMPL *session, WT_COLLATOR *collator, const WT_ITEM *use
const WT_ITEM *tree_item, int *cmpp)
{
if (collator == NULL) {
- *cmpp = __wt_lex_compare(user_item, tree_item);
+ *cmpp = __wt_lex_compare(user_item, tree_item, false);
return (0);
}
return (collator->compare(collator, &session->iface, user_item, tree_item, cmpp));
}
/*
+ * __wt_prefix_match --
+ * Check if the prefix item is equal to the leading bytes of the tree item.
+ */
+static inline int
+__wt_prefix_match(const WT_ITEM *prefix, const WT_ITEM *tree_item)
+{
+ return (__wt_lex_compare(prefix, tree_item, true));
+}
+
+/*
* __wt_lex_compare_skip --
* Lexicographic comparison routine, skipping leading bytes. Returns: < 0 if user_item is
* lexicographically < tree_item = 0 if user_item is lexicographically = tree_item > 0 if
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index a349bdcf84f..948c9a5befc 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -262,10 +262,14 @@ extern int __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentr
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned)
@@ -2003,7 +2007,7 @@ static inline int __wt_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fstr, WT_IT
static inline int __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size,
u_int skipdepth, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-static inline int __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
+static inline int __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item, bool prefix)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_lex_compare_short(const WT_ITEM *user_item, const WT_ITEM *tree_item)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -2028,6 +2032,8 @@ static inline int __wt_page_swap_func(
const char *func, int line
#endif
) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline int __wt_prefix_match(const WT_ITEM *prefix, const WT_ITEM *tree_item)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len,
void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r,
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 706e6fee492..534d4a1cf40 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -742,6 +742,7 @@ struct __wt_connection_stats {
int64_t cursor_next_skip_total;
int64_t cursor_prev_skip_total;
int64_t cursor_skip_hs_cur_position;
+ int64_t cursor_search_near_prefix_fast_paths;
int64_t cursor_next_hs_tombstone;
int64_t cursor_next_skip_ge_100;
int64_t cursor_next_skip_lt_100;
@@ -960,6 +961,7 @@ struct __wt_dsrc_stats {
int64_t cursor_next_skip_total;
int64_t cursor_prev_skip_total;
int64_t cursor_skip_hs_cur_position;
+ int64_t cursor_search_near_prefix_fast_paths;
int64_t cursor_next_hs_tombstone;
int64_t cursor_next_skip_ge_100;
int64_t cursor_next_skip_lt_100;
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index fc45cf31e8d..a4a1b584b35 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -715,10 +715,11 @@ struct __wt_cursor {
#define WT_CURSTD_META_INUSE 0x0040000u
#define WT_CURSTD_OPEN 0x0080000u
#define WT_CURSTD_OVERWRITE 0x0100000u
-#define WT_CURSTD_RAW 0x0200000u
-#define WT_CURSTD_RAW_SEARCH 0x0400000u
-#define WT_CURSTD_VALUE_EXT 0x0800000u /* Value points out of tree. */
-#define WT_CURSTD_VALUE_INT 0x1000000u /* Value points into tree. */
+#define WT_CURSTD_PREFIX_SEARCH 0x0200000u
+#define WT_CURSTD_RAW 0x0400000u
+#define WT_CURSTD_RAW_SEARCH 0x0800000u
+#define WT_CURSTD_VALUE_EXT 0x1000000u /* Value points out of tree. */
+#define WT_CURSTD_VALUE_INT 0x2000000u /* Value points into tree. */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
#define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)
#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
@@ -4782,10 +4783,7 @@ struct __wt_storage_source {
* objects with this prefix will be visible, and the prefix will be removed when
* listed. Prefixes may contain '/' as a separator.
* @param auth_token the authorization identifier.
- * @configstart{WT_STORAGE_SOURCE.customize_file_system, manually maintained}
- * @config{cache_directory, name of directory holding cached objects and other objects
- * not yet flushed\, directory must already exist, a string; default \c ".".}
- * @configend
+ * @param config additional configuration, currently must be NULL.
* @param[out] file_system the customized file system returned
*/
int (*ss_customize_file_system)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session,
@@ -5920,141 +5918,146 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
*/
#define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1428
/*!
+ * cursor: Total number of times a search near has exited due to prefix
+ * config
+ */
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 1429
+/*!
* cursor: cursor next calls that skip due to a globally visible history
* store tombstone
*/
-#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1429
+#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1430
/*!
* cursor: cursor next calls that skip greater than or equal to 100
* entries
*/
-#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1430
+#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1431
/*! cursor: cursor next calls that skip less than 100 entries */
-#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1431
+#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1432
/*!
* cursor: cursor prev calls that skip due to a globally visible history
* store tombstone
*/
-#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1432
+#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1433
/*!
* cursor: cursor prev calls that skip greater than or equal to 100
* entries
*/
-#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1433
+#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1434
/*! cursor: cursor prev calls that skip less than 100 entries */
-#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1434
+#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1435
/*! cursor: open cursor count */
-#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1435
+#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1436
/*! reconciliation: approximate byte size of timestamps in pages written */
-#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1436
+#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1437
/*!
* reconciliation: approximate byte size of transaction IDs in pages
* written
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1437
+#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1438
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1438
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1439
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1439
+#define WT_STAT_CONN_REC_PAGES 1440
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1440
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1441
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1441
+#define WT_STAT_CONN_REC_PAGE_DELETE 1442
/*!
* reconciliation: pages written including an aggregated newest start
* durable timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1442
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1443
/*!
* reconciliation: pages written including an aggregated newest stop
* durable timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1443
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1444
/*!
* reconciliation: pages written including an aggregated newest stop
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1444
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1445
/*!
* reconciliation: pages written including an aggregated newest stop
* transaction ID
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1445
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1446
/*!
* reconciliation: pages written including an aggregated newest
* transaction ID
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1446
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1447
/*!
* reconciliation: pages written including an aggregated oldest start
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1447
+#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1448
/*! reconciliation: pages written including an aggregated prepare */
-#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1448
+#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1449
/*!
* reconciliation: pages written including at least one start durable
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1449
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1450
/*!
* reconciliation: pages written including at least one start transaction
* ID
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1450
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1451
/*!
* reconciliation: pages written including at least one stop durable
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1451
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1452
/*! reconciliation: pages written including at least one stop timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1452
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1453
/*!
* reconciliation: pages written including at least one stop transaction
* ID
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1453
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1454
/*! reconciliation: records written including a start durable timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1454
+#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1455
/*! reconciliation: records written including a start timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1455
+#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1456
/*! reconciliation: records written including a start transaction ID */
-#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1456
+#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1457
/*! reconciliation: records written including a stop durable timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1457
+#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1458
/*! reconciliation: records written including a stop timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1458
+#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1459
/*! reconciliation: records written including a stop transaction ID */
-#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1459
+#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1460
/*! session: tiered storage local retention time (secs) */
-#define WT_STAT_CONN_TIERED_RETENTION 1460
+#define WT_STAT_CONN_TIERED_RETENTION 1461
/*! session: tiered storage object size */
-#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1461
+#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1462
/*! transaction: race to read prepared update retry */
-#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1462
+#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1463
/*!
* transaction: rollback to stable history store records with stop
* timestamps older than newer records
*/
-#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1463
+#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1464
/*! transaction: rollback to stable inconsistent checkpoint */
-#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1464
+#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1465
/*! transaction: rollback to stable keys removed */
-#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1465
+#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1466
/*! transaction: rollback to stable keys restored */
-#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1466
+#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1467
/*! transaction: rollback to stable restored tombstones from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1467
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1468
/*! transaction: rollback to stable restored updates from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1468
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1469
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1469
+#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1470
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1470
+#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1471
/*! transaction: transaction checkpoints due to obsolete pages */
-#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1471
+#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1472
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1472
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1473
/*!
* @}
@@ -6539,141 +6542,146 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
*/
#define WT_STAT_DSRC_CURSOR_SKIP_HS_CUR_POSITION 2166
/*!
+ * cursor: Total number of times a search near has exited due to prefix
+ * config
+ */
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 2167
+/*!
* cursor: cursor next calls that skip due to a globally visible history
* store tombstone
*/
-#define WT_STAT_DSRC_CURSOR_NEXT_HS_TOMBSTONE 2167
+#define WT_STAT_DSRC_CURSOR_NEXT_HS_TOMBSTONE 2168
/*!
* cursor: cursor next calls that skip greater than or equal to 100
* entries
*/
-#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_GE_100 2168
+#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_GE_100 2169
/*! cursor: cursor next calls that skip less than 100 entries */
-#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_LT_100 2169
+#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_LT_100 2170
/*!
* cursor: cursor prev calls that skip due to a globally visible history
* store tombstone
*/
-#define WT_STAT_DSRC_CURSOR_PREV_HS_TOMBSTONE 2170
+#define WT_STAT_DSRC_CURSOR_PREV_HS_TOMBSTONE 2171
/*!
* cursor: cursor prev calls that skip greater than or equal to 100
* entries
*/
-#define WT_STAT_DSRC_CURSOR_PREV_SKIP_GE_100 2171
+#define WT_STAT_DSRC_CURSOR_PREV_SKIP_GE_100 2172
/*! cursor: cursor prev calls that skip less than 100 entries */
-#define WT_STAT_DSRC_CURSOR_PREV_SKIP_LT_100 2172
+#define WT_STAT_DSRC_CURSOR_PREV_SKIP_LT_100 2173
/*! cursor: open cursor count */
-#define WT_STAT_DSRC_CURSOR_OPEN_COUNT 2173
+#define WT_STAT_DSRC_CURSOR_OPEN_COUNT 2174
/*! reconciliation: approximate byte size of timestamps in pages written */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TS 2174
+#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TS 2175
/*!
* reconciliation: approximate byte size of transaction IDs in pages
* written
*/
-#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TXN 2175
+#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TXN 2176
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2176
+#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2177
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2177
+#define WT_STAT_DSRC_REC_PAGES 2178
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2178
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2179
/*! reconciliation: pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2179
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2180
/*!
* reconciliation: pages written including an aggregated newest start
* durable timestamp
*/
-#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 2180
+#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 2181
/*!
* reconciliation: pages written including an aggregated newest stop
* durable timestamp
*/
-#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 2181
+#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 2182
/*!
* reconciliation: pages written including an aggregated newest stop
* timestamp
*/
-#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TS 2182
+#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TS 2183
/*!
* reconciliation: pages written including an aggregated newest stop
* transaction ID
*/
-#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TXN 2183
+#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TXN 2184
/*!
* reconciliation: pages written including an aggregated newest
* transaction ID
*/
-#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_TXN 2184
+#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_TXN 2185
/*!
* reconciliation: pages written including an aggregated oldest start
* timestamp
*/
-#define WT_STAT_DSRC_REC_TIME_AGGR_OLDEST_START_TS 2185
+#define WT_STAT_DSRC_REC_TIME_AGGR_OLDEST_START_TS 2186
/*! reconciliation: pages written including an aggregated prepare */
-#define WT_STAT_DSRC_REC_TIME_AGGR_PREPARED 2186
+#define WT_STAT_DSRC_REC_TIME_AGGR_PREPARED 2187
/*!
* reconciliation: pages written including at least one start durable
* timestamp
*/
-#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 2187
+#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 2188
/*!
* reconciliation: pages written including at least one start transaction
* ID
*/
-#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_START_TXN 2188
+#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_START_TXN 2189
/*!
* reconciliation: pages written including at least one stop durable
* timestamp
*/
-#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 2189
+#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 2190
/*! reconciliation: pages written including at least one stop timestamp */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TS 2190
+#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TS 2191
/*!
* reconciliation: pages written including at least one stop transaction
* ID
*/
-#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TXN 2191
+#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TXN 2192
/*! reconciliation: records written including a start durable timestamp */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_START_TS 2192
+#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_START_TS 2193
/*! reconciliation: records written including a start timestamp */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TS 2193
+#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TS 2194
/*! reconciliation: records written including a start transaction ID */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TXN 2194
+#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TXN 2195
/*! reconciliation: records written including a stop durable timestamp */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_STOP_TS 2195
+#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_STOP_TS 2196
/*! reconciliation: records written including a stop timestamp */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TS 2196
+#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TS 2197
/*! reconciliation: records written including a stop transaction ID */
-#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TXN 2197
+#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TXN 2198
/*! session: tiered storage local retention time (secs) */
-#define WT_STAT_DSRC_TIERED_RETENTION 2198
+#define WT_STAT_DSRC_TIERED_RETENTION 2199
/*! session: tiered storage object size */
-#define WT_STAT_DSRC_TIERED_OBJECT_SIZE 2199
+#define WT_STAT_DSRC_TIERED_OBJECT_SIZE 2200
/*! transaction: race to read prepared update retry */
-#define WT_STAT_DSRC_TXN_READ_RACE_PREPARE_UPDATE 2200
+#define WT_STAT_DSRC_TXN_READ_RACE_PREPARE_UPDATE 2201
/*!
* transaction: rollback to stable history store records with stop
* timestamps older than newer records
*/
-#define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2201
+#define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2202
/*! transaction: rollback to stable inconsistent checkpoint */
-#define WT_STAT_DSRC_TXN_RTS_INCONSISTENT_CKPT 2202
+#define WT_STAT_DSRC_TXN_RTS_INCONSISTENT_CKPT 2203
/*! transaction: rollback to stable keys removed */
-#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2203
+#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2204
/*! transaction: rollback to stable keys restored */
-#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2204
+#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2205
/*! transaction: rollback to stable restored tombstones from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2205
+#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2206
/*! transaction: rollback to stable restored updates from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2206
+#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2207
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2207
+#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2208
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2208
+#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2209
/*! transaction: transaction checkpoints due to obsolete pages */
-#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2209
+#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2210
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2210
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2211
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 73aadc87b42..d622d44589e 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -175,6 +175,7 @@ static const char *const __stats_dsrc_desc[] = {
"cursor: Total number of entries skipped by cursor next calls",
"cursor: Total number of entries skipped by cursor prev calls",
"cursor: Total number of entries skipped to position the history store cursor",
+ "cursor: Total number of times a search near has exited due to prefix config",
"cursor: cursor next calls that skip due to a globally visible history store tombstone",
"cursor: cursor next calls that skip greater than or equal to 100 entries",
"cursor: cursor next calls that skip less than 100 entries",
@@ -427,6 +428,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cursor_next_skip_total = 0;
stats->cursor_prev_skip_total = 0;
stats->cursor_skip_hs_cur_position = 0;
+ stats->cursor_search_near_prefix_fast_paths = 0;
stats->cursor_next_hs_tombstone = 0;
stats->cursor_next_skip_ge_100 = 0;
stats->cursor_next_skip_lt_100 = 0;
@@ -665,6 +667,7 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to)
to->cursor_next_skip_total += from->cursor_next_skip_total;
to->cursor_prev_skip_total += from->cursor_prev_skip_total;
to->cursor_skip_hs_cur_position += from->cursor_skip_hs_cur_position;
+ to->cursor_search_near_prefix_fast_paths += from->cursor_search_near_prefix_fast_paths;
to->cursor_next_hs_tombstone += from->cursor_next_hs_tombstone;
to->cursor_next_skip_ge_100 += from->cursor_next_skip_ge_100;
to->cursor_next_skip_lt_100 += from->cursor_next_skip_lt_100;
@@ -905,6 +908,8 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to)
to->cursor_next_skip_total += WT_STAT_READ(from, cursor_next_skip_total);
to->cursor_prev_skip_total += WT_STAT_READ(from, cursor_prev_skip_total);
to->cursor_skip_hs_cur_position += WT_STAT_READ(from, cursor_skip_hs_cur_position);
+ to->cursor_search_near_prefix_fast_paths +=
+ WT_STAT_READ(from, cursor_search_near_prefix_fast_paths);
to->cursor_next_hs_tombstone += WT_STAT_READ(from, cursor_next_hs_tombstone);
to->cursor_next_skip_ge_100 += WT_STAT_READ(from, cursor_next_skip_ge_100);
to->cursor_next_skip_lt_100 += WT_STAT_READ(from, cursor_next_skip_lt_100);
@@ -1396,6 +1401,7 @@ static const char *const __stats_connection_desc[] = {
"cursor: Total number of entries skipped by cursor next calls",
"cursor: Total number of entries skipped by cursor prev calls",
"cursor: Total number of entries skipped to position the history store cursor",
+ "cursor: Total number of times a search near has exited due to prefix config",
"cursor: cursor next calls that skip due to a globally visible history store tombstone",
"cursor: cursor next calls that skip greater than or equal to 100 entries",
"cursor: cursor next calls that skip less than 100 entries",
@@ -1910,6 +1916,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cursor_next_skip_total = 0;
stats->cursor_prev_skip_total = 0;
stats->cursor_skip_hs_cur_position = 0;
+ stats->cursor_search_near_prefix_fast_paths = 0;
stats->cursor_next_hs_tombstone = 0;
stats->cursor_next_skip_ge_100 = 0;
stats->cursor_next_skip_lt_100 = 0;
@@ -2430,6 +2437,8 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->cursor_next_skip_total += WT_STAT_READ(from, cursor_next_skip_total);
to->cursor_prev_skip_total += WT_STAT_READ(from, cursor_prev_skip_total);
to->cursor_skip_hs_cur_position += WT_STAT_READ(from, cursor_skip_hs_cur_position);
+ to->cursor_search_near_prefix_fast_paths +=
+ WT_STAT_READ(from, cursor_search_near_prefix_fast_paths);
to->cursor_next_hs_tombstone += WT_STAT_READ(from, cursor_next_hs_tombstone);
to->cursor_next_skip_ge_100 += WT_STAT_READ(from, cursor_next_skip_ge_100);
to->cursor_next_skip_lt_100 += WT_STAT_READ(from, cursor_next_skip_lt_100);
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index bbefea1d64c..83e6c2b8241 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -1353,7 +1353,7 @@ __txn_mod_compare(const void *a, const void *b)
*/
if (aopt->type == WT_TXN_OP_BASIC_ROW || aopt->type == WT_TXN_OP_INMEM_ROW)
return (aopt->btree->collator == NULL ?
- __wt_lex_compare(&aopt->u.op_row.key, &bopt->u.op_row.key) :
+ __wt_lex_compare(&aopt->u.op_row.key, &bopt->u.op_row.key, false) :
0);
return (aopt->u.op_col.recno < bopt->u.op_col.recno);
}
diff --git a/src/third_party/wiredtiger/test/suite/test_search_near01.py b/src/third_party/wiredtiger/test/suite/test_search_near01.py
new file mode 100644
index 00000000000..2e54671c06c
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_search_near01.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import time, wiredtiger, wttest, unittest
+from wiredtiger import stat
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_search_near01.py
+# Test various prefix search near scenarios.
+class test_search_near01(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all)'
+ session_config = 'isolation=snapshot'
+
+ def get_stat(self, stat, local_session = None):
+ if (local_session != None):
+ stat_cursor = local_session.open_cursor('statistics:')
+ else:
+ stat_cursor = self.session.open_cursor('statistics:')
+ val = stat_cursor[stat][2]
+ stat_cursor.close()
+ return val
+
+ def unique_insert(self, cursor, prefix, id, keys):
+ key = prefix + ',' + str(id)
+ keys.append(key)
+ cursor.set_key(prefix)
+ cursor.set_value(prefix)
+ self.assertEqual(cursor.insert(), 0)
+ cursor.set_key(prefix)
+ self.assertEqual(cursor.remove(), 0)
+ cursor.set_key(prefix)
+ cursor.search_near()
+ cursor.set_key(key)
+ cursor.set_value(key)
+ self.assertEqual(cursor.insert(), 0)
+
+ def test_base_scenario(self):
+ uri = 'table:test_base_scenario'
+ self.session.create(uri, 'key_format=u,value_format=u')
+ cursor = self.session.open_cursor(uri)
+ session2 = self.conn.open_session()
+ cursor3 = self.session.open_cursor(uri, None, "debug=(release_evict=true)")
+
+ # Basic character array.
+ l = "abcdefghijklmnopqrstuvwxyz"
+
+ # Start our older reader.
+ session2.begin_transaction()
+
+ key_count = 26*26*26
+ # Insert keys aaa -> zzz.
+ self.session.begin_transaction()
+ for i in range (0, 26):
+ for j in range (0, 26):
+ for k in range (0, 26):
+ cursor[l[i] + l[j] + l[k]] = l[i] + l[j] + l[k]
+ self.session.commit_transaction()
+
+ # Evict the whole range.
+ for i in range (0, 26):
+ for j in range(0, 26):
+ cursor3.set_key(l[i] + l[j] + 'a')
+ cursor3.search()
+ cursor3.reset()
+
+ # Search near for the "aa" part of the range.
+ cursor2 = session2.open_cursor(uri)
+ cursor2.set_key('aa')
+ cursor2.search_near()
+
+ skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100)
+ # This should be equal to roughly key_count * 2 as we're going to traverse the whole
+ # range forward, and then the whole range backwards.
+ self.assertGreater(skip_count, key_count * 2)
+
+ cursor2.reconfigure("prefix_key=true")
+ cursor2.set_key('aa')
+ cursor2.search_near()
+
+ prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100)
+ # We should've skipped ~26*2 here as we're only looking at the "aa" range * 2.
+ self.assertGreaterEqual(prefix_skip_count - skip_count, 26*2)
+ skip_count = prefix_skip_count
+
+ # The prefix code will have come into play at once as we walked to "aba". The prev
+ # traversal will go off the end of the file and as such we don't expect it to increment
+ # this statistic again.
+ self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 1)
+
+ # Search for a key not at the start.
+ cursor2.set_key('bb')
+ cursor2.search_near()
+
+ # Assert it to have only incremented the skipped statistic ~26*2 times.
+ prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100)
+ self.assertGreaterEqual(prefix_skip_count - skip_count, 26*2)
+ skip_count = prefix_skip_count
+
+ # Here we should've hit the prefix fast path code twice. Plus the time we already did.
+ self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 2+1)
+
+ cursor2.close()
+ cursor2 = session2.open_cursor(uri)
+ cursor2.set_key('bb')
+ cursor2.search_near()
+ # Assert that we've incremented the stat key_count times, as we closed the cursor and
+ # reopened it.
+ #
+ # This validates cursor caching logic, as if we don't clear the flag correctly this will
+ # fail.
+ #
+ # It should be closer to key_count * 2 but this an approximation.
+ prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100)
+ self.assertGreaterEqual(prefix_skip_count - skip_count, key_count)
+
+ # This test aims to simulate a unique index insertion.
+ def test_unique_index_case(self):
+ uri = 'table:test_unique_index_case'
+ self.session.create(uri, 'key_format=u,value_format=u')
+ cursor = self.session.open_cursor(uri)
+ session2 = self.conn.open_session()
+ cursor3 = self.session.open_cursor(uri, None, "debug=(release_evict=true)")
+ l = "abcdefghijklmnopqrstuvwxyz"
+
+ # A unique index has the following insertion method:
+ # 1. Insert the prefix
+ # 2. Remove the prefix
+ # 3. Search near for the prefix
+ # 4. Insert the full value
+ # All of these operations are wrapped in the same txn, this test attempts to test scenarios
+ # that could arise from this insertion method.
+
+ # A unique index key has the format (prefix, _id), we'll insert keys that look similar.
+
+ # Start our old reader txn.
+ session2.begin_transaction()
+
+ key_count = 26*26
+ id = 0
+ cc_id = 0
+ keys = []
+
+ # Insert keys aa,1 -> zz,N
+ for i in range (0, 26):
+ for j in range (0, 26):
+ # Skip inserting 'c'.
+ if (i == 2 and j == 2):
+ cc_id = id
+ id = id + 1
+ continue
+ self.session.begin_transaction()
+ prefix = l[i] + l[j]
+ self.unique_insert(cursor, prefix, id, keys)
+ id = id + 1
+ self.session.commit_transaction()
+
+ # Evict the whole range.
+ for i in keys:
+ cursor3.set_key(i)
+ cursor3.search()
+ cursor3.reset()
+
+ # Using our older reader attempt to find a value.
+ # Search near for the "cc" prefix.
+ cursor2 = session2.open_cursor(uri)
+ cursor2.set_key('cc')
+ cursor2.search_near()
+
+ skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100)
+ # This should be equal to roughly key_count * 2 as we're going to traverse most of the
+ # range forward, and then the whole range backwards.
+ self.assertGreater(skip_count, key_count * 2)
+
+ cursor2.reconfigure("prefix_key=true")
+ cursor2.set_key('cc')
+ cursor2.search_near()
+ self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 2)
+
+ # This still isn't visible to our older reader and as such we expect this statistic to
+ # increment twice.
+ self.unique_insert(cursor2, 'cc', cc_id, keys)
+ self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 4)
+
+ # In order for prefix key fast pathing to work we rely on some guarantees provided by row
+ # search. Test some of the guarantees.
+ def test_row_search(self):
+ uri = 'table:test_row_search'
+ self.session.create(uri, 'key_format=u,value_format=u')
+ cursor = self.session.open_cursor(uri)
+ session2 = self.conn.open_session()
+ l = "abcdefghijklmnopqrstuvwxyz"
+ # Insert keys a -> z, except c
+ self.session.begin_transaction()
+ for i in range (0, 26):
+ if (i == 2):
+ continue
+ cursor[l[i]] = l[i]
+ self.session.commit_transaction()
+ # Start our older reader transaction.
+ session2.begin_transaction()
+ # Insert a few keys in the 'c' range
+ self.session.begin_transaction()
+ cursor['c'] = 'c'
+ cursor['cc'] = 'cc'
+ cursor['ccc'] = 'ccc'
+ self.session.commit_transaction()
+ # Search_near for 'c' and assert we skip 3 entries. Internally the row search is landing on
+ # 'c'.
+ cursor2 = session2.open_cursor(uri)
+ cursor2.set_key('c')
+ cursor2.search_near()
+
+ skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100)
+ self.assertEqual(skip_count, 3)
+ session2.commit_transaction()
+
+ # Perform an insertion and removal of a key next to another key, then search for the
+ # removed key.
+ self.session.begin_transaction()
+ cursor.set_key('dd')
+ cursor.set_value('dd')
+ cursor.insert()
+ cursor.set_key('dd')
+ cursor.remove()
+ cursor.set_key('ddd')
+ cursor.set_value('ddd')
+ cursor.insert()
+ cursor.set_key('dd')
+ cursor.search_near()
+ self.session.commit_transaction()
+ skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100)
+ self.assertEqual(skip_count, 4)
+
+ # Test a basic prepared scenario.
+ def test_prepared(self):
+ uri = 'table:test_base_scenario'
+ self.session.create(uri, 'key_format=u,value_format=u')
+ cursor = self.session.open_cursor(uri)
+ session2 = self.conn.open_session()
+ cursor3 = session2.open_cursor(uri, None, "debug=(release_evict=true)")
+ # Insert an update without timestamp
+ l = "abcdefghijklmnopqrstuvwxyz"
+ session2.begin_transaction()
+
+ key_count = 26*26
+
+ # Insert 'cc'
+ self.session.begin_transaction()
+ cursor['cc'] = 'cc'
+ self.session.commit_transaction()
+
+ # Prepare keys aa -> zz
+ self.session.begin_transaction()
+ for i in range (0, 26):
+ if (i == 2):
+ continue
+ for j in range (0, 26):
+ cursor[l[i] + l[j]] = l[i] + l[j]
+
+ self.session.prepare_transaction('prepare_timestamp=2')
+
+ # Evict the whole range.
+ for i in range (0, 26):
+ for j in range(0, 26):
+ cursor3.set_key(l[i] + l[j])
+ cursor3.search()
+ cursor3.reset()
+
+ # Search near for the "aa" part of the range.
+ cursor2 = session2.open_cursor(uri)
+ cursor2.set_key('c')
+ cursor2.search_near()
+
+ skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100, session2)
+ # This should be equal to roughly key_count * 2 as we're going to traverse the whole
+ # range forward, and then the whole range backwards.
+ self.assertGreater(skip_count, key_count)
+
+ cursor2.reconfigure("prefix_key=true")
+ cursor2.set_key('c')
+ cursor2.search_near()
+
+ prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100, session2)
+ self.assertEqual(prefix_skip_count - skip_count, 3)
+ skip_count = prefix_skip_count
+
+ self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths, session2), 2)
+
+ session2.rollback_transaction()
+ session2.begin_transaction('ignore_prepare=true')
+ cursor4 = session2.open_cursor(uri)
+ cursor4.reconfigure("prefix_key=true")
+ cursor4.set_key('c')
+ cursor4.search_near()
+ prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100, session2)
+ self.assertEqual(prefix_skip_count - skip_count, 2)
+ skip_count = prefix_skip_count
+
+ cursor4.reconfigure("prefix_key=false")
+ cursor4.set_key('c')
+ cursor4.search_near()
+ self.assertEqual(self.get_stat(stat.conn.cursor_next_skip_lt_100, session2) - skip_count, 2)