From c712d5ddbd852b03331284ab9e08bae3addb4aa0 Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Tue, 4 May 2021 16:46:29 +1000 Subject: Import wiredtiger: 652def8ae433a2001145d34baec9ef9f5610a97e from branch mongodb-5.0 ref: 859f83a6cc..652def8ae4 for: 5.0.0 WT-7264 Creating a new configuration for search near that allows it to exit quickly when searching for prefixes --- src/third_party/wiredtiger/dist/api_data.py | 8 + src/third_party/wiredtiger/dist/stat_data.py | 1 + src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_curnext.c | 39 ++- src/third_party/wiredtiger/src/btree/bt_curprev.c | 39 ++- src/third_party/wiredtiger/src/btree/bt_cursor.c | 9 +- src/third_party/wiredtiger/src/config/config_def.c | 17 +- src/third_party/wiredtiger/src/cursor/cur_std.c | 19 +- .../wiredtiger/src/include/btree_cmp_inline.h | 21 +- src/third_party/wiredtiger/src/include/extern.h | 8 +- src/third_party/wiredtiger/src/include/stat.h | 2 + .../wiredtiger/src/include/wiredtiger.in | 200 +++++++------ src/third_party/wiredtiger/src/support/stat.c | 9 + src/third_party/wiredtiger/src/txn/txn.c | 2 +- .../wiredtiger/test/suite/test_search_near01.py | 330 +++++++++++++++++++++ 15 files changed, 581 insertions(+), 125 deletions(-) create mode 100644 src/third_party/wiredtiger/test/suite/test_search_near01.py diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 945414cf3e5..953474d404f 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -1245,6 +1245,14 @@ cursor_runtime_config = [ if the record exists, WT_CURSOR::update fails with ::WT_NOTFOUND if the record does not exist''', type='boolean'), + Config('prefix_search', 'false', r''' + when performing a search near for a prefix, if set to true this + configuration will allow the search near to exit early if it has left + the key range defined by the prefix. This is relevant when the table + contains a large number of records which potentially aren't visible to + the caller of search near, as such a large number of records could be skipped. + The prefix_search configuration provides a fast exit in this scenario.''', type='boolean', + undoc=True), ] methods = { diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 16cd9553968..34e5b020a4a 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -813,6 +813,7 @@ conn_dsrc_stats = [ CursorStat('cursor_prev_skip_ge_100', 'cursor prev calls that skip greater than or equal to 100 entries'), CursorStat('cursor_prev_skip_lt_100', 'cursor prev calls that skip less than 100 entries'), CursorStat('cursor_prev_skip_total', 'Total number of entries skipped by cursor prev calls'), + CursorStat('cursor_search_near_prefix_fast_paths', 'Total number of times a search near has exited due to prefix config'), CursorStat('cursor_skip_hs_cur_position', 'Total number of entries skipped to position the history store cursor'), ########################################## # Checkpoint cleanup statistics diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 19ed0ad74b7..aa12ed897f8 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "859f83a6ccf9bf1361f8aa75c1ba59e5624ea02d" + "commit": "652def8ae433a2001145d34baec9ef9f5610a97e" } diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 18fdef73315..a2b7f161d3e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -305,7 +305,8 @@ restart_read: * Move to the next row-store item. */ static inline int -__cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp) +__cursor_row_next( + WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp, WT_ITEM *prefix) { WT_CELL_UNPACK_KV kpack; WT_INSERT *ins; @@ -402,6 +403,17 @@ restart_read_insert: restart_read_page: rip = &page->pg_row[cbt->slot]; WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); + /* + * If the cursor has prefix search configured we can early exit here if the key that we are + * visiting is after our prefix. + */ + if (F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH) && prefix != NULL && + __wt_prefix_match(prefix, &cbt->iface.key) < 0) { + /* It is not okay for the user to have a custom collator. */ + WT_ASSERT(session, CUR2BT(cbt)->collator == NULL); + WT_STAT_CONN_DATA_INCR(session, cursor_search_near_prefix_fast_paths); + return (WT_NOTFOUND); + } WT_RET(__wt_txn_read( session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { @@ -622,11 +634,12 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) } /* - * __wt_btcur_next -- - * Move to the next record in the tree. + * __wt_btcur_next_prefix -- + * Move to the next record in the tree. Taking an optional prefix item for a special case of + * search near. */ int -__wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) +__wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating) { WT_CURSOR *cursor; WT_DECL_RET; @@ -692,8 +705,14 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) total_skipped += skipped; break; case WT_PAGE_ROW_LEAF: - ret = __cursor_row_next(cbt, newpage, restart, &skipped); + ret = __cursor_row_next(cbt, newpage, restart, &skipped, prefix); total_skipped += skipped; + /* + * We can directly return WT_NOTFOUND here as the caller expects the cursor to be + * positioned when traversing keys for prefix search near. + */ + if (ret == WT_NOTFOUND && F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH)) + return (WT_NOTFOUND); break; default: WT_ERR(__wt_illegal_value(session, page->type)); @@ -774,3 +793,13 @@ err: F_CLR(cbt, WT_CBT_ITERATE_RETRY_PREV); return (ret); } + +/* + * __wt_btcur_next -- + * Move to the next record in the tree. + */ +int +__wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) +{ + return (__wt_btcur_next_prefix(cbt, NULL, truncating)); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 7517eac77d8..867a46201a4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -441,10 +441,12 @@ restart_read: /* * __cursor_row_prev -- - * Move to the previous row-store item. + * Move to the previous row-store item. Taking an optional prefix item for a special case of + * search near. */ static inline int -__cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp) +__cursor_row_prev( + WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp, WT_ITEM *prefix) { WT_CELL_UNPACK_KV kpack; WT_INSERT *ins; @@ -553,6 +555,17 @@ restart_read_insert: restart_read_page: rip = &page->pg_row[cbt->slot]; WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); + /* + * If the cursor has prefix search configured we can early exit here if the key we are + * visiting is before our prefix. + */ + if (F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH) && prefix != NULL && + __wt_prefix_match(prefix, &cbt->iface.key) > 0) { + /* It is not okay for the user to have a custom collator. */ + WT_ASSERT(session, CUR2BT(cbt)->collator == NULL); + WT_STAT_CONN_DATA_INCR(session, cursor_search_near_prefix_fast_paths); + return (WT_NOTFOUND); + } WT_RET(__wt_txn_read( session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { @@ -572,11 +585,11 @@ restart_read_page: } /* - * __wt_btcur_prev -- + * __wt_btcur_prev_prefix -- * Move to the previous record in the tree. */ int -__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) +__wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating) { WT_CURSOR *cursor; WT_DECL_RET; @@ -653,8 +666,14 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) total_skipped += skipped; break; case WT_PAGE_ROW_LEAF: - ret = __cursor_row_prev(cbt, newpage, restart, &skipped); + ret = __cursor_row_prev(cbt, newpage, restart, &skipped, prefix); total_skipped += skipped; + /* + * We can directly return WT_NOTFOUND here as the caller will reset the cursor for + * us, this way we don't leave the cursor positioned after returning WT_NOTFOUND. + */ + if (ret == WT_NOTFOUND && F_ISSET(&cbt->iface, WT_CURSTD_PREFIX_SEARCH)) + return (WT_NOTFOUND); break; default: WT_ERR(__wt_illegal_value(session, page->type)); @@ -726,3 +745,13 @@ err: F_CLR(cbt, WT_CBT_ITERATE_RETRY_NEXT); return (ret); } + +/* + * __wt_btcur_prev -- + * Move to the previous record in the tree. + */ +int +__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) +{ + return (__wt_btcur_prev_prefix(cbt, NULL, truncating)); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 3c8786b91cb..481a1632a08 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -690,7 +690,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * here because at low isolation levels, new records could appear as we are stepping through * the tree. */ - while ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) { + while ((ret = __wt_btcur_next_prefix(cbt, &state.key, false)) != WT_NOTFOUND) { WT_ERR(ret); if (btree->type == BTREE_ROW) WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &state.key, &exact)); @@ -703,7 +703,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) /* * We walked to the end of the tree without finding a match. Walk backwards instead. */ - while ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) { + while ((ret = __wt_btcur_prev_prefix(cbt, &state.key, false)) != WT_NOTFOUND) { WT_ERR(ret); if (btree->type == BTREE_ROW) WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &state.key, &exact)); @@ -725,6 +725,11 @@ err: #endif if (ret != 0) { + /* + * It is important that this reset is kept as the cursor state is modified in the above prev + * and next loops. Those internally do reset the cursor but not when performing a prefix + * search near. + */ WT_TRET(__cursor_reset(cbt)); __cursor_state_restore(cursor, &state); } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 585dec9806a..25cbb0e8b33 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -164,7 +164,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_set_timestamp[] = { static const WT_CONFIG_CHECK confchk_WT_CURSOR_reconfigure[] = { {"append", "boolean", NULL, NULL, NULL, 0}, {"overwrite", "boolean", NULL, NULL, NULL, 0}, - {NULL, NULL, NULL, NULL, NULL, 0}}; + {"prefix_search", "boolean", NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_CHECK confchk_assert_subconfigs[] = { {"commit_timestamp", "string", NULL, @@ -357,9 +357,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = { {"incremental", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_incremental_subconfigs, 7}, {"next_random", "boolean", NULL, NULL, NULL, 0}, {"next_random_sample_size", "string", NULL, NULL, NULL, 0}, - {"overwrite", "boolean", NULL, NULL, NULL, 0}, {"raw", "boolean", NULL, NULL, NULL, 0}, - {"read_once", "boolean", NULL, NULL, NULL, 0}, {"readonly", "boolean", NULL, NULL, NULL, 0}, - {"skip_sort_check", "boolean", NULL, NULL, NULL, 0}, + {"overwrite", "boolean", NULL, NULL, NULL, 0}, {"prefix_search", "boolean", NULL, NULL, NULL, 0}, + {"raw", "boolean", NULL, NULL, NULL, 0}, {"read_once", "boolean", NULL, NULL, NULL, 0}, + {"readonly", "boolean", NULL, NULL, NULL, 0}, {"skip_sort_check", "boolean", NULL, NULL, NULL, 0}, {"statistics", "list", NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"clear\"," "\"size\",\"tree_walk\"]", @@ -1092,7 +1092,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "oldest_timestamp=,stable_timestamp=", confchk_WT_CONNECTION_set_timestamp, 5}, {"WT_CURSOR.close", "", NULL, 0}, - {"WT_CURSOR.reconfigure", "append=false,overwrite=true", confchk_WT_CURSOR_reconfigure, 2}, + {"WT_CURSOR.reconfigure", "append=false,overwrite=true,prefix_search=false", + confchk_WT_CURSOR_reconfigure, 3}, {"WT_SESSION.alter", "access_pattern_hint=none,app_metadata=," "assert=(commit_timestamp=none,durable_timestamp=none," @@ -1155,9 +1156,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "debug=(release_evict=false),dump=,incremental=(consolidate=false" ",enabled=false,file=,force_stop=false,granularity=16MB,src_id=," "this_id=),next_random=false,next_random_sample_size=0," - "overwrite=true,raw=false,read_once=false,readonly=false," - "skip_sort_check=false,statistics=,target=", - confchk_WT_SESSION_open_cursor, 16}, + "overwrite=true,prefix_search=false,raw=false,read_once=false," + "readonly=false,skip_sort_check=false,statistics=,target=", + confchk_WT_SESSION_open_cursor, 17}, {"WT_SESSION.prepare_transaction", "prepare_timestamp=", confchk_WT_SESSION_prepare_transaction, 1}, {"WT_SESSION.query_timestamp", "get=read", confchk_WT_SESSION_query_timestamp, 1}, diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 3b42302aefc..19a50939a7a 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -855,7 +855,8 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, uint64_t hash_v * For these configuration values, there is no difference in the resulting cursor other * than flag values, so fix them up according to the given configuration. */ - F_CLR(cursor, WT_CURSTD_APPEND | WT_CURSTD_RAW | WT_CURSTD_OVERWRITE); + F_CLR(cursor, + WT_CURSTD_APPEND | WT_CURSTD_PREFIX_SEARCH | WT_CURSTD_RAW | WT_CURSTD_OVERWRITE); F_SET(cursor, overwrite_flag); /* * If this is a btree cursor, clear its read_once flag. @@ -1059,6 +1060,22 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) } else WT_ERR_NOTFOUND_OK(ret, false); + /* Set the prefix search near flag. */ + if ((ret = __wt_config_getones(session, config, "prefix_key", &cval)) == 0) { + if (cval.val) { + /* Prefix search near configuration can only be used for row-store. */ + if (WT_CURSOR_RECNO(cursor)) + WT_ERR_MSG( + session, EINVAL, "cannot use prefix key search near for column store formats"); + if (CUR2BT(cursor)->collator != NULL) + WT_ERR_MSG( + session, EINVAL, "cannot use prefix key search near with a custom collator"); + F_SET(cursor, WT_CURSTD_PREFIX_SEARCH); + } else + F_CLR(cursor, WT_CURSTD_PREFIX_SEARCH); + } else + WT_ERR_NOTFOUND_OK(ret, false); + WT_ERR(__cursor_config_debug(cursor, cfg)); err: diff --git a/src/third_party/wiredtiger/src/include/btree_cmp_inline.h b/src/third_party/wiredtiger/src/include/btree_cmp_inline.h index 18d8a8e5158..0c7eaf9fdb9 100644 --- a/src/third_party/wiredtiger/src/include/btree_cmp_inline.h +++ b/src/third_party/wiredtiger/src/include/btree_cmp_inline.h @@ -23,11 +23,12 @@ * __wt_lex_compare -- * Lexicographic comparison routine. Returns: < 0 if user_item is lexicographically < tree_item * = 0 if user_item is lexicographically = tree_item > 0 if user_item is lexicographically > - * tree_item We use the names "user" and "tree" so it's clear in the btree code which the - * application is looking at when we call its comparison function. + * tree_item. We use the names "user" and "tree" so it's clear in the btree code which the + * application is looking at when we call its comparison function. If prefix is specified, 0 can + * be returned when the user_item is equal to the tree_item for the minimum size. */ static inline int -__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) +__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item, bool prefix) { size_t len, usz, tsz; const uint8_t *userp, *treep; @@ -92,7 +93,7 @@ __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) return (*userp < *treep ? -1 : 1); /* Contents are equal up to the smallest length. */ - return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1); + return ((usz == tsz || prefix) ? 0 : (usz < tsz) ? -1 : 1); } /* @@ -104,12 +105,22 @@ __wt_compare(WT_SESSION_IMPL *session, WT_COLLATOR *collator, const WT_ITEM *use const WT_ITEM *tree_item, int *cmpp) { if (collator == NULL) { - *cmpp = __wt_lex_compare(user_item, tree_item); + *cmpp = __wt_lex_compare(user_item, tree_item, false); return (0); } return (collator->compare(collator, &session->iface, user_item, tree_item, cmpp)); } +/* + * __wt_prefix_match -- + * Check if the prefix item is equal to the leading bytes of the tree item. + */ +static inline int +__wt_prefix_match(const WT_ITEM *prefix, const WT_ITEM *tree_item) +{ + return (__wt_lex_compare(prefix, tree_item, true)); +} + /* * __wt_lex_compare_skip -- * Lexicographic comparison routine, skipping leading bytes. Returns: < 0 if user_item is diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index a349bdcf84f..948c9a5befc 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -262,10 +262,14 @@ extern int __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentr WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned) @@ -2003,7 +2007,7 @@ static inline int __wt_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fstr, WT_IT static inline int __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size, u_int skipdepth, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline int __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) +static inline int __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item, bool prefix) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_lex_compare_short(const WT_ITEM *user_item, const WT_ITEM *tree_item) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2028,6 +2032,8 @@ static inline int __wt_page_swap_func( const char *func, int line #endif ) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline int __wt_prefix_match(const WT_ITEM *prefix, const WT_ITEM *tree_item) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 706e6fee492..534d4a1cf40 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -742,6 +742,7 @@ struct __wt_connection_stats { int64_t cursor_next_skip_total; int64_t cursor_prev_skip_total; int64_t cursor_skip_hs_cur_position; + int64_t cursor_search_near_prefix_fast_paths; int64_t cursor_next_hs_tombstone; int64_t cursor_next_skip_ge_100; int64_t cursor_next_skip_lt_100; @@ -960,6 +961,7 @@ struct __wt_dsrc_stats { int64_t cursor_next_skip_total; int64_t cursor_prev_skip_total; int64_t cursor_skip_hs_cur_position; + int64_t cursor_search_near_prefix_fast_paths; int64_t cursor_next_hs_tombstone; int64_t cursor_next_skip_ge_100; int64_t cursor_next_skip_lt_100; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index fc45cf31e8d..a4a1b584b35 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -715,10 +715,11 @@ struct __wt_cursor { #define WT_CURSTD_META_INUSE 0x0040000u #define WT_CURSTD_OPEN 0x0080000u #define WT_CURSTD_OVERWRITE 0x0100000u -#define WT_CURSTD_RAW 0x0200000u -#define WT_CURSTD_RAW_SEARCH 0x0400000u -#define WT_CURSTD_VALUE_EXT 0x0800000u /* Value points out of tree. */ -#define WT_CURSTD_VALUE_INT 0x1000000u /* Value points into tree. */ +#define WT_CURSTD_PREFIX_SEARCH 0x0200000u +#define WT_CURSTD_RAW 0x0400000u +#define WT_CURSTD_RAW_SEARCH 0x0800000u +#define WT_CURSTD_VALUE_EXT 0x1000000u /* Value points out of tree. */ +#define WT_CURSTD_VALUE_INT 0x2000000u /* Value points into tree. */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) @@ -4782,10 +4783,7 @@ struct __wt_storage_source { * objects with this prefix will be visible, and the prefix will be removed when * listed. Prefixes may contain '/' as a separator. * @param auth_token the authorization identifier. - * @configstart{WT_STORAGE_SOURCE.customize_file_system, manually maintained} - * @config{cache_directory, name of directory holding cached objects and other objects - * not yet flushed\, directory must already exist, a string; default \c ".".} - * @configend + * @param config additional configuration, currently must be NULL. * @param[out] file_system the customized file system returned */ int (*ss_customize_file_system)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, @@ -5919,142 +5917,147 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * cursor */ #define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1428 +/*! + * cursor: Total number of times a search near has exited due to prefix + * config + */ +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 1429 /*! * cursor: cursor next calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1429 +#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1430 /*! * cursor: cursor next calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1430 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1431 /*! cursor: cursor next calls that skip less than 100 entries */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1431 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1432 /*! * cursor: cursor prev calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1432 +#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1433 /*! * cursor: cursor prev calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1433 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1434 /*! cursor: cursor prev calls that skip less than 100 entries */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1434 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1435 /*! cursor: open cursor count */ -#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1435 +#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1436 /*! reconciliation: approximate byte size of timestamps in pages written */ -#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1436 +#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1437 /*! * reconciliation: approximate byte size of transaction IDs in pages * written */ -#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1437 +#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1438 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1438 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1439 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1439 +#define WT_STAT_CONN_REC_PAGES 1440 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1440 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1441 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1441 +#define WT_STAT_CONN_REC_PAGE_DELETE 1442 /*! * reconciliation: pages written including an aggregated newest start * durable timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1442 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1443 /*! * reconciliation: pages written including an aggregated newest stop * durable timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1443 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1444 /*! * reconciliation: pages written including an aggregated newest stop * timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1444 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1445 /*! * reconciliation: pages written including an aggregated newest stop * transaction ID */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1445 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1446 /*! * reconciliation: pages written including an aggregated newest * transaction ID */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1446 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1447 /*! * reconciliation: pages written including an aggregated oldest start * timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1447 +#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1448 /*! reconciliation: pages written including an aggregated prepare */ -#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1448 +#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1449 /*! * reconciliation: pages written including at least one start durable * timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1449 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1450 /*! * reconciliation: pages written including at least one start transaction * ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1450 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1451 /*! * reconciliation: pages written including at least one stop durable * timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1451 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1452 /*! reconciliation: pages written including at least one stop timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1452 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1453 /*! * reconciliation: pages written including at least one stop transaction * ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1453 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1454 /*! reconciliation: records written including a start durable timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1454 +#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1455 /*! reconciliation: records written including a start timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1455 +#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1456 /*! reconciliation: records written including a start transaction ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1456 +#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1457 /*! reconciliation: records written including a stop durable timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1457 +#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1458 /*! reconciliation: records written including a stop timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1458 +#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1459 /*! reconciliation: records written including a stop transaction ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1459 +#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1460 /*! session: tiered storage local retention time (secs) */ -#define WT_STAT_CONN_TIERED_RETENTION 1460 +#define WT_STAT_CONN_TIERED_RETENTION 1461 /*! session: tiered storage object size */ -#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1461 +#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1462 /*! transaction: race to read prepared update retry */ -#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1462 +#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1463 /*! * transaction: rollback to stable history store records with stop * timestamps older than newer records */ -#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1463 +#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1464 /*! transaction: rollback to stable inconsistent checkpoint */ -#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1464 +#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1465 /*! transaction: rollback to stable keys removed */ -#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1465 +#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1466 /*! transaction: rollback to stable keys restored */ -#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1466 +#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1467 /*! transaction: rollback to stable restored tombstones from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1467 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1468 /*! transaction: rollback to stable restored updates from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1468 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1469 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1469 +#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1470 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1470 +#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1471 /*! transaction: transaction checkpoints due to obsolete pages */ -#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1471 +#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1472 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1472 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1473 /*! * @} @@ -6538,142 +6541,147 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * cursor */ #define WT_STAT_DSRC_CURSOR_SKIP_HS_CUR_POSITION 2166 +/*! + * cursor: Total number of times a search near has exited due to prefix + * config + */ +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 2167 /*! * cursor: cursor next calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_DSRC_CURSOR_NEXT_HS_TOMBSTONE 2167 +#define WT_STAT_DSRC_CURSOR_NEXT_HS_TOMBSTONE 2168 /*! * cursor: cursor next calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_GE_100 2168 +#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_GE_100 2169 /*! cursor: cursor next calls that skip less than 100 entries */ -#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_LT_100 2169 +#define WT_STAT_DSRC_CURSOR_NEXT_SKIP_LT_100 2170 /*! * cursor: cursor prev calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_DSRC_CURSOR_PREV_HS_TOMBSTONE 2170 +#define WT_STAT_DSRC_CURSOR_PREV_HS_TOMBSTONE 2171 /*! * cursor: cursor prev calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_DSRC_CURSOR_PREV_SKIP_GE_100 2171 +#define WT_STAT_DSRC_CURSOR_PREV_SKIP_GE_100 2172 /*! cursor: cursor prev calls that skip less than 100 entries */ -#define WT_STAT_DSRC_CURSOR_PREV_SKIP_LT_100 2172 +#define WT_STAT_DSRC_CURSOR_PREV_SKIP_LT_100 2173 /*! cursor: open cursor count */ -#define WT_STAT_DSRC_CURSOR_OPEN_COUNT 2173 +#define WT_STAT_DSRC_CURSOR_OPEN_COUNT 2174 /*! reconciliation: approximate byte size of timestamps in pages written */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TS 2174 +#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TS 2175 /*! * reconciliation: approximate byte size of transaction IDs in pages * written */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TXN 2175 +#define WT_STAT_DSRC_REC_TIME_WINDOW_BYTES_TXN 2176 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2176 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2177 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2177 +#define WT_STAT_DSRC_REC_PAGES 2178 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2178 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2179 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2179 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2180 /*! * reconciliation: pages written including an aggregated newest start * durable timestamp */ -#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 2180 +#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 2181 /*! * reconciliation: pages written including an aggregated newest stop * durable timestamp */ -#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 2181 +#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 2182 /*! * reconciliation: pages written including an aggregated newest stop * timestamp */ -#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TS 2182 +#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TS 2183 /*! * reconciliation: pages written including an aggregated newest stop * transaction ID */ -#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TXN 2183 +#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_STOP_TXN 2184 /*! * reconciliation: pages written including an aggregated newest * transaction ID */ -#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_TXN 2184 +#define WT_STAT_DSRC_REC_TIME_AGGR_NEWEST_TXN 2185 /*! * reconciliation: pages written including an aggregated oldest start * timestamp */ -#define WT_STAT_DSRC_REC_TIME_AGGR_OLDEST_START_TS 2185 +#define WT_STAT_DSRC_REC_TIME_AGGR_OLDEST_START_TS 2186 /*! reconciliation: pages written including an aggregated prepare */ -#define WT_STAT_DSRC_REC_TIME_AGGR_PREPARED 2186 +#define WT_STAT_DSRC_REC_TIME_AGGR_PREPARED 2187 /*! * reconciliation: pages written including at least one start durable * timestamp */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 2187 +#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 2188 /*! * reconciliation: pages written including at least one start transaction * ID */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_START_TXN 2188 +#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_START_TXN 2189 /*! * reconciliation: pages written including at least one stop durable * timestamp */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 2189 +#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 2190 /*! reconciliation: pages written including at least one stop timestamp */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TS 2190 +#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TS 2191 /*! * reconciliation: pages written including at least one stop transaction * ID */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TXN 2191 +#define WT_STAT_DSRC_REC_TIME_WINDOW_PAGES_STOP_TXN 2192 /*! reconciliation: records written including a start durable timestamp */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_START_TS 2192 +#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_START_TS 2193 /*! reconciliation: records written including a start timestamp */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TS 2193 +#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TS 2194 /*! reconciliation: records written including a start transaction ID */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TXN 2194 +#define WT_STAT_DSRC_REC_TIME_WINDOW_START_TXN 2195 /*! reconciliation: records written including a stop durable timestamp */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_STOP_TS 2195 +#define WT_STAT_DSRC_REC_TIME_WINDOW_DURABLE_STOP_TS 2196 /*! reconciliation: records written including a stop timestamp */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TS 2196 +#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TS 2197 /*! reconciliation: records written including a stop transaction ID */ -#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TXN 2197 +#define WT_STAT_DSRC_REC_TIME_WINDOW_STOP_TXN 2198 /*! session: tiered storage local retention time (secs) */ -#define WT_STAT_DSRC_TIERED_RETENTION 2198 +#define WT_STAT_DSRC_TIERED_RETENTION 2199 /*! session: tiered storage object size */ -#define WT_STAT_DSRC_TIERED_OBJECT_SIZE 2199 +#define WT_STAT_DSRC_TIERED_OBJECT_SIZE 2200 /*! transaction: race to read prepared update retry */ -#define WT_STAT_DSRC_TXN_READ_RACE_PREPARE_UPDATE 2200 +#define WT_STAT_DSRC_TXN_READ_RACE_PREPARE_UPDATE 2201 /*! * transaction: rollback to stable history store records with stop * timestamps older than newer records */ -#define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2201 +#define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2202 /*! transaction: rollback to stable inconsistent checkpoint */ -#define WT_STAT_DSRC_TXN_RTS_INCONSISTENT_CKPT 2202 +#define WT_STAT_DSRC_TXN_RTS_INCONSISTENT_CKPT 2203 /*! transaction: rollback to stable keys removed */ -#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2203 +#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2204 /*! transaction: rollback to stable keys restored */ -#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2204 +#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2205 /*! transaction: rollback to stable restored tombstones from history store */ -#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2205 +#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2206 /*! transaction: rollback to stable restored updates from history store */ -#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2206 +#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2207 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2207 +#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2208 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2208 +#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2209 /*! transaction: transaction checkpoints due to obsolete pages */ -#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2209 +#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2210 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2210 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2211 /*! * @} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 73aadc87b42..d622d44589e 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -175,6 +175,7 @@ static const char *const __stats_dsrc_desc[] = { "cursor: Total number of entries skipped by cursor next calls", "cursor: Total number of entries skipped by cursor prev calls", "cursor: Total number of entries skipped to position the history store cursor", + "cursor: Total number of times a search near has exited due to prefix config", "cursor: cursor next calls that skip due to a globally visible history store tombstone", "cursor: cursor next calls that skip greater than or equal to 100 entries", "cursor: cursor next calls that skip less than 100 entries", @@ -427,6 +428,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cursor_next_skip_total = 0; stats->cursor_prev_skip_total = 0; stats->cursor_skip_hs_cur_position = 0; + stats->cursor_search_near_prefix_fast_paths = 0; stats->cursor_next_hs_tombstone = 0; stats->cursor_next_skip_ge_100 = 0; stats->cursor_next_skip_lt_100 = 0; @@ -665,6 +667,7 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to) to->cursor_next_skip_total += from->cursor_next_skip_total; to->cursor_prev_skip_total += from->cursor_prev_skip_total; to->cursor_skip_hs_cur_position += from->cursor_skip_hs_cur_position; + to->cursor_search_near_prefix_fast_paths += from->cursor_search_near_prefix_fast_paths; to->cursor_next_hs_tombstone += from->cursor_next_hs_tombstone; to->cursor_next_skip_ge_100 += from->cursor_next_skip_ge_100; to->cursor_next_skip_lt_100 += from->cursor_next_skip_lt_100; @@ -905,6 +908,8 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to) to->cursor_next_skip_total += WT_STAT_READ(from, cursor_next_skip_total); to->cursor_prev_skip_total += WT_STAT_READ(from, cursor_prev_skip_total); to->cursor_skip_hs_cur_position += WT_STAT_READ(from, cursor_skip_hs_cur_position); + to->cursor_search_near_prefix_fast_paths += + WT_STAT_READ(from, cursor_search_near_prefix_fast_paths); to->cursor_next_hs_tombstone += WT_STAT_READ(from, cursor_next_hs_tombstone); to->cursor_next_skip_ge_100 += WT_STAT_READ(from, cursor_next_skip_ge_100); to->cursor_next_skip_lt_100 += WT_STAT_READ(from, cursor_next_skip_lt_100); @@ -1396,6 +1401,7 @@ static const char *const __stats_connection_desc[] = { "cursor: Total number of entries skipped by cursor next calls", "cursor: Total number of entries skipped by cursor prev calls", "cursor: Total number of entries skipped to position the history store cursor", + "cursor: Total number of times a search near has exited due to prefix config", "cursor: cursor next calls that skip due to a globally visible history store tombstone", "cursor: cursor next calls that skip greater than or equal to 100 entries", "cursor: cursor next calls that skip less than 100 entries", @@ -1910,6 +1916,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cursor_next_skip_total = 0; stats->cursor_prev_skip_total = 0; stats->cursor_skip_hs_cur_position = 0; + stats->cursor_search_near_prefix_fast_paths = 0; stats->cursor_next_hs_tombstone = 0; stats->cursor_next_skip_ge_100 = 0; stats->cursor_next_skip_lt_100 = 0; @@ -2430,6 +2437,8 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->cursor_next_skip_total += WT_STAT_READ(from, cursor_next_skip_total); to->cursor_prev_skip_total += WT_STAT_READ(from, cursor_prev_skip_total); to->cursor_skip_hs_cur_position += WT_STAT_READ(from, cursor_skip_hs_cur_position); + to->cursor_search_near_prefix_fast_paths += + WT_STAT_READ(from, cursor_search_near_prefix_fast_paths); to->cursor_next_hs_tombstone += WT_STAT_READ(from, cursor_next_hs_tombstone); to->cursor_next_skip_ge_100 += WT_STAT_READ(from, cursor_next_skip_ge_100); to->cursor_next_skip_lt_100 += WT_STAT_READ(from, cursor_next_skip_lt_100); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index bbefea1d64c..83e6c2b8241 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -1353,7 +1353,7 @@ __txn_mod_compare(const void *a, const void *b) */ if (aopt->type == WT_TXN_OP_BASIC_ROW || aopt->type == WT_TXN_OP_INMEM_ROW) return (aopt->btree->collator == NULL ? - __wt_lex_compare(&aopt->u.op_row.key, &bopt->u.op_row.key) : + __wt_lex_compare(&aopt->u.op_row.key, &bopt->u.op_row.key, false) : 0); return (aopt->u.op_col.recno < bopt->u.op_col.recno); } diff --git a/src/third_party/wiredtiger/test/suite/test_search_near01.py b/src/third_party/wiredtiger/test/suite/test_search_near01.py new file mode 100644 index 00000000000..2e54671c06c --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_search_near01.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +import time, wiredtiger, wttest, unittest +from wiredtiger import stat + +def timestamp_str(t): + return '%x' % t + +# test_search_near01.py +# Test various prefix search near scenarios. +class test_search_near01(wttest.WiredTigerTestCase): + conn_config = 'statistics=(all)' + session_config = 'isolation=snapshot' + + def get_stat(self, stat, local_session = None): + if (local_session != None): + stat_cursor = local_session.open_cursor('statistics:') + else: + stat_cursor = self.session.open_cursor('statistics:') + val = stat_cursor[stat][2] + stat_cursor.close() + return val + + def unique_insert(self, cursor, prefix, id, keys): + key = prefix + ',' + str(id) + keys.append(key) + cursor.set_key(prefix) + cursor.set_value(prefix) + self.assertEqual(cursor.insert(), 0) + cursor.set_key(prefix) + self.assertEqual(cursor.remove(), 0) + cursor.set_key(prefix) + cursor.search_near() + cursor.set_key(key) + cursor.set_value(key) + self.assertEqual(cursor.insert(), 0) + + def test_base_scenario(self): + uri = 'table:test_base_scenario' + self.session.create(uri, 'key_format=u,value_format=u') + cursor = self.session.open_cursor(uri) + session2 = self.conn.open_session() + cursor3 = self.session.open_cursor(uri, None, "debug=(release_evict=true)") + + # Basic character array. + l = "abcdefghijklmnopqrstuvwxyz" + + # Start our older reader. + session2.begin_transaction() + + key_count = 26*26*26 + # Insert keys aaa -> zzz. + self.session.begin_transaction() + for i in range (0, 26): + for j in range (0, 26): + for k in range (0, 26): + cursor[l[i] + l[j] + l[k]] = l[i] + l[j] + l[k] + self.session.commit_transaction() + + # Evict the whole range. + for i in range (0, 26): + for j in range(0, 26): + cursor3.set_key(l[i] + l[j] + 'a') + cursor3.search() + cursor3.reset() + + # Search near for the "aa" part of the range. + cursor2 = session2.open_cursor(uri) + cursor2.set_key('aa') + cursor2.search_near() + + skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100) + # This should be equal to roughly key_count * 2 as we're going to traverse the whole + # range forward, and then the whole range backwards. + self.assertGreater(skip_count, key_count * 2) + + cursor2.reconfigure("prefix_key=true") + cursor2.set_key('aa') + cursor2.search_near() + + prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100) + # We should've skipped ~26*2 here as we're only looking at the "aa" range * 2. + self.assertGreaterEqual(prefix_skip_count - skip_count, 26*2) + skip_count = prefix_skip_count + + # The prefix code will have come into play at once as we walked to "aba". The prev + # traversal will go off the end of the file and as such we don't expect it to increment + # this statistic again. + self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 1) + + # Search for a key not at the start. + cursor2.set_key('bb') + cursor2.search_near() + + # Assert it to have only incremented the skipped statistic ~26*2 times. + prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100) + self.assertGreaterEqual(prefix_skip_count - skip_count, 26*2) + skip_count = prefix_skip_count + + # Here we should've hit the prefix fast path code twice. Plus the time we already did. + self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 2+1) + + cursor2.close() + cursor2 = session2.open_cursor(uri) + cursor2.set_key('bb') + cursor2.search_near() + # Assert that we've incremented the stat key_count times, as we closed the cursor and + # reopened it. + # + # This validates cursor caching logic, as if we don't clear the flag correctly this will + # fail. + # + # It should be closer to key_count * 2 but this an approximation. + prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100) + self.assertGreaterEqual(prefix_skip_count - skip_count, key_count) + + # This test aims to simulate a unique index insertion. + def test_unique_index_case(self): + uri = 'table:test_unique_index_case' + self.session.create(uri, 'key_format=u,value_format=u') + cursor = self.session.open_cursor(uri) + session2 = self.conn.open_session() + cursor3 = self.session.open_cursor(uri, None, "debug=(release_evict=true)") + l = "abcdefghijklmnopqrstuvwxyz" + + # A unique index has the following insertion method: + # 1. Insert the prefix + # 2. Remove the prefix + # 3. Search near for the prefix + # 4. Insert the full value + # All of these operations are wrapped in the same txn, this test attempts to test scenarios + # that could arise from this insertion method. + + # A unique index key has the format (prefix, _id), we'll insert keys that look similar. + + # Start our old reader txn. + session2.begin_transaction() + + key_count = 26*26 + id = 0 + cc_id = 0 + keys = [] + + # Insert keys aa,1 -> zz,N + for i in range (0, 26): + for j in range (0, 26): + # Skip inserting 'c'. + if (i == 2 and j == 2): + cc_id = id + id = id + 1 + continue + self.session.begin_transaction() + prefix = l[i] + l[j] + self.unique_insert(cursor, prefix, id, keys) + id = id + 1 + self.session.commit_transaction() + + # Evict the whole range. + for i in keys: + cursor3.set_key(i) + cursor3.search() + cursor3.reset() + + # Using our older reader attempt to find a value. + # Search near for the "cc" prefix. + cursor2 = session2.open_cursor(uri) + cursor2.set_key('cc') + cursor2.search_near() + + skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100) + # This should be equal to roughly key_count * 2 as we're going to traverse most of the + # range forward, and then the whole range backwards. + self.assertGreater(skip_count, key_count * 2) + + cursor2.reconfigure("prefix_key=true") + cursor2.set_key('cc') + cursor2.search_near() + self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 2) + + # This still isn't visible to our older reader and as such we expect this statistic to + # increment twice. + self.unique_insert(cursor2, 'cc', cc_id, keys) + self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths), 4) + + # In order for prefix key fast pathing to work we rely on some guarantees provided by row + # search. Test some of the guarantees. + def test_row_search(self): + uri = 'table:test_row_search' + self.session.create(uri, 'key_format=u,value_format=u') + cursor = self.session.open_cursor(uri) + session2 = self.conn.open_session() + l = "abcdefghijklmnopqrstuvwxyz" + # Insert keys a -> z, except c + self.session.begin_transaction() + for i in range (0, 26): + if (i == 2): + continue + cursor[l[i]] = l[i] + self.session.commit_transaction() + # Start our older reader transaction. + session2.begin_transaction() + # Insert a few keys in the 'c' range + self.session.begin_transaction() + cursor['c'] = 'c' + cursor['cc'] = 'cc' + cursor['ccc'] = 'ccc' + self.session.commit_transaction() + # Search_near for 'c' and assert we skip 3 entries. Internally the row search is landing on + # 'c'. + cursor2 = session2.open_cursor(uri) + cursor2.set_key('c') + cursor2.search_near() + + skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100) + self.assertEqual(skip_count, 3) + session2.commit_transaction() + + # Perform an insertion and removal of a key next to another key, then search for the + # removed key. + self.session.begin_transaction() + cursor.set_key('dd') + cursor.set_value('dd') + cursor.insert() + cursor.set_key('dd') + cursor.remove() + cursor.set_key('ddd') + cursor.set_value('ddd') + cursor.insert() + cursor.set_key('dd') + cursor.search_near() + self.session.commit_transaction() + skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100) + self.assertEqual(skip_count, 4) + + # Test a basic prepared scenario. + def test_prepared(self): + uri = 'table:test_base_scenario' + self.session.create(uri, 'key_format=u,value_format=u') + cursor = self.session.open_cursor(uri) + session2 = self.conn.open_session() + cursor3 = session2.open_cursor(uri, None, "debug=(release_evict=true)") + # Insert an update without timestamp + l = "abcdefghijklmnopqrstuvwxyz" + session2.begin_transaction() + + key_count = 26*26 + + # Insert 'cc' + self.session.begin_transaction() + cursor['cc'] = 'cc' + self.session.commit_transaction() + + # Prepare keys aa -> zz + self.session.begin_transaction() + for i in range (0, 26): + if (i == 2): + continue + for j in range (0, 26): + cursor[l[i] + l[j]] = l[i] + l[j] + + self.session.prepare_transaction('prepare_timestamp=2') + + # Evict the whole range. + for i in range (0, 26): + for j in range(0, 26): + cursor3.set_key(l[i] + l[j]) + cursor3.search() + cursor3.reset() + + # Search near for the "aa" part of the range. + cursor2 = session2.open_cursor(uri) + cursor2.set_key('c') + cursor2.search_near() + + skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100, session2) + # This should be equal to roughly key_count * 2 as we're going to traverse the whole + # range forward, and then the whole range backwards. + self.assertGreater(skip_count, key_count) + + cursor2.reconfigure("prefix_key=true") + cursor2.set_key('c') + cursor2.search_near() + + prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100, session2) + self.assertEqual(prefix_skip_count - skip_count, 3) + skip_count = prefix_skip_count + + self.assertEqual(self.get_stat(stat.conn.cursor_search_near_prefix_fast_paths, session2), 2) + + session2.rollback_transaction() + session2.begin_transaction('ignore_prepare=true') + cursor4 = session2.open_cursor(uri) + cursor4.reconfigure("prefix_key=true") + cursor4.set_key('c') + cursor4.search_near() + prefix_skip_count = self.get_stat(stat.conn.cursor_next_skip_lt_100, session2) + self.assertEqual(prefix_skip_count - skip_count, 2) + skip_count = prefix_skip_count + + cursor4.reconfigure("prefix_key=false") + cursor4.set_key('c') + cursor4.search_near() + self.assertEqual(self.get_stat(stat.conn.cursor_next_skip_lt_100, session2) - skip_count, 2) -- cgit v1.2.1