diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-03-04 14:45:58 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-03-04 04:15:21 +0000 |
commit | ff9995ed5cf2d72b67bb5520c9f71a9acfa27457 (patch) | |
tree | a54879ecd835fdbfdb0da3bcc9191bd607ad7d48 /src | |
parent | 2f11ef616efad0986a76325c624cdcc7ef65bc43 (diff) | |
download | mongo-ff9995ed5cf2d72b67bb5520c9f71a9acfa27457.tar.gz |
Import wiredtiger: a5fd80d29c69f12c01f412fb6d8d7930cecc8758 from branch mongodb-5.0
ref: 563ccc601f..a5fd80d29c
for: 4.9.0
WT-7164 Merge "HS cursor restructure" feature branch into develop
Diffstat (limited to 'src')
33 files changed, 1292 insertions, 1629 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index ebf39d669e3..2fb3a6e3f4d 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -565,6 +565,7 @@ calloc cas catfmt cb +cbt ccc ccr cd diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index 0e9890acf78..70a938da4b8 100755 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -135,7 +135,9 @@ func_ok() -e '/int zlib_terminate$/d' \ -e '/int zstd_error$/d' \ -e '/int zstd_pre_size$/d' \ - -e '/int zstd_terminate$/d' + -e '/int zstd_terminate$/d' \ + -e '/int __wt_curhs_search_near_after$/d' \ + -e '/int __wt_curhs_search_near_before$/d' } for f in `find bench ext src test -name '*.c' -o -name '*_inline.h'`; do diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index e6eab22645d..7a25fa521a4 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -303,9 +303,7 @@ connection_stats = [ CursorStat('cursor_modify_bytes', 'cursor modify key and value bytes affected', 'size'), CursorStat('cursor_modify_bytes_touch', 'cursor modify value bytes modified', 'size'), CursorStat('cursor_next', 'cursor next calls'), - CursorStat('cursor_next_hs_tombstone_rts', 'cursor next calls that skip due to a globally visible history store tombstone in rollback to stable'), CursorStat('cursor_prev', 'cursor prev calls'), - CursorStat('cursor_prev_hs_tombstone_rts', 'cursor prev calls that skip due to a globally visible history store tombstone in rollback to stable'), CursorStat('cursor_remove', 'cursor remove calls'), CursorStat('cursor_remove_bytes', 'cursor remove key bytes removed', 'size'), CursorStat('cursor_reopen', 'cursors reused from cache'), @@ -874,7 +872,7 @@ conn_dsrc_stats = [ TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'), TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'), TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'), - TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'), + TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable history store records with stop timestamps older than newer records'), TxnStat('txn_rts_inconsistent_ckpt', 'rollback to stable inconsistent checkpoint'), TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'), TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index b4883db4d35..ecd052a77d2 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "563ccc601f5689a16a3f41743398329b8a3aedf7" + "commit": "a5fd80d29c69f12c01f412fb6d8d7930cecc8758" } diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 00e29bbbee5..c9acfff3628 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -40,18 +40,18 @@ struct __wt_dbg { static const /* Output separator */ char *const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"; -static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool); +static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool, WT_CURSOR *); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_modify(WT_DBG *, const uint8_t *); static int __debug_page(WT_DBG *, WT_REF *, uint32_t); static int __debug_page_col_fix(WT_DBG *, WT_REF *); static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t); -static int __debug_page_col_var(WT_DBG *, WT_REF *); +static int __debug_page_col_var(WT_DBG *, WT_REF *, WT_CURSOR *); static int __debug_page_metadata(WT_DBG *, WT_REF *); static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t); -static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *); +static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *, WT_CURSOR *); static int __debug_ref(WT_DBG *, WT_REF *); -static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *); +static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *, WT_CURSOR *); static int __debug_tree(WT_SESSION_IMPL *, WT_REF *, const char *, uint32_t); static int __debug_update(WT_DBG *, WT_UPDATE *, bool); static int __debug_wrapup(WT_DBG *); @@ -285,9 +285,6 @@ __debug_wrapup(WT_DBG *ds) session = ds->session; msg = ds->msg; - if (session->hs_cursor != NULL) - WT_TRET(__wt_hs_cursor_close(session)); - __wt_scr_free(session, &ds->key); __wt_scr_free(session, &ds->hs_key); __wt_scr_free(session, &ds->hs_value); @@ -421,7 +418,7 @@ __debug_hs_cursor(WT_DBG *ds, WT_CURSOR *hs_cursor) uint32_t hs_btree_id; char time_string[WT_TIME_STRING_SIZE]; - cbt = (WT_CURSOR_BTREE *)hs_cursor; + cbt = __wt_curhs_get_cbt(hs_cursor); session = ds->session; WT_TIME_WINDOW_INIT(&tw); @@ -463,16 +460,12 @@ __debug_hs_cursor(WT_DBG *ds, WT_CURSOR *hs_cursor) * Dump any HS records associated with the key. */ static int -__debug_hs_key(WT_DBG *ds) +__debug_hs_key(WT_DBG *ds, WT_CURSOR *hs_cursor) { WT_BTREE *btree; - WT_CURSOR *hs_cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - wt_timestamp_t older_start_ts; - uint64_t hs_counter; uint32_t hs_btree_id; - int cmp, exact; session = ds->session; btree = S2BT(session); @@ -482,26 +475,12 @@ __debug_hs_key(WT_DBG *ds) * Open a history store cursor positioned at the end of the data store key (the newest record) * and iterate backwards until we reach a different key or btree. */ - hs_cursor = session->hs_cursor; - hs_cursor->set_key(hs_cursor, hs_btree_id, ds->key, WT_TS_MAX, WT_TXN_MAX); - ret = hs_cursor->search_near(hs_cursor, &exact); - - /* If we jumped to the next key, go back to the previous key. */ - if (ret == 0 && exact > 0) - ret = hs_cursor->prev(hs_cursor); - - for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) { - WT_RET(hs_cursor->get_key(hs_cursor, &hs_btree_id, ds->t1, &older_start_ts, &hs_counter)); - - if (hs_btree_id != btree->id) - break; - - WT_RET(__wt_compare(session, NULL, ds->key, ds->t1, &cmp)); - if (cmp != 0) - break; + hs_cursor->set_key(hs_cursor, 4, hs_btree_id, ds->key, WT_TS_MAX, WT_TXN_MAX); + ret = __wt_curhs_search_near_before(session, hs_cursor); + for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) WT_RET(__debug_hs_cursor(ds, hs_cursor)); - } + return (ret == WT_NOTFOUND ? 0 : ret); } @@ -970,19 +949,19 @@ __wt_debug_cursor_page(void *cursor_arg, const char *ofile) * Dump the history store tree given a user cursor. */ int -__wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile) +__wt_debug_cursor_tree_hs(void *session_arg, const char *ofile) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - WT_CURSOR_BTREE *cbt; + WT_BTREE *hs_btree; + WT_CURSOR *hs_cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - session = CUR2S(cursor_arg); - - WT_RET(__wt_hs_cursor_open(session)); - cbt = (WT_CURSOR_BTREE *)session->hs_cursor; - WT_WITH_BTREE(session, CUR2BT(cbt), ret = __wt_debug_tree_all(session, NULL, NULL, ofile)); - WT_TRET(__wt_hs_cursor_close(session)); + session = (WT_SESSION_IMPL *)session_arg; + WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); + hs_btree = __wt_curhs_get_btree(hs_cursor); + WT_WITH_BTREE(session, hs_btree, ret = __wt_debug_tree_all(session, NULL, NULL, ofile)); + WT_TRET(hs_cursor->close(hs_cursor)); return (ret); } @@ -1017,9 +996,11 @@ __debug_tree(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile, uint32_t static int __debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags) { + WT_CURSOR *hs_cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + hs_cursor = NULL; session = ds->session; WT_RET(__wt_scr_alloc(session, 100, &ds->key)); @@ -1028,43 +1009,47 @@ __debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags) * doesn't work, we may be running in-memory. */ if (!WT_IS_HS(session->dhandle)) { - if (session->hs_cursor != NULL || __wt_hs_cursor_open(session) == 0) { - WT_RET(__wt_scr_alloc(session, 0, &ds->hs_key)); - WT_RET(__wt_scr_alloc(session, 0, &ds->hs_value)); - } + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + WT_ERR(__wt_scr_alloc(session, 0, &ds->hs_key)); + WT_ERR(__wt_scr_alloc(session, 0, &ds->hs_value)); + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); } /* Dump the page metadata. */ WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, ref)); - WT_RET(ret); + WT_ERR(ret); /* Dump the page. */ switch (ref->page->type) { case WT_PAGE_COL_FIX: if (LF_ISSET(WT_DEBUG_TREE_LEAF)) - WT_RET(__debug_page_col_fix(ds, ref)); + WT_ERR(__debug_page_col_fix(ds, ref)); break; case WT_PAGE_COL_INT: WT_WITH_PAGE_INDEX(session, ret = __debug_page_col_int(ds, ref->page, flags)); - WT_RET(ret); + WT_ERR(ret); break; case WT_PAGE_COL_VAR: if (LF_ISSET(WT_DEBUG_TREE_LEAF)) - WT_RET(__debug_page_col_var(ds, ref)); + WT_ERR(__debug_page_col_var(ds, ref, hs_cursor)); break; case WT_PAGE_ROW_INT: WT_WITH_PAGE_INDEX(session, ret = __debug_page_row_int(ds, ref->page, flags)); - WT_RET(ret); + WT_ERR(ret); break; case WT_PAGE_ROW_LEAF: if (LF_ISSET(WT_DEBUG_TREE_LEAF)) - WT_RET(__debug_page_row_leaf(ds, ref->page)); + WT_ERR(__debug_page_row_leaf(ds, ref->page, hs_cursor)); break; default: - return (__wt_illegal_value(session, ref->page->type)); + WT_ERR(__wt_illegal_value(session, ref->page->type)); } - return (0); +err: + if (hs_cursor != NULL) + WT_TRET(hs_cursor->close(hs_cursor)); + return (ret); } /* @@ -1209,11 +1194,11 @@ __debug_page_col_fix(WT_DBG *ds, WT_REF *ref) if (WT_COL_UPDATE_SINGLE(page) != NULL) { WT_RET(ds->f(ds, "%s", sep)); - WT_RET(__debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", true)); + WT_RET(__debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", true, NULL)); } if (WT_COL_APPEND(page) != NULL) { WT_RET(ds->f(ds, "%s", sep)); - WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", true)); + WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", true, NULL)); } return (0); } @@ -1254,7 +1239,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) * Dump an in-memory WT_PAGE_COL_VAR page. */ static int -__debug_page_col_var(WT_DBG *ds, WT_REF *ref) +__debug_page_col_var(WT_DBG *ds, WT_REF *ref, WT_CURSOR *hs_cursor) { WT_CELL *cell; WT_CELL_UNPACK_KV *unpack, _unpack; @@ -1283,17 +1268,17 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref) p = ds->key->mem; WT_RET(__wt_vpack_uint(&p, 0, recno)); ds->key->size = WT_PTRDIFF(p, ds->key->mem); - WT_RET(__debug_hs_key(ds)); + WT_RET(__debug_hs_key(ds, hs_cursor)); } if ((update = WT_COL_UPDATE(page, cip)) != NULL) - WT_RET(__debug_col_skip(ds, update, "update", false)); + WT_RET(__debug_col_skip(ds, update, "update", false, hs_cursor)); recno += rle; } if (WT_COL_APPEND(page) != NULL) { WT_RET(ds->f(ds, "%s", sep)); - WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", false)); + WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", false, hs_cursor)); } return (0); @@ -1337,7 +1322,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) * Dump an in-memory WT_PAGE_ROW_LEAF page. */ static int -__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) +__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page, WT_CURSOR *hs_cursor) { WT_CELL_UNPACK_KV *unpack, _unpack; WT_INSERT_HEAD *insert; @@ -1353,7 +1338,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) * Dump any K/V pairs inserted into the page before the first from-disk key on the page. */ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) - WT_RET(__debug_row_skip(ds, insert)); + WT_RET(__debug_row_skip(ds, insert, hs_cursor)); /* Dump the page's K/V pairs. */ WT_ROW_FOREACH (page, rip, i) { @@ -1366,11 +1351,11 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) WT_RET(__debug_update(ds, upd, false)); - if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL) - WT_RET(__debug_hs_key(ds)); + if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL) + WT_RET(__debug_hs_key(ds, hs_cursor)); if ((insert = WT_ROW_INSERT(page, rip)) != NULL) - WT_RET(__debug_row_skip(ds, insert)); + WT_RET(__debug_row_skip(ds, insert, hs_cursor)); } return (0); } @@ -1380,7 +1365,8 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) * Dump a column-store skiplist. */ static int -__debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte) +__debug_col_skip( + WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte, WT_CURSOR *hs_cursor) { WT_INSERT *ins; WT_SESSION_IMPL *session; @@ -1392,11 +1378,11 @@ __debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte WT_RET(ds->f(ds, "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins))); WT_RET(__debug_update(ds, ins->upd, hexbyte)); - if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL) { + if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL) { p = ds->key->mem; WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins))); ds->key->size = WT_PTRDIFF(p, ds->key->mem); - WT_RET(__debug_hs_key(ds)); + WT_RET(__debug_hs_key(ds, hs_cursor)); } } return (0); @@ -1407,7 +1393,7 @@ __debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte * Dump an insert list. */ static int -__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head) +__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head, WT_CURSOR *hs_cursor) { WT_INSERT *ins; WT_SESSION_IMPL *session; @@ -1418,9 +1404,9 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head) WT_RET(__debug_item_key(ds, "insert", WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins))); WT_RET(__debug_update(ds, ins->upd, false)); - if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL) { + if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL) { WT_RET(__wt_buf_set(session, ds->key, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins))); - WT_RET(__debug_hs_key(ds)); + WT_RET(__debug_hs_key(ds, hs_cursor)); } } return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 3faf1c84aa3..a4d82c3d904 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -71,7 +71,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) return (0); } - WT_RET(__wt_hs_cursor_cache(session)); + WT_RET(__wt_curhs_cache(session)); (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict(session, ref, previous_state, 0); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 0bf0fb8672a..cf2aca0fc87 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -278,9 +278,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) */ if (ret == 0 && (ckpt + 1)->name == NULL && !skip_hs) { /* Open a history store cursor. */ - WT_ERR(__wt_hs_cursor_open(session)); WT_TRET(__wt_hs_verify_one(session)); - WT_TRET(__wt_hs_cursor_close(session)); /* * We cannot error out here. If we got an error verifying the history store, we need * to follow through with reacquiring the exclusive call below. We'll error out @@ -778,11 +776,12 @@ __verify_key_hs( wt_timestamp_t older_start_ts, older_stop_ts; uint64_t hs_counter; uint32_t hs_btree_id; - int cmp, exact; char ts_string[2][WT_TS_INT_STRING_SIZE]; btree = S2BT(session); hs_btree_id = btree->id; + WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); /* * Set the data store timestamp and transactions to initiate timestamp range verification. Since @@ -795,36 +794,23 @@ __verify_key_hs( * Open a history store cursor positioned at the end of the data store key (the newest record) * and iterate backwards until we reach a different key or btree. */ - hs_cursor = session->hs_cursor; - hs_cursor->set_key(hs_cursor, hs_btree_id, tmp1, WT_TS_MAX, WT_TXN_MAX); - ret = hs_cursor->search_near(hs_cursor, &exact); - - /* If we jumped to the next key, go back to the previous key. */ - if (ret == 0 && exact > 0) - ret = hs_cursor->prev(hs_cursor); + hs_cursor->set_key(hs_cursor, 4, hs_btree_id, tmp1, WT_TS_MAX, UINT64_MAX); + ret = __wt_curhs_search_near_before(session, hs_cursor); for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) { - WT_RET(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp2, &older_start_ts, &hs_counter)); - - if (hs_btree_id != btree->id) - break; - - WT_RET(__wt_compare(session, NULL, tmp1, vs->tmp2, &cmp)); - if (cmp != 0) - break; - + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp2, &older_start_ts, &hs_counter)); /* Verify the newer record's start is later than the older record's stop. */ if (newer_start_ts < older_stop_ts) { - WT_RET_MSG(session, WT_ERROR, + WT_ERR_MSG(session, WT_ERROR, "key %s has a overlap of timestamp ranges between history store stop timestamp %s " "being newer than a more recent timestamp range having start timestamp %s", __wt_buf_set_printable(session, tmp1->data, tmp1->size, vs->tmp2), - __verify_timestamp_to_pretty_string(older_stop_ts, ts_string[0]), - __verify_timestamp_to_pretty_string(newer_start_ts, ts_string[1])); + __wt_timestamp_to_string(older_stop_ts, ts_string[0]), + __wt_timestamp_to_string(newer_start_ts, ts_string[1])); } if (vs->stable_timestamp != WT_TS_NONE) - WT_RET( + WT_ERR( __verify_ts_stable_cmp(session, tmp1, NULL, 0, older_start_ts, older_stop_ts, vs)); /* @@ -833,7 +819,8 @@ __verify_key_hs( */ newer_start_ts = older_start_ts; } - +err: + WT_TRET(hs_cursor->close(hs_cursor)); return (ret == WT_NOTFOUND ? 0 : ret); #else WT_UNUSED(session); diff --git a/src/third_party/wiredtiger/src/cursor/cur_hs.c b/src/third_party/wiredtiger/src/cursor/cur_hs.c index f5574b3c1ce..d4cda2065c6 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_hs.c +++ b/src/third_party/wiredtiger/src/cursor/cur_hs.c @@ -8,15 +8,19 @@ #include "wt_internal.h" +static int __curhs_file_cursor_next(WT_SESSION_IMPL *, WT_CURSOR *); +static int __curhs_file_cursor_open(WT_SESSION_IMPL *, WT_CURSOR **); +static int __curhs_file_cursor_prev(WT_SESSION_IMPL *, WT_CURSOR *); +static int __curhs_file_cursor_search_near(WT_SESSION_IMPL *, WT_CURSOR *, int *); static int __curhs_prev_visible(WT_SESSION_IMPL *, WT_CURSOR_HS *); static int __curhs_next_visible(WT_SESSION_IMPL *, WT_CURSOR_HS *); - +static int __curhs_search_near_helper(WT_SESSION_IMPL *, WT_CURSOR *, bool); /* - * __hs_cursor_open_int -- + * __curhs_file_cursor_open -- * Open a new history store table cursor, internal function. */ static int -__hs_cursor_open_int(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +__curhs_file_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { WT_CURSOR *cursor; WT_DECL_RET; @@ -34,12 +38,12 @@ __hs_cursor_open_int(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) } /* - * __wt_hs_cursor_cache -- + * __wt_curhs_cache -- * Cache a new history store table cursor. Open and then close a history store cursor without * saving it in the session. */ int -__wt_hs_cursor_cache(WT_SESSION_IMPL *session) +__wt_curhs_cache(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; @@ -70,45 +74,17 @@ __wt_hs_cursor_cache(WT_SESSION_IMPL *session) (session->dhandle != NULL && WT_IS_METADATA(S2BT(session)->dhandle)) || session == conn->default_session) return (0); - WT_RET(__hs_cursor_open_int(session, &cursor)); + WT_RET(__curhs_file_cursor_open(session, &cursor)); WT_RET(cursor->close(cursor)); return (0); } /* - * __wt_hs_cursor_open -- - * Open a new history store table cursor wrapper function. - */ -int -__wt_hs_cursor_open(WT_SESSION_IMPL *session) -{ - /* Not allowed to open a cursor if you already have one */ - WT_ASSERT(session, session->hs_cursor == NULL); - - return (__hs_cursor_open_int(session, &session->hs_cursor)); -} - -/* - * __wt_hs_cursor_close -- - * Discard a history store cursor. - */ -int -__wt_hs_cursor_close(WT_SESSION_IMPL *session) -{ - /* Should only be called when session has an open history store cursor */ - WT_ASSERT(session, session->hs_cursor != NULL); - - WT_RET(session->hs_cursor->close(session->hs_cursor)); - session->hs_cursor = NULL; - return (0); -} - -/* - * __wt_hs_cursor_next -- + * __curhs_file_cursor_next -- * Execute a next operation on a history store cursor with the appropriate isolation level. */ -int -__wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor) +static int +__curhs_file_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor) { WT_DECL_RET; @@ -117,11 +93,11 @@ __wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor) } /* - * __wt_hs_cursor_prev -- + * __curhs_file_cursor_prev -- * Execute a prev operation on a history store cursor with the appropriate isolation level. */ -int -__wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor) +static int +__curhs_file_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor) { WT_DECL_RET; @@ -130,12 +106,12 @@ __wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor) } /* - * __wt_hs_cursor_search_near -- + * __curhs_file_cursor_search_near -- * Execute a search near operation on a history store cursor with the appropriate isolation * level. */ -int -__wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp) +static int +__curhs_file_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp) { WT_DECL_RET; @@ -145,8 +121,34 @@ __wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exa } /* + * __curhs_set_key_ptr -- + * Copy the key buffer pointer from file cursor to the history store cursor. + */ +static inline void +__curhs_set_key_ptr(WT_CURSOR *hs_cursor, WT_CURSOR *file_cursor) +{ + hs_cursor->key.data = file_cursor->key.data; + hs_cursor->key.size = file_cursor->key.size; + WT_ASSERT(CUR2S(file_cursor), F_ISSET(file_cursor, WT_CURSTD_KEY_SET)); + F_SET(hs_cursor, F_MASK(file_cursor, WT_CURSTD_KEY_SET)); +} + +/* + * __curhs_set_value_ptr -- + * Copy the value buffer pointer from file cursor to the history store cursor. + */ +static inline void +__curhs_set_value_ptr(WT_CURSOR *hs_cursor, WT_CURSOR *file_cursor) +{ + hs_cursor->value.data = file_cursor->value.data; + hs_cursor->value.size = file_cursor->value.size; + WT_ASSERT(CUR2S(file_cursor), F_ISSET(file_cursor, WT_CURSTD_VALUE_SET)); + F_SET(hs_cursor, F_MASK(file_cursor, WT_CURSTD_VALUE_SET)); +} + +/* * __curhs_next -- - * WT_CURSOR->next method for the hs cursor type. + * WT_CURSOR->next method for the history store cursor type. */ static int __curhs_next(WT_CURSOR *cursor) @@ -160,7 +162,7 @@ __curhs_next(WT_CURSOR *cursor) file_cursor = hs_cursor->file_cursor; CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, next, CUR2BT(file_cursor)); - WT_ERR(__wt_hs_cursor_next(session, file_cursor)); + WT_ERR(__curhs_file_cursor_next(session, file_cursor)); /* * We need to check if the history store record is visible to the current session. If not, the * __curhs_next_visible() will also keep iterating forward through the records until it finds a @@ -168,6 +170,9 @@ __curhs_next(WT_CURSOR *cursor) */ WT_ERR(__curhs_next_visible(session, hs_cursor)); + __curhs_set_key_ptr(cursor, file_cursor); + __curhs_set_value_ptr(cursor, file_cursor); + if (0) { err: WT_TRET(cursor->reset(cursor)); @@ -177,7 +182,7 @@ err: /* * __curhs_prev -- - * WT_CURSOR->prev method for the hs cursor type. + * WT_CURSOR->prev method for the history store cursor type. */ static int __curhs_prev(WT_CURSOR *cursor) @@ -191,7 +196,7 @@ __curhs_prev(WT_CURSOR *cursor) file_cursor = hs_cursor->file_cursor; CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, prev, CUR2BT(file_cursor)); - WT_ERR(__wt_hs_cursor_prev(session, file_cursor)); + WT_ERR(__curhs_file_cursor_prev(session, file_cursor)); /* * We need to check if the history store record is visible to the current session. If not, the * __curhs_prev_visible() will also keep iterating backwards through the records until it finds @@ -199,6 +204,9 @@ __curhs_prev(WT_CURSOR *cursor) */ WT_ERR(__curhs_prev_visible(session, hs_cursor)); + __curhs_set_key_ptr(cursor, file_cursor); + __curhs_set_value_ptr(cursor, file_cursor); + if (0) { err: WT_TRET(cursor->reset(cursor)); @@ -208,7 +216,7 @@ err: /* * __curhs_close -- - * WT_CURSOR->close method for the hs cursor type. + * WT_CURSOR->close method for the history store cursor type. */ static int __curhs_close(WT_CURSOR *cursor) @@ -216,7 +224,6 @@ __curhs_close(WT_CURSOR *cursor) WT_CURSOR *file_cursor; WT_CURSOR_HS *hs_cursor; WT_DECL_RET; - WT_ITEM *datastore_key; WT_SESSION_IMPL *session; hs_cursor = (WT_CURSOR_HS *)cursor; @@ -224,11 +231,11 @@ __curhs_close(WT_CURSOR *cursor) CURSOR_API_CALL_PREPARE_ALLOWED( cursor, session, close, file_cursor == NULL ? NULL : CUR2BT(file_cursor)); err: + __wt_scr_free(session, &hs_cursor->datastore_key); if (file_cursor != NULL) WT_TRET(file_cursor->close(file_cursor)); - datastore_key = &hs_cursor->datastore_key; - __wt_scr_free(session, &datastore_key); __wt_cursor_close(cursor); + --session->hs_cursor_counter; API_END_RET(session, ret); } @@ -252,9 +259,15 @@ __curhs_reset(WT_CURSOR *cursor) ret = file_cursor->reset(file_cursor); WT_TIME_WINDOW_INIT(&hs_cursor->time_window); hs_cursor->btree_id = 0; - hs_cursor->datastore_key.data = NULL; - hs_cursor->datastore_key.size = 0; + hs_cursor->datastore_key->data = NULL; + hs_cursor->datastore_key->size = 0; hs_cursor->flags = 0; + cursor->key.data = NULL; + cursor->key.size = 0; + cursor->value.data = NULL; + cursor->value.size = 0; + F_CLR(cursor, WT_CURSTD_KEY_SET); + F_CLR(cursor, WT_CURSTD_VALUE_SET); err: API_END_RET(session, ret); @@ -262,7 +275,7 @@ err: /* * __curhs_set_key -- - * WT_CURSOR->set_key method for the hs cursor type. + * WT_CURSOR->set_key method for the history store cursor type. */ static void __curhs_set_key(WT_CURSOR *cursor, ...) @@ -282,6 +295,7 @@ __curhs_set_key(WT_CURSOR *cursor, ...) start_ts = WT_TS_NONE; counter = 0; + hs_cursor->flags = 0; va_start(ap, cursor); arg_count = va_arg(ap, uint32_t); @@ -292,11 +306,11 @@ __curhs_set_key(WT_CURSOR *cursor, ...) if (arg_count > 1) { datastore_key = va_arg(ap, WT_ITEM *); WT_IGNORE_RET(__wt_buf_set( - session, &hs_cursor->datastore_key, datastore_key->data, datastore_key->size)); + session, hs_cursor->datastore_key, datastore_key->data, datastore_key->size)); F_SET(hs_cursor, WT_HS_CUR_KEY_SET); } else { - hs_cursor->datastore_key.data = NULL; - hs_cursor->datastore_key.size = 0; + hs_cursor->datastore_key->data = NULL; + hs_cursor->datastore_key->size = 0; F_CLR(hs_cursor, WT_HS_CUR_KEY_SET); } @@ -315,7 +329,9 @@ __curhs_set_key(WT_CURSOR *cursor, ...) va_end(ap); file_cursor->set_key( - file_cursor, hs_cursor->btree_id, &hs_cursor->datastore_key, start_ts, counter); + file_cursor, hs_cursor->btree_id, hs_cursor->datastore_key, start_ts, counter); + + __curhs_set_key_ptr(cursor, file_cursor); } /* @@ -342,8 +358,8 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) WT_ERR(__wt_scr_alloc(session, 0, &datastore_key)); - for (; ret == 0; ret = __wt_hs_cursor_prev(session, file_cursor)) { - WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter)); + for (; ret == 0; ret = __curhs_file_cursor_prev(session, file_cursor)) { + WT_ERR(file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter)); /* Stop before crossing over to the next btree. */ if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) { @@ -356,7 +372,7 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) * have crossed over the desired key and not found the record we are looking for. */ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) { - WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp)); + WT_ERR(__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp)); if (cmp != 0) { ret = WT_NOTFOUND; goto err; @@ -379,6 +395,12 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED)) break; + /* + * If we are using a history store cursor and haven't set the WT_CURSTD_HS_READ_COMMITTED + * flag then we must have a snapshot, assert that we do. + */ + WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)); + if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) { /* * If the stop time point of a record is visible to us, we won't be able to see anything @@ -425,8 +447,8 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) WT_ERR(__wt_scr_alloc(session, 0, &datastore_key)); - for (; ret == 0; ret = __wt_hs_cursor_next(session, file_cursor)) { - WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter)); + for (; ret == 0; ret = __curhs_file_cursor_next(session, file_cursor)) { + WT_ERR(file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter)); /* Stop before crossing over to the next btree. */ if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) { @@ -439,7 +461,7 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) * have crossed over the desired key and not found the record we are looking for. */ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) { - WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp)); + WT_ERR(__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp)); if (cmp != 0) { ret = WT_NOTFOUND; goto err; @@ -463,6 +485,12 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) break; /* + * If we are using a history store cursor and haven't set the WT_CURSTD_HS_READ_COMMITTED + * flag then we must have a snapshot, assert that we do. + */ + WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)); + + /* * If the stop time point of a record is visible to us, check the next one. */ if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) @@ -479,170 +507,267 @@ err: } /* + * __wt_curhs_search_near_before -- + * Set the cursor position at the requested position or before it. + */ +int +__wt_curhs_search_near_before(WT_SESSION_IMPL *session, WT_CURSOR *cursor) +{ + return (__curhs_search_near_helper(session, cursor, true)); +} + +/* + * __wt_curhs_search_near_after -- + * Set the cursor position at the requested position or after it. + */ +int +__wt_curhs_search_near_after(WT_SESSION_IMPL *session, WT_CURSOR *cursor) +{ + return (__curhs_search_near_helper(session, cursor, false)); +} + +/* + * __curhs_search_near_helper -- + * Helper function to set the cursor position based on search criteria. + */ +static int +__curhs_search_near_helper(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool before) +{ + WT_DECL_ITEM(srch_key); + WT_DECL_RET; + int cmp; + + WT_RET(__wt_scr_alloc(session, 0, &srch_key)); + WT_ERR(__wt_buf_set(session, srch_key, cursor->key.data, cursor->key.size)); + WT_ERR(cursor->search_near(cursor, &cmp)); + if (before) { + /* + * If we want to land on a key that is smaller or equal to the specified key, keep walking + * backwards as there may be content inserted concurrently. + */ + if (cmp > 0) { + while ((ret = cursor->prev(cursor)) == 0) { + WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position); + WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position); + WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp)); + /* + * Exit if we have found a key that is smaller than or equal to the specified key. + */ + if (cmp <= 0) + break; + } + } + } else { + /* + * If we want to land on a key that is larger or equal to the specified key, keep walking + * forwards as there may be content inserted concurrently. + */ + if (cmp < 0) { + while ((ret = cursor->next(cursor)) == 0) { + WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position); + WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position); + WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp)); + /* Exit if we have found a key that is larger than or equal to the specified key. */ + if (cmp >= 0) + break; + } + } + } + +err: + __wt_scr_free(session, &srch_key); + return (ret); +} + +/* * __curhs_search_near -- - * WT_CURSOR->search_near method for the hs cursor type. + * WT_CURSOR->search_near method for the history store cursor type. */ static int __curhs_search_near(WT_CURSOR *cursor, int *exactp) { WT_CURSOR *file_cursor; WT_CURSOR_HS *hs_cursor; + WT_DECL_ITEM(datastore_key); WT_DECL_ITEM(srch_key); WT_DECL_RET; WT_SESSION_IMPL *session; - int cmp; - int exact; + wt_timestamp_t start_ts; + uint64_t counter; + uint32_t btree_id; + int exact, cmp; hs_cursor = (WT_CURSOR_HS *)cursor; file_cursor = hs_cursor->file_cursor; *exactp = 0; - cmp = 0; CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, search_near, CUR2BT(file_cursor)); + WT_ERR(__wt_scr_alloc(session, 0, &datastore_key)); WT_ERR(__wt_scr_alloc(session, 0, &srch_key)); /* At least we have the btree id set. */ WT_ASSERT(session, F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET)); WT_ERR(__wt_buf_set(session, srch_key, file_cursor->key.data, file_cursor->key.size)); /* Reset cursor if we get WT_NOTFOUND. */ - WT_ERR(__wt_hs_cursor_search_near(session, file_cursor, &exact)); + WT_ERR(__curhs_file_cursor_search_near(session, file_cursor, &exact)); - /* - * There are some key fields missing so we are searching a range of keys. Place the cursor at - * the start of the range. - */ - if (!F_ISSET(hs_cursor, WT_HS_CUR_COUNTER_SET)) { + if (exact >= 0) { /* - * If we raced with a history store insert, we may be two or more records away from our - * target. Keep iterating forwards until we are on or past our target key. - * - * We can't use the cursor positioning helper that we use for regular reads since that will - * place us at the end of a particular key/timestamp range whereas we want to be placed at - * the beginning. + * We placed the file cursor before the search key. Try first to walk forwards to see if we + * can find a visible record. If nothing is visible, try to walk backwards. */ - if (exact < 0) { - while ((ret = __wt_hs_cursor_next(session, file_cursor)) == 0) { - WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp)); - if (cmp >= 0) - break; + WT_ERR_NOTFOUND_OK(__curhs_next_visible(session, hs_cursor), true); + if (ret == WT_NOTFOUND) { + /* + * When walking backwards, first ensure we walk back to the specified btree or key space + * as we may have crossed the boundary. Do that in a loop as there may be content + * inserted concurrently. + */ + while ((ret = __curhs_file_cursor_prev(session, file_cursor)) == 0) { + WT_ERR( + file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter)); + + /* We are back in the specified btree range. */ + if (btree_id == hs_cursor->btree_id && F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) { + WT_ERR( + __wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp)); + + /* We are back in the specified key range. */ + if (cmp == 0) + break; + + /* + * We are now smaller than the key range, which indicates nothing is visible to + * us in the specified key range. + */ + if (cmp < 0) { + ret = WT_NOTFOUND; + goto err; + } + } + + /* + * We are now smaller than the btree range, which indicates nothing is visible to us + * in the specified btree range. + */ + if (btree_id < hs_cursor->btree_id) { + ret = WT_NOTFOUND; + goto err; + } } + WT_ERR(ret); + /* + * Keeping looking for the first visible update in the specified range when walking + * backwards. + */ + WT_ERR(__curhs_prev_visible(session, hs_cursor)); /* - * No entries greater than or equal to the key we searched for. Reset cursor if we get - * WT_NOTFOUND. + * We can't find anything visible when first walking forwards so we must have found an + * update that is smaller than the specified key. */ + *exactp = -1; + } else { WT_ERR(ret); - - *exactp = cmp; - } else - *exactp = 1; - - WT_ERR(__curhs_next_visible(session, hs_cursor)); - } - /* Search the closest match that is smaller or equal to the search key. */ - else { + /* + * We find an update when walking forwards. If initially we land on the same key as the + * specified key, exact will be 0 and we should return that. If it is not visible, we + * must have found a key that is larger than the specified key. + */ + *exactp = exact; + } + } else { /* - * Because of the special visibility rules for the history store, a new key can appear in - * between our search and the set of updates that we're interested in. Keep trying until we - * find it. - * - * There may be no history store entries for the given btree id and record key if they have - * been removed by rollback to stable. - * - * Note that we need to compare the raw key off the cursor to determine where we are in the - * history store as opposed to comparing the embedded data store key since the ordering is - * not guaranteed to be the same. + * We placed the file cursor after the search key. Try first to walk backwards to see if we + * can find a visible record. If nothing is visible, try to walk forwards. */ - if (exact > 0) { + WT_ERR_NOTFOUND_OK(__curhs_prev_visible(session, hs_cursor), true); + if (ret == WT_NOTFOUND) { /* - * It's possible that we may race with a history store insert for another key. So we may - * be more than one record away the end of our target key/timestamp range. Keep - * iterating backwards until we land on our key. + * When walking forwards, first ensure we walk back to the specified btree or key space + * as we may have crossed the boundary. Do that in a loop as there may be content + * inserted concurrently. */ - while ((ret = __wt_hs_cursor_prev(session, file_cursor)) == 0) { - WT_STAT_CONN_DATA_INCR(session, cursor_skip_hs_cur_position); - - WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp)); - if (cmp <= 0) - break; + while ((ret = __curhs_file_cursor_next(session, file_cursor)) == 0) { + WT_ERR( + file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter)); + + /* We are back in the specified btree range. */ + if (btree_id == hs_cursor->btree_id && F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) { + WT_ERR( + __wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp)); + + /* We are back in the specified key range. */ + if (cmp == 0) + break; + + /* + * We are now larger than the key range, which indicates nothing is visible to + * us in the specified key range. + */ + if (cmp > 0) { + ret = WT_NOTFOUND; + goto err; + } + } + + /* + * We are now larger than the btree range, which indicates nothing is visible to us + * in the specified btree range. + */ + if (btree_id > hs_cursor->btree_id) { + ret = WT_NOTFOUND; + goto err; + } } + WT_ERR(ret); + /* + * Keeping looking for the first visible update in the specified range when walking + * forwards. + */ + WT_ERR(__curhs_next_visible(session, hs_cursor)); + /* + * We can't find anything visible when first walking backwards so we must have found an + * update that is larger than the specified key. + */ + *exactp = 1; + } else { + WT_ERR(ret); + *exactp = exact; + } + } - *exactp = cmp; - } else - *exactp = -1; #ifdef HAVE_DIAGNOSTIC - if (ret == 0) { - WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp)); - WT_ASSERT(session, cmp <= 0); - } + WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp)); + WT_ASSERT( + session, (cmp == 0 && *exactp == 0) || (cmp < 0 && *exactp < 0) || (cmp > 0 && *exactp > 0)); #endif - WT_ERR(__curhs_prev_visible(session, hs_cursor)); - } + __curhs_set_key_ptr(cursor, file_cursor); + __curhs_set_value_ptr(cursor, file_cursor); if (0) { err: WT_TRET(cursor->reset(cursor)); } + __wt_scr_free(session, &datastore_key); __wt_scr_free(session, &srch_key); API_END_RET(session, ret); } /* - * __curhs_get_key -- - * WT_CURSOR->get_key method for the hs cursor type. - */ -static int -__curhs_get_key(WT_CURSOR *cursor, ...) -{ - WT_CURSOR *file_cursor; - WT_CURSOR_HS *hs_cursor; - WT_DECL_RET; - va_list ap; - - hs_cursor = (WT_CURSOR_HS *)cursor; - file_cursor = hs_cursor->file_cursor; - - va_start(ap, cursor); - ret = file_cursor->get_key(file_cursor, va_arg(ap, uint32_t *), va_arg(ap, WT_ITEM **), - va_arg(ap, wt_timestamp_t *), va_arg(ap, uint64_t *)); - va_end(ap); - - return (ret); -} - -/* - * __curhs_get_value -- - * WT_CURSOR->get_value method for the hs cursor type. - */ -static int -__curhs_get_value(WT_CURSOR *cursor, ...) -{ - WT_CURSOR *file_cursor; - WT_CURSOR_HS *hs_cursor; - WT_DECL_RET; - va_list ap; - - hs_cursor = (WT_CURSOR_HS *)cursor; - file_cursor = hs_cursor->file_cursor; - - va_start(ap, cursor); - ret = file_cursor->get_value(file_cursor, va_arg(ap, wt_timestamp_t *), - va_arg(ap, wt_timestamp_t *), va_arg(ap, uint64_t *), va_arg(ap, WT_ITEM **)); - va_end(ap); - - return (ret); -} - -/* * __curhs_set_value -- - * WT_CURSOR->set_value method for the hs cursor type. + * WT_CURSOR->set_value method for the history store cursor type. */ static void __curhs_set_value(WT_CURSOR *cursor, ...) { WT_CURSOR *file_cursor; WT_CURSOR_HS *hs_cursor; + WT_ITEM *hs_val; + wt_timestamp_t start_ts; + wt_timestamp_t stop_ts; + uint64_t type; va_list ap; hs_cursor = (WT_CURSOR_HS *)cursor; @@ -650,14 +775,20 @@ __curhs_set_value(WT_CURSOR *cursor, ...) va_start(ap, cursor); hs_cursor->time_window = *va_arg(ap, WT_TIME_WINDOW *); - file_cursor->set_value(file_cursor, va_arg(ap, wt_timestamp_t), va_arg(ap, wt_timestamp_t), - va_arg(ap, uint64_t), va_arg(ap, WT_ITEM *)); + stop_ts = va_arg(ap, wt_timestamp_t); + start_ts = va_arg(ap, wt_timestamp_t); + type = va_arg(ap, uint64_t); + hs_val = va_arg(ap, WT_ITEM *); + + file_cursor->set_value(file_cursor, stop_ts, start_ts, type, hs_val); va_end(ap); + + __curhs_set_value_ptr(cursor, file_cursor); } /* * __curhs_insert -- - * WT_CURSOR->insert method for the hs cursor type. + * WT_CURSOR->insert method for the history store cursor type. */ static int __curhs_insert(WT_CURSOR *cursor) @@ -676,6 +807,12 @@ __curhs_insert(WT_CURSOR *cursor) CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, insert, CUR2BT(file_cursor)); + /* + * Disable bulk loads into history store. This would normally occur when updating a record with + * a cursor however the history store doesn't use cursor update, so we do it here. + */ + __wt_cursor_disable_bulk(session); + /* Allocate a tombstone only when there is a valid stop time point. */ if (WT_TIME_WINDOW_HAS_STOP(&hs_cursor->time_window)) { /* @@ -701,7 +838,6 @@ __curhs_insert(WT_CURSOR *cursor) if (hs_tombstone != NULL) { hs_tombstone->next = hs_upd; hs_upd = hs_tombstone; - hs_tombstone = NULL; } retry: @@ -725,7 +861,7 @@ err: /* * __curhs_remove -- - * WT_CURSOR->remove method for the hs cursor type. + * WT_CURSOR->remove method for the history store cursor type. */ static int __curhs_remove(WT_CURSOR *cursor) @@ -734,9 +870,14 @@ __curhs_remove(WT_CURSOR *cursor) WT_CURSOR_BTREE *cbt; WT_CURSOR_HS *hs_cursor; WT_DECL_RET; + WT_ITEM hs_key; WT_SESSION_IMPL *session; WT_UPDATE *hs_tombstone; + wt_timestamp_t hs_start_ts; + uint64_t hs_counter; + uint32_t hs_btree_id; + WT_CLEAR(hs_key); hs_cursor = (WT_CURSOR_HS *)cursor; file_cursor = hs_cursor->file_cursor; cbt = (WT_CURSOR_BTREE *)file_cursor; @@ -745,7 +886,9 @@ __curhs_remove(WT_CURSOR *cursor) CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, insert, CUR2BT(file_cursor)); /* Remove must be called with cursor positioned. */ - WT_ASSERT(session, F_ISSET(file_cursor, WT_CURSTD_KEY_INT)); + WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_KEY_INT)); + + WT_ERR(cursor->get_key(cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); /* * Since we're using internal functions to modify the row structure, we need to manually set the @@ -765,6 +908,7 @@ __curhs_remove(WT_CURSOR *cursor) /* Invalidate the previous value but we will hold on to the position of the key. */ F_CLR(file_cursor, WT_CURSTD_VALUE_SET); + F_CLR(cursor, WT_CURSTD_VALUE_SET); if (0) { err: @@ -777,7 +921,7 @@ err: /* * __curhs_update -- - * WT_CURSOR->update method for the hs cursor type. + * WT_CURSOR->update method for the history store cursor type. */ static int __curhs_update(WT_CURSOR *cursor) @@ -785,15 +929,11 @@ __curhs_update(WT_CURSOR *cursor) WT_CURSOR *file_cursor; WT_CURSOR_BTREE *cbt; WT_CURSOR_HS *hs_cursor; - WT_DECL_ITEM(hs_value); WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *hs_tombstone, *hs_upd; bool retry; - uint64_t hs_upd_type; - wt_timestamp_t hs_durable_ts, hs_stop_durable_ts; - hs_cursor = (WT_CURSOR_HS *)cursor; file_cursor = hs_cursor->file_cursor; cbt = (WT_CURSOR_BTREE *)file_cursor; @@ -814,34 +954,12 @@ __curhs_update(WT_CURSOR *cursor) WT_ASSERT(session, !WT_TIME_WINDOW_IS_EMPTY(&hs_cursor->time_window)); WT_ASSERT(session, WT_TIME_WINDOW_HAS_STOP(&hs_cursor->time_window)); - /* - * Ideally we want to check if we are positioned on the newest value for user key. However, we - * can't check if the timestamp was set to WT_TS_MAX when we searched for the key. We can can a - * next() on cursor to confirm there is no newer value but that would disturb our cursor. A more - * expensive method would be to search again and verify. - */ - /* The tombstone to represent the stop time window. */ WT_ERR(__wt_upd_alloc_tombstone(session, &hs_tombstone, NULL)); hs_tombstone->start_ts = hs_cursor->time_window.stop_ts; hs_tombstone->durable_ts = hs_cursor->time_window.durable_stop_ts; hs_tombstone->txnid = hs_cursor->time_window.stop_txn; - /* Modify the existing value with a new stop timestamp. */ - - /* Allocate a buffer for the history store value. */ - WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); - - /* Retrieve the existing update value and stop timestamp. */ - WT_ERR(file_cursor->get_value( - file_cursor, &hs_stop_durable_ts, &hs_durable_ts, &hs_upd_type, hs_value)); - WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX); - WT_ASSERT(session, (uint8_t)hs_upd_type == WT_UPDATE_STANDARD); - - /* Use set_value method to pack the new value. */ - file_cursor->set_value( - file_cursor, hs_cursor->time_window.stop_ts, hs_durable_ts, hs_upd_type, hs_value); - WT_ERR(__wt_upd_alloc(session, &file_cursor->value, WT_UPDATE_STANDARD, &hs_upd, NULL)); hs_upd->start_ts = hs_cursor->time_window.start_ts; hs_upd->durable_ts = hs_cursor->time_window.durable_start_ts; @@ -850,6 +968,11 @@ __curhs_update(WT_CURSOR *cursor) /* Connect the tombstone to the update. */ hs_tombstone->next = hs_upd; + /* + * Since we're using internal functions to modify the row structure, we need to manually set the + * comparison to an exact match. + */ + cbt->compare = 0; /* Make the updates and if we fail, search and try again. */ while ((ret = __wt_hs_modify(cbt, hs_tombstone)) == WT_RESTART) { WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &file_cursor->key, false)); @@ -863,11 +986,13 @@ __curhs_update(WT_CURSOR *cursor) WT_TRET(ret); } + __curhs_set_key_ptr(cursor, file_cursor); + __curhs_set_value_ptr(cursor, file_cursor); + if (0) { err: __wt_free(session, hs_tombstone); __wt_free(session, hs_upd); - __wt_scr_free(session, &hs_value); WT_TRET(cursor->reset(cursor)); } API_END_RET(session, ret); @@ -880,53 +1005,54 @@ err: int __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp) { - WT_CURSOR_STATIC_INIT(iface, __curhs_get_key, /* get-key */ - __curhs_get_value, /* get-value */ - __curhs_set_key, /* set-key */ - __curhs_set_value, /* set-value */ - __wt_cursor_compare_notsup, /* compare */ - __wt_cursor_equals_notsup, /* equals */ - __curhs_next, /* next */ - __curhs_prev, /* prev */ - __curhs_reset, /* reset */ - __wt_cursor_notsup, /* search */ - __curhs_search_near, /* search-near */ - __curhs_insert, /* insert */ - __wt_cursor_modify_value_format_notsup, /* modify */ - __curhs_update, /* update */ - __curhs_remove, /* remove */ - __wt_cursor_notsup, /* reserve */ - __wt_cursor_reconfigure_notsup, /* reconfigure */ - __wt_cursor_notsup, /* cache */ - __wt_cursor_reopen_notsup, /* reopen */ - __curhs_close); /* close */ + WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __curhs_set_key, /* set-key */ + __curhs_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curhs_next, /* next */ + __curhs_prev, /* prev */ + __curhs_reset, /* reset */ + __wt_cursor_notsup, /* search */ + __curhs_search_near, /* search-near */ + __curhs_insert, /* insert */ + __wt_cursor_modify_value_format_notsup, /* modify */ + __curhs_update, /* update */ + __curhs_remove, /* remove */ + __wt_cursor_notsup, /* reserve */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ + __curhs_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_HS *hs_cursor; WT_DECL_RET; - WT_ITEM *datastore_key; + *cursorp = NULL; WT_RET(__wt_calloc_one(session, &hs_cursor)); + ++session->hs_cursor_counter; cursor = (WT_CURSOR *)hs_cursor; *cursor = iface; cursor->session = (WT_SESSION *)session; cursor->key_format = WT_HS_KEY_FORMAT; cursor->value_format = WT_HS_VALUE_FORMAT; + WT_ERR(__wt_strdup(session, WT_HS_URI, &cursor->uri)); /* Open the file cursor for operations on the regular history store .*/ - WT_ERR(__hs_cursor_open_int(session, &hs_cursor->file_cursor)); + WT_ERR(__curhs_file_cursor_open(session, &hs_cursor->file_cursor)); WT_ERR(__wt_cursor_init(cursor, WT_HS_URI, owner, NULL, cursorp)); WT_TIME_WINDOW_INIT(&hs_cursor->time_window); hs_cursor->btree_id = 0; - datastore_key = &hs_cursor->datastore_key; - WT_ERR(__wt_scr_alloc(session, 0, &datastore_key)); + WT_ERR(__wt_scr_alloc(session, 0, &hs_cursor->datastore_key)); hs_cursor->flags = 0; WT_TIME_WINDOW_INIT(&hs_cursor->time_window); if (0) { err: - WT_TRET(__curhs_close(cursor)); + WT_TRET(cursor->close(cursor)); *cursorp = NULL; } return (ret); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 17e559cb52b..2269a925d3d 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -285,7 +285,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) * busy and then opens a different file (in this case, the HS file), it can deadlock with a * thread waiting for the first file to drain from the eviction queue. See WT-5946 for details. */ - WT_RET(__wt_hs_cursor_cache(session)); + WT_RET(__wt_curhs_cache(session)); if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) { /* * Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We @@ -2330,7 +2330,6 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - WT_CURSOR *hs_cursor_saved; WT_DECL_RET; WT_TRACK_OP_DECL; WT_TXN_GLOBAL *txn_global; @@ -2349,21 +2348,12 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d txn_shared = WT_SESSION_TXN_SHARED(session); /* - * If we have a history store cursor, save it. This ensures that if eviction needs to access the - * history store, it will get its own cursor, avoiding potential problems if it were to - * reposition or reset a history store cursor that we're in the middle of using for something - * else. - */ - hs_cursor_saved = session->hs_cursor; - session->hs_cursor = NULL; - - /* * Before we enter the eviction generation, make sure this session has a cached history store * cursor, otherwise we can deadlock with a session wanting exclusive access to a handle: that * session will have a handle list write lock and will be waiting on eviction to drain, we'll be * inside eviction waiting on a handle list read lock to open a history store cursor. */ - WT_ERR(__wt_hs_cursor_cache(session)); + WT_ERR(__wt_curhs_cache(session)); /* * It is not safe to proceed if the eviction server threads aren't setup yet. @@ -2464,12 +2454,6 @@ err: done: WT_TRACK_OP_END(session); - /* If the caller was using a history store cursor they should have closed it by now. */ - WT_ASSERT(session, session->hs_cursor == NULL); - - /* Restore the caller's history store cursor. */ - session->hs_cursor = hs_cursor_saved; - return (ret); } diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 8919f3d6544..5d6954cb594 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -76,7 +76,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) evict_flags = LF_ISSET(WT_READ_NO_SPLIT) ? WT_EVICT_CALL_NO_SPLIT : 0; FLD_SET(evict_flags, WT_EVICT_CALL_URGENT); - WT_RET(__wt_hs_cursor_cache(session)); + WT_RET(__wt_curhs_cache(session)); (void)__wt_atomic_addv32(&btree->evict_busy, 1); ret = __wt_evict(session, ref, previous_state, evict_flags); (void)__wt_atomic_subv32(&btree->evict_busy, 1); @@ -131,7 +131,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32 /* * Track history store pages being force evicted while holding a history store cursor open. */ - if (session->hs_cursor != NULL && WT_IS_HS(session->dhandle)) { + if (session->hs_cursor_counter > 0 && WT_IS_HS(session->dhandle)) { force_evict_hs = true; WT_STAT_CONN_INCR(session, cache_eviction_force_hs); } diff --git a/src/third_party/wiredtiger/src/history/hs_conn.c b/src/third_party/wiredtiger/src/history/hs_conn.c index 161aeec0030..6163d0042c7 100644 --- a/src/third_party/wiredtiger/src/history/hs_conn.c +++ b/src/third_party/wiredtiger/src/history/hs_conn.c @@ -55,22 +55,20 @@ __hs_cleanup_las(WT_SESSION_IMPL *session) /* * __wt_hs_get_btree -- - * Get the history store btree. Open a history store cursor if needed to get the btree. + * Get the history store btree by opening a history store cursor. */ int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep) { + WT_CURSOR *hs_cursor; WT_DECL_RET; *hs_btreep = NULL; - WT_RET(__wt_hs_cursor_open(session)); - - *hs_btreep = CUR2BT(session->hs_cursor); + WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); + *hs_btreep = __wt_curhs_get_btree(hs_cursor); WT_ASSERT(session, *hs_btreep != NULL); - - WT_TRET(__wt_hs_cursor_close(session)); - + WT_TRET(hs_cursor->close(hs_cursor)); return (ret); } diff --git a/src/third_party/wiredtiger/src/history/hs_cursor.c b/src/third_party/wiredtiger/src/history/hs_cursor.c index 432fe116a72..31da7b2cc9b 100644 --- a/src/third_party/wiredtiger/src/history/hs_cursor.c +++ b/src/third_party/wiredtiger/src/history/hs_cursor.c @@ -87,117 +87,39 @@ __wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd) } /* - * __hs_cursor_position_int -- - * Internal function to position a history store cursor at the end of a set of updates for a - * given btree id, record key and timestamp. + * __wt_hs_upd_time_window -- + * Get the underlying time window of the update history store cursor is positioned at. */ -static int -__hs_cursor_position_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, - const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key) +void +__wt_hs_upd_time_window(WT_CURSOR *hs_cursor, WT_TIME_WINDOW **twp) { - WT_DECL_ITEM(srch_key); - WT_DECL_RET; - int cmp, exact; - - /* The session should be pointing at the history store btree. */ - WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle)); - - if (user_srch_key == NULL) - WT_RET(__wt_scr_alloc(session, 0, &srch_key)); - else - srch_key = user_srch_key; - - /* - * Because of the special visibility rules for the history store, a new key can appear in - * between our search and the set of updates that we're interested in. Keep trying until we find - * it. - * - * There may be no history store entries for the given btree id and record key if they have been - * removed by WT_CONNECTION::rollback_to_stable. - * - * Note that we need to compare the raw key off the cursor to determine where we are in the - * history store as opposed to comparing the embedded data store key since the ordering is not - * guaranteed to be the same. - */ - cursor->set_key(cursor, btree_id, key, timestamp, UINT64_MAX); - /* Copy the raw key before searching as a basis for comparison. */ - WT_ERR(__wt_buf_set(session, srch_key, cursor->key.data, cursor->key.size)); - WT_ERR(cursor->search_near(cursor, &exact)); - if (exact > 0) { - /* - * It's possible that we may race with a history store insert for another key. So we may be - * more than one record away the end of our target key/timestamp range. Keep iterating - * backwards until we land on our key. - */ - while ((ret = cursor->prev(cursor)) == 0) { - WT_STAT_CONN_DATA_INCR(session, cursor_skip_hs_cur_position); + WT_CURSOR_BTREE *hs_cbt; - WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp)); - if (cmp <= 0) - break; - } - } -#ifdef HAVE_DIAGNOSTIC - if (ret == 0) { - WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp)); - WT_ASSERT(session, cmp <= 0); - } -#endif -err: - if (user_srch_key == NULL) - __wt_scr_free(session, &srch_key); - return (ret); + hs_cbt = __wt_curhs_get_cbt(hs_cursor); + *twp = &hs_cbt->upd_value->tw; } /* - * __wt_hs_cursor_position -- - * Position a history store cursor at the end of a set of updates for a given btree id, record - * key and timestamp. There may be no history store entries for the given btree id and record - * key if they have been removed by WT_CONNECTION::rollback_to_stable. There is an optional - * argument to store the key that we used to position the cursor which can be used to assess - * where the cursor is relative to it. The function executes with isolation level set as - * WT_ISO_READ_UNCOMMITTED. + * __wt_hs_find_upd -- + * Scan the history store for a record the btree cursor wants to position on. Create an update + * for the record and return to the caller. */ int -__wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, - const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key) -{ - WT_DECL_RET; - - WT_WITH_BTREE(session, CUR2BT(cursor), - WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, - ret = __hs_cursor_position_int(session, cursor, btree_id, key, timestamp, user_srch_key))); - return (ret); -} - -/* - * __hs_find_upd_int -- - * Internal helper to scan the history store for a record the btree cursor wants to position on. - * Create an update for the record and return to the caller. The caller may choose to optionally - * allow prepared updates to be returned regardless of whether prepare is being ignored - * globally. Otherwise, a prepare conflict will be returned upon reading a prepared update. - */ -static int -__hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, - const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, - WT_ITEM *base_value_buf) +__wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, + const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf) { WT_CURSOR *hs_cursor; - WT_CURSOR_BTREE *hs_cbt; WT_DECL_ITEM(hs_value); WT_DECL_ITEM(orig_hs_value_buf); WT_DECL_RET; WT_ITEM hs_key, recno_key; WT_MODIFY_VECTOR modifies; - WT_TXN *txn; WT_TXN_SHARED *txn_shared; WT_UPDATE *mod_upd; - wt_timestamp_t durable_timestamp, durable_timestamp_tmp, hs_start_ts, hs_start_ts_tmp; + wt_timestamp_t durable_timestamp, durable_timestamp_tmp; wt_timestamp_t hs_stop_durable_ts, hs_stop_durable_ts_tmp, read_timestamp; - uint64_t hs_counter, hs_counter_tmp, upd_type_full; - uint32_t hs_btree_id; + uint64_t upd_type_full; uint8_t *p, recno_key_buf[WT_INTPACK64_MAXSIZE], upd_type; - int cmp; bool upd_found; hs_cursor = NULL; @@ -205,15 +127,11 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, orig_hs_value_buf = NULL; WT_CLEAR(hs_key); __wt_modify_vector_init(session, &modifies); - txn = session->txn; txn_shared = WT_SESSION_TXN_SHARED(session); upd_found = false; WT_STAT_CONN_DATA_INCR(session, cursor_search_hs); - hs_cursor = session->hs_cursor; - hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; - /* Row-store key is as passed to us, create the column-store key as needed. */ WT_ASSERT( session, (key == NULL && recno != WT_RECNO_OOB) || (key != NULL && recno == WT_RECNO_OOB)); @@ -226,70 +144,29 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, key->size = WT_PTRDIFF(p, recno_key_buf); } - /* Allocate buffer for the history store value. */ - WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); /* * After positioning our cursor, we're stepping backwards to find the correct update. Since the * timestamp is part of the key, our cursor needs to go from the newest record (further in the * history store) to the oldest (earlier in the history store) for a given key. - */ - read_timestamp = allow_prepare ? txn->prepare_timestamp : txn_shared->read_timestamp; - - /* + * * A reader without a timestamp should read the largest timestamp in the range, however cursor * search near if given a 0 timestamp will place at the top of the range and hide the records * below it. As such we need to adjust a 0 timestamp to the timestamp max value. */ - if (read_timestamp == WT_TS_NONE) - read_timestamp = WT_TS_MAX; + read_timestamp = + txn_shared->read_timestamp == WT_TS_NONE ? WT_TS_MAX : txn_shared->read_timestamp; - WT_ERR_NOTFOUND_OK( - __wt_hs_cursor_position(session, hs_cursor, btree_id, key, read_timestamp, NULL), true); + hs_cursor->set_key(hs_cursor, 4, btree_id, key, read_timestamp, UINT64_MAX); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true); if (ret == WT_NOTFOUND) { ret = 0; goto done; } - for (;; ret = __wt_hs_cursor_prev(session, hs_cursor)) { - WT_ERR_NOTFOUND_OK(ret, true); - /* If we hit the end of the table, let's get out of here. */ - if (ret == WT_NOTFOUND) { - ret = 0; - goto done; - } - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); - - /* Stop before crossing over to the next btree */ - if (hs_btree_id != btree_id) - goto done; - - /* - * Keys are sorted in an order, skip the ones before the desired key, and bail out if we - * have crossed over the desired key and not found the record we are looking for. - */ - WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); - if (cmp != 0) - goto done; - - /* - * If the stop time pair on the tombstone in the history store is already globally visible - * we can skip it. - */ - if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) { - WT_STAT_CONN_DATA_INCR(session, cursor_prev_hs_tombstone); - continue; - } - /* - * If the stop time point of a record is visible to us, we won't be able to see anything for - * this entire key. Just jump straight to the end. - */ - if (__wt_txn_tw_stop_visible(session, &hs_cbt->upd_value->tw)) - goto done; - /* If the start time point is visible to us, let's return that record. */ - if (__wt_txn_tw_start_visible(session, &hs_cbt->upd_value->tw)) - break; - } + /* Allocate buffer for the history store value. */ + WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); WT_ERR(hs_cursor->get_value( hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &upd_type_full, hs_value)); upd_type = (uint8_t)upd_type_full; @@ -320,6 +197,8 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, * visibility checks when reading in order to construct the modify chain, so we can create * the value we expect. */ + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + while (upd_type == WT_UPDATE_MODIFY) { WT_ERR(__wt_upd_alloc(session, hs_value, upd_type, &mod_upd, NULL)); WT_ERR(__wt_modify_vector_push(&modifies, mod_upd)); @@ -330,7 +209,7 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, * update here we fall back to the datastore version. If its timestamp doesn't match our * timestamp then we return not found. */ - WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true); + WT_ERR_NOTFOUND_OK(hs_cursor->next(hs_cursor), true); if (ret == WT_NOTFOUND) { /* * Fallback to the provided value as the base value. @@ -344,47 +223,6 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, upd_type = WT_UPDATE_STANDARD; break; } - hs_start_ts_tmp = WT_TS_NONE; - /* - * Make sure we use the temporary variants of these variables. We need to retain the - * timestamps of the original modify we saw. - * - * We keep looking back into history store until we find a base update to apply the - * reverse deltas on top of. - */ - WT_ERR(hs_cursor->get_key( - hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts_tmp, &hs_counter_tmp)); - - if (hs_btree_id != btree_id) { - /* Fallback to the provided value as the base value. */ - orig_hs_value_buf = hs_value; - hs_value = base_value_buf; - upd_type = WT_UPDATE_STANDARD; - break; - } - - WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); - - if (cmp != 0) { - /* Fallback to the provided value as the base value. */ - orig_hs_value_buf = hs_value; - hs_value = base_value_buf; - upd_type = WT_UPDATE_STANDARD; - break; - } - - /* - * If the stop time pair on the tombstone in the history store is already globally - * visible fall back to the base value. This is possible in scenarios where the latest - * updates are aborted by RTS according to stable timestamp. - */ - if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) { - /* Fallback to the provided value as the base value. */ - orig_hs_value_buf = hs_value; - hs_value = base_value_buf; - upd_type = WT_UPDATE_STANDARD; - break; - } WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_durable_ts_tmp, &durable_timestamp_tmp, &upd_type_full, hs_value)); @@ -440,26 +278,8 @@ err: WT_ASSERT(session, ret != WT_NOTFOUND); - return (ret); -} - -/* - * __wt_hs_find_upd -- - * Scan the history store for a record. - */ -int -__wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, uint64_t recno, - WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf) -{ - WT_BTREE *btree; - WT_DECL_RET; - - btree = S2BT(session); + if (hs_cursor != NULL) + WT_TRET(hs_cursor->close(hs_cursor)); - WT_RET(__wt_hs_cursor_open(session)); - WT_WITH_BTREE(session, CUR2BT(session->hs_cursor), - (ret = __hs_find_upd_int( - session, btree->id, key, value_format, recno, upd_value, allow_prepare, base_value_buf))); - WT_TRET(__wt_hs_cursor_close(session)); return (ret); } diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index 318804eb7e1..6f523d49089 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -11,8 +11,7 @@ static int __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, const WT_ITEM *key, bool reinsert); static int __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, - WT_BTREE *btree, const WT_ITEM *key, wt_timestamp_t ts, uint64_t *hs_counter, - const WT_ITEM *srch_key); + WT_BTREE *btree, const WT_ITEM *key, wt_timestamp_t ts, uint64_t *hs_counter); /* * __hs_verbose_cache_stats -- @@ -61,100 +60,17 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree) } /* - * __hs_insert_record_with_btree_int -- - * Internal helper for inserting history store records. If this call is successful, the cursor - * parameter will be positioned on the newly inserted record. Otherwise, it will be reset. - */ -static int -__hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint64_t btree_id, - const WT_ITEM *key, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw, - uint64_t counter) -{ - WT_CURSOR_BTREE *cbt; - WT_DECL_RET; - WT_UPDATE *hs_upd, *upd_local; - - cbt = (WT_CURSOR_BTREE *)cursor; - hs_upd = upd_local = NULL; - - /* The session should be pointing at the history store btree. */ - WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle)); - - /* - * Use WT_CURSOR.set_key and WT_CURSOR.set_value to create key and value items, then use them to - * create an update chain for a direct insertion onto the history store page. - */ - cursor->set_key(cursor, btree_id, key, tw->start_ts, counter); - cursor->set_value(cursor, tw->durable_stop_ts, tw->durable_start_ts, (uint64_t)type, hs_value); - - /* Allocate a tombstone only when there is a valid stop time point. */ - if (WT_TIME_WINDOW_HAS_STOP(tw)) { - /* - * Insert a delete record to represent stop time point for the actual record to be inserted. - * Set the stop time point as the commit time point of the history store delete record. - */ - WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); - hs_upd->start_ts = tw->stop_ts; - hs_upd->durable_ts = tw->durable_stop_ts; - hs_upd->txnid = tw->stop_txn; - } - - /* - * Append to the delete record, the actual record to be inserted into the history store. Set the - * current update start time point as the commit time point to the history store record. - */ - WT_ERR(__wt_upd_alloc(session, &cursor->value, WT_UPDATE_STANDARD, &upd_local, NULL)); - upd_local->start_ts = tw->start_ts; - upd_local->durable_ts = tw->durable_start_ts; - upd_local->txnid = tw->start_txn; - - /* Insert the standard update as next update if there is a tombstone. */ - if (hs_upd != NULL) - hs_upd->next = upd_local; - else - hs_upd = upd_local; - - /* Search the page and insert the updates. */ - WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &cursor->key, true)); - WT_ERR(ret); - WT_ERR(__wt_hs_modify(cbt, hs_upd)); - - /* - * Since the two updates (tombstone and the standard) will reconcile into a single entry, we are - * incrementing the history store insert statistic by one. - */ - WT_STAT_CONN_DATA_INCR(session, cache_hs_insert); - -err: - if (ret != 0) { - __wt_free_update_list(session, &hs_upd); - - /* - * We did a row search, release the cursor so that the page doesn't continue being held. - * - * If we were successful, do NOT reset the cursor. We may want to make use of its position - * later to remove timestamped entries. - */ - cursor->reset(cursor); - } - - return (ret); -} - -/* - * __hs_insert_record_with_btree -- + * __hs_insert_record -- * A helper function to insert the record into the history store including stop time point. - * Should be called with session's btree switched to the history store. */ static int -__hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, - const WT_ITEM *key, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw) +__hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, + const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw) { #ifdef HAVE_DIAGNOSTIC WT_CURSOR_BTREE *hs_cbt; #endif WT_DECL_ITEM(hs_key); - WT_DECL_ITEM(srch_key); #ifdef HAVE_DIAGNOSTIC WT_DECL_ITEM(existing_val); #endif @@ -164,37 +80,24 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT wt_timestamp_t durable_timestamp_diag; wt_timestamp_t hs_stop_durable_ts_diag; uint64_t upd_type_full_diag; + int cmp; #endif uint64_t counter, hs_counter; uint32_t hs_btree_id; - int cmp; counter = 0; /* Allocate buffers for the history store and search key. */ WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); - WT_ERR(__wt_scr_alloc(session, 0, &srch_key)); #ifdef HAVE_DIAGNOSTIC /* Allocate buffer for the existing history store value for the same key. */ WT_ERR(__wt_scr_alloc(session, 0, &existing_val)); - hs_cbt = (WT_CURSOR_BTREE *)cursor; + hs_cbt = __wt_curhs_get_cbt(cursor); #endif - /* - * The session should be pointing at the history store btree since this is the one that we'll be - * inserting into. The btree parameter that we're passing in should is the btree that the - * history store content is associated with (this is where the btree id part of the history - * store key comes from). - */ - WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle)); - WT_ASSERT(session, !WT_IS_HS(btree->dhandle)); - - /* - * Disable bulk loads into history store. This would normally occur when updating a record with - * a cursor however the history store doesn't use cursor update, so we do it here. - */ - __wt_cursor_disable_bulk(session); + /* Sanity check that the btree is not a history store btree. */ + WT_ASSERT(session, !WT_IS_HS(btree)); /* * Only deltas or full updates should be written to the history store. More specifically, we @@ -207,43 +110,33 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT * timestamp. Otherwise the newly inserting history store record may fall behind the existing * one can lead to wrong order. */ - WT_ERR_NOTFOUND_OK( - __wt_hs_cursor_position(session, cursor, btree->id, key, tw->start_ts, srch_key), true); + cursor->set_key(cursor, 4, btree->id, key, tw->start_ts, UINT64_MAX); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, cursor), true); + if (ret == 0) { WT_ERR(cursor->get_key(cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); - /* - * Check the whether the existing record is also from the same timestamp. - * - * Verify simple checks first to confirm whether the retrieved update same or not before - * performing the expensive key comparison. - */ - if (hs_btree_id == btree->id && tw->start_ts == hs_start_ts) { - WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp)); #ifdef HAVE_DIAGNOSTIC - if (cmp == 0) { - WT_ERR(cursor->get_value(cursor, &hs_stop_durable_ts_diag, &durable_timestamp_diag, - &upd_type_full_diag, existing_val)); - WT_ERR(__wt_compare(session, NULL, existing_val, hs_value, &cmp)); - /* - * Check if the existing HS value is same as the new value we are about to insert. - * We can skip this check if the existing value has a globally visible stop time, - * i.e., the value has been deleted from the HS. - */ - if (cmp == 0) - WT_ASSERT(session, - (WT_TIME_WINDOW_HAS_STOP(&hs_cbt->upd_value->tw) && - __wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) || - tw->start_txn == WT_TXN_NONE || - tw->start_txn != hs_cbt->upd_value->tw.start_txn || - tw->start_ts != hs_cbt->upd_value->tw.start_ts); - counter = hs_counter + 1; - } -#else + if (tw->start_ts == hs_start_ts) { + WT_ERR(cursor->get_value(cursor, &hs_stop_durable_ts_diag, &durable_timestamp_diag, + &upd_type_full_diag, existing_val)); + WT_ERR(__wt_compare(session, NULL, existing_val, hs_value, &cmp)); + /* + * We shouldn't be inserting the same value again for the key unless coming from a + * different transaction. If the updates are from the same transaction, the start + * timestamp for each update should be different. + */ if (cmp == 0) - counter = hs_counter + 1; -#endif + WT_ASSERT(session, + tw->start_txn == WT_TXN_NONE || + tw->start_txn != hs_cbt->upd_value->tw.start_txn || + tw->start_ts != hs_cbt->upd_value->tw.start_ts); + counter = hs_counter + 1; } +#else + if (tw->start_ts == hs_start_ts) + counter = hs_counter + 1; +#endif } /* @@ -251,10 +144,20 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT * updates, we should remove them and reinsert them at the current timestamp. */ if (tw->start_ts != WT_TS_NONE) { - WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, cursor), true); + /* + * If there were no keys equal to or less than our target key, we would have received + * WT_NOTFOUND. In that case we need to search again with a higher timestamp as the cursor + * would not be positioned correctly. + */ + if (ret == 0) + WT_ERR_NOTFOUND_OK(cursor->next(cursor), true); + else { + cursor->set_key(cursor, 3, btree->id, key, tw->start_ts + 1); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, cursor), true); + } if (ret == 0) WT_ERR(__hs_fixup_out_of_order_from_pos( - session, cursor, btree, key, tw->start_ts, &counter, srch_key)); + session, cursor, btree, key, tw->start_ts, &counter)); } #ifdef HAVE_DIAGNOSTIC @@ -270,36 +173,20 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT } } #endif - /* The tree structure can change while we try to insert the mod list, retry if that happens. */ - while ((ret = __hs_insert_record_with_btree_int( - session, cursor, btree->id, key, type, hs_value, tw, counter)) == WT_RESTART) - WT_STAT_CONN_DATA_INCR(session, cache_hs_insert_restart); + + /* Insert the new record now. */ + cursor->set_key(cursor, 4, btree->id, key, tw->start_ts, counter); + cursor->set_value( + cursor, tw, tw->durable_stop_ts, tw->durable_start_ts, (uint64_t)type, hs_value); + WT_ERR(cursor->insert(cursor)); + WT_STAT_CONN_INCR(session, cache_hs_insert); + WT_STAT_DATA_INCR(session, cache_hs_insert); + err: #ifdef HAVE_DIAGNOSTIC __wt_scr_free(session, &existing_val); #endif __wt_scr_free(session, &hs_key); - __wt_scr_free(session, &srch_key); - /* We did a row search, release the cursor so that the page doesn't continue being held. */ - cursor->reset(cursor); - - return (ret); -} - -/* - * __hs_insert_record -- - * Temporarily switches to history store btree and calls the helper routine to insert records. - */ -static int -__hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, - const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw) -{ - WT_CURSOR_BTREE *cbt; - WT_DECL_RET; - - cbt = (WT_CURSOR_BTREE *)cursor; - WT_WITH_BTREE(session, CUR2BT(cbt), - ret = __hs_insert_record_with_btree(session, cursor, btree, key, type, hs_value, tw)); return (ret); } @@ -346,8 +233,8 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_MODIFY_VECTOR *modifies, int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) { - WT_BTREE *btree; - WT_CURSOR *cursor; + WT_BTREE *btree, *hs_btree; + WT_CURSOR *hs_cursor; WT_DECL_ITEM(full_value); WT_DECL_ITEM(key); WT_DECL_ITEM(modify_value); @@ -372,10 +259,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) bool enable_reverse_modify, hs_inserted, squashed, ts_updates_in_hs; btree = S2BT(session); - cursor = session->hs_cursor; prev_upd = NULL; insert_cnt = 0; WT_TIME_WINDOW_INIT(&tw); + + WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + __wt_modify_vector_init(session, &modifies); if (!btree->hs_entries) @@ -560,13 +450,15 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) if (oldest_upd->type == WT_UPDATE_TOMBSTONE && oldest_upd == first_non_ts_upd && !F_ISSET(first_non_ts_upd, WT_UPDATE_CLEARED_HS)) { /* We can only delete history store entries that have timestamps. */ - WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1, true)); - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_non_ts); + WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, key, 1, true)); + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_non_ts); + WT_STAT_DATA_INCR(session, cache_hs_key_truncate_non_ts); F_SET(first_non_ts_upd, WT_UPDATE_CLEARED_HS); } else if (first_non_ts_upd != NULL && !F_ISSET(first_non_ts_upd, WT_UPDATE_CLEARED_HS) && (list->ins == NULL || ts_updates_in_hs)) { - WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1, true)); - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_non_ts); + WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, key, 1, true)); + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_non_ts); + WT_STAT_DATA_INCR(session, cache_hs_key_truncate_non_ts); F_SET(first_non_ts_upd, WT_UPDATE_CLEARED_HS); } @@ -704,13 +596,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) enable_reverse_modify && __wt_calc_modify(session, prev_full_value, full_value, prev_full_value->size / 10, entries, &nentries) == 0) { - WT_ERR(__wt_modify_pack(cursor, entries, nentries, &modify_value)); + WT_ERR(__wt_modify_pack(hs_cursor, entries, nentries, &modify_value)); WT_ERR(__hs_insert_record( - session, cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw)); + session, hs_cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw)); __wt_scr_free(session, &modify_value); } else WT_ERR(__hs_insert_record( - session, cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw)); + session, hs_cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw)); /* Flag the update as now in the history store. */ F_SET(upd, WT_UPDATE_HS); @@ -730,7 +622,8 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size)); WT_STAT_CONN_SET(session, cache_hs_ondisk, hs_size); - max_hs_size = CUR2BT(cursor)->file_max; + hs_btree = __wt_curhs_get_btree(hs_cursor); + max_hs_size = hs_btree->file_max; if (max_hs_size != 0 && (uint64_t)hs_size > max_hs_size) WT_ERR_PANIC(session, WT_PANIC, "WiredTigerHS: file size of %" PRIu64 " exceeds maximum size %" PRIu64, (uint64_t)hs_size, @@ -747,95 +640,39 @@ err: __wt_modify_vector_free(&modifies); __wt_scr_free(session, &full_value); __wt_scr_free(session, &prev_full_value); + + WT_TRET(hs_cursor->close(hs_cursor)); return (ret); } /* - * __hs_delete_key_from_ts_int -- - * Internal helper for deleting history store content of a given key from a timestamp. + * __wt_hs_delete_key_from_ts -- + * Delete history store content of a given key from a timestamp. */ -static int -__hs_delete_key_from_ts_int( - WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert) +int +__wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, + const WT_ITEM *key, wt_timestamp_t ts, bool reinsert) { - WT_CURSOR *hs_cursor; - WT_DECL_ITEM(srch_key); WT_DECL_RET; - WT_ITEM hs_key; - wt_timestamp_t hs_start_ts; - uint64_t hs_counter; - uint32_t hs_btree_id; - int cmp, exact; - - /* The session should be pointing at the history store btree. */ - WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle)); + bool hs_read_committed; - hs_cursor = session->hs_cursor; - WT_RET(__wt_scr_alloc(session, 0, &srch_key)); + hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + if (!hs_read_committed) + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - hs_cursor->set_key(hs_cursor, btree_id, key, ts, 0); - WT_ERR(__wt_buf_set(session, srch_key, hs_cursor->key.data, hs_cursor->key.size)); - WT_ERR_NOTFOUND_OK(__wt_hs_cursor_search_near(session, hs_cursor, &exact), true); + hs_cursor->set_key(hs_cursor, 3, btree_id, key, ts); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor), true); /* Empty history store is fine. */ - if (ret == WT_NOTFOUND) + if (ret == WT_NOTFOUND) { + ret = 0; goto done; - /* - * If we raced with a history store insert, we may be two or more records away from our target. - * Keep iterating forwards until we are on or past our target key. - * - * We can't use the cursor positioning helper that we use for regular reads since that will - * place us at the end of a particular key/timestamp range whereas we want to be placed at the - * beginning. - */ - if (exact < 0) { - while ((ret = __wt_hs_cursor_next(session, hs_cursor)) == 0) { - WT_ERR(__wt_compare(session, NULL, &hs_cursor->key, srch_key, &cmp)); - if (cmp >= 0) - break; - } - /* No entries greater than or equal to the key we searched for. */ - WT_ERR_NOTFOUND_OK(ret, true); - if (ret == WT_NOTFOUND) - goto done; } - /* Bailing out here also means we have no history store records for our key. */ - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); - if (hs_btree_id != btree_id) - goto done; - WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); - if (cmp != 0) - goto done; - WT_ASSERT(session, ts == WT_TS_NONE || hs_start_ts != WT_TS_NONE); WT_ERR(__hs_delete_key_from_pos(session, hs_cursor, btree_id, key, reinsert)); done: - ret = 0; err: - __wt_scr_free(session, &srch_key); - return (ret); -} - -/* - * __wt_hs_delete_key_from_ts -- - * Delete history store content of a given key from a timestamp. - */ -int -__wt_hs_delete_key_from_ts( - WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert) -{ - WT_DECL_RET; - - /* If the operation can't open new handles, it should have figured that out before here. */ - WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)); - - /* The tree structure can change while we try to insert the mod list, retry if that happens. */ - do { - WT_WITH_BTREE(session, CUR2BT(session->hs_cursor), - (ret = __hs_delete_key_from_ts_int(session, btree_id, key, ts, reinsert))); - if (ret == WT_RESTART) - WT_STAT_CONN_DATA_INCR(session, cache_hs_insert_restart); - } while (ret == WT_RESTART); - + if (!hs_read_committed) + F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); return (ret); } @@ -847,31 +684,29 @@ __wt_hs_delete_key_from_ts( */ static int __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_BTREE *btree, - const WT_ITEM *key, wt_timestamp_t ts, uint64_t *counter, const WT_ITEM *srch_key) + const WT_ITEM *key, wt_timestamp_t ts, uint64_t *counter) { - WT_CURSOR *insert_cursor; + WT_CURSOR *hs_insert_cursor; WT_CURSOR_BTREE *hs_cbt; WT_DECL_RET; WT_ITEM hs_key, hs_value; - WT_TIME_WINDOW tw; - WT_UPDATE *tombstone; - wt_timestamp_t hs_ts, hs_start_durable_ts, hs_stop_durable_ts; + WT_TIME_WINDOW tw, hs_insert_tw; + wt_timestamp_t hs_ts; uint64_t hs_counter, hs_upd_type; uint32_t hs_btree_id; +#ifdef HAVE_DIAGNOSTIC int cmp; +#endif char ts_string[5][WT_TS_INT_STRING_SIZE]; - const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL}; - insert_cursor = NULL; - hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; + hs_insert_cursor = NULL; + hs_cbt = __wt_curhs_get_cbt(hs_cursor); WT_CLEAR(hs_key); WT_CLEAR(hs_value); - WT_TIME_WINDOW_INIT(&tw); - tombstone = NULL; - - /* The session should be pointing at the history store btree. */ - WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle)); +#ifndef HAVE_DIAGNOSTIC + WT_UNUSED(key); +#endif /* * Position ourselves at the beginning of the key range that we may have to fixup. Prior to * getting here, we've positioned our cursor at the end of a key/timestamp range and then done a @@ -881,15 +716,15 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, * to keep doing "next" until we've got a key greater than the one we attempted to position * ourselves with. */ - for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) { - /* - * Prior to getting here, we've done a "search near" on our key for the timestamp we're - * inserting and then a "next". In the regular case, our cursor will be positioned on the - * next key and we'll break out of the first iteration in one of the conditions below. - */ + for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { + /* We shouldn't have crossed the btree and user key search space. */ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); - WT_ERR(__wt_compare(session, NULL, &hs_cursor->key, srch_key, &cmp)); - if (cmp > 0) + WT_ASSERT(session, hs_btree_id == btree->id); +#ifdef HAVE_DIAGNOSTIC + WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); + WT_ASSERT(session, cmp == 0); +#endif + if (hs_ts > ts) break; } if (ret == WT_NOTFOUND) @@ -916,27 +751,14 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, * 2 foo 3 2 ccc * 2 foo 3 3 ddd */ - for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) { - /* - * Prior to getting here, we've done a "search near" on our key for the timestamp we're - * inserting and then a "next". In the regular case, our cursor will be positioned on the - * next key and we'll break out of the first iteration in one of the conditions below. - */ + for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { + /* We shouldn't have crossed the btree and user key search space. */ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); - if (hs_btree_id != btree->id) - break; - + WT_ASSERT(session, hs_btree_id == btree->id); +#ifdef HAVE_DIAGNOSTIC WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); - if (cmp != 0) - break; - /* - * If the stop time pair on the tombstone in the history store is already globally visible - * we can skip it. - */ - if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) { - WT_STAT_CONN_DATA_INCR(session, cursor_next_hs_tombstone); - continue; - } + WT_ASSERT(session, cmp == 0); +#endif /* * If we got here, we've got out-of-order updates in the history store. * @@ -950,11 +772,8 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, * Don't incur the overhead of opening this new cursor unless we need it. In the regular * case, we'll never get here. */ - if (insert_cursor == NULL) { - WT_WITHOUT_DHANDLE(session, - ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &insert_cursor)); - WT_ERR(ret); - } + if (hs_insert_cursor == NULL) + WT_ERR(__wt_curhs_open(session, NULL, &hs_insert_cursor)); /* * If these history store records are resolved prepared updates, their durable timestamps @@ -973,47 +792,38 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, __wt_timestamp_to_string(hs_cbt->upd_value->tw.durable_stop_ts, ts_string[3]), __wt_timestamp_to_string(ts, ts_string[4])); - tw.start_ts = tw.durable_start_ts = ts; - tw.start_txn = hs_cbt->upd_value->tw.start_txn; + hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts; + hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn; /* * We're going to be inserting something immediately after with the same timestamp. Either * another moved update OR the update itself that triggered the correction. In either case, * we should preserve the stop transaction id. */ - tw.stop_ts = tw.durable_stop_ts = ts; - tw.stop_txn = hs_cbt->upd_value->tw.stop_txn; + hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = ts; + hs_insert_tw.stop_txn = hs_cbt->upd_value->tw.stop_txn; /* Extract the underlying value for reinsertion. */ WT_ERR(hs_cursor->get_value( - hs_cursor, &hs_stop_durable_ts, &hs_start_durable_ts, &hs_upd_type, &hs_value)); + hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value)); - /* Reinsert entry with earlier timestamp. */ - while ((ret = __hs_insert_record_with_btree_int(session, insert_cursor, btree->id, key, - (uint8_t)hs_upd_type, &hs_value, &tw, *counter)) == WT_RESTART) - ; - WT_ERR(ret); + /* Insert the value back with different timestamps. */ + hs_insert_cursor->set_key(hs_insert_cursor, 4, btree->id, &hs_key, ts, *counter); + hs_insert_cursor->set_value(hs_insert_cursor, &hs_insert_tw, hs_insert_tw.durable_stop_ts, + hs_insert_tw.durable_start_ts, (uint64_t)hs_upd_type, &hs_value); + WT_ERR(hs_insert_cursor->insert(hs_insert_cursor)); ++(*counter); - /* Delete entry with higher timestamp. */ - hs_cbt->compare = 0; - WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL)); - tombstone->txnid = WT_TXN_NONE; - tombstone->start_ts = tombstone->durable_ts = WT_TS_NONE; - while ((ret = __wt_hs_modify(hs_cbt, tombstone)) == WT_RESTART) { - WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(hs_cbt, &hs_cursor->key, false)); - WT_ERR(ret); - } - WT_ERR(ret); - tombstone = NULL; - WT_STAT_CONN_DATA_INCR(session, cache_hs_order_fixup_move); + /* Delete the entry with higher timestamp. */ + WT_ERR(hs_cursor->remove(hs_cursor)); + WT_STAT_CONN_INCR(session, cache_hs_order_fixup_move); + WT_STAT_DATA_INCR(session, cache_hs_order_fixup_move); } if (ret == WT_NOTFOUND) ret = 0; err: - __wt_free(session, tombstone); - if (insert_cursor != NULL) - insert_cursor->close(insert_cursor); + if (hs_insert_cursor != NULL) + hs_insert_cursor->close(hs_insert_cursor); return (ret); } @@ -1027,26 +837,21 @@ static int __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, const WT_ITEM *key, bool reinsert) { - WT_CURSOR *insert_cursor; + WT_CURSOR *hs_insert_cursor; WT_CURSOR_BTREE *hs_cbt; WT_DECL_RET; WT_ITEM hs_key, hs_value; - WT_TIME_WINDOW tw; - WT_UPDATE *upd; + WT_TIME_WINDOW hs_insert_tw; wt_timestamp_t durable_timestamp, hs_start_ts, hs_stop_durable_ts; uint64_t hs_counter, hs_insert_counter, hs_upd_type; uint32_t hs_btree_id; - int cmp; - const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL}; - hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; + hs_cbt = __wt_curhs_get_cbt(hs_cursor); hs_insert_counter = 0; WT_CLEAR(hs_key); WT_CLEAR(hs_value); - WT_TIME_WINDOW_INIT(&tw); - upd = NULL; - insert_cursor = NULL; + hs_insert_cursor = NULL; if (reinsert) { /* * Determine the starting value of our counter, i.e. highest counter value of the timestamp @@ -1056,90 +861,60 @@ __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_ * The cursor will also be positioned at the start of the range that we wish to start * inserting. */ - WT_WITHOUT_DHANDLE(session, - ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &insert_cursor)); + WT_WITHOUT_DHANDLE(session, ret = __wt_curhs_open(session, NULL, &hs_insert_cursor)); WT_ERR(ret); - F_SET(insert_cursor, WT_CURSTD_IGNORE_TOMBSTONE); - WT_ERR_NOTFOUND_OK( - __wt_hs_cursor_position(session, insert_cursor, btree_id, key, WT_TS_NONE, NULL), true); + F_SET(hs_insert_cursor, WT_CURSTD_HS_READ_COMMITTED); + hs_insert_cursor->set_key(hs_insert_cursor, 4, btree_id, key, WT_TS_NONE, UINT64_MAX); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_insert_cursor), true); if (ret == WT_NOTFOUND) { hs_insert_counter = 0; ret = 0; } else { - WT_ERR(insert_cursor->get_key( - insert_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_insert_counter)); + WT_ERR(hs_insert_cursor->get_key( + hs_insert_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_insert_counter)); + WT_ASSERT(session, hs_start_ts == WT_TS_NONE); /* - * Increment the hs counter that we'll be using to insert with to avoid overwriting the - * record we just found. + * Increment the history store counter that we'll be using to insert with to avoid + * overwriting the record we just found. */ hs_insert_counter++; } } /* Begin iterating over the range of entries we expect to replace. */ - for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) { + for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); - /* - * If the btree id or key isn't ours, that means that we've hit the end of the key range and - * that there is no more history store content for this key. - */ - if (hs_btree_id != btree_id) - break; - WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); - if (cmp != 0) - break; - - /* - * If the stop time pair on the tombstone in the history store is already globally visible - * we can skip it. - */ - if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) { - WT_STAT_CONN_DATA_INCR(session, cursor_next_hs_tombstone); - continue; - } - - /* - * Once we reinsert the entry below, we're not allowed to fail otherwise we'll be leaving - * our history store an invalid state. Anything that can potentially fail, such as heap - * allocation of the tombstone that we'll be using to remove the old value, should be - * performed before reinsertion. - */ - WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL)); if (reinsert) { WT_ERR(hs_cursor->get_value( hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &hs_upd_type, &hs_value)); - tw.start_ts = tw.durable_start_ts = WT_TS_NONE; - tw.start_txn = hs_cbt->upd_value->tw.start_txn; + /* Reinsert entry with zero timestamp. */ + hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = WT_TS_NONE; + hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn; + + hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = WT_TS_NONE; + hs_insert_tw.stop_txn = hs_cbt->upd_value->tw.stop_txn; - tw.stop_ts = tw.durable_stop_ts = WT_TS_NONE; - tw.stop_txn = hs_cbt->upd_value->tw.stop_txn; + hs_insert_cursor->set_key( + hs_insert_cursor, 4, btree_id, key, WT_TS_NONE, hs_insert_counter); + hs_insert_cursor->set_value(hs_insert_cursor, &hs_insert_tw, WT_TS_NONE, WT_TS_NONE, + (uint64_t)hs_upd_type, &hs_value); + WT_ERR(hs_insert_cursor->insert(hs_insert_cursor)); + WT_STAT_CONN_INCR(session, cache_hs_insert); + WT_STAT_DATA_INCR(session, cache_hs_insert); - /* Reinsert entry with zero timestamp. */ - while ( - (ret = __hs_insert_record_with_btree_int(session, insert_cursor, btree_id, &hs_key, - (uint8_t)hs_upd_type, &hs_value, &tw, hs_insert_counter)) == WT_RESTART) - ; hs_insert_counter++; - WT_ERR(ret); } + /* - * Since we're using internal functions to modify the row structure, we need to manually set - * the comparison to an exact match. - */ - hs_cbt->compare = 0; - /* - * Append a globally visible tombstone to the update list. This will effectively make the - * value invisible and the key itself will eventually get removed during reconciliation. + * Remove the key using history store cursor interface. * * If anything fails after this point and we're reinserting we need to panic as it will * leave our history store in an unexpected state with duplicate entries. */ - upd->txnid = WT_TXN_NONE; - upd->start_ts = upd->durable_ts = WT_TS_NONE; - if ((ret = __wt_hs_modify(hs_cbt, upd)) != 0) { + if ((ret = hs_cursor->remove(hs_cursor)) != 0) { if (reinsert) WT_ERR_PANIC(session, WT_PANIC, "Failed to insert tombstone, history store now " @@ -1147,14 +922,13 @@ __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_ else WT_ERR(ret); } - upd = NULL; - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate); + WT_STAT_CONN_INCR(session, cache_hs_key_truncate); + WT_STAT_DATA_INCR(session, cache_hs_key_truncate); } if (ret == WT_NOTFOUND) ret = 0; err: - __wt_free(session, upd); - if (insert_cursor != NULL) - insert_cursor->close(insert_cursor); + if (hs_insert_cursor != NULL) + hs_insert_cursor->close(hs_insert_cursor); return (ret); } diff --git a/src/third_party/wiredtiger/src/history/hs_verify.c b/src/third_party/wiredtiger/src/history/hs_verify.c index 2fc49daa643..3b73a587922 100644 --- a/src/third_party/wiredtiger/src/history/hs_verify.c +++ b/src/third_party/wiredtiger/src/history/hs_verify.c @@ -15,10 +15,9 @@ * store. */ static int -__hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_btree_id) +__hs_verify_id( + WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_CURSOR_BTREE *ds_cbt, uint32_t this_btree_id) { - WT_CURSOR *hs_cursor; - WT_CURSOR_BTREE *hs_cbt; WT_DECL_ITEM(prev_key); WT_DECL_RET; WT_ITEM key; @@ -27,12 +26,14 @@ __hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_ uint32_t btree_id; int cmp; - hs_cursor = session->hs_cursor; - hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; WT_CLEAR(key); WT_ERR(__wt_scr_alloc(session, 0, &prev_key)); +#ifndef HAVE_DIAGNOSTIC + WT_UNUSED(this_btree_id); +#endif + /* * If using standard cursors, we need to skip the non-globally visible tombstones in the data * table to verify the corresponding entries in the history store are too present in the data @@ -46,27 +47,18 @@ __hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_ * verify. When we return after moving to a new key the caller is responsible for keeping the * cursor there or deciding they're done. */ - for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) { - WT_ERR(hs_cursor->get_key(hs_cursor, &btree_id, &key, &hs_start_ts, &hs_counter)); - + for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { /* * If the btree id does not match the preview one, we're done. It is up to the caller to set * up for the next tree and call us, if they choose. For a full history store walk, the * caller sends in WT_BTREE_ID_INVALID and this function will set and use the first btree id * it finds and will return once it walks off that tree, leaving the cursor set to the first * key of that new tree. + * + * We should never cross the btree id, assert if we do so. */ - if (btree_id != this_btree_id) - break; - - /* - * If the stop time pair on the tombstone in the history store is already globally visible - * we can skip it. - */ - if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) { - WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone); - continue; - } + WT_ERR(hs_cursor->get_key(hs_cursor, &btree_id, &key, &hs_start_ts, &hs_counter)); + WT_ASSERT(session, btree_id == this_btree_id); /* * If we have already checked against this key, keep going to the next key. We only need to @@ -114,22 +106,14 @@ __wt_hs_verify_one(WT_SESSION_IMPL *session) WT_CURSOR *hs_cursor; WT_CURSOR_BTREE ds_cbt; WT_DECL_RET; - WT_ITEM hs_key; uint32_t btree_id; - int exact; - hs_cursor = session->hs_cursor; + WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); btree_id = S2BT(session)->id; - /* - * We are required to position the history store cursor. Set it to the first record of our btree - * in the history store. - */ - memset(&hs_key, 0, sizeof(hs_key)); - hs_cursor->set_key(hs_cursor, btree_id, &hs_key, 0, 0); - ret = __wt_hs_cursor_search_near(session, hs_cursor, &exact); - if (ret == 0 && exact < 0) - ret = __wt_hs_cursor_next(session, hs_cursor); + hs_cursor->set_key(hs_cursor, 1, btree_id); + WT_ERR(__wt_curhs_search_near_after(session, hs_cursor)); /* * If we positioned the cursor there is something to verify. @@ -141,9 +125,12 @@ __wt_hs_verify_one(WT_SESSION_IMPL *session) if (ret == 0) { __wt_btcur_init(session, &ds_cbt); __wt_btcur_open(&ds_cbt); - ret = __hs_verify_id(session, &ds_cbt, btree_id); + ret = __hs_verify_id(session, hs_cursor, &ds_cbt, btree_id); WT_TRET(__wt_btcur_close(&ds_cbt, false)); } + +err: + WT_TRET(hs_cursor->close(hs_cursor)); return (ret == WT_NOTFOUND ? 0 : ret); } @@ -173,10 +160,10 @@ __wt_hs_verify(WT_SESSION_IMPL *session) btree_id = WT_BTREE_ID_INVALID; uri_data = NULL; + WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); WT_ERR(__wt_scr_alloc(session, 0, &buf)); - WT_ERR(__wt_hs_cursor_open(session)); - hs_cursor = session->hs_cursor; - WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true); + WT_ERR_NOTFOUND_OK(hs_cursor->next(hs_cursor), true); stop = ret == WT_NOTFOUND ? true : false; ret = 0; @@ -198,17 +185,16 @@ __wt_hs_verify(WT_SESSION_IMPL *session) } WT_ERR(__wt_open_cursor(session, uri_data, NULL, NULL, &ds_cursor)); F_SET(ds_cursor, WT_CURSOR_RAW_OK); - ret = __hs_verify_id(session, (WT_CURSOR_BTREE *)ds_cursor, btree_id); + ret = __hs_verify_id(session, hs_cursor, (WT_CURSOR_BTREE *)ds_cursor, btree_id); if (ret == WT_NOTFOUND) stop = true; WT_TRET(ds_cursor->close(ds_cursor)); WT_ERR_NOTFOUND_OK(ret, false); } err: - WT_TRET(__wt_hs_cursor_close(session)); - __wt_scr_free(session, &buf); WT_ASSERT(session, key.mem == NULL && key.memsize == 0); __wt_free(session, uri_data); + WT_TRET(hs_cursor->close(hs_cursor)); return (ret); } diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index a894b79eeef..2a2bd5aca2f 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -36,32 +36,32 @@ WT_DATA_HANDLE *__olddh = (s)->dhandle; \ const char *__oldname; \ /* If this isn't an API reentry, the name should be NULL and the counter should be 0. */ \ - WT_ASSERT(session, (s)->name != NULL || s->api_call_counter == 0); \ + WT_ASSERT(session, (s)->name != NULL || (s)->api_call_counter == 0); \ __oldname = (s)->name; \ - ++s->api_call_counter; \ + ++(s)->api_call_counter; \ (s)->dhandle = (dh); \ (s)->name = (s)->lastop = #h "." #n #define API_SESSION_POP(s) \ (s)->dhandle = __olddh; \ (s)->name = __oldname; \ - --s->api_call_counter + --(s)->api_call_counter /* Standard entry points to the API: declares/initializes local variables. */ -#define API_SESSION_INIT(s, h, n, dh) \ - WT_TRACK_OP_DECL; \ - API_SESSION_PUSH(s, h, n, dh); \ - /* \ - * No code before this line, otherwise error handling won't be \ - * correct. \ - */ \ - WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ - WT_SINGLE_THREAD_CHECK_START(s); \ - WT_TRACK_OP_INIT(s); \ - if (s->api_call_counter == 1 && !F_ISSET(s, WT_SESSION_INTERNAL)) \ - __wt_op_timer_start(s); \ - /* Reset wait time if this isn't an API reentry. */ \ - if (s->api_call_counter == 1) \ - (s)->cache_wait_us = 0; \ +#define API_SESSION_INIT(s, h, n, dh) \ + WT_TRACK_OP_DECL; \ + API_SESSION_PUSH(s, h, n, dh); \ + /* \ + * No code before this line, otherwise error handling won't be \ + * correct. \ + */ \ + WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ + WT_SINGLE_THREAD_CHECK_START(s); \ + WT_TRACK_OP_INIT(s); \ + if ((s)->api_call_counter == 1 && !F_ISSET(s, WT_SESSION_INTERNAL)) \ + __wt_op_timer_start(s); \ + /* Reset wait time if this isn't an API reentry. */ \ + if ((s)->api_call_counter == 1) \ + (s)->cache_wait_us = 0; \ __wt_verbose((s), WT_VERB_API, "%s", "CALL: " #h ":" #n) #define API_CALL_NOCONF(s, h, n, dh) \ @@ -75,21 +75,26 @@ if ((config) != NULL) \ WT_ERR(__wt_config_check((s), WT_CONFIG_REF(session, h##_##n), (config), 0)) -#define API_END(s, ret) \ - if ((s) != NULL) { \ - WT_TRACK_OP_END(s); \ - WT_SINGLE_THREAD_CHECK_STOP(s); \ - if ((ret) != 0) \ - __wt_txn_err_set(s, ret); \ - if (s->api_call_counter == 1 && !F_ISSET(session, WT_SESSION_INTERNAL)) \ - __wt_op_timer_stop(s); \ - /* \ - * No code after this line, otherwise error handling \ - * won't be correct. \ - */ \ - API_SESSION_POP(s); \ - } \ - } \ +#define API_END(s, ret) \ + if ((s) != NULL) { \ + WT_TRACK_OP_END(s); \ + WT_SINGLE_THREAD_CHECK_STOP(s); \ + if ((ret) != 0) \ + __wt_txn_err_set(s, ret); \ + if ((s)->api_call_counter == 1 && !F_ISSET(session, WT_SESSION_INTERNAL)) \ + __wt_op_timer_stop(s); \ + /* \ + * We should not leave any history store cursor open when return from an api call. \ + * However, we cannot do a stricter check before WT-7247 is resolved. \ + */ \ + WT_ASSERT(s, (s)->api_call_counter > 1 || (s)->hs_cursor_counter <= 2); \ + /* \ + * No code after this line, otherwise error handling \ + * won't be correct. \ + */ \ + API_SESSION_POP(s); \ + } \ + } \ while (0) /* An API call wrapped in a transaction if necessary. */ @@ -188,13 +193,15 @@ SESSION_API_PREPARE_CHECK(s, WT_SESSION, n); \ API_CALL_NOCONF(s, WT_SESSION, n, NULL) -#define SESSION_API_PREPARE_CHECK(s, h, n) \ - do { \ - int __prepare_ret; \ - API_SESSION_PUSH(s, WT_SESSION, n, NULL); \ - __prepare_ret = __wt_txn_context_prepare_check(s); \ - API_SESSION_POP(s); \ - WT_RET(__prepare_ret); \ +#define SESSION_API_PREPARE_CHECK(s, h, n) \ + do { \ + if ((s)->api_call_counter == 0) { \ + int __prepare_ret; \ + API_SESSION_PUSH(s, WT_SESSION, n, NULL); \ + __prepare_ret = __wt_txn_context_prepare_check(s); \ + API_SESSION_POP(s); \ + WT_RET(__prepare_ret); \ + } \ } while (0) #define SESSION_API_CALL(s, n, config, cfg) \ @@ -209,8 +216,7 @@ #define CURSOR_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - if ((s)->hs_cursor == NULL) \ - SESSION_API_PREPARE_CHECK(s, WT_CURSOR, n); \ + SESSION_API_PREPARE_CHECK(s, WT_CURSOR, n); \ API_CALL_NOCONF(s, WT_CURSOR, n, ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ if (F_ISSET(cur, WT_CURSTD_CACHED)) \ WT_ERR(__wt_cursor_cached(cur)) diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index d489f7aa638..82023a1573e 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -288,7 +288,7 @@ struct __wt_cursor_hs { WT_CURSOR *file_cursor; /* Queries of regular history store data */ WT_TIME_WINDOW time_window; uint32_t btree_id; - WT_ITEM datastore_key; + WT_ITEM *datastore_key; /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_HS_CUR_BTREE_ID_SET 0x1u diff --git a/src/third_party/wiredtiger/src/include/cursor_inline.h b/src/third_party/wiredtiger/src/include/cursor_inline.h index ef359942853..4c5889b6b9e 100644 --- a/src/third_party/wiredtiger/src/include/cursor_inline.h +++ b/src/third_party/wiredtiger/src/include/cursor_inline.h @@ -7,6 +7,32 @@ */ /* + * __wt_curhs_get_btree -- + * Convert a history store cursor to the underlying btree. + */ +static inline WT_BTREE * +__wt_curhs_get_btree(WT_CURSOR *cursor) +{ + WT_CURSOR_HS *hs_cursor; + hs_cursor = (WT_CURSOR_HS *)cursor; + + return (CUR2BT(hs_cursor->file_cursor)); +} + +/* + * __wt_curhs_get_cbt -- + * Convert a history store cursor to the underlying btree cursor. + */ +static inline WT_CURSOR_BTREE * +__wt_curhs_get_cbt(WT_CURSOR *cursor) +{ + WT_CURSOR_HS *hs_cursor; + hs_cursor = (WT_CURSOR_HS *)cursor; + + return ((WT_CURSOR_BTREE *)hs_cursor->file_cursor); +} + +/* * __cursor_set_recno -- * The cursor value in the interface has to track the value in the underlying cursor, update * them in parallel. diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index dfefe57ba26..7fac6f5cbd4 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -495,8 +495,14 @@ extern int __wt_curfile_next_random(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_curhs_cache(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_curhs_search_near_after(WT_SESSION_IMPL *session, WT_CURSOR *cursor) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_curhs_search_near_before(WT_SESSION_IMPL *session, WT_CURSOR *cursor) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, @@ -596,7 +602,7 @@ extern int __wt_debug_addr_print(WT_SESSION_IMPL *session, const uint8_t *addr, WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE( (visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile) +extern int __wt_debug_cursor_tree_hs(void *session_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_disk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) @@ -750,26 +756,11 @@ extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM * WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_cursor_cache(WT_SESSION_IMPL *session) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_cursor_close(WT_SESSION_IMPL *session) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_cursor_open(WT_SESSION_IMPL *session) +extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, + uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, - const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_delete_key_from_ts( - WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, - uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf) +extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, + const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1716,6 +1707,7 @@ extern void __wt_gen_next(WT_SESSION_IMPL *session, int which, uint64_t *genp); extern void __wt_gen_next_drain(WT_SESSION_IMPL *session, int which); extern void __wt_hazard_close(WT_SESSION_IMPL *session); extern void __wt_hs_close(WT_SESSION_IMPL *session); +extern void __wt_hs_upd_time_window(WT_CURSOR *hs_cursor, WT_TIME_WINDOW **twp); extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg); extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor); extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); @@ -1820,8 +1812,12 @@ extern void __wt_verbose_worker(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)); extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); +static inline WT_BTREE *__wt_curhs_get_btree(WT_CURSOR *cursor) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline WT_CELL *__wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline WT_CURSOR_BTREE *__wt_curhs_get_cbt(WT_CURSOR *cursor) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline WT_IKEY *__wt_ref_key_instantiated(WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline WT_VISIBLE_TYPE __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index f07a9bba91f..9d783cede10 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -92,7 +92,7 @@ struct __wt_session_impl { WT_COMPACT_STATE *compact; /* Compaction information */ enum { WT_COMPACT_NONE = 0, WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state; - WT_CURSOR *hs_cursor; /* History store table cursor */ + u_int hs_cursor_counter; /* Number of open history store cursors */ WT_CURSOR *meta_cursor; /* Metadata file */ void *meta_track; /* Metadata operation tracking */ diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index f7f6cb8232e..9131489d0fc 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -451,10 +451,8 @@ struct __wt_connection_stats { int64_t cursor_modify_bytes; int64_t cursor_modify_bytes_touch; int64_t cursor_next; - int64_t cursor_next_hs_tombstone_rts; int64_t cursor_restart; int64_t cursor_prev; - int64_t cursor_prev_hs_tombstone_rts; int64_t cursor_remove; int64_t cursor_remove_bytes; int64_t cursor_reserve; diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h index b2365b1b2ac..6c89b2024bf 100644 --- a/src/third_party/wiredtiger/src/include/txn_inline.h +++ b/src/third_party/wiredtiger/src/include/txn_inline.h @@ -1044,8 +1044,8 @@ retry: /* If there's no visible update in the update chain or ondisk, check the history store file. */ if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(session->dhandle, WT_DHANDLE_HS)) { __wt_timing_stress(session, WT_TIMING_STRESS_HS_SEARCH); - WT_RET(__wt_hs_find_upd(session, key, cbt->iface.value_format, recno, cbt->upd_value, false, - &cbt->upd_value->buf)); + WT_RET(__wt_hs_find_upd(session, S2BT(session)->id, key, cbt->iface.value_format, recno, + cbt->upd_value, &cbt->upd_value->buf)); } /* diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index d0584f49dc1..7878645d75e 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -5338,842 +5338,832 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CURSOR_MODIFY_BYTES_TOUCH 1136 /*! cursor: cursor next calls */ #define WT_STAT_CONN_CURSOR_NEXT 1137 -/*! - * cursor: cursor next calls that skip due to a globally visible history - * store tombstone in rollback to stable - */ -#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE_RTS 1138 /*! cursor: cursor operation restarted */ -#define WT_STAT_CONN_CURSOR_RESTART 1139 +#define WT_STAT_CONN_CURSOR_RESTART 1138 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1140 -/*! - * cursor: cursor prev calls that skip due to a globally visible history - * store tombstone in rollback to stable - */ -#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE_RTS 1141 +#define WT_STAT_CONN_CURSOR_PREV 1139 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1142 +#define WT_STAT_CONN_CURSOR_REMOVE 1140 /*! cursor: cursor remove key bytes removed */ -#define WT_STAT_CONN_CURSOR_REMOVE_BYTES 1143 +#define WT_STAT_CONN_CURSOR_REMOVE_BYTES 1141 /*! cursor: cursor reserve calls */ -#define WT_STAT_CONN_CURSOR_RESERVE 1144 +#define WT_STAT_CONN_CURSOR_RESERVE 1142 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1145 +#define WT_STAT_CONN_CURSOR_RESET 1143 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1146 +#define WT_STAT_CONN_CURSOR_SEARCH 1144 /*! cursor: cursor search history store calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_HS 1147 +#define WT_STAT_CONN_CURSOR_SEARCH_HS 1145 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1148 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1146 /*! cursor: cursor sweep buckets */ -#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1149 +#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1147 /*! cursor: cursor sweep cursors closed */ -#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1150 +#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1148 /*! cursor: cursor sweep cursors examined */ -#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1151 +#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1149 /*! cursor: cursor sweeps */ -#define WT_STAT_CONN_CURSOR_SWEEP 1152 +#define WT_STAT_CONN_CURSOR_SWEEP 1150 /*! cursor: cursor truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1153 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1151 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1154 +#define WT_STAT_CONN_CURSOR_UPDATE 1152 /*! cursor: cursor update key and value bytes */ -#define WT_STAT_CONN_CURSOR_UPDATE_BYTES 1155 +#define WT_STAT_CONN_CURSOR_UPDATE_BYTES 1153 /*! cursor: cursor update value size change */ -#define WT_STAT_CONN_CURSOR_UPDATE_BYTES_CHANGED 1156 +#define WT_STAT_CONN_CURSOR_UPDATE_BYTES_CHANGED 1154 /*! cursor: cursors reused from cache */ -#define WT_STAT_CONN_CURSOR_REOPEN 1157 +#define WT_STAT_CONN_CURSOR_REOPEN 1155 /*! data-handle: connection data handle size */ -#define WT_STAT_CONN_DH_CONN_HANDLE_SIZE 1158 +#define WT_STAT_CONN_DH_CONN_HANDLE_SIZE 1156 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1159 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1157 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1160 +#define WT_STAT_CONN_DH_SWEEP_REF 1158 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1161 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1159 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1162 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1160 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1163 +#define WT_STAT_CONN_DH_SWEEP_TOD 1161 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1164 +#define WT_STAT_CONN_DH_SWEEPS 1162 /*! * data-handle: connection sweeps skipped due to checkpoint gathering * handles */ -#define WT_STAT_CONN_DH_SWEEP_SKIP_CKPT 1165 +#define WT_STAT_CONN_DH_SWEEP_SKIP_CKPT 1163 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1166 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1164 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1167 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1165 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1168 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1166 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1169 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1167 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1170 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1168 /*! lock: dhandle lock application thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1171 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1169 /*! lock: dhandle lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1172 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1170 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1173 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1171 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1174 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1172 /*! * lock: durable timestamp queue lock application thread time waiting * (usecs) */ -#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_WAIT_APPLICATION 1175 +#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_WAIT_APPLICATION 1173 /*! * lock: durable timestamp queue lock internal thread time waiting * (usecs) */ -#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_WAIT_INTERNAL 1176 +#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_WAIT_INTERNAL 1174 /*! lock: durable timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_READ_COUNT 1177 +#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_READ_COUNT 1175 /*! lock: durable timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_WRITE_COUNT 1178 +#define WT_STAT_CONN_LOCK_DURABLE_TIMESTAMP_WRITE_COUNT 1176 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1179 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1177 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1180 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1178 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1181 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1179 /*! * lock: read timestamp queue lock application thread time waiting * (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1182 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1180 /*! lock: read timestamp queue lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1183 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1181 /*! lock: read timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1184 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1182 /*! lock: read timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1185 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1183 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1186 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1184 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1187 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1185 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1188 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1186 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1189 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1187 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1190 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1188 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1191 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1189 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1192 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1190 /*! lock: txn global lock application thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1193 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1191 /*! lock: txn global lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1194 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1192 /*! lock: txn global read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1195 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1193 /*! lock: txn global write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1196 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1194 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1197 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1195 /*! log: force archive time sleeping (usecs) */ -#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1198 +#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1196 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1199 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1197 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1200 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1198 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1201 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1199 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1202 +#define WT_STAT_CONN_LOG_FLUSH 1200 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1203 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1201 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1204 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1202 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1205 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1203 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1206 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1204 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1207 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1205 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1208 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1206 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1209 +#define WT_STAT_CONN_LOG_SCANS 1207 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1210 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1208 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1211 +#define WT_STAT_CONN_LOG_WRITE_LSN 1209 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1212 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1210 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1213 +#define WT_STAT_CONN_LOG_SYNC 1211 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1214 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1212 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1215 +#define WT_STAT_CONN_LOG_SYNC_DIR 1213 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1216 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1214 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1217 +#define WT_STAT_CONN_LOG_WRITES 1215 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1218 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1216 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1219 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1217 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1220 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1218 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1221 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1219 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1222 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1220 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1223 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1221 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1224 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1222 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1225 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1223 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1226 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1224 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1227 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1225 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1228 +#define WT_STAT_CONN_LOG_SLOT_RACES 1226 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1229 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1227 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1230 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1228 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1231 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1229 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1232 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1230 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1233 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1231 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1234 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1232 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1235 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1233 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1236 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1234 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1237 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1235 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1238 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1236 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1239 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1237 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1240 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1238 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1241 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1239 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1242 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1240 /*! perf: file system read latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1243 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1241 /*! perf: file system read latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1244 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1242 /*! perf: file system read latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1245 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1243 /*! perf: file system read latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1246 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1244 /*! perf: file system read latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1247 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1245 /*! perf: file system read latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1248 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1246 /*! perf: file system write latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1249 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1247 /*! perf: file system write latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1250 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1248 /*! perf: file system write latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1251 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1249 /*! perf: file system write latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1252 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1250 /*! perf: file system write latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1253 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1251 /*! perf: file system write latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1254 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1252 /*! perf: operation read latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1255 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1253 /*! perf: operation read latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1256 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1254 /*! perf: operation read latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1257 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1255 /*! perf: operation read latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1258 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1256 /*! perf: operation read latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1259 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1257 /*! perf: operation write latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1260 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1258 /*! perf: operation write latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1261 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1259 /*! perf: operation write latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1262 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1260 /*! perf: operation write latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1263 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1261 /*! perf: operation write latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1264 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1262 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_CONN_REC_OVERFLOW_KEY_INTERNAL 1265 +#define WT_STAT_CONN_REC_OVERFLOW_KEY_INTERNAL 1263 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_CONN_REC_OVERFLOW_KEY_LEAF 1266 +#define WT_STAT_CONN_REC_OVERFLOW_KEY_LEAF 1264 /*! reconciliation: maximum seconds spent in a reconciliation call */ -#define WT_STAT_CONN_REC_MAXIMUM_SECONDS 1267 +#define WT_STAT_CONN_REC_MAXIMUM_SECONDS 1265 /*! * reconciliation: page reconciliation calls that resulted in values with * prepared transaction metadata */ -#define WT_STAT_CONN_REC_PAGES_WITH_PREPARE 1268 +#define WT_STAT_CONN_REC_PAGES_WITH_PREPARE 1266 /*! * reconciliation: page reconciliation calls that resulted in values with * timestamps */ -#define WT_STAT_CONN_REC_PAGES_WITH_TS 1269 +#define WT_STAT_CONN_REC_PAGES_WITH_TS 1267 /*! * reconciliation: page reconciliation calls that resulted in values with * transaction ids */ -#define WT_STAT_CONN_REC_PAGES_WITH_TXN 1270 +#define WT_STAT_CONN_REC_PAGES_WITH_TXN 1268 /*! reconciliation: pages written including at least one prepare state */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_PREPARED 1271 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_PREPARED 1269 /*! reconciliation: pages written including at least one start timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TS 1272 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TS 1270 /*! reconciliation: records written including a prepare state */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PREPARED 1273 +#define WT_STAT_CONN_REC_TIME_WINDOW_PREPARED 1271 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1274 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1272 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1275 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1273 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1276 +#define WT_STAT_CONN_SESSION_OPEN 1274 /*! session: session query timestamp calls */ -#define WT_STAT_CONN_SESSION_QUERY_TS 1277 +#define WT_STAT_CONN_SESSION_QUERY_TS 1275 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1278 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1276 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1279 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1277 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1280 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1278 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1281 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1279 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1282 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1280 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1283 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1281 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1284 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1282 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1285 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1283 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1286 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1284 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1287 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1285 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1288 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1286 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1289 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1287 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1290 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1288 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1291 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1289 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1292 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1290 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1293 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1291 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1294 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1292 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1295 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1293 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1296 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1294 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1297 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1295 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1298 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1296 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1299 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1297 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1300 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1298 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1301 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1299 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1302 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1300 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1303 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1301 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1304 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1302 /*! thread-yield: page access yielded due to prepare state change */ -#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1305 +#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1303 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1306 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1304 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1307 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1305 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1308 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1306 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1309 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1307 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1310 +#define WT_STAT_CONN_PAGE_SLEEP 1308 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1311 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1309 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1312 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1310 /*! transaction: Number of prepared updates */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1313 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1311 /*! transaction: durable timestamp queue entries walked */ -#define WT_STAT_CONN_TXN_DURABLE_QUEUE_WALKED 1314 +#define WT_STAT_CONN_TXN_DURABLE_QUEUE_WALKED 1312 /*! transaction: durable timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_DURABLE_QUEUE_EMPTY 1315 +#define WT_STAT_CONN_TXN_DURABLE_QUEUE_EMPTY 1313 /*! transaction: durable timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_DURABLE_QUEUE_HEAD 1316 +#define WT_STAT_CONN_TXN_DURABLE_QUEUE_HEAD 1314 /*! transaction: durable timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_DURABLE_QUEUE_INSERTS 1317 +#define WT_STAT_CONN_TXN_DURABLE_QUEUE_INSERTS 1315 /*! transaction: durable timestamp queue length */ -#define WT_STAT_CONN_TXN_DURABLE_QUEUE_LEN 1318 +#define WT_STAT_CONN_TXN_DURABLE_QUEUE_LEN 1316 /*! transaction: prepared transactions */ -#define WT_STAT_CONN_TXN_PREPARE 1319 +#define WT_STAT_CONN_TXN_PREPARE 1317 /*! transaction: prepared transactions committed */ -#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1320 +#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1318 /*! transaction: prepared transactions currently active */ -#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1321 +#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1319 /*! transaction: prepared transactions rolled back */ -#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1322 +#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1320 /*! transaction: query timestamp calls */ -#define WT_STAT_CONN_TXN_QUERY_TS 1323 +#define WT_STAT_CONN_TXN_QUERY_TS 1321 /*! transaction: read timestamp queue entries walked */ -#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1324 +#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1322 /*! transaction: read timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1325 +#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1323 /*! transaction: read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1326 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1324 /*! transaction: read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1327 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1325 /*! transaction: read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1328 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1326 /*! transaction: rollback to stable calls */ -#define WT_STAT_CONN_TXN_RTS 1329 +#define WT_STAT_CONN_TXN_RTS 1327 /*! transaction: rollback to stable pages visited */ -#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1330 +#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1328 /*! transaction: rollback to stable tree walk skipping pages */ -#define WT_STAT_CONN_TXN_RTS_TREE_WALK_SKIP_PAGES 1331 +#define WT_STAT_CONN_TXN_RTS_TREE_WALK_SKIP_PAGES 1329 /*! transaction: rollback to stable updates aborted */ -#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1332 +#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1330 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1333 +#define WT_STAT_CONN_TXN_SET_TS 1331 /*! transaction: set timestamp durable calls */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1334 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1332 /*! transaction: set timestamp durable updates */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1335 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1333 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1336 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1334 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1337 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1335 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1338 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1336 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1339 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1337 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1340 +#define WT_STAT_CONN_TXN_BEGIN 1338 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1341 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1339 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1342 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1340 /*! * transaction: transaction checkpoint history store file duration * (usecs) */ -#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1343 +#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1341 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1344 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1342 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1345 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1343 /*! * transaction: transaction checkpoint most recent duration for gathering * all handles (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION 1346 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION 1344 /*! * transaction: transaction checkpoint most recent duration for gathering * applied handles (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_APPLY 1347 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_APPLY 1345 /*! * transaction: transaction checkpoint most recent duration for gathering * skipped handles (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_SKIP 1348 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_SKIP 1346 /*! transaction: transaction checkpoint most recent handles applied */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_APPLIED 1349 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_APPLIED 1347 /*! transaction: transaction checkpoint most recent handles skipped */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_SKIPPED 1350 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_SKIPPED 1348 /*! transaction: transaction checkpoint most recent handles walked */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_WALKED 1351 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_WALKED 1349 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1352 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1350 /*! transaction: transaction checkpoint prepare currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1353 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1351 /*! transaction: transaction checkpoint prepare max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1354 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1352 /*! transaction: transaction checkpoint prepare min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1355 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1353 /*! transaction: transaction checkpoint prepare most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1356 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1354 /*! transaction: transaction checkpoint prepare total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1357 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1355 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1358 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1356 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1359 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1357 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1360 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1358 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1361 +#define WT_STAT_CONN_TXN_CHECKPOINT 1359 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1362 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1360 /*! transaction: transaction failures due to history store */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1363 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1361 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1364 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1362 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1365 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1363 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1366 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1364 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1367 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1365 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1368 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1366 /*! transaction: transaction range of timestamps pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1369 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1367 /*! * transaction: transaction range of timestamps pinned by the oldest * active read timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1370 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1368 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1371 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1369 /*! transaction: transaction read timestamp of the oldest active reader */ -#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1372 +#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1370 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1373 +#define WT_STAT_CONN_TXN_SYNC 1371 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1374 +#define WT_STAT_CONN_TXN_COMMIT 1372 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1375 +#define WT_STAT_CONN_TXN_ROLLBACK 1373 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1376 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1374 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1377 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1375 /*! cache: bytes currently in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INUSE 1378 +#define WT_STAT_CONN_CACHE_BYTES_INUSE 1376 /*! cache: bytes dirty in the cache cumulative */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY_TOTAL 1379 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY_TOTAL 1377 /*! cache: bytes read into cache */ -#define WT_STAT_CONN_CACHE_BYTES_READ 1380 +#define WT_STAT_CONN_CACHE_BYTES_READ 1378 /*! cache: bytes written from cache */ -#define WT_STAT_CONN_CACHE_BYTES_WRITE 1381 +#define WT_STAT_CONN_CACHE_BYTES_WRITE 1379 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1382 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1380 /*! cache: eviction walk target pages histogram - 0-9 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1383 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1381 /*! cache: eviction walk target pages histogram - 10-31 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1384 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1382 /*! cache: eviction walk target pages histogram - 128 and higher */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1385 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1383 /*! cache: eviction walk target pages histogram - 32-63 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1386 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1384 /*! cache: eviction walk target pages histogram - 64-128 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1387 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1385 /*! * cache: eviction walk target pages reduced due to history store cache * pressure */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_REDUCED 1388 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_REDUCED 1386 /*! cache: eviction walks abandoned */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1389 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1387 /*! cache: eviction walks gave up because they restarted their walk twice */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1390 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1388 /*! * cache: eviction walks gave up because they saw too many pages and * found no candidates */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1391 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1389 /*! * cache: eviction walks gave up because they saw too many pages and * found too few candidates */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1392 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1390 /*! cache: eviction walks reached end of tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1393 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1391 /*! cache: eviction walks restarted */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_RESTART 1394 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_RESTART 1392 /*! cache: eviction walks started from root of tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1395 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1393 /*! cache: eviction walks started from saved location in tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1396 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1394 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1397 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1395 /*! cache: history store table insert calls */ -#define WT_STAT_CONN_CACHE_HS_INSERT 1398 +#define WT_STAT_CONN_CACHE_HS_INSERT 1396 /*! cache: history store table insert calls that returned restart */ -#define WT_STAT_CONN_CACHE_HS_INSERT_RESTART 1399 +#define WT_STAT_CONN_CACHE_HS_INSERT_RESTART 1397 /*! * cache: history store table out-of-order resolved updates that lose * their durable timestamp */ -#define WT_STAT_CONN_CACHE_HS_ORDER_LOSE_DURABLE_TIMESTAMP 1400 +#define WT_STAT_CONN_CACHE_HS_ORDER_LOSE_DURABLE_TIMESTAMP 1398 /*! * cache: history store table out-of-order updates that were fixed up by * moving existing records */ -#define WT_STAT_CONN_CACHE_HS_ORDER_FIXUP_MOVE 1401 +#define WT_STAT_CONN_CACHE_HS_ORDER_FIXUP_MOVE 1399 /*! * cache: history store table out-of-order updates that were fixed up * during insertion */ -#define WT_STAT_CONN_CACHE_HS_ORDER_FIXUP_INSERT 1402 +#define WT_STAT_CONN_CACHE_HS_ORDER_FIXUP_INSERT 1400 /*! cache: history store table reads */ -#define WT_STAT_CONN_CACHE_HS_READ 1403 +#define WT_STAT_CONN_CACHE_HS_READ 1401 /*! cache: history store table reads missed */ -#define WT_STAT_CONN_CACHE_HS_READ_MISS 1404 +#define WT_STAT_CONN_CACHE_HS_READ_MISS 1402 /*! cache: history store table reads requiring squashed modifies */ -#define WT_STAT_CONN_CACHE_HS_READ_SQUASH 1405 +#define WT_STAT_CONN_CACHE_HS_READ_SQUASH 1403 /*! * cache: history store table truncation by rollback to stable to remove * an unstable update */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS_UNSTABLE 1406 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS_UNSTABLE 1404 /*! * cache: history store table truncation by rollback to stable to remove * an update */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS 1407 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS 1405 /*! cache: history store table truncation to remove an update */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE 1408 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE 1406 /*! * cache: history store table truncation to remove range of updates due * to key being removed from the data page during reconciliation */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_ONPAGE_REMOVAL 1409 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_ONPAGE_REMOVAL 1407 /*! * cache: history store table truncation to remove range of updates due * to non timestamped update on data page */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_NON_TS 1410 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_NON_TS 1408 /*! cache: history store table writes requiring squashed modifies */ -#define WT_STAT_CONN_CACHE_HS_WRITE_SQUASH 1411 +#define WT_STAT_CONN_CACHE_HS_WRITE_SQUASH 1409 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1412 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1410 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1413 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1411 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1414 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1412 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1415 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1413 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1416 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1414 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1417 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1415 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1418 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1416 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1419 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1417 /*! cache: page written requiring history store records */ -#define WT_STAT_CONN_CACHE_WRITE_HS 1420 +#define WT_STAT_CONN_CACHE_WRITE_HS 1418 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1421 +#define WT_STAT_CONN_CACHE_READ 1419 /*! cache: pages read into cache after truncate */ -#define WT_STAT_CONN_CACHE_READ_DELETED 1422 +#define WT_STAT_CONN_CACHE_READ_DELETED 1420 /*! cache: pages read into cache after truncate in prepare state */ -#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1423 +#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1421 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1424 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1422 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1425 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1423 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1426 +#define WT_STAT_CONN_CACHE_WRITE 1424 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1427 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1425 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1428 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1426 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1429 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1427 /*! checkpoint-cleanup: pages added for eviction */ -#define WT_STAT_CONN_CC_PAGES_EVICT 1430 +#define WT_STAT_CONN_CC_PAGES_EVICT 1428 /*! checkpoint-cleanup: pages removed */ -#define WT_STAT_CONN_CC_PAGES_REMOVED 1431 +#define WT_STAT_CONN_CC_PAGES_REMOVED 1429 /*! checkpoint-cleanup: pages skipped during tree walk */ -#define WT_STAT_CONN_CC_PAGES_WALK_SKIPPED 1432 +#define WT_STAT_CONN_CC_PAGES_WALK_SKIPPED 1430 /*! checkpoint-cleanup: pages visited */ -#define WT_STAT_CONN_CC_PAGES_VISITED 1433 +#define WT_STAT_CONN_CC_PAGES_VISITED 1431 /*! cursor: Total number of entries skipped by cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_TOTAL 1434 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_TOTAL 1432 /*! cursor: Total number of entries skipped by cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_TOTAL 1435 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_TOTAL 1433 /*! * cursor: Total number of entries skipped to position the history store * cursor */ -#define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1436 +#define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1434 /*! * cursor: cursor next calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1437 +#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1435 /*! * cursor: cursor next calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1438 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1436 /*! cursor: cursor next calls that skip less than 100 entries */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1439 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1437 /*! * cursor: cursor prev calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1440 +#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1438 /*! * cursor: cursor prev calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1441 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1439 /*! cursor: cursor prev calls that skip less than 100 entries */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1442 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1440 /*! cursor: open cursor count */ -#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1443 +#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1441 /*! reconciliation: approximate byte size of timestamps in pages written */ -#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1444 +#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1442 /*! * reconciliation: approximate byte size of transaction IDs in pages * written */ -#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1445 +#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1443 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1446 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1444 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1447 +#define WT_STAT_CONN_REC_PAGES 1445 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1448 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1446 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1449 +#define WT_STAT_CONN_REC_PAGE_DELETE 1447 /*! * reconciliation: pages written including an aggregated newest start * durable timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1450 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1448 /*! * reconciliation: pages written including an aggregated newest stop * durable timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1451 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1449 /*! * reconciliation: pages written including an aggregated newest stop * timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1452 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1450 /*! * reconciliation: pages written including an aggregated newest stop * transaction ID */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1453 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1451 /*! * reconciliation: pages written including an aggregated newest * transaction ID */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1454 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1452 /*! * reconciliation: pages written including an aggregated oldest start * timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1455 +#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1453 /*! reconciliation: pages written including an aggregated prepare */ -#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1456 +#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1454 /*! * reconciliation: pages written including at least one start durable * timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1457 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1455 /*! * reconciliation: pages written including at least one start transaction * ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1458 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1456 /*! * reconciliation: pages written including at least one stop durable * timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1459 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1457 /*! reconciliation: pages written including at least one stop timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1460 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1458 /*! * reconciliation: pages written including at least one stop transaction * ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1461 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1459 /*! reconciliation: records written including a start durable timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1462 +#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1460 /*! reconciliation: records written including a start timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1463 +#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1461 /*! reconciliation: records written including a start transaction ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1464 +#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1462 /*! reconciliation: records written including a stop durable timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1465 +#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1463 /*! reconciliation: records written including a stop timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1466 +#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1464 /*! reconciliation: records written including a stop transaction ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1467 +#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1465 /*! session: flush_tier operation calls */ -#define WT_STAT_CONN_FLUSH_TIER 1468 +#define WT_STAT_CONN_FLUSH_TIER 1466 /*! session: tiered storage local retention time (secs) */ -#define WT_STAT_CONN_TIERED_RETENTION 1469 +#define WT_STAT_CONN_TIERED_RETENTION 1467 /*! transaction: race to read prepared update retry */ -#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1470 +#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1468 /*! - * transaction: rollback to stable hs records with stop timestamps older - * than newer records + * transaction: rollback to stable history store records with stop + * timestamps older than newer records */ -#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1471 +#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1469 /*! transaction: rollback to stable inconsistent checkpoint */ -#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1472 +#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1470 /*! transaction: rollback to stable keys removed */ -#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1473 +#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1471 /*! transaction: rollback to stable keys restored */ -#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1474 +#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1472 /*! transaction: rollback to stable restored tombstones from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1475 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1473 /*! transaction: rollback to stable restored updates from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1476 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1474 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1477 +#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1475 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1478 +#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1476 /*! transaction: transaction checkpoints due to obsolete pages */ -#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1479 +#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1477 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1480 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1478 /*! * @} @@ -6771,8 +6761,8 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); /*! transaction: race to read prepared update retry */ #define WT_STAT_DSRC_TXN_READ_RACE_PREPARE_UPDATE 2200 /*! - * transaction: rollback to stable hs records with stop timestamps older - * than newer records + * transaction: rollback to stable history store records with stop + * timestamps older than newer records */ #define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2201 /*! transaction: rollback to stable inconsistent checkpoint */ diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index 26f360ca4c0..db4bb56c976 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -703,6 +703,7 @@ __wt_rec_row_leaf( WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK_KV *kpack, _kpack, *vpack, _vpack; + WT_CURSOR *hs_cursor; WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(tmpkey); WT_DECL_RET; @@ -720,6 +721,7 @@ __wt_rec_row_leaf( void *copy; btree = S2BT(session); + hs_cursor = NULL; page = pageref->page; slvg_skip = salvage == NULL ? 0 : salvage->skip; WT_TIME_WINDOW_INIT(&tw); @@ -914,11 +916,19 @@ __wt_rec_row_leaf( * ever need to blow away history store content, so we can skip this. */ if (!F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)) { - WT_ERR(__wt_hs_cursor_open(session)); + /* + * FIXME-WT-7053: we will hit the dhandle deadlock if we open multiple + * history store cursors in reconciliation. Once it is fixed, we can move + * the open and close of the history store cursor inside the delete key + * function. + */ + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); WT_ERR(__wt_hs_delete_key_from_ts( - session, btree->id, tmpkey, WT_TS_NONE, false)); - WT_ERR(__wt_hs_cursor_close(session)); - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_onpage_removal); + session, hs_cursor, btree->id, tmpkey, WT_TS_NONE, false)); + WT_ERR(hs_cursor->close(hs_cursor)); + hs_cursor = NULL; + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal); + WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal); } } @@ -1034,6 +1044,8 @@ leaf_insert: ret = __wt_rec_split_finish(session, r); err: + if (hs_cursor != NULL) + WT_TRET(hs_cursor->close(hs_cursor)); __wt_scr_free(session, &tmpkey); return (ret); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 6eec392127e..d0511459385 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -2289,8 +2289,6 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (i == r->multi_next) return (0); - WT_RET(__wt_hs_cursor_open(session)); - for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->supd != NULL) { WT_ERR(__wt_hs_insert_updates(session, r->page, multi)); @@ -2302,7 +2300,6 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) } err: - WT_TRET(__wt_hs_cursor_close(session)); return (ret); } diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index cec727d8ecd..aec5e3c0f75 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -542,6 +542,9 @@ __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, co { WT_DECL_RET; + /* We should not open other cursors when there are open history store cursors in the session. */ + WT_ASSERT(session, strcmp(uri, WT_HS_URI) == 0 || session->hs_cursor_counter == 0); + /* We do not cache any subordinate tables/files cursors. */ if (owner == NULL) { if ((ret = __wt_cursor_cache_get(session, uri, NULL, cfg, cursorp)) == 0) diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 47605ab42f8..6723f064f17 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -208,7 +208,8 @@ static const char *const __stats_dsrc_desc[] = { "session: flush_tier operation calls", "session: tiered storage local retention time (secs)", "transaction: race to read prepared update retry", - "transaction: rollback to stable hs records with stop timestamps older than newer records", + "transaction: rollback to stable history store records with stop timestamps older than newer " + "records", "transaction: rollback to stable inconsistent checkpoint", "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", @@ -1098,12 +1099,8 @@ static const char *const __stats_connection_desc[] = { "cursor: cursor modify key and value bytes affected", "cursor: cursor modify value bytes modified", "cursor: cursor next calls", - "cursor: cursor next calls that skip due to a globally visible history store tombstone in " - "rollback to stable", "cursor: cursor operation restarted", "cursor: cursor prev calls", - "cursor: cursor prev calls that skip due to a globally visible history store tombstone in " - "rollback to stable", "cursor: cursor remove calls", "cursor: cursor remove key bytes removed", "cursor: cursor reserve calls", @@ -1437,7 +1434,8 @@ static const char *const __stats_connection_desc[] = { "session: flush_tier operation calls", "session: tiered storage local retention time (secs)", "transaction: race to read prepared update retry", - "transaction: rollback to stable hs records with stop timestamps older than newer records", + "transaction: rollback to stable history store records with stop timestamps older than newer " + "records", "transaction: rollback to stable inconsistent checkpoint", "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", @@ -1625,10 +1623,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cursor_modify_bytes = 0; stats->cursor_modify_bytes_touch = 0; stats->cursor_next = 0; - stats->cursor_next_hs_tombstone_rts = 0; stats->cursor_restart = 0; stats->cursor_prev = 0; - stats->cursor_prev_hs_tombstone_rts = 0; stats->cursor_remove = 0; stats->cursor_remove_bytes = 0; stats->cursor_reserve = 0; @@ -2139,10 +2135,8 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->cursor_modify_bytes += WT_STAT_READ(from, cursor_modify_bytes); to->cursor_modify_bytes_touch += WT_STAT_READ(from, cursor_modify_bytes_touch); to->cursor_next += WT_STAT_READ(from, cursor_next); - to->cursor_next_hs_tombstone_rts += WT_STAT_READ(from, cursor_next_hs_tombstone_rts); to->cursor_restart += WT_STAT_READ(from, cursor_restart); to->cursor_prev += WT_STAT_READ(from, cursor_prev); - to->cursor_prev_hs_tombstone_rts += WT_STAT_READ(from, cursor_prev_hs_tombstone_rts); to->cursor_remove += WT_STAT_READ(from, cursor_remove); to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes); to->cursor_reserve += WT_STAT_READ(from, cursor_reserve); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 0e087eb6029..bf85cf61443 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -721,76 +721,27 @@ __wt_txn_release(WT_SESSION_IMPL *session) * Append the update older than the prepared update to the update chain */ static int -__txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key, WT_PAGE *page, +__txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *page, WT_UPDATE *chain, bool commit, WT_UPDATE **fix_updp, bool *upd_appended) { - WT_CURSOR_BTREE *hs_cbt; - WT_DECL_ITEM(hs_key); WT_DECL_ITEM(hs_value); WT_DECL_RET; + WT_TIME_WINDOW *hs_tw; WT_UPDATE *tombstone, *upd; - wt_timestamp_t durable_ts, hs_start_ts, hs_stop_durable_ts; + wt_timestamp_t durable_ts, hs_stop_durable_ts; size_t size, total_size; - uint64_t hs_counter, type_full; - uint32_t hs_btree_id; - int cmp; + uint64_t type_full; char ts_string[2][WT_TS_INT_STRING_SIZE]; WT_ASSERT(session, chain != NULL); - hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; *fix_updp = NULL; *upd_appended = false; size = total_size = 0; tombstone = upd = NULL; - /* Allocate buffers for the data store and history store key. */ - WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); - for (; ret == 0; ret = __wt_hs_cursor_prev(session, hs_cursor)) { - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); - - /* Stop before crossing over to the next btree */ - if (hs_btree_id != S2BT(session)->id) { - ret = WT_NOTFOUND; - goto done; - } - - /* - * Keys are sorted in an order, skip the ones before the desired key, and bail out if we - * have crossed over the desired key and not found the record we are looking for. - */ - WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp)); - if (cmp != 0) { - ret = WT_NOTFOUND; - goto done; - } - - /* - * If the stop time pair on the tombstone in the history store is already globally visible - * we can skip it. - */ - if (!__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) - break; - else - WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone); - } - - /* We walked off the top of the history store. */ - if (ret == WT_NOTFOUND) - goto done; - WT_ERR(ret); - - /* - * As part of the history store search, we never get an exact match based on our search criteria - * as we always search for a maximum record for that key. Make sure that we set the comparison - * result as an exact match to remove this key as part of rollback to stable. In case if we - * don't mark the comparison result as same, later the __wt_row_modify function will not - * properly remove the update from history store. - */ - hs_cbt->compare = 0; - /* Get current value. */ WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_durable_ts, &durable_ts, &type_full, hs_value)); @@ -799,15 +750,16 @@ __txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM * /* * If the history update already has a stop time point and we are committing the prepared update - * there is no work to do. + * there is no work to do. This happens if a deleted key is reinserted by a prepared update. */ if (hs_stop_durable_ts != WT_TS_MAX && commit) goto done; + __wt_hs_upd_time_window(hs_cursor, &hs_tw); WT_ERR(__wt_upd_alloc(session, hs_value, WT_UPDATE_STANDARD, &upd, &size)); - upd->txnid = hs_cbt->upd_value->tw.start_txn; - upd->durable_ts = hs_cbt->upd_value->tw.durable_start_ts; - upd->start_ts = hs_cbt->upd_value->tw.start_ts; + upd->txnid = hs_tw->start_txn; + upd->durable_ts = hs_tw->durable_start_ts; + upd->start_ts = hs_tw->start_ts; *fix_updp = upd; /* @@ -831,11 +783,11 @@ __txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM * /* If the history store record has a valid stop time point, append it. */ if (hs_stop_durable_ts != WT_TS_MAX) { - WT_ASSERT(session, hs_cbt->upd_value->tw.stop_ts != WT_TS_MAX); + WT_ASSERT(session, hs_tw->stop_ts != WT_TS_MAX); WT_ERR(__wt_upd_alloc(session, NULL, WT_UPDATE_TOMBSTONE, &tombstone, &size)); - tombstone->durable_ts = hs_cbt->upd_value->tw.durable_stop_ts; - tombstone->start_ts = hs_cbt->upd_value->tw.stop_ts; - tombstone->txnid = hs_cbt->upd_value->tw.stop_txn; + tombstone->durable_ts = hs_tw->durable_stop_ts; + tombstone->start_ts = hs_tw->stop_ts; + tombstone->txnid = hs_tw->stop_txn; tombstone->next = upd; /* * Set the flag to indicate that this update has been restored from history store for the @@ -873,7 +825,6 @@ err: __wt_free_update_list(session, &upd); } done: - __wt_scr_free(session, &hs_key); __wt_scr_free(session, &hs_value); return (ret); } @@ -958,15 +909,18 @@ static int __txn_fixup_prepared_update( WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_UPDATE *fix_upd, bool commit) { - WT_CURSOR_BTREE *hs_cbt; WT_DECL_RET; WT_ITEM hs_value; + WT_TIME_WINDOW tw; WT_TXN *txn; - WT_UPDATE *hs_upd; uint32_t txn_flags; +#ifdef HAVE_DIAGNOSTIC + uint64_t hs_upd_type; + wt_timestamp_t hs_durable_ts, hs_stop_durable_ts; +#endif - hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; txn = session->txn; + WT_TIME_WINDOW_INIT(&tw); /* * Transaction error and prepare are cleared temporarily as cursor functions are not allowed @@ -982,33 +936,34 @@ __txn_fixup_prepared_update( * If the history update already has a stop time point and we are committing the prepared update * there is no work to do. */ - WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); if (commit) { - hs_upd->start_ts = txn->commit_timestamp; - hs_upd->durable_ts = txn->durable_timestamp; - hs_upd->txnid = txn->id; + tw.stop_ts = txn->commit_timestamp; + tw.durable_stop_ts = txn->durable_timestamp; + tw.stop_txn = txn->id; + WT_TIME_WINDOW_SET_START(&tw, fix_upd); - hs_value.data = fix_upd->data; - hs_value.size = fix_upd->size; +#ifdef HAVE_DIAGNOSTIC + /* Retrieve the existing update value and stop timestamp. */ + WT_ERR(hs_cursor->get_value( + hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &hs_upd_type, &hs_value)); + WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX); + WT_ASSERT(session, (uint8_t)hs_upd_type == WT_UPDATE_STANDARD); +#endif /* * We need to update the stop durable timestamp stored in the history store value. * * Pack the value using cursor api. */ - hs_cursor->set_value(hs_cursor, txn->durable_timestamp, fix_upd->durable_ts, - (uint64_t)fix_upd->type, &hs_value); - WT_ERR(__wt_upd_alloc(session, &hs_cursor->value, WT_UPDATE_STANDARD, &hs_upd->next, NULL)); - hs_upd->next->durable_ts = fix_upd->durable_ts; - hs_upd->next->start_ts = fix_upd->start_ts; - hs_upd->next->txnid = fix_upd->txnid; + hs_value.data = fix_upd->data; + hs_value.size = fix_upd->size; + hs_cursor->set_value(hs_cursor, &tw, tw.durable_stop_ts, tw.durable_start_ts, + (uint64_t)WT_UPDATE_STANDARD, &hs_value); + WT_ERR(hs_cursor->update(hs_cursor)); + } else { + WT_ERR(hs_cursor->remove(hs_cursor)); } - WT_ERR(__wt_hs_modify(hs_cbt, hs_upd)); - - if (0) { err: - __wt_free_update_list(session, &hs_upd); - } F_SET(txn, txn_flags); return (ret); @@ -1128,22 +1083,15 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, cbt = (WT_CURSOR_BTREE *)(*cursorp); hs_btree_id = S2BT(session)->id; /* Open a history store table cursor. */ - WT_ERR(__wt_hs_cursor_open(session)); - hs_cursor = session->hs_cursor; + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); /* * Scan the history store for the given btree and key with maximum start timestamp to let * the search point to the last version of the key. */ - WT_ERR_NOTFOUND_OK(__wt_hs_cursor_position( - session, hs_cursor, hs_btree_id, &op->u.op_row.key, WT_TS_MAX, NULL), - true); - - if (ret == 0) - /* Not found if we cross the tree or key boundary. */ - WT_ERR_NOTFOUND_OK(__txn_append_hs_record(session, hs_cursor, &op->u.op_row.key, - cbt->ref->page, upd, commit, &fix_upd, &upd_appended), - true); + hs_cursor->set_key(hs_cursor, 4, hs_btree_id, &op->u.op_row.key, WT_TS_MAX, UINT64_MAX); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true); if (ret == WT_NOTFOUND && !commit) { /* * Allocate a tombstone and prepend it to the row so when we reconcile the update chain @@ -1156,7 +1104,10 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, __wt_row_modify(cbt, &cbt->iface.key, NULL, tombstone, WT_UPDATE_INVALID, false)); WT_ERR(ret); tombstone = NULL; - } else + } else if (ret == 0) + WT_ERR(__txn_append_hs_record( + session, hs_cursor, cbt->ref->page, upd, commit, &fix_upd, &upd_appended)); + else ret = 0; } @@ -1212,15 +1163,14 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, * Fix the history store contents if they exist, when there are no more updates in the update * list. Only in eviction, it is possible to write an unfinished history store update when the * prepared updates are written to the data store. When the page is read back into memory, there - * will be only one uncommitted prepared update. There can be a false positive of fixing history - * store when handling prepared inserts, but it doesn't cost much. + * will be only one uncommitted prepared update. */ if (fix_upd != NULL) WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd, commit)); err: if (hs_cursor != NULL) - WT_TRET(__wt_hs_cursor_close(session)); + WT_TRET(hs_cursor->close(hs_cursor)); if (!upd_appended) __wt_free(session, fix_upd); __wt_free(session, tombstone); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 14383dc3017..2ece30a9f0d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -276,18 +276,17 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW { WT_CELL_UNPACK_KV *unpack, _unpack; WT_CURSOR *hs_cursor; - WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(hs_key); WT_DECL_ITEM(hs_value); WT_DECL_ITEM(key); WT_DECL_RET; WT_ITEM full_value; - WT_UPDATE *hs_upd, *tombstone, *upd; + WT_TIME_WINDOW *hs_tw; + WT_UPDATE *tombstone, *upd; wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts; uint64_t hs_counter, type_full; uint32_t hs_btree_id; uint8_t type; - int cmp; char ts_string[4][WT_TS_INT_STRING_SIZE]; bool valid_update_found; #ifdef HAVE_DIAGNOSTIC @@ -295,7 +294,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW #endif hs_cursor = NULL; - hs_upd = tombstone = upd = NULL; + tombstone = upd = NULL; hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE; hs_btree_id = S2BT(session)->id; WT_CLEAR(full_value); @@ -319,9 +318,13 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW newer_hs_durable_ts = unpack->tw.durable_start_ts; /* Open a history store table cursor. */ - WT_ERR(__wt_hs_cursor_open(session)); - hs_cursor = session->hs_cursor; - cbt = (WT_CURSOR_BTREE *)hs_cursor; + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); + /* + * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system) + * outside the constraints of transactions. Therefore, there is no need for snapshot based + * visibility checks. + */ + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); /* * Scan the history store for the given btree and key with maximum start timestamp to let the @@ -330,40 +333,11 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * into data store and removed from history store. If none of the history store records satisfy * the given timestamp, the key is removed from data store. */ - ret = __wt_hs_cursor_position(session, hs_cursor, hs_btree_id, key, WT_TS_MAX, NULL); - for (; ret == 0; ret = __wt_hs_cursor_prev(session, hs_cursor)) { + hs_cursor->set_key(hs_cursor, 4, hs_btree_id, key, WT_TS_MAX, UINT64_MAX); + ret = __wt_curhs_search_near_before(session, hs_cursor); + for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) { WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); - /* Stop before crossing over to the next btree */ - if (hs_btree_id != S2BT(session)->id) - break; - - /* - * Keys are sorted in an order, skip the ones before the desired key, and bail out if we - * have crossed over the desired key and not found the record we are looking for. - */ - WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp)); - if (cmp != 0) - break; - - /* - * If the stop time pair on the tombstone in the history store is already globally visible - * we can skip it. - */ - if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) { - WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone_rts); - continue; - } - - /* - * As part of the history store search, we never get an exact match based on our search - * criteria as we always search for a maximum record for that key. Make sure that we set the - * comparison result as an exact match to remove this key as part of rollback to stable. In - * case if we don't mark the comparison result as same, later the __wt_row_modify function - * will not properly remove the update from history store. - */ - cbt->compare = 0; - /* Get current value and convert to full update if it is a modify. */ WT_ERR(hs_cursor->get_value( hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value)); @@ -416,16 +390,17 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * selected update to the update chain. Also it confirms that history store doesn't contains * any newer version than the current version for the key. */ + /* Retrieve the time window from the history cursor. */ + __wt_hs_upd_time_window(hs_cursor, &hs_tw); if (!replace && (hs_stop_durable_ts != WT_TS_NONE || - !__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.stop_txn)) && + !__rollback_check_if_txnid_non_committed(session, hs_tw->stop_txn)) && (hs_stop_durable_ts <= rollback_timestamp)) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "history store update valid with stop timestamp: %s, stable timestamp: %s, txnid: " "%" PRIu64 " and type: %" PRIu8, __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), - cbt->upd_value->tw.stop_txn, type); + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), hs_tw->stop_txn, type); break; } @@ -434,7 +409,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * transaction id. */ if ((hs_durable_ts != WT_TS_NONE || - !__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.start_txn)) && + !__rollback_check_if_txnid_non_committed(session, hs_tw->start_txn)) && (hs_durable_ts <= rollback_timestamp)) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "history store update valid with start timestamp: %s, durable timestamp: %s, stop " @@ -442,8 +417,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW __wt_timestamp_to_string(hs_start_ts, ts_string[0]), __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), - cbt->upd_value->tw.start_txn, type); + __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn, type); + WT_ASSERT(session, hs_tw->start_ts < unpack->tw.start_ts); valid_update_found = true; break; } @@ -455,8 +430,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW __wt_timestamp_to_string(hs_start_ts, ts_string[0]), __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), cbt->upd_value->tw.start_txn, - cbt->upd_value->tw.stop_txn, type); + __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn, + hs_tw->stop_txn, type); /* * Start time point of the current record may be used as stop time point of the previous @@ -468,8 +443,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW first_record = false; #endif - WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); - WT_ERR(__wt_hs_modify(cbt, hs_upd)); + WT_ERR(hs_cursor->remove(hs_cursor)); WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable); } @@ -480,9 +454,10 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * list. Otherwise remove the key by adding a tombstone. */ if (valid_update_found) { + /* Retrieve the time window from the history cursor. */ + __wt_hs_upd_time_window(hs_cursor, &hs_tw); WT_ASSERT(session, - cbt->upd_value->tw.start_ts < unpack->tw.start_ts || - cbt->upd_value->tw.start_txn < unpack->tw.start_txn); + hs_tw->start_ts < unpack->tw.start_ts || hs_tw->start_txn < unpack->tw.start_txn); WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL)); /* @@ -494,9 +469,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) upd->txnid = WT_TXN_NONE; else - upd->txnid = cbt->upd_value->tw.start_txn; - upd->durable_ts = cbt->upd_value->tw.durable_start_ts; - upd->start_ts = cbt->upd_value->tw.start_ts; + upd->txnid = hs_tw->start_txn; + upd->durable_ts = hs_tw->durable_start_ts; + upd->start_ts = hs_tw->start_ts; __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "update restored from history store txnid: %" PRIu64 ", start_ts: %s and durable_ts: %s", @@ -527,9 +502,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) tombstone->txnid = WT_TXN_NONE; else - tombstone->txnid = cbt->upd_value->tw.stop_txn; - tombstone->durable_ts = cbt->upd_value->tw.durable_stop_ts; - tombstone->start_ts = cbt->upd_value->tw.stop_ts; + tombstone->txnid = hs_tw->stop_txn; + tombstone->durable_ts = hs_tw->durable_stop_ts; + tombstone->start_ts = hs_tw->stop_ts; __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "tombstone restored from history store txnid: %" PRIu64 ", start_ts: %s, durable_ts: %s", @@ -557,8 +532,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW /* Finally remove that update from history store. */ if (valid_update_found) { - WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); - WT_ERR(__wt_hs_modify(cbt, hs_upd)); + WT_ERR(hs_cursor->remove(hs_cursor)); WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts); } @@ -567,13 +541,13 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW err: WT_ASSERT(session, tombstone == NULL || upd == tombstone); __wt_free_update_list(session, &upd); - __wt_free_update_list(session, &hs_upd); } __wt_scr_free(session, &hs_key); __wt_scr_free(session, &hs_value); __wt_scr_free(session, &key); __wt_buf_free(session, &full_value); - WT_TRET(__wt_hs_cursor_close(session)); + if (hs_cursor != NULL) + WT_TRET(hs_cursor->close(hs_cursor)); return (ret); } @@ -1305,74 +1279,44 @@ static int __rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id) { WT_CURSOR *hs_cursor; - WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(hs_key); WT_DECL_RET; - WT_ITEM key; - WT_UPDATE *hs_upd; wt_timestamp_t hs_start_ts; uint64_t hs_counter; uint32_t hs_btree_id; - int exact; char ts_string[WT_TS_INT_STRING_SIZE]; hs_cursor = NULL; - WT_CLEAR(key); - hs_upd = NULL; WT_RET(__wt_scr_alloc(session, 0, &hs_key)); /* Open a history store table cursor. */ - WT_ERR(__wt_hs_cursor_open(session)); - hs_cursor = session->hs_cursor; - cbt = (WT_CURSOR_BTREE *)hs_cursor; + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); /* Walk the history store for the given btree. */ - hs_cursor->set_key(hs_cursor, btree_id, &key, WT_TS_NONE, 0); - ret = __wt_hs_cursor_search_near(session, hs_cursor, &exact); - - /* - * The search should always end up pointing to the start of the required btree or end of the - * previous btree on success. Move the cursor based on the result. - */ - WT_ASSERT(session, (ret != 0 || exact != 0)); - if (ret == 0 && exact < 0) - ret = __wt_hs_cursor_next(session, hs_cursor); + hs_cursor->set_key(hs_cursor, 1, btree_id); + ret = __wt_curhs_search_near_after(session, hs_cursor); - for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) { + for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); - /* Stop crossing into the next btree boundary. */ - if (btree_id != hs_btree_id) - break; - - /* - * If the stop time pair on the tombstone in the history store is already globally visible - * we can skip it. - */ - if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) { - WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone_rts); - continue; - } + /* We shouldn't cross the btree search space. */ + WT_ASSERT(session, btree_id == hs_btree_id); - /* Set this comparison as exact match of the search for later use. */ - cbt->compare = 0; __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "rollback to stable history store cleanup of update with start timestamp: %s", __wt_timestamp_to_string(hs_start_ts, ts_string)); - WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); - WT_ERR(__wt_hs_modify(cbt, hs_upd)); + WT_ERR(hs_cursor->remove(hs_cursor)); WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts); - hs_upd = NULL; } WT_ERR_NOTFOUND_OK(ret, false); err: __wt_scr_free(session, &hs_key); - __wt_free(session, hs_upd); - WT_TRET(__wt_hs_cursor_close(session)); + if (hs_cursor != NULL) + WT_TRET(hs_cursor->close(hs_cursor)); return (ret); } diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index 44d14821a92..4cd239a528a 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -378,7 +378,7 @@ format_die(void) testutil_check(__wt_debug_cursor_page(g.page_dump_cursor, g.home_pagedump)); fprintf(stderr, "snapshot-isolation error: Dumping HS to %s\n", g.home_hsdump); #if WIREDTIGER_VERSION_MAJOR >= 10 - testutil_check(__wt_debug_cursor_tree_hs(g.page_dump_cursor, g.home_hsdump)); + testutil_check(__wt_debug_cursor_tree_hs(CUR2S(g.page_dump_cursor), g.home_hsdump)); #endif } #endif diff --git a/src/third_party/wiredtiger/test/suite/test_cursor13.py b/src/third_party/wiredtiger/test/suite/test_cursor13.py index c1d7254c016..75ac0b9a890 100755 --- a/src/third_party/wiredtiger/test/suite/test_cursor13.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor13.py @@ -71,7 +71,7 @@ class test_cursor13_base(wttest.WiredTigerTestCase): if hs_before[0] == hs_after[0] and hs_before[1] == hs_after[1]: break - # Fail if we haven't been able to get stable hs stats after too many attempts. + # Fail if we haven't been able to get stable history store stats after too many attempts. # Seems impossible, but better to check than to have an accidental infinite loop. self.assertNotEqual(i, max_tries - 1) diff --git a/src/third_party/wiredtiger/test/suite/test_hs05.py b/src/third_party/wiredtiger/test/suite/test_hs05.py index 0e1e5a84617..5a66d04f546 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs05.py +++ b/src/third_party/wiredtiger/test/suite/test_hs05.py @@ -72,7 +72,7 @@ class test_hs05(wttest.WiredTigerTestCase): score_diff = score_end - score_start self.pr("After large updates score start: " + str(score_start)) self.pr("After large updates score end: " + str(score_end)) - self.pr("After large updates hs score diff: " + str(score_diff)) + self.pr("After large updates history store score diff: " + str(score_diff)) def test_checkpoint_hs_reads(self): # Create a small table. diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py index 58d156b26b1..a6d1f8703d4 100755 --- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py @@ -37,7 +37,7 @@ def timestamp_str(t): return '%x' % t # test_rollback_to_stable11.py -# Test the rollback to stable is retrieving the proper hs update. +# Test the rollback to stable is retrieving the proper history store update. class test_rollback_to_stable11(test_rollback_to_stable_base): session_config = 'isolation=snapshot' diff --git a/src/third_party/wiredtiger/test/suite/test_util21.py b/src/third_party/wiredtiger/test/suite/test_util21.py new file mode 100644 index 00000000000..cdd117649db --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_util21.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2021 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wttest +from suite_subprocess import suite_subprocess +from helper import compare_files + +def timestamp_str(t): + return '%x' % t + +# test_util21.py +# Ensure that wt dump can dump obsolete data in the history store. +class test_util21(wttest.WiredTigerTestCase, suite_subprocess): + conn_config = 'cache_size=50MB' + session_config = 'isolation=snapshot' + + def add_data_with_timestamp(self, uri, value, ts): + # Apply a series of updates with commit timestamp. + cursor = self.session.open_cursor(uri) + for i in range(1, 5): + self.session.begin_transaction() + cursor[str(i)] = value + self.session.commit_transaction('commit_timestamp=' + timestamp_str(ts)) + cursor.close() + + def test_dump_obsolete_data(self): + uri = 'table:test_util21' + create_params = 'key_format=S,value_format=S' + self.session.create(uri, create_params) + + value1 = 'a' * 100 + value2 = 'b' * 100 + value3 = 'c' * 100 + value4 = 'd' * 100 + + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1)) + + self.add_data_with_timestamp(uri, value1, 2) + self.add_data_with_timestamp(uri, value2, 3) + self.add_data_with_timestamp(uri, value3, 5) + self.add_data_with_timestamp(uri, value4, 7) + # Perform checkpoint, to clean the dirty pages and place values on disk. + self.session.checkpoint() + + # Set stable timestamp, so we don't lose data when closing/opening connection when using wt dump. + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(10)) + + # Call dump on the values before the oldest timestamp is set + self.runWt(['dump', 'file:WiredTigerHS.wt'], outfilename="before_oldest") + + # Set oldest timestamp, and checkpoint, the obsolete data should not removed as + # the pages are clean. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(6)) + self.session.checkpoint() + self.runWt(['dump', 'file:WiredTigerHS.wt'], outfilename="after_oldest") + + self.assertEqual(True, compare_files(self, "before_oldest", "after_oldest")) + +if __name__ == '__main__': + wttest.run() |