From ff99a2afe938bf7aec7e4bbfb0a922d7f70d6712 Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Wed, 3 Jun 2020 17:32:56 +1000 Subject: Import wiredtiger: 3901b43d1e0f034c46999ccfdbdd0914791859e0 from branch mongodb-4.4 ref: cc2458c2c2..3901b43d1e for: 4.4.0-rc8 WT-5769 Search history store can potentially walk the whole history store tree WT-6328 Update test_compact02 to handle being halted by eviction pressure WT-6355 test/format can configure keys that are too short WT-6364 Fix double counting sizes of existing updates on the update chain for the history store --- src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_vrfy.c | 19 ++- src/third_party/wiredtiger/src/btree/row_modify.c | 13 +- src/third_party/wiredtiger/src/conn/conn_api.c | 11 +- src/third_party/wiredtiger/src/history/hs.c | 148 +++++++-------------- src/third_party/wiredtiger/src/include/session.h | 53 ++++---- src/third_party/wiredtiger/src/include/txn.i | 23 +--- .../wiredtiger/src/reconcile/rec_visibility.c | 11 +- src/third_party/wiredtiger/src/txn/txn.c | 13 +- src/third_party/wiredtiger/test/format/config.h | 7 +- .../wiredtiger/test/suite/test_compact02.py | 4 +- src/third_party/wiredtiger/test/suite/test_hs11.py | 30 ++--- src/third_party/wiredtiger/test/suite/test_hs14.py | 101 ++++++++++++++ 13 files changed, 245 insertions(+), 190 deletions(-) create mode 100644 src/third_party/wiredtiger/test/suite/test_hs14.py diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 77a5d85e3d6..1f51fdec549 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "cc2458c2c238b08db817161915d00d7e19c6ba29" + "commit": "3901b43d1e0f034c46999ccfdbdd0914791859e0" } diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 10dca448c28..b7ab4f87434 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -166,17 +166,25 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) WT_DECL_RET; WT_VSTUFF *vs, _vstuff; size_t root_addr_size; - uint32_t session_flags; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; const char *name; - bool bm_start, is_owner, quit, skip_hs; + bool bm_start, quit; + +#if 0 + /* FIXME-WT-6263: Temporarily disable history store verification. */ + uint32_t session_flags; + bool is_owner, skip_hs; +#endif btree = S2BT(session); bm = btree->bm; ckptbase = NULL; - session_flags = 0; /* -Wuninitialized */ name = session->dhandle->name; bm_start = false; + +#if 0 + /* FIXME-WT-6263: Temporarily disable history store verification. */ + session_flags = 0; /* -Wuninitialized */ is_owner = false; /* -Wuninitialized */ /* @@ -185,6 +193,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) * the history store against itself. */ skip_hs = strcmp(name, WT_METAFILE_URI) == 0 || strcmp(name, WT_HS_URI) == 0; +#endif WT_CLEAR(_vstuff); vs = &_vstuff; @@ -265,9 +274,12 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) WT_WITH_PAGE_INDEX( session, ret = __verify_tree(session, &btree->root, &addr_unpack, vs)); +#if 0 /* * The checkpoints are in time-order, so the last one in the list is the most recent. If * this is the most recent checkpoint, verify the history store against it. + * + * FIXME-WT-6263: Temporarily disable history store verification. */ if (ret == 0 && (ckpt + 1)->name == NULL && !skip_hs) { /* Open a history store cursor. */ @@ -280,6 +292,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) * after that and unloading this checkpoint. */ } +#endif /* * We have an exclusive lock on the handle, but we're swapping root pages in-and-out of diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 6aa44046cb8..1ec05b29779 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -50,7 +50,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_SESSION_IMPL *session; - WT_UPDATE *old_upd, *upd, **upd_entry; + WT_UPDATE *last_upd, *old_upd, *upd, **upd_entry; size_t ins_size, upd_size; uint32_t ins_slot; u_int i, skipdepth; @@ -59,6 +59,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, ins = NULL; page = cbt->ref->page; session = CUR2S(cbt); + last_upd = NULL; upd = upd_arg; logged = false; @@ -113,9 +114,13 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, } else { upd_size = __wt_update_list_memsize(upd); + /* If there are existing updates, append them after the new updates. */ + for (last_upd = upd; last_upd->next != NULL; last_upd = last_upd->next) + ; + last_upd->next = *upd_entry; + /* - * If it's a full update list, we're trying to instantiate the row. Otherwise, it's just - * a single update that we'd like to append to the update list. + * We can either put multiple new updates or a single update on the update chain. * * Set the "old" entry to the second update in the list so that the serialization * function succeeds in swapping the first update into place. @@ -220,6 +225,8 @@ err: cbt->ins = NULL; if (upd_arg == NULL) __wt_free(session, upd); + if (last_upd != NULL) + last_upd->next = NULL; } return (ret); diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 7ec08ed81c4..6c1b1c4a759 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -2291,11 +2291,16 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_DECL_RET; const WT_NAME_FLAG *ft; WT_SESSION *wt_session; - WT_SESSION_IMPL *session, *verify_session; + WT_SESSION_IMPL *session; bool config_base_set, try_salvage, verify_meta; const char *enc_cfg[] = {NULL, NULL}, *merge_cfg; char version[64]; +#if 0 + /* FIXME-WT-6263: Temporarily disable history store verification. */ + WT_SESSION_IMPL *verify_session; +#endif + /* Leave lots of space for optional additional configuration. */ const char *cfg[] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}; @@ -2701,9 +2706,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c /* Start the worker threads and run recovery. */ WT_ERR(__wt_connection_workers(session, cfg)); +#if 0 /* * If the user wants to verify WiredTiger metadata, verify the history store now that the * metadata table may have been salvaged and eviction has been started and recovery run. + * + * FIXME-WT-6263: Temporarily disable history store verification. */ if (verify_meta) { WT_ERR(__wt_open_internal_session(conn, "verify hs", false, 0, &verify_session)); @@ -2712,6 +2720,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_TRET(wt_session->close(wt_session, NULL)); WT_ERR(ret); } +#endif /* * The default session should not open data handles after this point: since it can be shared diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c index 6d1c63c5fdf..398b871ea57 100644 --- a/src/third_party/wiredtiger/src/history/hs.c +++ b/src/third_party/wiredtiger/src/history/hs.c @@ -202,6 +202,9 @@ __wt_hs_cursor_open(WT_SESSION_IMPL *session) session, ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &cursor)); WT_RET(ret); + /* History store cursors should always ignore tombstones. */ + F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE); + session->hs_cursor = cursor; F_SET(session, WT_SESSION_HS_CURSOR); @@ -215,7 +218,6 @@ __wt_hs_cursor_open(WT_SESSION_IMPL *session) int __wt_hs_cursor(WT_SESSION_IMPL *session, uint32_t *session_flags, bool *is_owner) { - /* * We don't want to get tapped for eviction after we start using the history store cursor; save * a copy of the current eviction state, we'll turn eviction off before we return. @@ -288,27 +290,12 @@ int __wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd) { WT_DECL_RET; - WT_PAGE_MODIFY *mod; - WT_SESSION_IMPL *session; - WT_UPDATE *last_upd; - - session = CUR2S(hs_cbt); - - /* If there are existing updates, append them after the new updates. */ - if (hs_cbt->compare == 0) { - for (last_upd = hs_upd; last_upd->next != NULL; last_upd = last_upd->next) - ; - if (hs_cbt->ins != NULL) - last_upd->next = hs_cbt->ins->upd; - else if ((mod = hs_cbt->ref->page->modify) != NULL && mod->mod_row_update != NULL) - last_upd->next = mod->mod_row_update[hs_cbt->slot]; - } /* * We don't have exclusive access to the history store page so we need to pass "false" here to * ensure that we're locking when inserting new keys to an insert list. */ - WT_WITH_BTREE(session, CUR2BT(hs_cbt), + WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt), ret = __wt_row_modify(hs_cbt, &hs_cbt->iface.key, NULL, hs_upd, WT_UPDATE_INVALID, false)); return (ret); } @@ -518,8 +505,6 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT */ WT_ERR(__wt_cursor_key_order_init((WT_CURSOR_BTREE *)cursor)); #endif - F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE); - /* We're pointing at the newly inserted update. Iterate once more to avoid deleting it. */ WT_ERR_NOTFOUND_OK(cursor->next(cursor), true); @@ -536,8 +521,6 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT done: err: - F_CLR(cursor, WT_CURSTD_IGNORE_TOMBSTONE); - /* We did a row search, release the cursor so that the page doesn't continue being held. */ cursor->reset(cursor); @@ -609,7 +592,7 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) uint32_t i; uint8_t *p; int nentries; - bool squashed, track_prepare, updates_in_hs, updates_older_than_onpage; + bool squashed, track_prepare; uint8_t upd_count; btree = S2BT(session); @@ -664,7 +647,7 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) __wt_free_update_list(session, &upd); upd = list->onpage_upd; second_older_than_prepare = NULL; - track_prepare = updates_in_hs = updates_older_than_onpage = false; + track_prepare = false; upd_count = 0; /* @@ -724,15 +707,12 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) } } - if (F_ISSET(upd, WT_UPDATE_HS)) { - updates_in_hs = true; - /* - * If we've reached a full update and its in the history store we don't need to - * continue as anything beyond this point won't help with calculating deltas. - */ - if (upd->type == WT_UPDATE_STANDARD) - break; - } + /* + * If we've reached a full update and its in the history store we don't need to continue + * as anything beyond this point won't help with calculating deltas. + */ + if (upd->type == WT_UPDATE_STANDARD && F_ISSET(upd, WT_UPDATE_HS)) + break; } upd = NULL; @@ -742,16 +722,10 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) __wt_modify_vector_pop(&modifies, &upd); WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_TOMBSTONE); - /* Skip TOMBSTONE at the end of the update chain. */ if (upd->type == WT_UPDATE_TOMBSTONE) { if (modifies.size > 0) { - /* - * We don't need to delete the history store records if everything is still on the - * insert list and there are no updates moved to the history store by checkpoint or - * a failed eviction. - */ - if ((list->ins == NULL || updates_in_hs) && upd->start_ts == WT_TS_NONE) { + if (upd->start_ts == WT_TS_NONE) { /* We can only delete history store entries that have timestamps. */ WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); @@ -777,7 +751,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) tmp = full_value, full_value = prev_full_value, prev_full_value = tmp, upd = prev_upd) { WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY); - updates_older_than_onpage = true; __wt_modify_vector_pop(&modifies, &prev_upd); @@ -803,24 +776,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) stop_time_point.txnid = prev_upd->txnid; } - /* - * Delete the history store records if we detect a mixed mode update. We don't need to - * do that if everything is still on the insert list and there are no updates moved to - * the history store by checkpoint or a failed eviction. - * - * Note that if the update is restored from data store or history store, we may have - * cleared its timestamp, remove the history store contents anyway in this case. - */ - if ((list->ins == NULL || updates_in_hs) && prev_upd->start_ts == WT_TS_NONE && - (upd->start_ts != WT_TS_NONE || - F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS))) { - /* We can only delete history store entries that have timestamps. */ - WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); - WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); - } - if (prev_upd->type == WT_UPDATE_TOMBSTONE) { WT_ASSERT(session, modifies.size > 0); + if (prev_upd->start_ts == WT_TS_NONE) { + /* We can only delete history store entries that have timestamps. */ + WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); + } __wt_modify_vector_pop(&modifies, &prev_upd); WT_ASSERT(session, prev_upd->type == WT_UPDATE_STANDARD); prev_full_value->data = prev_upd->data; @@ -874,24 +836,8 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) squashed = true; } - WT_ASSERT(session, - upd->txnid == list->onpage_upd->txnid && upd->start_ts == list->onpage_upd->start_ts); - if (modifies.size > 0) WT_STAT_CONN_INCR(session, cache_hs_write_squash); - - /* - * Delete the history store records if the onpage update's timestamp is WT_TS_NONE and we - * don't see any update older than it. We don't need to do that if everything is still on - * the insert list and there are no updates moved to the history store by checkpoint or a - * failed eviction. - */ - if (!updates_older_than_onpage && (list->ins == NULL || updates_in_hs) && - upd->start_ts == WT_TS_NONE) { - /* We can only delete history store entries that have timestamps. */ - WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); - WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); - } } WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size)); @@ -984,6 +930,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf) { WT_CURSOR *hs_cursor; + WT_CURSOR_BTREE *hs_cbt; WT_DECL_ITEM(hs_value); WT_DECL_ITEM(orig_hs_value_buf); WT_DECL_RET; @@ -1030,6 +977,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma /* Open a history store table cursor. */ WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner)); hs_cursor = session->hs_cursor; + hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; /* * After positioning our cursor, we're stepping backwards to find the correct update. Since the @@ -1043,19 +991,37 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma ret = 0; goto done; } - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); + for (;; ret = hs_cursor->prev(hs_cursor)) { + WT_ERR_NOTFOUND_OK(ret, true); + /* If we hit the end of the table, let's get out of here. */ + if (ret == WT_NOTFOUND) { + ret = 0; + goto done; + } + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); - /* Stop before crossing over to the next btree */ - if (hs_btree_id != S2BT(session)->id) - goto done; + /* Stop before crossing over to the next btree */ + if (hs_btree_id != S2BT(session)->id) + goto done; - /* - * Keys are sorted in an order, skip the ones before the desired key, and bail out if we have - * crossed over the desired key and not found the record we are looking for. - */ - WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); - if (cmp != 0) - goto done; + /* + * Keys are sorted in an order, skip the ones before the desired key, and bail out if we + * have crossed over the desired key and not found the record we are looking for. + */ + WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); + if (cmp != 0) + goto done; + + /* + * If the stop time point of a record is visible to us, we won't be able to see anything for + * this entire key. Just jump straight to the end. + */ + if (__wt_txn_tw_stop_visible(session, &hs_cbt->upd_value->tw)) + goto done; + /* If the start time point is visible to us, let's return that record. */ + if (__wt_txn_tw_start_visible(session, &hs_cbt->upd_value->tw)) + break; + } WT_ERR(hs_cursor->get_value( hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &upd_type_full, hs_value)); @@ -1086,7 +1052,6 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma * visibility checks when reading in order to construct the modify chain, so we can create * the value we expect. */ - F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY); while (upd_type == WT_UPDATE_MODIFY) { WT_ERR(__wt_upd_alloc(session, hs_value, upd_type, &mod_upd, NULL)); WT_ERR(__wt_modify_vector_push(&modifies, mod_upd)); @@ -1129,7 +1094,6 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma &upd_type_full, hs_value)); upd_type = (uint8_t)upd_type_full; } - F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY); WT_ASSERT(session, upd_type == WT_UPDATE_STANDARD); while (modifies.size > 0) { __wt_modify_vector_pop(&modifies, &mod_upd); @@ -1153,8 +1117,6 @@ skip_buf: done: err: - F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY); - if (orig_hs_value_buf != NULL) __wt_scr_free(session, &orig_hs_value_buf); else @@ -1270,20 +1232,10 @@ __wt_hs_delete_key_from_ts( WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner)); - /* - * In order to delete a key range, we need to be able to inspect all history store records - * regardless of their stop time points and the visibility of their values. - */ - F_SET(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); - F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY); - /* The tree structure can change while we try to insert the mod list, retry if that happens. */ while ((ret = __hs_delete_key_from_ts_int(session, btree_id, key, ts)) == WT_RESTART) WT_STAT_CONN_INCR(session, cache_hs_insert_restart); - F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY); - F_CLR(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); - WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner)); return (ret); } @@ -1375,7 +1327,6 @@ __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32 * visible tombstones in the data table to verify the corresponding entries in the history store * are too present in the data store. */ - F_SET(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); F_SET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE); /* @@ -1426,7 +1377,6 @@ __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32 WT_ERR_NOTFOUND_OK(ret, true); err: F_CLR(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE); - F_CLR(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); WT_ASSERT(session, hs_key.mem == NULL && hs_key.memsize == 0); __wt_scr_free(session, &prev_hs_key); return (ret); diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 17fb433ddbb..84d7c14c914 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -166,33 +166,32 @@ struct __wt_session_impl { #define WT_SESSION_CACHE_CURSORS 0x00000004u #define WT_SESSION_CAN_WAIT 0x00000008u #define WT_SESSION_HS_CURSOR 0x00000010u -#define WT_SESSION_HS_IGNORE_VISIBILITY 0x00000020u -#define WT_SESSION_IGNORE_CACHE_SIZE 0x00000040u -#define WT_SESSION_INSTANTIATE_PREPARE 0x00000080u -#define WT_SESSION_INTERNAL 0x00000100u -#define WT_SESSION_LOCKED_CHECKPOINT 0x00000200u -#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000400u -#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000800u -#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x00001000u -#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x00002000u -#define WT_SESSION_LOCKED_METADATA 0x00004000u -#define WT_SESSION_LOCKED_PASS 0x00008000u -#define WT_SESSION_LOCKED_SCHEMA 0x00010000u -#define WT_SESSION_LOCKED_SLOT 0x00020000u -#define WT_SESSION_LOCKED_TABLE_READ 0x00040000u -#define WT_SESSION_LOCKED_TABLE_WRITE 0x00080000u -#define WT_SESSION_LOCKED_TURTLE 0x00100000u -#define WT_SESSION_LOGGING_INMEM 0x00200000u -#define WT_SESSION_NO_DATA_HANDLES 0x00400000u -#define WT_SESSION_NO_LOGGING 0x00800000u -#define WT_SESSION_NO_RECONCILE 0x01000000u -#define WT_SESSION_NO_SCHEMA_LOCK 0x02000000u -#define WT_SESSION_QUIET_CORRUPT_FILE 0x04000000u -#define WT_SESSION_READ_WONT_NEED 0x08000000u -#define WT_SESSION_RESOLVING_TXN 0x10000000u -#define WT_SESSION_ROLLBACK_TO_STABLE 0x20000000u -#define WT_SESSION_SCHEMA_TXN 0x40000000u -#define WT_SESSION_SERVER_ASYNC 0x80000000u +#define WT_SESSION_IGNORE_CACHE_SIZE 0x00000020u +#define WT_SESSION_INSTANTIATE_PREPARE 0x00000040u +#define WT_SESSION_INTERNAL 0x00000080u +#define WT_SESSION_LOCKED_CHECKPOINT 0x00000100u +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000200u +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000400u +#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x00000800u +#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x00001000u +#define WT_SESSION_LOCKED_METADATA 0x00002000u +#define WT_SESSION_LOCKED_PASS 0x00004000u +#define WT_SESSION_LOCKED_SCHEMA 0x00008000u +#define WT_SESSION_LOCKED_SLOT 0x00010000u +#define WT_SESSION_LOCKED_TABLE_READ 0x00020000u +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00040000u +#define WT_SESSION_LOCKED_TURTLE 0x00080000u +#define WT_SESSION_LOGGING_INMEM 0x00100000u +#define WT_SESSION_NO_DATA_HANDLES 0x00200000u +#define WT_SESSION_NO_LOGGING 0x00400000u +#define WT_SESSION_NO_RECONCILE 0x00800000u +#define WT_SESSION_NO_SCHEMA_LOCK 0x01000000u +#define WT_SESSION_QUIET_CORRUPT_FILE 0x02000000u +#define WT_SESSION_READ_WONT_NEED 0x04000000u +#define WT_SESSION_RESOLVING_TXN 0x08000000u +#define WT_SESSION_ROLLBACK_TO_STABLE 0x10000000u +#define WT_SESSION_SCHEMA_TXN 0x20000000u +#define WT_SESSION_SERVER_ASYNC 0x40000000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 9dc39f4d1b8..d2c13bbb3b6 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -737,12 +737,10 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) if (prepare_state == WT_PREPARE_LOCKED) continue; - if (F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY) && upd->txnid != WT_TXN_ABORTED && - upd->type == WT_UPDATE_STANDARD) { - /* If we are resolving a modify then the btree must be the history store. */ - WT_ASSERT(session, WT_IS_HS(S2BT(session))); + if (WT_IS_HS(S2BT(session)) && upd->txnid != WT_TXN_ABORTED && + upd->type == WT_UPDATE_STANDARD) + /* Entries in the history store are always visible. */ return (WT_VISIBLE_TRUE); - } upd_visible = __wt_txn_visible(session, upd->txnid, upd->start_ts); @@ -860,9 +858,7 @@ __wt_txn_read_upd_list( * Ignore non-globally visible tombstones when we are doing history store scans in * rollback to stable or when we are told to. */ - if (type == WT_UPDATE_TOMBSTONE && - (F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) || - (WT_IS_HS(S2BT(session)) && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))) && + if (type == WT_UPDATE_TOMBSTONE && F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) && !__wt_txn_upd_visible_all(session, upd)) { cbt->upd_value->tw.durable_stop_ts = upd->durable_ts; cbt->upd_value->tw.stop_ts = upd->start_ts; @@ -953,8 +949,7 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint * are told to ignore non-globally visible tombstones. */ if (__wt_txn_tw_stop_visible(session, &tw) && - ((!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) && - (!WT_IS_HS(S2BT(session)) || !F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))) || + (!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) || (__wt_txn_tw_stop_visible_all(session, &tw) && !WT_CURSOR_IS_DUMP(&cbt->iface)))) { cbt->upd_value->buf.data = NULL; cbt->upd_value->buf.size = 0; @@ -975,13 +970,7 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint } /* If the start time point is visible then we need to return the ondisk value. */ - if (F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY) || - __wt_txn_tw_start_visible(session, &tw)) { - /* If we are resolving a modify then the btree must be the history store. */ - WT_ASSERT( - session, (F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY) && WT_IS_HS(S2BT(session))) || - !F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY)); - + if (WT_IS_HS(S2BT(session)) || __wt_txn_tw_start_visible(session, &tw)) { if (cbt->upd_value->skip_buf) { cbt->upd_value->buf.data = NULL; cbt->upd_value->buf.size = 0; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 54d281e06b9..b4a054a95b1 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -140,6 +140,9 @@ __rec_append_orig_value( * timestamped globally visible tombstone because even if its timestamp is smaller than * the entries in the history store, we can't change the history store entries. This is * not correct but we hope we can get away with it. + * + * FIXME-WT-6171: remove this once we get rid of out of order timestamps and mixed mode + * transactions. */ if (unpack->tw.durable_stop_ts != WT_TS_NONE && tombstone_globally_visible) return (0); @@ -486,6 +489,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v select_tw->durable_start_ts = select_tw->durable_stop_ts; select_tw->start_ts = select_tw->stop_ts; + select_tw->start_txn = select_tw->stop_txn; } /* @@ -544,12 +548,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * part of the page, and they are physically removed by checkpoint writing this page, that is, * the checkpoint doesn't include the overflow blocks so they're removed and future readers of * this page won't be able to find them. - * - * There is no need to append the original value for in memory databases as the onpage value - * should be already on the update chain and there is no history store. */ - if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && upd_select->upd != NULL && vpack != NULL && - vpack->type != WT_CELL_DEL && (upd_saved || F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW))) + if (upd_select->upd != NULL && vpack != NULL && vpack->type != WT_CELL_DEL && + (upd_saved || F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW))) WT_ERR(__rec_append_orig_value(session, page, upd_select->upd, vpack)); __wt_time_window_clear_obsolete( diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index fd19865636f..7fcb1ef940f 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -920,14 +920,8 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, /* * Scan the history store for the given btree and key with maximum start timestamp to let - * the search point to the last version of the key. We must ignore tombstone in the history - * store while retrieving the update from the history store to replace the update in the - * data store. We also need to ignore visibility of the updates as we have already released - * our snapshot in prepare. Otherwise, we can't see updates with non-globally visible - * transaction ids. + * the search point to the last version of the key. */ - F_SET(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); - F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY); WT_ERR_NOTFOUND_OK( __wt_hs_cursor_position(session, hs_cursor, hs_btree_id, &op->u.op_row.key, WT_TS_MAX), true); @@ -1011,11 +1005,8 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd, commit)); err: - if (hs_cursor != NULL) { - F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY); - F_CLR(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); + if (hs_cursor != NULL) ret = __wt_hs_cursor_close(session, session_flags, is_owner); - } if (!upd_appended) __wt_free(session, fix_upd); return (ret); diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 80d2ab99cd4..d0023be05d2 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -103,7 +103,12 @@ static CONFIG c[] = { {"btree.key_max", "maximum size of keys", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL}, - {"btree.key_min", "minimum size of keys", 0x0, 10, 32, 256, &g.c_key_min, NULL}, + /* + * A minimum key size of 11 is necessary. Row-store keys have a leading 10-digit number and the + * 11 guarantees we never see a key that we can't convert to a numeric value without formatting + * it first because there's a trailing non-digit character in every key. + */ + {"btree.key_min", "minimum size of keys", 0x0, 11, 32, 256, &g.c_key_min, NULL}, {"btree.leaf_page_max", "maximum size of Btree leaf nodes", 0x0, 9, 17, 27, &g.c_leaf_page_max, NULL}, diff --git a/src/third_party/wiredtiger/test/suite/test_compact02.py b/src/third_party/wiredtiger/test/suite/test_compact02.py index c15fb5bc78b..e466aa81a95 100644 --- a/src/third_party/wiredtiger/test/suite/test_compact02.py +++ b/src/third_party/wiredtiger/test/suite/test_compact02.py @@ -149,11 +149,11 @@ class test_compact02(wttest.WiredTigerTestCase): # Compact can collide with eviction, if that happens we retry. Wait for # a long time, the check for EBUSY means we're not retrying on any real # errors. - for i in range(1, 60): + for i in range(1, 80): if not self.raisesBusy( lambda: self.session.compact(self.uri, None)): break - time.sleep(5) + time.sleep(6) # 6. Get stats on compacted table. sz = self.getSize() diff --git a/src/third_party/wiredtiger/test/suite/test_hs11.py b/src/third_party/wiredtiger/test/suite/test_hs11.py index f7d31c6796f..efc9d02401c 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs11.py +++ b/src/third_party/wiredtiger/test/suite/test_hs11.py @@ -32,12 +32,13 @@ def timestamp_str(t): return '%x' % t # test_hs11.py -# Ensure that mixed mode updates clear the history store records. +# Ensure that when we delete a key due to a tombstone being globally visible, we delete its +# associated history store content. class test_hs11(wttest.WiredTigerTestCase): conn_config = 'cache_size=50MB' session_config = 'isolation=snapshot' - def run_test(self, update_type): + def test_key_deletion_clears_hs(self): uri = 'table:test_hs11' create_params = 'key_format=S,value_format=S' self.session.create(uri, create_params) @@ -57,16 +58,14 @@ class test_hs11(wttest.WiredTigerTestCase): # Reconcile and flush versions 1-3 to the history store. self.session.checkpoint() - # Apply a mixed mode update. + # Apply a non-timestamped tombstone. When the pages get evicted, the keys will get deleted + # since the tombstone is globally visible. for i in range(1, 10000): if i % 2 == 0: - if update_type == 'deletion': - cursor.set_key(str(i)) - cursor.remove() - else: - cursor[str(i)] = value2 + cursor.set_key(str(i)) + cursor.remove() - # Now apply an update at timestamp 10. + # Now apply an update at timestamp 10 to recreate each key. for i in range(1, 10000): self.session.begin_transaction() cursor[str(i)] = value2 @@ -77,17 +76,8 @@ class test_hs11(wttest.WiredTigerTestCase): self.session.begin_transaction('read_timestamp=' + timestamp_str(ts)) for i in range(1, 10000): if i % 2 == 0: - if update_type == 'deletion': - cursor.set_key(str(i)) - self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) - else: - self.assertEqual(cursor[str(i)], value2) + cursor.set_key(str(i)) + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) else: self.assertEqual(cursor[str(i)], value1) self.session.rollback_transaction() - - def test_key_deletion_clears_hs(self): - self.run_test('deletion') - - def test_key_update_clears_hs(self): - self.run_test('update') diff --git a/src/third_party/wiredtiger/test/suite/test_hs14.py b/src/third_party/wiredtiger/test/suite/test_hs14.py new file mode 100644 index 00000000000..ebd5f471f2b --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_hs14.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import time, wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +# test_hs14.py +# Ensure that point in time reads with few visible history store records don't +# damage performance. +class test_hs14(wttest.WiredTigerTestCase): + conn_config = 'cache_size=50MB' + session_config = 'isolation=snapshot' + + def test_hs14(self): + uri = 'table:test_hs14' + self.session.create(uri, 'key_format=S,value_format=S') + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1)) + cursor = self.session.open_cursor(uri) + + value1 = 'a' * 500 + value2 = 'b' * 500 + value3 = 'c' * 500 + value4 = 'd' * 500 + value5 = 'e' * 500 + + for i in range(1, 10000): + self.session.begin_transaction() + cursor[str(i)] = value1 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(2)) + self.session.begin_transaction() + cursor[str(i)] = value2 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(2)) + self.session.begin_transaction() + cursor[str(i)] = value3 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) + self.session.begin_transaction() + cursor[str(i)] = value4 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(4)) + + start = time.time() + self.session.begin_transaction('read_timestamp=' + timestamp_str(3)) + for i in range(1, 10000): + self.assertEqual(cursor[str(i)], value3) + self.session.rollback_transaction() + end = time.time() + + # The time spent when all history store keys are visible to us. + visible_hs_latency = (end - start) + + for i in range(1, 10000): + self.session.begin_transaction() + cursor.set_key(str(i)) + cursor.remove() + self.session.commit_transaction('commit_timestamp=' + timestamp_str(5)) + self.session.begin_transaction() + cursor[str(i)] = value5 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(10)) + + start = time.time() + self.session.begin_transaction('read_timestamp=' + timestamp_str(9)) + for i in range(1, 10000): + cursor.set_key(str(i)) + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + self.session.rollback_transaction() + end = time.time() + + # The time spent when all history store keys are invisible to us. + invisible_hs_latency = (end - start) + + self.assertLess(invisible_hs_latency, (visible_hs_latency * 10), + "Reader took an order of magnitude longer for when all " + "history store records were invisible, visible={}, invisible={}".format( + visible_hs_latency, invisible_hs_latency + )) -- cgit v1.2.1