summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2020-06-03 17:32:56 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-06-03 07:46:19 +0000
commitff99a2afe938bf7aec7e4bbfb0a922d7f70d6712 (patch)
tree07d2afeb5da1be1e9d427d00c37a5160a205f8a9
parentf9f44a12b3c712d60881b430510bd5b7d603ca1b (diff)
downloadmongo-r4.4.0-rc8.tar.gz
Import wiredtiger: 3901b43d1e0f034c46999ccfdbdd0914791859e0 from branch mongodb-4.4r4.4.0-rc8
ref: cc2458c2c2..3901b43d1e for: 4.4.0-rc8 WT-5769 Search history store can potentially walk the whole history store tree WT-6328 Update test_compact02 to handle being halted by eviction pressure WT-6355 test/format can configure keys that are too short WT-6364 Fix double counting sizes of existing updates on the update chain for the history store
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c19
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c13
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c11
-rw-r--r--src/third_party/wiredtiger/src/history/hs.c148
-rw-r--r--src/third_party/wiredtiger/src/include/session.h53
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i23
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c11
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c13
-rw-r--r--src/third_party/wiredtiger/test/format/config.h7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_compact02.py4
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs11.py30
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs14.py101
13 files changed, 245 insertions, 190 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 77a5d85e3d6..1f51fdec549 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "cc2458c2c238b08db817161915d00d7e19c6ba29"
+ "commit": "3901b43d1e0f034c46999ccfdbdd0914791859e0"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index 10dca448c28..b7ab4f87434 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -166,17 +166,25 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
WT_DECL_RET;
WT_VSTUFF *vs, _vstuff;
size_t root_addr_size;
- uint32_t session_flags;
uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
const char *name;
- bool bm_start, is_owner, quit, skip_hs;
+ bool bm_start, quit;
+
+#if 0
+ /* FIXME-WT-6263: Temporarily disable history store verification. */
+ uint32_t session_flags;
+ bool is_owner, skip_hs;
+#endif
btree = S2BT(session);
bm = btree->bm;
ckptbase = NULL;
- session_flags = 0; /* -Wuninitialized */
name = session->dhandle->name;
bm_start = false;
+
+#if 0
+ /* FIXME-WT-6263: Temporarily disable history store verification. */
+ session_flags = 0; /* -Wuninitialized */
is_owner = false; /* -Wuninitialized */
/*
@@ -185,6 +193,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
* the history store against itself.
*/
skip_hs = strcmp(name, WT_METAFILE_URI) == 0 || strcmp(name, WT_HS_URI) == 0;
+#endif
WT_CLEAR(_vstuff);
vs = &_vstuff;
@@ -265,9 +274,12 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
WT_WITH_PAGE_INDEX(
session, ret = __verify_tree(session, &btree->root, &addr_unpack, vs));
+#if 0
/*
* The checkpoints are in time-order, so the last one in the list is the most recent. If
* this is the most recent checkpoint, verify the history store against it.
+ *
+ * FIXME-WT-6263: Temporarily disable history store verification.
*/
if (ret == 0 && (ckpt + 1)->name == NULL && !skip_hs) {
/* Open a history store cursor. */
@@ -280,6 +292,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
* after that and unloading this checkpoint.
*/
}
+#endif
/*
* We have an exclusive lock on the handle, but we're swapping root pages in-and-out of
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index 6aa44046cb8..1ec05b29779 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -50,7 +50,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value,
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_SESSION_IMPL *session;
- WT_UPDATE *old_upd, *upd, **upd_entry;
+ WT_UPDATE *last_upd, *old_upd, *upd, **upd_entry;
size_t ins_size, upd_size;
uint32_t ins_slot;
u_int i, skipdepth;
@@ -59,6 +59,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value,
ins = NULL;
page = cbt->ref->page;
session = CUR2S(cbt);
+ last_upd = NULL;
upd = upd_arg;
logged = false;
@@ -113,9 +114,13 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value,
} else {
upd_size = __wt_update_list_memsize(upd);
+ /* If there are existing updates, append them after the new updates. */
+ for (last_upd = upd; last_upd->next != NULL; last_upd = last_upd->next)
+ ;
+ last_upd->next = *upd_entry;
+
/*
- * If it's a full update list, we're trying to instantiate the row. Otherwise, it's just
- * a single update that we'd like to append to the update list.
+ * We can either put multiple new updates or a single update on the update chain.
*
* Set the "old" entry to the second update in the list so that the serialization
* function succeeds in swapping the first update into place.
@@ -220,6 +225,8 @@ err:
cbt->ins = NULL;
if (upd_arg == NULL)
__wt_free(session, upd);
+ if (last_upd != NULL)
+ last_upd->next = NULL;
}
return (ret);
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 7ec08ed81c4..6c1b1c4a759 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -2291,11 +2291,16 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_DECL_RET;
const WT_NAME_FLAG *ft;
WT_SESSION *wt_session;
- WT_SESSION_IMPL *session, *verify_session;
+ WT_SESSION_IMPL *session;
bool config_base_set, try_salvage, verify_meta;
const char *enc_cfg[] = {NULL, NULL}, *merge_cfg;
char version[64];
+#if 0
+ /* FIXME-WT-6263: Temporarily disable history store verification. */
+ WT_SESSION_IMPL *verify_session;
+#endif
+
/* Leave lots of space for optional additional configuration. */
const char *cfg[] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
@@ -2701,9 +2706,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
/* Start the worker threads and run recovery. */
WT_ERR(__wt_connection_workers(session, cfg));
+#if 0
/*
* If the user wants to verify WiredTiger metadata, verify the history store now that the
* metadata table may have been salvaged and eviction has been started and recovery run.
+ *
+ * FIXME-WT-6263: Temporarily disable history store verification.
*/
if (verify_meta) {
WT_ERR(__wt_open_internal_session(conn, "verify hs", false, 0, &verify_session));
@@ -2712,6 +2720,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_TRET(wt_session->close(wt_session, NULL));
WT_ERR(ret);
}
+#endif
/*
* The default session should not open data handles after this point: since it can be shared
diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c
index 6d1c63c5fdf..398b871ea57 100644
--- a/src/third_party/wiredtiger/src/history/hs.c
+++ b/src/third_party/wiredtiger/src/history/hs.c
@@ -202,6 +202,9 @@ __wt_hs_cursor_open(WT_SESSION_IMPL *session)
session, ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &cursor));
WT_RET(ret);
+ /* History store cursors should always ignore tombstones. */
+ F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE);
+
session->hs_cursor = cursor;
F_SET(session, WT_SESSION_HS_CURSOR);
@@ -215,7 +218,6 @@ __wt_hs_cursor_open(WT_SESSION_IMPL *session)
int
__wt_hs_cursor(WT_SESSION_IMPL *session, uint32_t *session_flags, bool *is_owner)
{
-
/*
* We don't want to get tapped for eviction after we start using the history store cursor; save
* a copy of the current eviction state, we'll turn eviction off before we return.
@@ -288,27 +290,12 @@ int
__wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd)
{
WT_DECL_RET;
- WT_PAGE_MODIFY *mod;
- WT_SESSION_IMPL *session;
- WT_UPDATE *last_upd;
-
- session = CUR2S(hs_cbt);
-
- /* If there are existing updates, append them after the new updates. */
- if (hs_cbt->compare == 0) {
- for (last_upd = hs_upd; last_upd->next != NULL; last_upd = last_upd->next)
- ;
- if (hs_cbt->ins != NULL)
- last_upd->next = hs_cbt->ins->upd;
- else if ((mod = hs_cbt->ref->page->modify) != NULL && mod->mod_row_update != NULL)
- last_upd->next = mod->mod_row_update[hs_cbt->slot];
- }
/*
* We don't have exclusive access to the history store page so we need to pass "false" here to
* ensure that we're locking when inserting new keys to an insert list.
*/
- WT_WITH_BTREE(session, CUR2BT(hs_cbt),
+ WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt),
ret = __wt_row_modify(hs_cbt, &hs_cbt->iface.key, NULL, hs_upd, WT_UPDATE_INVALID, false));
return (ret);
}
@@ -518,8 +505,6 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
*/
WT_ERR(__wt_cursor_key_order_init((WT_CURSOR_BTREE *)cursor));
#endif
- F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE);
-
/* We're pointing at the newly inserted update. Iterate once more to avoid deleting it. */
WT_ERR_NOTFOUND_OK(cursor->next(cursor), true);
@@ -536,8 +521,6 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
done:
err:
- F_CLR(cursor, WT_CURSTD_IGNORE_TOMBSTONE);
-
/* We did a row search, release the cursor so that the page doesn't continue being held. */
cursor->reset(cursor);
@@ -609,7 +592,7 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
uint32_t i;
uint8_t *p;
int nentries;
- bool squashed, track_prepare, updates_in_hs, updates_older_than_onpage;
+ bool squashed, track_prepare;
uint8_t upd_count;
btree = S2BT(session);
@@ -664,7 +647,7 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
__wt_free_update_list(session, &upd);
upd = list->onpage_upd;
second_older_than_prepare = NULL;
- track_prepare = updates_in_hs = updates_older_than_onpage = false;
+ track_prepare = false;
upd_count = 0;
/*
@@ -724,15 +707,12 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
}
}
- if (F_ISSET(upd, WT_UPDATE_HS)) {
- updates_in_hs = true;
- /*
- * If we've reached a full update and its in the history store we don't need to
- * continue as anything beyond this point won't help with calculating deltas.
- */
- if (upd->type == WT_UPDATE_STANDARD)
- break;
- }
+ /*
+ * If we've reached a full update and its in the history store we don't need to continue
+ * as anything beyond this point won't help with calculating deltas.
+ */
+ if (upd->type == WT_UPDATE_STANDARD && F_ISSET(upd, WT_UPDATE_HS))
+ break;
}
upd = NULL;
@@ -742,16 +722,10 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
__wt_modify_vector_pop(&modifies, &upd);
WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_TOMBSTONE);
-
/* Skip TOMBSTONE at the end of the update chain. */
if (upd->type == WT_UPDATE_TOMBSTONE) {
if (modifies.size > 0) {
- /*
- * We don't need to delete the history store records if everything is still on the
- * insert list and there are no updates moved to the history store by checkpoint or
- * a failed eviction.
- */
- if ((list->ins == NULL || updates_in_hs) && upd->start_ts == WT_TS_NONE) {
+ if (upd->start_ts == WT_TS_NONE) {
/* We can only delete history store entries that have timestamps. */
WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1));
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts);
@@ -777,7 +751,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
tmp = full_value, full_value = prev_full_value, prev_full_value = tmp,
upd = prev_upd) {
WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY);
- updates_older_than_onpage = true;
__wt_modify_vector_pop(&modifies, &prev_upd);
@@ -803,24 +776,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
stop_time_point.txnid = prev_upd->txnid;
}
- /*
- * Delete the history store records if we detect a mixed mode update. We don't need to
- * do that if everything is still on the insert list and there are no updates moved to
- * the history store by checkpoint or a failed eviction.
- *
- * Note that if the update is restored from data store or history store, we may have
- * cleared its timestamp, remove the history store contents anyway in this case.
- */
- if ((list->ins == NULL || updates_in_hs) && prev_upd->start_ts == WT_TS_NONE &&
- (upd->start_ts != WT_TS_NONE ||
- F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS))) {
- /* We can only delete history store entries that have timestamps. */
- WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1));
- WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts);
- }
-
if (prev_upd->type == WT_UPDATE_TOMBSTONE) {
WT_ASSERT(session, modifies.size > 0);
+ if (prev_upd->start_ts == WT_TS_NONE) {
+ /* We can only delete history store entries that have timestamps. */
+ WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1));
+ WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts);
+ }
__wt_modify_vector_pop(&modifies, &prev_upd);
WT_ASSERT(session, prev_upd->type == WT_UPDATE_STANDARD);
prev_full_value->data = prev_upd->data;
@@ -874,24 +836,8 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
squashed = true;
}
- WT_ASSERT(session,
- upd->txnid == list->onpage_upd->txnid && upd->start_ts == list->onpage_upd->start_ts);
-
if (modifies.size > 0)
WT_STAT_CONN_INCR(session, cache_hs_write_squash);
-
- /*
- * Delete the history store records if the onpage update's timestamp is WT_TS_NONE and we
- * don't see any update older than it. We don't need to do that if everything is still on
- * the insert list and there are no updates moved to the history store by checkpoint or a
- * failed eviction.
- */
- if (!updates_older_than_onpage && (list->ins == NULL || updates_in_hs) &&
- upd->start_ts == WT_TS_NONE) {
- /* We can only delete history store entries that have timestamps. */
- WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1));
- WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts);
- }
}
WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size));
@@ -984,6 +930,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma
WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf)
{
WT_CURSOR *hs_cursor;
+ WT_CURSOR_BTREE *hs_cbt;
WT_DECL_ITEM(hs_value);
WT_DECL_ITEM(orig_hs_value_buf);
WT_DECL_RET;
@@ -1030,6 +977,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma
/* Open a history store table cursor. */
WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner));
hs_cursor = session->hs_cursor;
+ hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
/*
* After positioning our cursor, we're stepping backwards to find the correct update. Since the
@@ -1043,19 +991,37 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma
ret = 0;
goto done;
}
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
+ for (;; ret = hs_cursor->prev(hs_cursor)) {
+ WT_ERR_NOTFOUND_OK(ret, true);
+ /* If we hit the end of the table, let's get out of here. */
+ if (ret == WT_NOTFOUND) {
+ ret = 0;
+ goto done;
+ }
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
- /* Stop before crossing over to the next btree */
- if (hs_btree_id != S2BT(session)->id)
- goto done;
+ /* Stop before crossing over to the next btree */
+ if (hs_btree_id != S2BT(session)->id)
+ goto done;
- /*
- * Keys are sorted in an order, skip the ones before the desired key, and bail out if we have
- * crossed over the desired key and not found the record we are looking for.
- */
- WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
- if (cmp != 0)
- goto done;
+ /*
+ * Keys are sorted in an order, skip the ones before the desired key, and bail out if we
+ * have crossed over the desired key and not found the record we are looking for.
+ */
+ WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
+ if (cmp != 0)
+ goto done;
+
+ /*
+ * If the stop time point of a record is visible to us, we won't be able to see anything for
+ * this entire key. Just jump straight to the end.
+ */
+ if (__wt_txn_tw_stop_visible(session, &hs_cbt->upd_value->tw))
+ goto done;
+ /* If the start time point is visible to us, let's return that record. */
+ if (__wt_txn_tw_start_visible(session, &hs_cbt->upd_value->tw))
+ break;
+ }
WT_ERR(hs_cursor->get_value(
hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &upd_type_full, hs_value));
@@ -1086,7 +1052,6 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma
* visibility checks when reading in order to construct the modify chain, so we can create
* the value we expect.
*/
- F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY);
while (upd_type == WT_UPDATE_MODIFY) {
WT_ERR(__wt_upd_alloc(session, hs_value, upd_type, &mod_upd, NULL));
WT_ERR(__wt_modify_vector_push(&modifies, mod_upd));
@@ -1129,7 +1094,6 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma
&upd_type_full, hs_value));
upd_type = (uint8_t)upd_type_full;
}
- F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY);
WT_ASSERT(session, upd_type == WT_UPDATE_STANDARD);
while (modifies.size > 0) {
__wt_modify_vector_pop(&modifies, &mod_upd);
@@ -1153,8 +1117,6 @@ skip_buf:
done:
err:
- F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY);
-
if (orig_hs_value_buf != NULL)
__wt_scr_free(session, &orig_hs_value_buf);
else
@@ -1270,20 +1232,10 @@ __wt_hs_delete_key_from_ts(
WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner));
- /*
- * In order to delete a key range, we need to be able to inspect all history store records
- * regardless of their stop time points and the visibility of their values.
- */
- F_SET(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
- F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY);
-
/* The tree structure can change while we try to insert the mod list, retry if that happens. */
while ((ret = __hs_delete_key_from_ts_int(session, btree_id, key, ts)) == WT_RESTART)
WT_STAT_CONN_INCR(session, cache_hs_insert_restart);
- F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY);
- F_CLR(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
-
WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
return (ret);
}
@@ -1375,7 +1327,6 @@ __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32
* visible tombstones in the data table to verify the corresponding entries in the history store
* are too present in the data store.
*/
- F_SET(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
F_SET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE);
/*
@@ -1426,7 +1377,6 @@ __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32
WT_ERR_NOTFOUND_OK(ret, true);
err:
F_CLR(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE);
- F_CLR(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
WT_ASSERT(session, hs_key.mem == NULL && hs_key.memsize == 0);
__wt_scr_free(session, &prev_hs_key);
return (ret);
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index 17fb433ddbb..84d7c14c914 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -166,33 +166,32 @@ struct __wt_session_impl {
#define WT_SESSION_CACHE_CURSORS 0x00000004u
#define WT_SESSION_CAN_WAIT 0x00000008u
#define WT_SESSION_HS_CURSOR 0x00000010u
-#define WT_SESSION_HS_IGNORE_VISIBILITY 0x00000020u
-#define WT_SESSION_IGNORE_CACHE_SIZE 0x00000040u
-#define WT_SESSION_INSTANTIATE_PREPARE 0x00000080u
-#define WT_SESSION_INTERNAL 0x00000100u
-#define WT_SESSION_LOCKED_CHECKPOINT 0x00000200u
-#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000400u
-#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000800u
-#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x00001000u
-#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x00002000u
-#define WT_SESSION_LOCKED_METADATA 0x00004000u
-#define WT_SESSION_LOCKED_PASS 0x00008000u
-#define WT_SESSION_LOCKED_SCHEMA 0x00010000u
-#define WT_SESSION_LOCKED_SLOT 0x00020000u
-#define WT_SESSION_LOCKED_TABLE_READ 0x00040000u
-#define WT_SESSION_LOCKED_TABLE_WRITE 0x00080000u
-#define WT_SESSION_LOCKED_TURTLE 0x00100000u
-#define WT_SESSION_LOGGING_INMEM 0x00200000u
-#define WT_SESSION_NO_DATA_HANDLES 0x00400000u
-#define WT_SESSION_NO_LOGGING 0x00800000u
-#define WT_SESSION_NO_RECONCILE 0x01000000u
-#define WT_SESSION_NO_SCHEMA_LOCK 0x02000000u
-#define WT_SESSION_QUIET_CORRUPT_FILE 0x04000000u
-#define WT_SESSION_READ_WONT_NEED 0x08000000u
-#define WT_SESSION_RESOLVING_TXN 0x10000000u
-#define WT_SESSION_ROLLBACK_TO_STABLE 0x20000000u
-#define WT_SESSION_SCHEMA_TXN 0x40000000u
-#define WT_SESSION_SERVER_ASYNC 0x80000000u
+#define WT_SESSION_IGNORE_CACHE_SIZE 0x00000020u
+#define WT_SESSION_INSTANTIATE_PREPARE 0x00000040u
+#define WT_SESSION_INTERNAL 0x00000080u
+#define WT_SESSION_LOCKED_CHECKPOINT 0x00000100u
+#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000200u
+#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000400u
+#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x00000800u
+#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x00001000u
+#define WT_SESSION_LOCKED_METADATA 0x00002000u
+#define WT_SESSION_LOCKED_PASS 0x00004000u
+#define WT_SESSION_LOCKED_SCHEMA 0x00008000u
+#define WT_SESSION_LOCKED_SLOT 0x00010000u
+#define WT_SESSION_LOCKED_TABLE_READ 0x00020000u
+#define WT_SESSION_LOCKED_TABLE_WRITE 0x00040000u
+#define WT_SESSION_LOCKED_TURTLE 0x00080000u
+#define WT_SESSION_LOGGING_INMEM 0x00100000u
+#define WT_SESSION_NO_DATA_HANDLES 0x00200000u
+#define WT_SESSION_NO_LOGGING 0x00400000u
+#define WT_SESSION_NO_RECONCILE 0x00800000u
+#define WT_SESSION_NO_SCHEMA_LOCK 0x01000000u
+#define WT_SESSION_QUIET_CORRUPT_FILE 0x02000000u
+#define WT_SESSION_READ_WONT_NEED 0x04000000u
+#define WT_SESSION_RESOLVING_TXN 0x08000000u
+#define WT_SESSION_ROLLBACK_TO_STABLE 0x10000000u
+#define WT_SESSION_SCHEMA_TXN 0x20000000u
+#define WT_SESSION_SERVER_ASYNC 0x40000000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t flags;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 9dc39f4d1b8..d2c13bbb3b6 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -737,12 +737,10 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
if (prepare_state == WT_PREPARE_LOCKED)
continue;
- if (F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY) && upd->txnid != WT_TXN_ABORTED &&
- upd->type == WT_UPDATE_STANDARD) {
- /* If we are resolving a modify then the btree must be the history store. */
- WT_ASSERT(session, WT_IS_HS(S2BT(session)));
+ if (WT_IS_HS(S2BT(session)) && upd->txnid != WT_TXN_ABORTED &&
+ upd->type == WT_UPDATE_STANDARD)
+ /* Entries in the history store are always visible. */
return (WT_VISIBLE_TRUE);
- }
upd_visible = __wt_txn_visible(session, upd->txnid, upd->start_ts);
@@ -860,9 +858,7 @@ __wt_txn_read_upd_list(
* Ignore non-globally visible tombstones when we are doing history store scans in
* rollback to stable or when we are told to.
*/
- if (type == WT_UPDATE_TOMBSTONE &&
- (F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) ||
- (WT_IS_HS(S2BT(session)) && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))) &&
+ if (type == WT_UPDATE_TOMBSTONE && F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) &&
!__wt_txn_upd_visible_all(session, upd)) {
cbt->upd_value->tw.durable_stop_ts = upd->durable_ts;
cbt->upd_value->tw.stop_ts = upd->start_ts;
@@ -953,8 +949,7 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint
* are told to ignore non-globally visible tombstones.
*/
if (__wt_txn_tw_stop_visible(session, &tw) &&
- ((!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) &&
- (!WT_IS_HS(S2BT(session)) || !F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))) ||
+ (!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) ||
(__wt_txn_tw_stop_visible_all(session, &tw) && !WT_CURSOR_IS_DUMP(&cbt->iface)))) {
cbt->upd_value->buf.data = NULL;
cbt->upd_value->buf.size = 0;
@@ -975,13 +970,7 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint
}
/* If the start time point is visible then we need to return the ondisk value. */
- if (F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY) ||
- __wt_txn_tw_start_visible(session, &tw)) {
- /* If we are resolving a modify then the btree must be the history store. */
- WT_ASSERT(
- session, (F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY) && WT_IS_HS(S2BT(session))) ||
- !F_ISSET(session, WT_SESSION_HS_IGNORE_VISIBILITY));
-
+ if (WT_IS_HS(S2BT(session)) || __wt_txn_tw_start_visible(session, &tw)) {
if (cbt->upd_value->skip_buf) {
cbt->upd_value->buf.data = NULL;
cbt->upd_value->buf.size = 0;
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index 54d281e06b9..b4a054a95b1 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -140,6 +140,9 @@ __rec_append_orig_value(
* timestamped globally visible tombstone because even if its timestamp is smaller than
* the entries in the history store, we can't change the history store entries. This is
* not correct but we hope we can get away with it.
+ *
+ * FIXME-WT-6171: remove this once we get rid of out of order timestamps and mixed mode
+ * transactions.
*/
if (unpack->tw.durable_stop_ts != WT_TS_NONE && tombstone_globally_visible)
return (0);
@@ -486,6 +489,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
select_tw->durable_start_ts = select_tw->durable_stop_ts;
select_tw->start_ts = select_tw->stop_ts;
+ select_tw->start_txn = select_tw->stop_txn;
}
/*
@@ -544,12 +548,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* part of the page, and they are physically removed by checkpoint writing this page, that is,
* the checkpoint doesn't include the overflow blocks so they're removed and future readers of
* this page won't be able to find them.
- *
- * There is no need to append the original value for in memory databases as the onpage value
- * should be already on the update chain and there is no history store.
*/
- if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && upd_select->upd != NULL && vpack != NULL &&
- vpack->type != WT_CELL_DEL && (upd_saved || F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW)))
+ if (upd_select->upd != NULL && vpack != NULL && vpack->type != WT_CELL_DEL &&
+ (upd_saved || F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW)))
WT_ERR(__rec_append_orig_value(session, page, upd_select->upd, vpack));
__wt_time_window_clear_obsolete(
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index fd19865636f..7fcb1ef940f 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -920,14 +920,8 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
/*
* Scan the history store for the given btree and key with maximum start timestamp to let
- * the search point to the last version of the key. We must ignore tombstone in the history
- * store while retrieving the update from the history store to replace the update in the
- * data store. We also need to ignore visibility of the updates as we have already released
- * our snapshot in prepare. Otherwise, we can't see updates with non-globally visible
- * transaction ids.
+ * the search point to the last version of the key.
*/
- F_SET(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
- F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY);
WT_ERR_NOTFOUND_OK(
__wt_hs_cursor_position(session, hs_cursor, hs_btree_id, &op->u.op_row.key, WT_TS_MAX),
true);
@@ -1011,11 +1005,8 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd, commit));
err:
- if (hs_cursor != NULL) {
- F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY);
- F_CLR(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
+ if (hs_cursor != NULL)
ret = __wt_hs_cursor_close(session, session_flags, is_owner);
- }
if (!upd_appended)
__wt_free(session, fix_upd);
return (ret);
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index 80d2ab99cd4..d0023be05d2 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -103,7 +103,12 @@ static CONFIG c[] = {
{"btree.key_max", "maximum size of keys", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL},
- {"btree.key_min", "minimum size of keys", 0x0, 10, 32, 256, &g.c_key_min, NULL},
+ /*
+ * A minimum key size of 11 is necessary. Row-store keys have a leading 10-digit number and the
+ * 11 guarantees we never see a key that we can't convert to a numeric value without formatting
+ * it first because there's a trailing non-digit character in every key.
+ */
+ {"btree.key_min", "minimum size of keys", 0x0, 11, 32, 256, &g.c_key_min, NULL},
{"btree.leaf_page_max", "maximum size of Btree leaf nodes", 0x0, 9, 17, 27, &g.c_leaf_page_max,
NULL},
diff --git a/src/third_party/wiredtiger/test/suite/test_compact02.py b/src/third_party/wiredtiger/test/suite/test_compact02.py
index c15fb5bc78b..e466aa81a95 100644
--- a/src/third_party/wiredtiger/test/suite/test_compact02.py
+++ b/src/third_party/wiredtiger/test/suite/test_compact02.py
@@ -149,11 +149,11 @@ class test_compact02(wttest.WiredTigerTestCase):
# Compact can collide with eviction, if that happens we retry. Wait for
# a long time, the check for EBUSY means we're not retrying on any real
# errors.
- for i in range(1, 60):
+ for i in range(1, 80):
if not self.raisesBusy(
lambda: self.session.compact(self.uri, None)):
break
- time.sleep(5)
+ time.sleep(6)
# 6. Get stats on compacted table.
sz = self.getSize()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs11.py b/src/third_party/wiredtiger/test/suite/test_hs11.py
index f7d31c6796f..efc9d02401c 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs11.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs11.py
@@ -32,12 +32,13 @@ def timestamp_str(t):
return '%x' % t
# test_hs11.py
-# Ensure that mixed mode updates clear the history store records.
+# Ensure that when we delete a key due to a tombstone being globally visible, we delete its
+# associated history store content.
class test_hs11(wttest.WiredTigerTestCase):
conn_config = 'cache_size=50MB'
session_config = 'isolation=snapshot'
- def run_test(self, update_type):
+ def test_key_deletion_clears_hs(self):
uri = 'table:test_hs11'
create_params = 'key_format=S,value_format=S'
self.session.create(uri, create_params)
@@ -57,16 +58,14 @@ class test_hs11(wttest.WiredTigerTestCase):
# Reconcile and flush versions 1-3 to the history store.
self.session.checkpoint()
- # Apply a mixed mode update.
+ # Apply a non-timestamped tombstone. When the pages get evicted, the keys will get deleted
+ # since the tombstone is globally visible.
for i in range(1, 10000):
if i % 2 == 0:
- if update_type == 'deletion':
- cursor.set_key(str(i))
- cursor.remove()
- else:
- cursor[str(i)] = value2
+ cursor.set_key(str(i))
+ cursor.remove()
- # Now apply an update at timestamp 10.
+ # Now apply an update at timestamp 10 to recreate each key.
for i in range(1, 10000):
self.session.begin_transaction()
cursor[str(i)] = value2
@@ -77,17 +76,8 @@ class test_hs11(wttest.WiredTigerTestCase):
self.session.begin_transaction('read_timestamp=' + timestamp_str(ts))
for i in range(1, 10000):
if i % 2 == 0:
- if update_type == 'deletion':
- cursor.set_key(str(i))
- self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
- else:
- self.assertEqual(cursor[str(i)], value2)
+ cursor.set_key(str(i))
+ self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
else:
self.assertEqual(cursor[str(i)], value1)
self.session.rollback_transaction()
-
- def test_key_deletion_clears_hs(self):
- self.run_test('deletion')
-
- def test_key_update_clears_hs(self):
- self.run_test('update')
diff --git a/src/third_party/wiredtiger/test/suite/test_hs14.py b/src/third_party/wiredtiger/test/suite/test_hs14.py
new file mode 100644
index 00000000000..ebd5f471f2b
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_hs14.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import time, wiredtiger, wttest
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_hs14.py
+# Ensure that point in time reads with few visible history store records don't
+# damage performance.
+class test_hs14(wttest.WiredTigerTestCase):
+ conn_config = 'cache_size=50MB'
+ session_config = 'isolation=snapshot'
+
+ def test_hs14(self):
+ uri = 'table:test_hs14'
+ self.session.create(uri, 'key_format=S,value_format=S')
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1))
+ cursor = self.session.open_cursor(uri)
+
+ value1 = 'a' * 500
+ value2 = 'b' * 500
+ value3 = 'c' * 500
+ value4 = 'd' * 500
+ value5 = 'e' * 500
+
+ for i in range(1, 10000):
+ self.session.begin_transaction()
+ cursor[str(i)] = value1
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
+ self.session.begin_transaction()
+ cursor[str(i)] = value2
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
+ self.session.begin_transaction()
+ cursor[str(i)] = value3
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
+ self.session.begin_transaction()
+ cursor[str(i)] = value4
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
+
+ start = time.time()
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(3))
+ for i in range(1, 10000):
+ self.assertEqual(cursor[str(i)], value3)
+ self.session.rollback_transaction()
+ end = time.time()
+
+ # The time spent when all history store keys are visible to us.
+ visible_hs_latency = (end - start)
+
+ for i in range(1, 10000):
+ self.session.begin_transaction()
+ cursor.set_key(str(i))
+ cursor.remove()
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+ self.session.begin_transaction()
+ cursor[str(i)] = value5
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(10))
+
+ start = time.time()
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(9))
+ for i in range(1, 10000):
+ cursor.set_key(str(i))
+ self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
+ self.session.rollback_transaction()
+ end = time.time()
+
+ # The time spent when all history store keys are invisible to us.
+ invisible_hs_latency = (end - start)
+
+ self.assertLess(invisible_hs_latency, (visible_hs_latency * 10),
+ "Reader took an order of magnitude longer for when all "
+ "history store records were invisible, visible={}, invisible={}".format(
+ visible_hs_latency, invisible_hs_latency
+ ))