summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2020-11-20 17:30:42 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-11-20 07:24:19 +0000
commitd0dfd00a48e83bc0e7e17d938f2578970a329304 (patch)
tree07a9a582d7fbfec81e6e82d2ee4b49e6d50d9362
parentefb1467bec93d1de4aff70e817e10645f2ebfb3f (diff)
downloadmongo-d0dfd00a48e83bc0e7e17d938f2578970a329304.tar.gz
Import wiredtiger: b22e16b7643e0e07c784962899b3a45728536947 from branch mongodb-5.0
ref: d05021d0ee..b22e16b764 for: 4.9.0 WT-6563 Create a reproducer for invalid modification application WT-6672 Don't increase the writegen number until RTS cleans up the checkpoint WT-6859 Implement search_near method for the history store cursor
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py9
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c18
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c9
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c31
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_hs.c331
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c31
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h10
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h1
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in55
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c16
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c11
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs19.py156
13 files changed, 639 insertions, 41 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 14132297f13..c76938f7919 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -1264,6 +1264,15 @@ methods = {
cursor without taking a lock, returning EBUSY if the operation
conflicts with a running checkpoint''',
type='boolean', undoc=True),
+ Config('debug', '', r'''
+ configure debug specific behavior on a cursor. Generally only
+ used for internal testing purposes''',
+ type='category', subconfig=[
+ Config('release_evict', 'false', r'''
+ Configure the cursor to evict the page positioned on when the
+ reset API is used''',
+ type='boolean')
+ ]),
Config('dump', '', r'''
configure the cursor for dump format inputs and outputs: "hex"
selects a simple hexadecimal format, "json" selects a JSON format
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index f1d1c774bd4..8dceff68b67 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "d05021d0ee6243aab343910a9ba7a8edb2e8a8c2"
+ "commit": "b22e16b7643e0e07c784962899b3a45728536947"
}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 7ec7b27170c..55c59fee565 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -297,6 +297,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_log_flush[] = {
{"sync", "string", NULL, "choices=[\"background\",\"off\",\"on\"]", NULL, 0},
{NULL, NULL, NULL, NULL, NULL, 0}};
+static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor_debug_subconfigs[] = {
+ {"release_evict", "boolean", NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
+
static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor_incremental_subconfigs[] = {
{"consolidate", "boolean", NULL, NULL, NULL, 0}, {"enabled", "boolean", NULL, NULL, NULL, 0},
{"file", "string", NULL, NULL, NULL, 0}, {"force_stop", "boolean", NULL, NULL, NULL, 0},
@@ -308,6 +311,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = {
{"append", "boolean", NULL, NULL, NULL, 0}, {"bulk", "string", NULL, NULL, NULL, 0},
{"checkpoint", "string", NULL, NULL, NULL, 0},
{"checkpoint_wait", "boolean", NULL, NULL, NULL, 0},
+ {"debug", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_debug_subconfigs, 1},
{"dump", "string", NULL, "choices=[\"hex\",\"json\",\"pretty\",\"print\"]", NULL, 0},
{"incremental", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_incremental_subconfigs, 7},
{"next_random", "boolean", NULL, NULL, NULL, 0},
@@ -924,13 +928,13 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
{"WT_SESSION.log_flush", "sync=on", confchk_WT_SESSION_log_flush, 1},
{"WT_SESSION.log_printf", "", NULL, 0},
{"WT_SESSION.open_cursor",
- "append=false,bulk=false,checkpoint=,checkpoint_wait=true,dump=,"
- "incremental=(consolidate=false,enabled=false,file=,"
- "force_stop=false,granularity=16MB,src_id=,this_id=),"
- "next_random=false,next_random_sample_size=0,overwrite=true,"
- "raw=false,read_once=false,readonly=false,skip_sort_check=false,"
- "statistics=,target=",
- confchk_WT_SESSION_open_cursor, 15},
+ "append=false,bulk=false,checkpoint=,checkpoint_wait=true,"
+ "debug=(release_evict=false),dump=,incremental=(consolidate=false"
+ ",enabled=false,file=,force_stop=false,granularity=16MB,src_id=,"
+ "this_id=),next_random=false,next_random_sample_size=0,"
+ "overwrite=true,raw=false,read_once=false,readonly=false,"
+ "skip_sort_check=false,statistics=,target=",
+ confchk_WT_SESSION_open_cursor, 16},
{"WT_SESSION.prepare_transaction", "prepare_timestamp=", confchk_WT_SESSION_prepare_transaction,
1},
{"WT_SESSION.query_timestamp", "get=read", confchk_WT_SESSION_query_timestamp, 1},
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 9b5e2394b61..f3da1531709 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -2740,7 +2740,14 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(wt_session->salvage(wt_session, WT_METAFILE_URI, NULL));
}
- /* Initialize the connection's base write generation. */
+ /*
+ * Initialize the connection's base write generation.
+ *
+ * We'll write over this value after performing rollback to stable however, we need to set it
+ * here. The logic below will involve opening up the metadata file and if the connection-wide
+ * base write generation is uninitialized, we'll tag the btree with the wrong base write gen and
+ * incorrectly interpret transaction ids during rollback to stable.
+ */
WT_ERR(__wt_metadata_init_base_write_gen(session));
WT_ERR(__wt_metadata_cursor(session, NULL));
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 16a9d7812b5..8727ed1b18b 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -848,6 +848,37 @@ restart:
}
/*
+ * __wt_dhandle_update_write_gens --
+ * Update the open dhandles write generation and base write generation number.
+ */
+void
+__wt_dhandle_update_write_gens(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+
+ conn = S2C(session);
+
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ break;
+ btree = (WT_BTREE *)dhandle->handle;
+
+ WT_ASSERT(session, btree != NULL);
+
+ /*
+ * Initialize the btrees write generation numbers and runtime write generations after
+ * rollback to stable so that the transaction ids of the pages will be reset when loaded
+ * from disk to memory.
+ */
+ btree->write_gen = btree->base_write_gen = btree->run_write_gen =
+ WT_MAX(btree->write_gen, conn->base_write_gen);
+ }
+}
+
+/*
* __wt_verbose_dump_handles --
* Dump information about all data handles.
*/
diff --git a/src/third_party/wiredtiger/src/cursor/cur_hs.c b/src/third_party/wiredtiger/src/cursor/cur_hs.c
index 500b9208b98..923b9941d0e 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_hs.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_hs.c
@@ -152,6 +152,7 @@ __curhs_close(WT_CURSOR *cursor)
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
+ WT_ITEM *datastore_key;
WT_SESSION_IMPL *session;
hs_cursor = (WT_CURSOR_HS *)cursor;
@@ -161,6 +162,8 @@ __curhs_close(WT_CURSOR *cursor)
err:
if (file_cursor != NULL)
WT_TRET(file_cursor->close(file_cursor));
+ datastore_key = &hs_cursor->datastore_key;
+ __wt_scr_free(session, &datastore_key);
__wt_cursor_close(cursor);
API_END_RET(session, ret);
@@ -185,6 +188,10 @@ __curhs_reset(WT_CURSOR *cursor)
ret = file_cursor->reset(file_cursor);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
+ hs_cursor->btree_id = 0;
+ hs_cursor->datastore_key.data = NULL;
+ hs_cursor->datastore_key.size = 0;
+ hs_cursor->flags = 0;
err:
API_END_RET(session, ret);
@@ -199,15 +206,327 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
{
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
+ WT_ITEM *datastore_key;
+ WT_SESSION_IMPL *session;
+ wt_timestamp_t start_ts;
+ uint64_t counter;
+ uint32_t arg_count;
va_list ap;
hs_cursor = (WT_CURSOR_HS *)cursor;
file_cursor = hs_cursor->file_cursor;
+ session = CUR2S(cursor);
+ start_ts = WT_TS_NONE;
+ counter = 0;
va_start(ap, cursor);
- file_cursor->set_key(file_cursor, va_arg(ap, uint32_t), va_arg(ap, WT_ITEM *),
- va_arg(ap, wt_timestamp_t), va_arg(ap, uint64_t));
+ arg_count = va_arg(ap, uint32_t);
+
+ WT_ASSERT(session, arg_count >= 1 && arg_count <= 4);
+
+ hs_cursor->btree_id = va_arg(ap, uint32_t);
+ F_SET(hs_cursor, WT_HS_CUR_BTREE_ID_SET);
+ if (arg_count > 1) {
+ datastore_key = va_arg(ap, WT_ITEM *);
+ WT_IGNORE_RET(__wt_buf_set(
+ session, &hs_cursor->datastore_key, datastore_key->data, datastore_key->size));
+ F_SET(hs_cursor, WT_HS_CUR_KEY_SET);
+ } else {
+ hs_cursor->datastore_key.data = NULL;
+ hs_cursor->datastore_key.size = 0;
+ F_CLR(hs_cursor, WT_HS_CUR_KEY_SET);
+ }
+
+ if (arg_count > 2) {
+ start_ts = va_arg(ap, wt_timestamp_t);
+ F_SET(hs_cursor, WT_HS_CUR_TS_SET);
+ } else
+ F_CLR(hs_cursor, WT_HS_CUR_TS_SET);
+
+ if (arg_count > 3) {
+ counter = va_arg(ap, uint64_t);
+ F_SET(hs_cursor, WT_HS_CUR_COUNTER_SET);
+ } else
+ F_CLR(hs_cursor, WT_HS_CUR_COUNTER_SET);
+
va_end(ap);
+
+ file_cursor->set_key(
+ file_cursor, hs_cursor->btree_id, &hs_cursor->datastore_key, start_ts, counter);
+}
+
+/*
+ * __curhs_prev_visible --
+ * Check the visibility of the current history store record. If it is not visible, find the
+ * previous visible history store record.
+ */
+static int
+__curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR *std_cursor;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_ITEM(datastore_key);
+ WT_DECL_RET;
+ wt_timestamp_t start_ts;
+ uint64_t counter;
+ uint32_t btree_id;
+ int cmp;
+
+ file_cursor = hs_cursor->file_cursor;
+ std_cursor = (WT_CURSOR *)hs_cursor;
+ cbt = (WT_CURSOR_BTREE *)file_cursor;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
+
+ for (; ret == 0; ret = __wt_hs_cursor_prev(session, file_cursor)) {
+ WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
+
+ /* Stop before crossing over to the next btree. */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+
+ /*
+ * Keys are sorted in an order, skip the ones before the desired key, and bail out if we
+ * have crossed over the desired key and not found the record we are looking for.
+ */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
+ WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
+ if (cmp != 0) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+ }
+
+ /*
+ * If the stop time pair on the tombstone in the history store is already globally visible
+ * we can skip it.
+ */
+ if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
+ WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone);
+ WT_STAT_DATA_INCR(session, cursor_prev_hs_tombstone);
+ continue;
+ }
+
+ /*
+ * Don't check the visibility of the record if we want to read any history store record that
+ * is not obsolete.
+ */
+ if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
+ break;
+
+ if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) {
+ /*
+ * If the stop time point of a record is visible to us, we won't be able to see anything
+ * for this entire key.
+ */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
+ ret = WT_NOTFOUND;
+ goto done;
+ } else
+ continue;
+ }
+
+ /* If the start time point is visible to us, let's return that record. */
+ if (__wt_txn_tw_start_visible(session, &cbt->upd_value->tw))
+ break;
+ }
+
+done:
+err:
+ __wt_scr_free(session, &datastore_key);
+ return (ret);
+}
+
+/*
+ * __curhs_next_visible --
+ * Check the visibility of the current history store record. If it is not visible, find the next
+ * visible history store record.
+ */
+static int
+__curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR *std_cursor;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_ITEM(datastore_key);
+ WT_DECL_RET;
+ wt_timestamp_t start_ts;
+ uint64_t counter;
+ uint32_t btree_id;
+ int cmp;
+
+ file_cursor = hs_cursor->file_cursor;
+ std_cursor = (WT_CURSOR *)hs_cursor;
+ cbt = (WT_CURSOR_BTREE *)file_cursor;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
+
+ for (; ret == 0; ret = __wt_hs_cursor_next(session, file_cursor)) {
+ WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
+
+ /* Stop before crossing over to the next btree. */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+
+ /*
+ * Keys are sorted in an order, skip the ones before the desired key, and bail out if we
+ * have crossed over the desired key and not found the record we are looking for.
+ */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
+ WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
+ if (cmp != 0) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+ }
+
+ /*
+ * If the stop time pair on the tombstone in the history store is already globally visible
+ * we can skip it.
+ */
+ if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
+ WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone);
+ WT_STAT_DATA_INCR(session, cursor_next_hs_tombstone);
+ continue;
+ }
+
+ /*
+ * Don't check the visibility of the record if we want to read any history store record that
+ * is not obsolete.
+ */
+ if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
+ break;
+
+ /*
+ * If the stop time point of a record is visible to us, check the next one.
+ */
+ if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw))
+ continue;
+
+ /* If the start time point is visible to us, let's return that record. */
+ if (__wt_txn_tw_start_visible(session, &cbt->upd_value->tw))
+ break;
+ }
+
+done:
+err:
+ __wt_scr_free(session, &datastore_key);
+ return (ret);
+}
+
+/*
+ * __curhs_search_near --
+ * WT_CURSOR->search_near method for the hs cursor type.
+ */
+static int
+__curhs_search_near(WT_CURSOR *cursor, int *exactp)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_HS *hs_cursor;
+ WT_DECL_ITEM(srch_key);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int cmp;
+ int exact;
+
+ hs_cursor = (WT_CURSOR_HS *)cursor;
+ file_cursor = hs_cursor->file_cursor;
+ *exactp = 0;
+ cmp = 0;
+
+ CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, search_near, CUR2BT(file_cursor));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &srch_key));
+ /* At least we have the btree id set. */
+ WT_ASSERT(session, F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET));
+ WT_ERR(__wt_buf_set(session, srch_key, file_cursor->key.data, file_cursor->key.size));
+ WT_ERR_NOTFOUND_OK(__wt_hs_cursor_search_near(session, file_cursor, &exact), true);
+
+ /* Empty history store is fine. */
+ if (ret == WT_NOTFOUND)
+ goto done;
+
+ /*
+ * There are some key fields missing so we are searching a range of keys. Place the cursor at
+ * the start of the range.
+ */
+ if (!F_ISSET(hs_cursor, WT_HS_CUR_COUNTER_SET)) {
+ /*
+ * If we raced with a history store insert, we may be two or more records away from our
+ * target. Keep iterating forwards until we are on or past our target key.
+ *
+ * We can't use the cursor positioning helper that we use for regular reads since that will
+ * place us at the end of a particular key/timestamp range whereas we want to be placed at
+ * the beginning.
+ */
+ if (exact < 0) {
+ while ((ret = __wt_hs_cursor_next(session, file_cursor)) == 0) {
+ WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
+ if (cmp >= 0)
+ break;
+ }
+ /* No entries greater than or equal to the key we searched for. */
+ WT_ERR_NOTFOUND_OK(ret, true);
+ if (ret == WT_NOTFOUND)
+ goto done;
+
+ *exactp = cmp;
+ } else
+ *exactp = 1;
+
+ WT_ERR(__curhs_next_visible(session, hs_cursor));
+ }
+ /* Search the closest match that is smaller or equal to the search key. */
+ else {
+ /*
+ * Because of the special visibility rules for the history store, a new key can appear in
+ * between our search and the set of updates that we're interested in. Keep trying until we
+ * find it.
+ *
+ * There may be no history store entries for the given btree id and record key if they have
+ * been removed by rollback to stable.
+ *
+ * Note that we need to compare the raw key off the cursor to determine where we are in the
+ * history store as opposed to comparing the embedded data store key since the ordering is
+ * not guaranteed to be the same.
+ */
+ if (exact > 0) {
+ /*
+ * It's possible that we may race with a history store insert for another key. So we may
+ * be more than one record away the end of our target key/timestamp range. Keep
+ * iterating backwards until we land on our key.
+ */
+ while ((ret = file_cursor->prev(cursor)) == 0) {
+ WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position);
+ WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position);
+
+ WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
+ if (cmp <= 0)
+ break;
+ }
+
+ *exactp = cmp;
+ } else
+ *exactp = -1;
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0) {
+ WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
+ WT_ASSERT(session, cmp <= 0);
+ }
+#endif
+
+ WT_ERR(__curhs_prev_visible(session, hs_cursor));
+ }
+
+done:
+err:
+ __wt_scr_free(session, &srch_key);
+ API_END_RET(session, ret);
}
/*
@@ -356,7 +675,7 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
__wt_cursor_notsup, /* prev */
__curhs_reset, /* reset */
__wt_cursor_notsup, /* search */
- __wt_cursor_search_near_notsup, /* search-near */
+ __curhs_search_near, /* search-near */
__curhs_insert, /* insert */
__wt_cursor_modify_value_format_notsup, /* modify */
__wt_cursor_notsup, /* update */
@@ -369,6 +688,7 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
WT_CURSOR *cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
+ WT_ITEM *datastore_key;
WT_RET(__wt_calloc_one(session, &hs_cursor));
cursor = (WT_CURSOR *)hs_cursor;
@@ -381,6 +701,11 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
WT_ERR(__hs_cursor_open_int(session, &hs_cursor->file_cursor));
WT_ERR(__wt_cursor_init(cursor, WT_HS_URI, owner, NULL, cursorp));
+ WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
+ hs_cursor->btree_id = 0;
+ datastore_key = &hs_cursor->datastore_key;
+ WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
+ hs_cursor->flags = 0;
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index 364a6dffbdb..92b8e5c5b3e 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -979,6 +979,33 @@ err:
}
/*
+ * __cursor_config_debug --
+ * Set configuration options for debug category.
+ */
+static int
+__cursor_config_debug(WT_CURSOR *cursor, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ /*
+ * Debug options. Special handling for options that aren't found - since reconfigure passes in
+ * just the single configuration string, not the stack.
+ */
+ if ((ret = __wt_config_gets_def(session, cfg, "debug.release_evict", 0, &cval)) == 0) {
+ if (cval.val)
+ F_SET(cursor, WT_CURSTD_DEBUG_RESET_EVICT);
+ else
+ F_CLR(cursor, WT_CURSTD_DEBUG_RESET_EVICT);
+ } else
+ WT_RET_NOTFOUND_OK(ret);
+ return (0);
+}
+
+/*
* __wt_cursor_reconfigure --
* Set runtime-configurable settings.
*/
@@ -988,6 +1015,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config)
WT_CONFIG_ITEM cval;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ const char *cfg[] = {config, NULL};
CURSOR_API_CALL(cursor, session, reconfigure, NULL);
@@ -1018,6 +1046,8 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config)
} else
WT_ERR_NOTFOUND_OK(ret, false);
+ WT_ERR(__cursor_config_debug(cursor, cfg));
+
err:
API_END_RET(session, ret);
}
@@ -1110,6 +1140,7 @@ __wt_cursor_init(
cursor->update = __wt_cursor_notsup;
F_CLR(cursor, WT_CURSTD_CACHEABLE);
}
+ WT_RET(__cursor_config_debug(cursor, cfg));
/*
* dump If an index cursor is opened with dump, then this function is called on the index files,
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index 2edff95e833..9a737ed158e 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -287,6 +287,16 @@ struct __wt_cursor_hs {
WT_CURSOR *file_cursor; /* Queries of regular history store data */
WT_TIME_WINDOW time_window;
+ uint32_t btree_id;
+ WT_ITEM datastore_key;
+
+ /* AUTOMATIC FLAG VALUE GENERATION START */
+#define WT_HS_CUR_BTREE_ID_SET 0x1u
+#define WT_HS_CUR_COUNTER_SET 0x2u
+#define WT_HS_CUR_KEY_SET 0x4u
+#define WT_HS_CUR_TS_SET 0x8u
+ /* AUTOMATIC FLAG VALUE GENERATION STOP */
+ uint8_t flags;
};
struct __wt_cursor_index {
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index d902d72ff01..cf8bcdd353e 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1652,6 +1652,7 @@ extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session);
extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
+extern void __wt_dhandle_update_write_gens(WT_SESSION_IMPL *session);
extern void __wt_encrypt_size(
WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep);
extern void __wt_err_func(
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index f5568a8b7e1..f44b4e188c2 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -699,30 +699,31 @@ struct __wt_cursor {
const char *internal_uri;
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_CURSTD_APPEND 0x000001u
-#define WT_CURSTD_BULK 0x000002u
-#define WT_CURSTD_CACHEABLE 0x000004u
-#define WT_CURSTD_CACHED 0x000008u
-#define WT_CURSTD_DEAD 0x000010u
-#define WT_CURSTD_DEBUG_COPY_KEY 0x000020u
-#define WT_CURSTD_DEBUG_COPY_VALUE 0x000040u
-#define WT_CURSTD_DEBUG_RESET_EVICT 0x000080u
-#define WT_CURSTD_DUMP_HEX 0x000100u
-#define WT_CURSTD_DUMP_JSON 0x000200u
-#define WT_CURSTD_DUMP_PRETTY 0x000400u
-#define WT_CURSTD_DUMP_PRINT 0x000800u
-#define WT_CURSTD_IGNORE_TOMBSTONE 0x001000u
-#define WT_CURSTD_JOINED 0x002000u
-#define WT_CURSTD_KEY_EXT 0x004000u /* Key points out of tree. */
-#define WT_CURSTD_KEY_INT 0x008000u /* Key points into tree. */
-#define WT_CURSTD_META_INUSE 0x010000u
-#define WT_CURSTD_OPEN 0x020000u
-#define WT_CURSTD_OVERWRITE 0x040000u
-#define WT_CURSTD_RAW 0x080000u
-#define WT_CURSTD_RAW_SEARCH 0x100000u
-#define WT_CURSTD_UPDATE_LOCAL 0x200000u
-#define WT_CURSTD_VALUE_EXT 0x400000u /* Value points out of tree. */
-#define WT_CURSTD_VALUE_INT 0x800000u /* Value points into tree. */
+#define WT_CURSTD_APPEND 0x0000001u
+#define WT_CURSTD_BULK 0x0000002u
+#define WT_CURSTD_CACHEABLE 0x0000004u
+#define WT_CURSTD_CACHED 0x0000008u
+#define WT_CURSTD_DEAD 0x0000010u
+#define WT_CURSTD_DEBUG_COPY_KEY 0x0000020u
+#define WT_CURSTD_DEBUG_COPY_VALUE 0x0000040u
+#define WT_CURSTD_DEBUG_RESET_EVICT 0x0000080u
+#define WT_CURSTD_DUMP_HEX 0x0000100u
+#define WT_CURSTD_DUMP_JSON 0x0000200u
+#define WT_CURSTD_DUMP_PRETTY 0x0000400u
+#define WT_CURSTD_DUMP_PRINT 0x0000800u
+#define WT_CURSTD_HS_READ_COMMITTED 0x0001000u
+#define WT_CURSTD_IGNORE_TOMBSTONE 0x0002000u
+#define WT_CURSTD_JOINED 0x0004000u
+#define WT_CURSTD_KEY_EXT 0x0008000u /* Key points out of tree. */
+#define WT_CURSTD_KEY_INT 0x0010000u /* Key points into tree. */
+#define WT_CURSTD_META_INUSE 0x0020000u
+#define WT_CURSTD_OPEN 0x0040000u
+#define WT_CURSTD_OVERWRITE 0x0080000u
+#define WT_CURSTD_RAW 0x0100000u
+#define WT_CURSTD_RAW_SEARCH 0x0200000u
+#define WT_CURSTD_UPDATE_LOCAL 0x0400000u
+#define WT_CURSTD_VALUE_EXT 0x0800000u /* Value points out of tree. */
+#define WT_CURSTD_VALUE_INT 0x1000000u /* Value points into tree. */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
#define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)
#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
@@ -867,6 +868,12 @@ struct __wt_session {
* @config{checkpoint, the name of a checkpoint to open (the reserved name
* "WiredTigerCheckpoint" opens the most recent internal checkpoint taken for the object).
* The cursor does not support data modification., a string; default empty.}
+ * @config{debug = (, configure debug specific behavior on a cursor. Generally only used
+ * for internal testing purposes., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;release_evict, Configure the cursor to evict the page
+ * positioned on when the reset API is used., a boolean flag; default \c false.}
+ * @config{
+ * ),,}
* @config{dump, configure the cursor for dump format inputs and outputs: "hex" selects a
* simple hexadecimal format\, "json" selects a JSON format with each record formatted as
* fields named by column names if available\, "pretty" selects a human-readable format
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 6e042fc9b33..c08c5d457b7 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -548,8 +548,7 @@ err:
/*
* __recovery_setup_file --
- * Set up the recovery slot for a file, track the largest file ID, and update the base write gen
- * based on the file's configuration.
+ * Set up the recovery slot for a file and track the largest file ID.
*/
static int
__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
@@ -595,8 +594,7 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
(WT_IS_MAX_LSN(&r->max_ckpt_lsn) || __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0))
WT_ASSIGN_LSN(&r->max_ckpt_lsn, &lsn);
- /* Update the base write gen based on this file's configuration. */
- return (__wt_metadata_update_base_write_gen(r->session, config));
+ return (0);
}
/*
@@ -979,6 +977,16 @@ done:
*/
WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+ /* Initialize the connection's base write generation after rollback to stable. */
+ WT_ERR(__wt_metadata_init_base_write_gen(session));
+
+ /*
+ * Update the open dhandles write generations and base write generation with the connection's
+ * base write generation. The write generations of the pages which are in disk will be
+ * initialized when loaded to cache.
+ */
+ __wt_dhandle_update_write_gens(session);
+
/*
* If we're downgrading and have newer log files, force an archive, no matter what the archive
* setting is.
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 90ec9389deb..ce904576651 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -333,7 +333,16 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
if (valid_update_found) {
WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL));
- upd->txnid = cbt->upd_value->tw.start_txn;
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because
+ * the connections write generation will be initialized after rollback to stable and the
+ * updates in the cache will be problematic. The transaction id of pages which are in
+ * disk will be automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ upd->txnid = WT_TXN_NONE;
+ else
+ upd->txnid = cbt->upd_value->tw.start_txn;
upd->durable_ts = cbt->upd_value->tw.durable_start_ts;
upd->start_ts = cbt->upd_value->tw.start_ts;
__wt_verbose(session, WT_VERB_RTS,
diff --git a/src/third_party/wiredtiger/test/suite/test_hs19.py b/src/third_party/wiredtiger/test/suite/test_hs19.py
new file mode 100644
index 00000000000..2c87d6243f3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_hs19.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import time, wiredtiger, wttest
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_hs19.py
+# Ensure eviction doesn't clear the history store again after checkpoint has done so because of the same update without timestamp.
+class test_hs19(wttest.WiredTigerTestCase):
+ conn_config = 'cache_size=5MB,eviction=(threads_max=1)'
+ session_config = 'isolation=snapshot'
+
+ def test_hs19(self):
+ uri = 'table:test_hs19'
+ junk_uri = 'table:junk'
+ self.session.create(uri, 'key_format=S,value_format=S')
+ session2 = self.conn.open_session()
+ session2.create(junk_uri, 'key_format=S,value_format=S')
+ cursor2 = session2.open_cursor(junk_uri)
+ cursor = self.session.open_cursor(uri)
+ self.conn.set_timestamp(
+ 'oldest_timestamp=' + timestamp_str(1) + ',stable_timestamp=' + timestamp_str(1))
+
+ value1 = 'a' * 500
+ value2 = 'b' * 500
+ value3 = 'c' * 50000
+
+ # Insert an update without timestamp.
+ self.session.begin_transaction()
+ cursor[str(0)] = value1
+ self.session.commit_transaction()
+
+ # Do 2 modifies.
+ self.session.begin_transaction()
+ cursor.set_key(str(0))
+ mods = [wiredtiger.Modify('B', 100, 1)]
+ self.assertEqual(cursor.modify(mods), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(1))
+
+ self.session.begin_transaction()
+ cursor.set_key(str(0))
+ mods = [wiredtiger.Modify('C', 101, 1)]
+ self.assertEqual(cursor.modify(mods), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
+
+ # Start a transaction to pin back the reconciliation last running value.
+ session2.begin_transaction()
+ cursor2[str(1)] = value3
+
+ # Insert a modify ahead of our reconstructed modify, this one will be used unintentionally
+ # to reconstruct the final value, corrupting the resulting value.
+ # The 0 at the end of the modify call indicates how many bytes to replace, we keep
+ # it as 0 here to not overwrite any of the existing value.
+ self.session.begin_transaction()
+ cursor.set_key(str(0))
+ mods = [wiredtiger.Modify('AAAAAAAAAA', 102, 0)]
+ self.assertEqual(cursor.modify(mods), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
+
+ # Insert a modify to get written as the on disk value by checkpoint.
+ self.session.begin_transaction()
+ cursor.set_key(str(0))
+ mods = [wiredtiger.Modify('D', 102, 1)]
+ self.assertEqual(cursor.modify(mods), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
+
+ # Checkpoint such that all modifies get written out to the history store and the latest
+ # modify gets written to the on disk value.
+ self.session.checkpoint('use_timestamp=true')
+
+ # Add an additional modify so that when eviction sees this page it will rewrite it as it's
+ # dirty.
+ self.session.begin_transaction()
+ cursor.set_key(str(0))
+ mods = [wiredtiger.Modify('E', 103, 1)]
+ self.assertEqual(cursor.modify(mods), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+
+ # First deposition the first cursor, so the page can be evicted.
+ cursor.reset()
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ # Search for the key so we position our cursor on the page that we want to evict.
+ evict_cursor.set_key(str(0))
+ evict_cursor.search()
+ evict_cursor.reset()
+ evict_cursor.close()
+
+ # Construct and test the value as at timestamp 1
+ expected = list(value1)
+ expected[100] = 'B'
+ expected = str().join(expected)
+
+ # Retrieve the value at timestamp 1.
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(1))
+ cursor.set_key(str(0))
+ cursor.search()
+
+ # Assert that it matches our expected value.
+ self.assertEqual(cursor.get_value(), expected)
+ self.session.rollback_transaction()
+
+ # Construct and test the value as at timestamp 2
+ expected = list(expected)
+ expected[101] = 'C'
+ expected = str().join(expected)
+
+ # Retrieve the value at timestamp 1.
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(2))
+ cursor.set_key(str(0))
+ cursor.search()
+
+ # Assert that it matches our expected value.
+ self.assertEqual(cursor.get_value(), expected)
+ self.session.rollback_transaction()
+
+ # Construct and test the value as at timestamp 3
+ expected = list(expected)
+ for x in range(10):
+ expected[102 + x] = 'A'
+ expected.append('a')
+ expected = str().join(expected)
+
+ # Retrieve the value at timestamp 1.
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(3))
+ cursor.set_key(str(0))
+ cursor.search()
+ # Assert that it matches our expected value.
+ self.assertEqual(cursor.get_value(), expected)
+ self.session.rollback_transaction()