diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-11-20 17:30:42 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-11-20 07:24:19 +0000 |
commit | d0dfd00a48e83bc0e7e17d938f2578970a329304 (patch) | |
tree | 07a9a582d7fbfec81e6e82d2ee4b49e6d50d9362 /src/third_party/wiredtiger/src | |
parent | efb1467bec93d1de4aff70e817e10645f2ebfb3f (diff) | |
download | mongo-d0dfd00a48e83bc0e7e17d938f2578970a329304.tar.gz |
Import wiredtiger: b22e16b7643e0e07c784962899b3a45728536947 from branch mongodb-5.0
ref: d05021d0ee..b22e16b764
for: 4.9.0
WT-6563 Create a reproducer for invalid modification application
WT-6672 Don't increase the writegen number until RTS cleans up the checkpoint
WT-6859 Implement search_near method for the history store cursor
Diffstat (limited to 'src/third_party/wiredtiger/src')
-rw-r--r-- | src/third_party/wiredtiger/src/config/config_def.c | 18 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/conn/conn_api.c | 9 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/conn/conn_dhandle.c | 31 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/cursor/cur_hs.c | 331 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/cursor/cur_std.c | 31 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/cursor.h | 10 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/extern.h | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/wiredtiger.in | 55 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_recover.c | 16 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c | 11 |
10 files changed, 473 insertions, 40 deletions
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 7ec7b27170c..55c59fee565 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -297,6 +297,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_log_flush[] = { {"sync", "string", NULL, "choices=[\"background\",\"off\",\"on\"]", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; +static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor_debug_subconfigs[] = { + {"release_evict", "boolean", NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; + static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor_incremental_subconfigs[] = { {"consolidate", "boolean", NULL, NULL, NULL, 0}, {"enabled", "boolean", NULL, NULL, NULL, 0}, {"file", "string", NULL, NULL, NULL, 0}, {"force_stop", "boolean", NULL, NULL, NULL, 0}, @@ -308,6 +311,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = { {"append", "boolean", NULL, NULL, NULL, 0}, {"bulk", "string", NULL, NULL, NULL, 0}, {"checkpoint", "string", NULL, NULL, NULL, 0}, {"checkpoint_wait", "boolean", NULL, NULL, NULL, 0}, + {"debug", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_debug_subconfigs, 1}, {"dump", "string", NULL, "choices=[\"hex\",\"json\",\"pretty\",\"print\"]", NULL, 0}, {"incremental", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_incremental_subconfigs, 7}, {"next_random", "boolean", NULL, NULL, NULL, 0}, @@ -924,13 +928,13 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", {"WT_SESSION.log_flush", "sync=on", confchk_WT_SESSION_log_flush, 1}, {"WT_SESSION.log_printf", "", NULL, 0}, {"WT_SESSION.open_cursor", - "append=false,bulk=false,checkpoint=,checkpoint_wait=true,dump=," - "incremental=(consolidate=false,enabled=false,file=," - "force_stop=false,granularity=16MB,src_id=,this_id=)," - "next_random=false,next_random_sample_size=0,overwrite=true," - "raw=false,read_once=false,readonly=false,skip_sort_check=false," - "statistics=,target=", - confchk_WT_SESSION_open_cursor, 15}, + "append=false,bulk=false,checkpoint=,checkpoint_wait=true," + "debug=(release_evict=false),dump=,incremental=(consolidate=false" + ",enabled=false,file=,force_stop=false,granularity=16MB,src_id=," + "this_id=),next_random=false,next_random_sample_size=0," + "overwrite=true,raw=false,read_once=false,readonly=false," + "skip_sort_check=false,statistics=,target=", + confchk_WT_SESSION_open_cursor, 16}, {"WT_SESSION.prepare_transaction", "prepare_timestamp=", confchk_WT_SESSION_prepare_transaction, 1}, {"WT_SESSION.query_timestamp", "get=read", confchk_WT_SESSION_query_timestamp, 1}, diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 9b5e2394b61..f3da1531709 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -2740,7 +2740,14 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_ERR(wt_session->salvage(wt_session, WT_METAFILE_URI, NULL)); } - /* Initialize the connection's base write generation. */ + /* + * Initialize the connection's base write generation. + * + * We'll write over this value after performing rollback to stable however, we need to set it + * here. The logic below will involve opening up the metadata file and if the connection-wide + * base write generation is uninitialized, we'll tag the btree with the wrong base write gen and + * incorrectly interpret transaction ids during rollback to stable. + */ WT_ERR(__wt_metadata_init_base_write_gen(session)); WT_ERR(__wt_metadata_cursor(session, NULL)); diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 16a9d7812b5..8727ed1b18b 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -848,6 +848,37 @@ restart: } /* + * __wt_dhandle_update_write_gens -- + * Update the open dhandles write generation and base write generation number. + */ +void +__wt_dhandle_update_write_gens(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + + conn = S2C(session); + + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + break; + btree = (WT_BTREE *)dhandle->handle; + + WT_ASSERT(session, btree != NULL); + + /* + * Initialize the btrees write generation numbers and runtime write generations after + * rollback to stable so that the transaction ids of the pages will be reset when loaded + * from disk to memory. + */ + btree->write_gen = btree->base_write_gen = btree->run_write_gen = + WT_MAX(btree->write_gen, conn->base_write_gen); + } +} + +/* * __wt_verbose_dump_handles -- * Dump information about all data handles. */ diff --git a/src/third_party/wiredtiger/src/cursor/cur_hs.c b/src/third_party/wiredtiger/src/cursor/cur_hs.c index 500b9208b98..923b9941d0e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_hs.c +++ b/src/third_party/wiredtiger/src/cursor/cur_hs.c @@ -152,6 +152,7 @@ __curhs_close(WT_CURSOR *cursor) WT_CURSOR *file_cursor; WT_CURSOR_HS *hs_cursor; WT_DECL_RET; + WT_ITEM *datastore_key; WT_SESSION_IMPL *session; hs_cursor = (WT_CURSOR_HS *)cursor; @@ -161,6 +162,8 @@ __curhs_close(WT_CURSOR *cursor) err: if (file_cursor != NULL) WT_TRET(file_cursor->close(file_cursor)); + datastore_key = &hs_cursor->datastore_key; + __wt_scr_free(session, &datastore_key); __wt_cursor_close(cursor); API_END_RET(session, ret); @@ -185,6 +188,10 @@ __curhs_reset(WT_CURSOR *cursor) ret = file_cursor->reset(file_cursor); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_TIME_WINDOW_INIT(&hs_cursor->time_window); + hs_cursor->btree_id = 0; + hs_cursor->datastore_key.data = NULL; + hs_cursor->datastore_key.size = 0; + hs_cursor->flags = 0; err: API_END_RET(session, ret); @@ -199,15 +206,327 @@ __curhs_set_key(WT_CURSOR *cursor, ...) { WT_CURSOR *file_cursor; WT_CURSOR_HS *hs_cursor; + WT_ITEM *datastore_key; + WT_SESSION_IMPL *session; + wt_timestamp_t start_ts; + uint64_t counter; + uint32_t arg_count; va_list ap; hs_cursor = (WT_CURSOR_HS *)cursor; file_cursor = hs_cursor->file_cursor; + session = CUR2S(cursor); + start_ts = WT_TS_NONE; + counter = 0; va_start(ap, cursor); - file_cursor->set_key(file_cursor, va_arg(ap, uint32_t), va_arg(ap, WT_ITEM *), - va_arg(ap, wt_timestamp_t), va_arg(ap, uint64_t)); + arg_count = va_arg(ap, uint32_t); + + WT_ASSERT(session, arg_count >= 1 && arg_count <= 4); + + hs_cursor->btree_id = va_arg(ap, uint32_t); + F_SET(hs_cursor, WT_HS_CUR_BTREE_ID_SET); + if (arg_count > 1) { + datastore_key = va_arg(ap, WT_ITEM *); + WT_IGNORE_RET(__wt_buf_set( + session, &hs_cursor->datastore_key, datastore_key->data, datastore_key->size)); + F_SET(hs_cursor, WT_HS_CUR_KEY_SET); + } else { + hs_cursor->datastore_key.data = NULL; + hs_cursor->datastore_key.size = 0; + F_CLR(hs_cursor, WT_HS_CUR_KEY_SET); + } + + if (arg_count > 2) { + start_ts = va_arg(ap, wt_timestamp_t); + F_SET(hs_cursor, WT_HS_CUR_TS_SET); + } else + F_CLR(hs_cursor, WT_HS_CUR_TS_SET); + + if (arg_count > 3) { + counter = va_arg(ap, uint64_t); + F_SET(hs_cursor, WT_HS_CUR_COUNTER_SET); + } else + F_CLR(hs_cursor, WT_HS_CUR_COUNTER_SET); + va_end(ap); + + file_cursor->set_key( + file_cursor, hs_cursor->btree_id, &hs_cursor->datastore_key, start_ts, counter); +} + +/* + * __curhs_prev_visible -- + * Check the visibility of the current history store record. If it is not visible, find the + * previous visible history store record. + */ +static int +__curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR *std_cursor; + WT_CURSOR_BTREE *cbt; + WT_DECL_ITEM(datastore_key); + WT_DECL_RET; + wt_timestamp_t start_ts; + uint64_t counter; + uint32_t btree_id; + int cmp; + + file_cursor = hs_cursor->file_cursor; + std_cursor = (WT_CURSOR *)hs_cursor; + cbt = (WT_CURSOR_BTREE *)file_cursor; + + WT_ERR(__wt_scr_alloc(session, 0, &datastore_key)); + + for (; ret == 0; ret = __wt_hs_cursor_prev(session, file_cursor)) { + WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter)); + + /* Stop before crossing over to the next btree. */ + if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) { + ret = WT_NOTFOUND; + goto done; + } + + /* + * Keys are sorted in an order, skip the ones before the desired key, and bail out if we + * have crossed over the desired key and not found the record we are looking for. + */ + if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) { + WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp)); + if (cmp != 0) { + ret = WT_NOTFOUND; + goto done; + } + } + + /* + * If the stop time pair on the tombstone in the history store is already globally visible + * we can skip it. + */ + if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) { + WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone); + WT_STAT_DATA_INCR(session, cursor_prev_hs_tombstone); + continue; + } + + /* + * Don't check the visibility of the record if we want to read any history store record that + * is not obsolete. + */ + if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED)) + break; + + if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) { + /* + * If the stop time point of a record is visible to us, we won't be able to see anything + * for this entire key. + */ + if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) { + ret = WT_NOTFOUND; + goto done; + } else + continue; + } + + /* If the start time point is visible to us, let's return that record. */ + if (__wt_txn_tw_start_visible(session, &cbt->upd_value->tw)) + break; + } + +done: +err: + __wt_scr_free(session, &datastore_key); + return (ret); +} + +/* + * __curhs_next_visible -- + * Check the visibility of the current history store record. If it is not visible, find the next + * visible history store record. + */ +static int +__curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor) +{ + WT_CURSOR *file_cursor; + WT_CURSOR *std_cursor; + WT_CURSOR_BTREE *cbt; + WT_DECL_ITEM(datastore_key); + WT_DECL_RET; + wt_timestamp_t start_ts; + uint64_t counter; + uint32_t btree_id; + int cmp; + + file_cursor = hs_cursor->file_cursor; + std_cursor = (WT_CURSOR *)hs_cursor; + cbt = (WT_CURSOR_BTREE *)file_cursor; + + WT_ERR(__wt_scr_alloc(session, 0, &datastore_key)); + + for (; ret == 0; ret = __wt_hs_cursor_next(session, file_cursor)) { + WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter)); + + /* Stop before crossing over to the next btree. */ + if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) { + ret = WT_NOTFOUND; + goto done; + } + + /* + * Keys are sorted in an order, skip the ones before the desired key, and bail out if we + * have crossed over the desired key and not found the record we are looking for. + */ + if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) { + WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp)); + if (cmp != 0) { + ret = WT_NOTFOUND; + goto done; + } + } + + /* + * If the stop time pair on the tombstone in the history store is already globally visible + * we can skip it. + */ + if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) { + WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone); + WT_STAT_DATA_INCR(session, cursor_next_hs_tombstone); + continue; + } + + /* + * Don't check the visibility of the record if we want to read any history store record that + * is not obsolete. + */ + if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED)) + break; + + /* + * If the stop time point of a record is visible to us, check the next one. + */ + if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) + continue; + + /* If the start time point is visible to us, let's return that record. */ + if (__wt_txn_tw_start_visible(session, &cbt->upd_value->tw)) + break; + } + +done: +err: + __wt_scr_free(session, &datastore_key); + return (ret); +} + +/* + * __curhs_search_near -- + * WT_CURSOR->search_near method for the hs cursor type. + */ +static int +__curhs_search_near(WT_CURSOR *cursor, int *exactp) +{ + WT_CURSOR *file_cursor; + WT_CURSOR_HS *hs_cursor; + WT_DECL_ITEM(srch_key); + WT_DECL_RET; + WT_SESSION_IMPL *session; + int cmp; + int exact; + + hs_cursor = (WT_CURSOR_HS *)cursor; + file_cursor = hs_cursor->file_cursor; + *exactp = 0; + cmp = 0; + + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, search_near, CUR2BT(file_cursor)); + + WT_ERR(__wt_scr_alloc(session, 0, &srch_key)); + /* At least we have the btree id set. */ + WT_ASSERT(session, F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET)); + WT_ERR(__wt_buf_set(session, srch_key, file_cursor->key.data, file_cursor->key.size)); + WT_ERR_NOTFOUND_OK(__wt_hs_cursor_search_near(session, file_cursor, &exact), true); + + /* Empty history store is fine. */ + if (ret == WT_NOTFOUND) + goto done; + + /* + * There are some key fields missing so we are searching a range of keys. Place the cursor at + * the start of the range. + */ + if (!F_ISSET(hs_cursor, WT_HS_CUR_COUNTER_SET)) { + /* + * If we raced with a history store insert, we may be two or more records away from our + * target. Keep iterating forwards until we are on or past our target key. + * + * We can't use the cursor positioning helper that we use for regular reads since that will + * place us at the end of a particular key/timestamp range whereas we want to be placed at + * the beginning. + */ + if (exact < 0) { + while ((ret = __wt_hs_cursor_next(session, file_cursor)) == 0) { + WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp)); + if (cmp >= 0) + break; + } + /* No entries greater than or equal to the key we searched for. */ + WT_ERR_NOTFOUND_OK(ret, true); + if (ret == WT_NOTFOUND) + goto done; + + *exactp = cmp; + } else + *exactp = 1; + + WT_ERR(__curhs_next_visible(session, hs_cursor)); + } + /* Search the closest match that is smaller or equal to the search key. */ + else { + /* + * Because of the special visibility rules for the history store, a new key can appear in + * between our search and the set of updates that we're interested in. Keep trying until we + * find it. + * + * There may be no history store entries for the given btree id and record key if they have + * been removed by rollback to stable. + * + * Note that we need to compare the raw key off the cursor to determine where we are in the + * history store as opposed to comparing the embedded data store key since the ordering is + * not guaranteed to be the same. + */ + if (exact > 0) { + /* + * It's possible that we may race with a history store insert for another key. So we may + * be more than one record away the end of our target key/timestamp range. Keep + * iterating backwards until we land on our key. + */ + while ((ret = file_cursor->prev(cursor)) == 0) { + WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position); + WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position); + + WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp)); + if (cmp <= 0) + break; + } + + *exactp = cmp; + } else + *exactp = -1; +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) { + WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp)); + WT_ASSERT(session, cmp <= 0); + } +#endif + + WT_ERR(__curhs_prev_visible(session, hs_cursor)); + } + +done: +err: + __wt_scr_free(session, &srch_key); + API_END_RET(session, ret); } /* @@ -356,7 +675,7 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp) __wt_cursor_notsup, /* prev */ __curhs_reset, /* reset */ __wt_cursor_notsup, /* search */ - __wt_cursor_search_near_notsup, /* search-near */ + __curhs_search_near, /* search-near */ __curhs_insert, /* insert */ __wt_cursor_modify_value_format_notsup, /* modify */ __wt_cursor_notsup, /* update */ @@ -369,6 +688,7 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp) WT_CURSOR *cursor; WT_CURSOR_HS *hs_cursor; WT_DECL_RET; + WT_ITEM *datastore_key; WT_RET(__wt_calloc_one(session, &hs_cursor)); cursor = (WT_CURSOR *)hs_cursor; @@ -381,6 +701,11 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp) WT_ERR(__hs_cursor_open_int(session, &hs_cursor->file_cursor)); WT_ERR(__wt_cursor_init(cursor, WT_HS_URI, owner, NULL, cursorp)); + WT_TIME_WINDOW_INIT(&hs_cursor->time_window); + hs_cursor->btree_id = 0; + datastore_key = &hs_cursor->datastore_key; + WT_ERR(__wt_scr_alloc(session, 0, &datastore_key)); + hs_cursor->flags = 0; WT_TIME_WINDOW_INIT(&hs_cursor->time_window); diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 364a6dffbdb..92b8e5c5b3e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -979,6 +979,33 @@ err: } /* + * __cursor_config_debug -- + * Set configuration options for debug category. + */ +static int +__cursor_config_debug(WT_CURSOR *cursor, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + + /* + * Debug options. Special handling for options that aren't found - since reconfigure passes in + * just the single configuration string, not the stack. + */ + if ((ret = __wt_config_gets_def(session, cfg, "debug.release_evict", 0, &cval)) == 0) { + if (cval.val) + F_SET(cursor, WT_CURSTD_DEBUG_RESET_EVICT); + else + F_CLR(cursor, WT_CURSTD_DEBUG_RESET_EVICT); + } else + WT_RET_NOTFOUND_OK(ret); + return (0); +} + +/* * __wt_cursor_reconfigure -- * Set runtime-configurable settings. */ @@ -988,6 +1015,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) WT_CONFIG_ITEM cval; WT_DECL_RET; WT_SESSION_IMPL *session; + const char *cfg[] = {config, NULL}; CURSOR_API_CALL(cursor, session, reconfigure, NULL); @@ -1018,6 +1046,8 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) } else WT_ERR_NOTFOUND_OK(ret, false); + WT_ERR(__cursor_config_debug(cursor, cfg)); + err: API_END_RET(session, ret); } @@ -1110,6 +1140,7 @@ __wt_cursor_init( cursor->update = __wt_cursor_notsup; F_CLR(cursor, WT_CURSTD_CACHEABLE); } + WT_RET(__cursor_config_debug(cursor, cfg)); /* * dump If an index cursor is opened with dump, then this function is called on the index files, diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 2edff95e833..9a737ed158e 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -287,6 +287,16 @@ struct __wt_cursor_hs { WT_CURSOR *file_cursor; /* Queries of regular history store data */ WT_TIME_WINDOW time_window; + uint32_t btree_id; + WT_ITEM datastore_key; + + /* AUTOMATIC FLAG VALUE GENERATION START */ +#define WT_HS_CUR_BTREE_ID_SET 0x1u +#define WT_HS_CUR_COUNTER_SET 0x2u +#define WT_HS_CUR_KEY_SET 0x4u +#define WT_HS_CUR_TS_SET 0x8u + /* AUTOMATIC FLAG VALUE GENERATION STOP */ + uint8_t flags; }; struct __wt_cursor_index { diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index d902d72ff01..cf8bcdd353e 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1652,6 +1652,7 @@ extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session); extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...); extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...); +extern void __wt_dhandle_update_write_gens(WT_SESSION_IMPL *session); extern void __wt_encrypt_size( WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep); extern void __wt_err_func( diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index f5568a8b7e1..f44b4e188c2 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -699,30 +699,31 @@ struct __wt_cursor { const char *internal_uri; /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CURSTD_APPEND 0x000001u -#define WT_CURSTD_BULK 0x000002u -#define WT_CURSTD_CACHEABLE 0x000004u -#define WT_CURSTD_CACHED 0x000008u -#define WT_CURSTD_DEAD 0x000010u -#define WT_CURSTD_DEBUG_COPY_KEY 0x000020u -#define WT_CURSTD_DEBUG_COPY_VALUE 0x000040u -#define WT_CURSTD_DEBUG_RESET_EVICT 0x000080u -#define WT_CURSTD_DUMP_HEX 0x000100u -#define WT_CURSTD_DUMP_JSON 0x000200u -#define WT_CURSTD_DUMP_PRETTY 0x000400u -#define WT_CURSTD_DUMP_PRINT 0x000800u -#define WT_CURSTD_IGNORE_TOMBSTONE 0x001000u -#define WT_CURSTD_JOINED 0x002000u -#define WT_CURSTD_KEY_EXT 0x004000u /* Key points out of tree. */ -#define WT_CURSTD_KEY_INT 0x008000u /* Key points into tree. */ -#define WT_CURSTD_META_INUSE 0x010000u -#define WT_CURSTD_OPEN 0x020000u -#define WT_CURSTD_OVERWRITE 0x040000u -#define WT_CURSTD_RAW 0x080000u -#define WT_CURSTD_RAW_SEARCH 0x100000u -#define WT_CURSTD_UPDATE_LOCAL 0x200000u -#define WT_CURSTD_VALUE_EXT 0x400000u /* Value points out of tree. */ -#define WT_CURSTD_VALUE_INT 0x800000u /* Value points into tree. */ +#define WT_CURSTD_APPEND 0x0000001u +#define WT_CURSTD_BULK 0x0000002u +#define WT_CURSTD_CACHEABLE 0x0000004u +#define WT_CURSTD_CACHED 0x0000008u +#define WT_CURSTD_DEAD 0x0000010u +#define WT_CURSTD_DEBUG_COPY_KEY 0x0000020u +#define WT_CURSTD_DEBUG_COPY_VALUE 0x0000040u +#define WT_CURSTD_DEBUG_RESET_EVICT 0x0000080u +#define WT_CURSTD_DUMP_HEX 0x0000100u +#define WT_CURSTD_DUMP_JSON 0x0000200u +#define WT_CURSTD_DUMP_PRETTY 0x0000400u +#define WT_CURSTD_DUMP_PRINT 0x0000800u +#define WT_CURSTD_HS_READ_COMMITTED 0x0001000u +#define WT_CURSTD_IGNORE_TOMBSTONE 0x0002000u +#define WT_CURSTD_JOINED 0x0004000u +#define WT_CURSTD_KEY_EXT 0x0008000u /* Key points out of tree. */ +#define WT_CURSTD_KEY_INT 0x0010000u /* Key points into tree. */ +#define WT_CURSTD_META_INUSE 0x0020000u +#define WT_CURSTD_OPEN 0x0040000u +#define WT_CURSTD_OVERWRITE 0x0080000u +#define WT_CURSTD_RAW 0x0100000u +#define WT_CURSTD_RAW_SEARCH 0x0200000u +#define WT_CURSTD_UPDATE_LOCAL 0x0400000u +#define WT_CURSTD_VALUE_EXT 0x0800000u /* Value points out of tree. */ +#define WT_CURSTD_VALUE_INT 0x1000000u /* Value points into tree. */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) @@ -867,6 +868,12 @@ struct __wt_session { * @config{checkpoint, the name of a checkpoint to open (the reserved name * "WiredTigerCheckpoint" opens the most recent internal checkpoint taken for the object). * The cursor does not support data modification., a string; default empty.} + * @config{debug = (, configure debug specific behavior on a cursor. Generally only used + * for internal testing purposes., a set of related configuration options defined below.} + * @config{ release_evict, Configure the cursor to evict the page + * positioned on when the reset API is used., a boolean flag; default \c false.} + * @config{ + * ),,} * @config{dump, configure the cursor for dump format inputs and outputs: "hex" selects a * simple hexadecimal format\, "json" selects a JSON format with each record formatted as * fields named by column names if available\, "pretty" selects a human-readable format diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 6e042fc9b33..c08c5d457b7 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -548,8 +548,7 @@ err: /* * __recovery_setup_file -- - * Set up the recovery slot for a file, track the largest file ID, and update the base write gen - * based on the file's configuration. + * Set up the recovery slot for a file and track the largest file ID. */ static int __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) @@ -595,8 +594,7 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) (WT_IS_MAX_LSN(&r->max_ckpt_lsn) || __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0)) WT_ASSIGN_LSN(&r->max_ckpt_lsn, &lsn); - /* Update the base write gen based on this file's configuration. */ - return (__wt_metadata_update_base_write_gen(r->session, config)); + return (0); } /* @@ -979,6 +977,16 @@ done: */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); + /* Initialize the connection's base write generation after rollback to stable. */ + WT_ERR(__wt_metadata_init_base_write_gen(session)); + + /* + * Update the open dhandles write generations and base write generation with the connection's + * base write generation. The write generations of the pages which are in disk will be + * initialized when loaded to cache. + */ + __wt_dhandle_update_write_gens(session); + /* * If we're downgrading and have newer log files, force an archive, no matter what the archive * setting is. diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 90ec9389deb..ce904576651 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -333,7 +333,16 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW if (valid_update_found) { WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL)); - upd->txnid = cbt->upd_value->tw.start_txn; + /* + * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because + * the connections write generation will be initialized after rollback to stable and the + * updates in the cache will be problematic. The transaction id of pages which are in + * disk will be automatically reset as part of unpacking cell when loaded to cache. + */ + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + upd->txnid = WT_TXN_NONE; + else + upd->txnid = cbt->upd_value->tw.start_txn; upd->durable_ts = cbt->upd_value->tw.durable_start_ts; upd->start_ts = cbt->upd_value->tw.start_ts; __wt_verbose(session, WT_VERB_RTS, |