summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWill Korteland <will.korteland@mongodb.com>2022-09-15 06:07:51 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-15 06:44:16 +0000
commita5ecbf2c6416309673afdbe3ff2f4280f4d03d43 (patch)
treedae209b5e4c8d6df15f80dc40ef4ac2b79081ee4
parent3018614b7d37f5a2aa8eb4082a4accbb30d90bba (diff)
downloadmongo-a5ecbf2c6416309673afdbe3ff2f4280f4d03d43.tar.gz
Import wiredtiger: 507ac1f26247ec2b881572ef3f794bf8e842bf7d from branch mongodb-master
ref: f36a8e9d80..507ac1f262 for: 6.2.0-rc0 WT-9805 Save the updates need to be deleted from history store and delete them later (#8262)
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c7
-rw-r--r--src/third_party/wiredtiger/src/history/hs_rec.c285
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h2
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h20
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h2
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c94
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c10
-rw-r--r--src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805196
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare25.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare26.py134
11 files changed, 597 insertions, 157 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 855eba155bd..9c4b194e1e7 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "f36a8e9d80dde82ceb11805e6437e12a2d784ae6"
+ "commit": "507ac1f26247ec2b881572ef3f794bf8e842bf7d"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 0d61b0350f3..40d9337c1d6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -612,6 +612,9 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt)
WT_STAT_CONN_DATA_INCR(session, cursor_reset);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ /* Initialize the update value as we are not pointing to any value. */
+ cbt->upd_value->type = WT_UPDATE_INVALID;
+ WT_TIME_WINDOW_INIT(&cbt->upd_value->tw);
return (__cursor_reset(cbt));
}
@@ -2205,6 +2208,10 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
cbt->modify_update = &cbt->_modify_update;
cbt->upd_value = &cbt->_upd_value;
+ /* Initialize the value. */
+ cbt->upd_value->type = WT_UPDATE_INVALID;
+ WT_TIME_WINDOW_INIT(&cbt->upd_value->tw);
+
#ifdef HAVE_DIAGNOSTIC
cbt->lastkey = &cbt->_lastkey;
cbt->lastrecno = WT_RECNO_OOB;
diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c
index 236f8856257..eb3697423be 100644
--- a/src/third_party/wiredtiger/src/history/hs_rec.c
+++ b/src/third_party/wiredtiger/src/history/hs_rec.c
@@ -61,65 +61,6 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree)
}
/*
- * __hs_delete_record --
- * Delete the update left in the history store
- */
-static int
-__hs_delete_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key,
- WT_UPDATE *delete_upd, WT_UPDATE *delete_tombstone)
-{
- WT_DECL_RET;
- bool hs_read_committed;
-#ifdef HAVE_DIAGNOSTIC
- WT_TIME_WINDOW *hs_tw;
-#endif
-
- hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
- F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
-
- /* No need to delete from the history store if it is already obsolete. */
- if (delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)) {
- ret = 0;
- goto done;
- }
-
- hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX);
- WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
- /* It's possible the value in the history store becomes obsolete concurrently. */
- if (ret == WT_NOTFOUND) {
- WT_ASSERT(
- session, delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone));
- ret = 0;
- goto done;
- }
-
-#ifdef HAVE_DIAGNOSTIC
- __wt_hs_upd_time_window(hs_cursor, &hs_tw);
- WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == delete_upd->txnid);
- WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == delete_upd->start_ts);
- WT_ASSERT(session,
- hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == delete_upd->durable_ts);
- if (delete_tombstone != NULL) {
- WT_ASSERT(session, hs_tw->stop_txn == delete_tombstone->txnid);
- WT_ASSERT(session, hs_tw->stop_ts == delete_tombstone->start_ts);
- WT_ASSERT(session, hs_tw->durable_stop_ts == delete_tombstone->durable_ts);
- } else
- WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw));
-#endif
-
- WT_ERR(hs_cursor->remove(hs_cursor));
-done:
- if (delete_tombstone != NULL)
- F_CLR(delete_tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
- F_CLR(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
-
-err:
- if (!hs_read_committed)
- F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
- return (ret);
-}
-
-/*
* __hs_insert_record --
* A helper function to insert the record into the history store including stop time point.
*/
@@ -345,9 +286,45 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_UPDATE_VECTOR *updates,
}
/*
+ * __hs_pack_key --
+ * Pack the history store key
+ */
+static inline int
+__hs_pack_key(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_INSERT *ins,
+ WT_ROW *rip, WT_ITEM *key)
+{
+ WT_DECL_RET;
+ uint8_t *p;
+
+ switch (r->page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ p = key->mem;
+ WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins)));
+ key->size = WT_PTRDIFF(p, key->data);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (ins == NULL) {
+ WT_WITH_BTREE(
+ session, btree, ret = __wt_row_leaf_key(session, r->page, rip, key, false));
+ WT_RET(ret);
+ } else {
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ }
+ break;
+ default:
+ WT_RET(__wt_illegal_value(session, r->page->type));
+ }
+
+ return (ret);
+}
+
+/*
* __wt_hs_insert_updates --
- * Copy one set of saved updates into the database's history store table. Whether the function
- * fails or succeeds, if there is a successful write to history, cache_write_hs is set to true.
+ * Copy one set of saved updates into the database's history store table if they haven't been
+ * moved there. Whether the function fails or succeeds, if there is a successful write to
+ * history, cache_write_hs is set to true.
*/
int
__wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *multi)
@@ -367,14 +344,12 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult
WT_MODIFY entries[MAX_REVERSE_MODIFY_NUM];
WT_UPDATE_VECTOR updates;
WT_SAVE_UPD *list;
- WT_UPDATE *delete_tombstone, *delete_upd, *newest_hs, *no_ts_upd, *oldest_upd, *prev_upd,
- *ref_upd, *tombstone, *upd;
+ WT_UPDATE *newest_hs, *no_ts_upd, *oldest_upd, *prev_upd, *ref_upd, *tombstone, *upd;
WT_TIME_WINDOW tw;
wt_off_t hs_size;
uint64_t insert_cnt, max_hs_size, modify_cnt;
uint64_t cache_hs_insert_full_update, cache_hs_insert_reverse_modify, cache_hs_write_squash;
uint32_t i;
- uint8_t *p;
int nentries;
bool enable_reverse_modify, error_on_ts_ordering, hs_inserted, squashed;
@@ -407,65 +382,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult
if (list->onpage_upd == NULL)
continue;
- /* History store table key component: source key. */
- switch (r->page->type) {
- case WT_PAGE_COL_FIX:
- case WT_PAGE_COL_VAR:
- p = key->mem;
- WT_ERR(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
- key->size = WT_PTRDIFF(p, key->data);
- break;
- case WT_PAGE_ROW_LEAF:
- if (list->ins == NULL) {
- WT_WITH_BTREE(
- session, btree, ret = __wt_row_leaf_key(session, r->page, list->rip, key, false));
- WT_ERR(ret);
- } else {
- key->data = WT_INSERT_KEY(list->ins);
- key->size = WT_INSERT_KEY_SIZE(list->ins);
- }
- break;
- default:
- WT_ERR(__wt_illegal_value(session, r->page->type));
- }
-
- no_ts_upd = newest_hs = NULL;
- ref_upd = list->onpage_upd;
- delete_tombstone = delete_upd = NULL;
-
- __wt_update_vector_clear(&updates);
-
- /*
- * Reverse deltas are only supported on 'S' and 'u' value formats.
- */
- enable_reverse_modify =
- (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u"));
-
- /*
- * Delete the update that is both on the update chain and the history store from the history
- * store. Otherwise, we will trigger out of order fix when the update is inserted to the
- * history store again.
- */
- for (upd = list->onpage_tombstone != NULL ? list->onpage_tombstone : list->onpage_upd;
- upd != NULL; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) {
- WT_ASSERT_ALWAYS(session, F_ISSET(upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS),
- "Attempting to remove an update from the history store in WiredTiger, but the "
- "update was missing.");
- if (upd->type == WT_UPDATE_TOMBSTONE)
- delete_tombstone = upd;
- else {
- delete_upd = upd;
- WT_ERR(
- __hs_delete_record(session, hs_cursor, key, delete_upd, delete_tombstone));
- break;
- }
- }
- }
-
/* Skip aborted updates. */
for (upd = list->onpage_upd->next; upd != NULL && upd->txnid == WT_TXN_ABORTED;
upd = upd->next)
@@ -479,6 +395,20 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult
if (F_ISSET(upd, WT_UPDATE_HS))
continue;
+ /* History store table key component: source key. */
+ WT_ERR(__hs_pack_key(session, btree, r, list->ins, list->rip, key));
+
+ no_ts_upd = newest_hs = NULL;
+ ref_upd = list->onpage_upd;
+
+ __wt_update_vector_clear(&updates);
+
+ /*
+ * Reverse deltas are only supported on 'S' and 'u' value formats.
+ */
+ enable_reverse_modify =
+ (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u"));
+
/*
* The algorithm assumes the oldest update on the update chain in memory is either a full
* update or a tombstone.
@@ -831,7 +761,7 @@ __wt_hs_delete_key(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btre
{
WT_DECL_RET;
WT_ITEM hs_key;
- wt_timestamp_t hs_ts;
+ wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
bool hs_read_all_flag;
@@ -850,7 +780,7 @@ __wt_hs_delete_key(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btre
ret = 0;
goto done;
} else {
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
++hs_counter;
}
@@ -880,8 +810,8 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_RET;
WT_ITEM hs_key, hs_value;
- WT_TIME_WINDOW hs_insert_tw, tw, *twp;
- wt_timestamp_t hs_ts;
+ WT_TIME_WINDOW hs_insert_tw, *twp;
+ wt_timestamp_t hs_durable_start_ts, hs_durable_stop_ts, hs_start_ts;
uint64_t cache_hs_order_lose_durable_timestamp, cache_hs_order_reinsert, cache_hs_order_remove;
uint64_t hs_counter, hs_upd_type;
uint32_t hs_btree_id;
@@ -958,7 +888,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
continue;
/* We shouldn't have crossed the btree and user key search space. */
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
WT_ASSERT(session, hs_btree_id == btree_id);
#ifdef HAVE_DIAGNOSTIC
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
@@ -970,7 +900,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
* the cell. The cell's start timestamp can be cleared during reconciliation if it is
* globally visible.
*/
- if (hs_ts >= ts || twp->stop_ts >= ts)
+ if (hs_start_ts >= ts || twp->stop_ts >= ts)
break;
}
if (ret == WT_NOTFOUND)
@@ -1026,7 +956,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
*/
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
/* We shouldn't have crossed the btree and user key search space. */
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
WT_ASSERT(session, hs_btree_id == btree_id);
#ifdef HAVE_DIAGNOSTIC
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
@@ -1044,7 +974,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
* by ignoring them.
*/
__wt_hs_upd_time_window(hs_cursor, &twp);
- if (hs_ts < ts && twp->stop_ts < ts)
+ if (hs_start_ts < ts && twp->stop_ts < ts)
continue;
if (reinsert) {
@@ -1098,10 +1028,10 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
/* Extract the underlying value for reinsertion. */
WT_ERR(hs_cursor->get_value(
- hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value));
+ hs_cursor, &hs_durable_stop_ts, &hs_durable_start_ts, &hs_upd_type, &hs_value));
/* Reinsert the update with corrected timestamps. */
- if (no_ts_tombstone && hs_ts == ts)
+ if (no_ts_tombstone && hs_start_ts == ts)
*counter = hs_counter;
/* Insert the value back with different timestamps. */
@@ -1132,3 +1062,92 @@ err:
return (ret);
}
+
+/*
+ * __hs_delete_record --
+ * Delete an update from the history store if it is not obsolete
+ */
+static int
+__hs_delete_record(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, WT_UPDATE *upd, WT_UPDATE *tombstone)
+{
+ WT_DECL_RET;
+ bool hs_read_committed;
+#ifdef HAVE_DIAGNOSTIC
+ WT_TIME_WINDOW *hs_tw;
+#endif
+
+ if (r->hs_cursor == NULL)
+ WT_RET(__wt_curhs_open(session, NULL, &r->hs_cursor));
+ hs_read_committed = F_ISSET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
+ /* Ensure we can see all the content in the history store. */
+ F_SET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
+
+ /* No need to delete from the history store if it is already obsolete. */
+ if (tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone))
+ goto done;
+
+ r->hs_cursor->set_key(r->hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX);
+ WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, r->hs_cursor), true);
+ /* It's possible the value in the history store becomes obsolete concurrently. */
+ if (ret == WT_NOTFOUND) {
+ WT_ASSERT(session, tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone));
+ ret = 0;
+ } else {
+#ifdef HAVE_DIAGNOSTIC
+ __wt_hs_upd_time_window(r->hs_cursor, &hs_tw);
+ WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == upd->txnid);
+ WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == upd->start_ts);
+ WT_ASSERT(session,
+ hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == upd->durable_ts);
+ if (tombstone != NULL) {
+ WT_ASSERT(session, hs_tw->stop_txn == tombstone->txnid);
+ WT_ASSERT(session, hs_tw->stop_ts == tombstone->start_ts);
+ WT_ASSERT(session, hs_tw->durable_stop_ts == tombstone->durable_ts);
+ } else
+ WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw));
+#endif
+
+ WT_ERR(r->hs_cursor->remove(r->hs_cursor));
+ }
+done:
+ if (tombstone != NULL)
+ F_CLR(tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
+ F_CLR(upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
+
+err:
+ if (!hs_read_committed)
+ F_CLR(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
+ return (ret);
+}
+
+/*
+ * __wt_hs_delete_updates --
+ * Delete the updates from the history store
+ */
+int
+__wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_DELETE_HS_UPD *delete_hs_upd;
+ uint32_t i;
+
+ /* Nothing to delete from the history store. */
+ if (r->delete_hs_upd == NULL)
+ return (0);
+
+ btree = S2BT(session);
+
+ WT_RET(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
+
+ for (delete_hs_upd = r->delete_hs_upd, i = 0; i < r->delete_hs_upd_next; ++delete_hs_upd, ++i) {
+ WT_ERR(__hs_pack_key(session, btree, r, delete_hs_upd->ins, delete_hs_upd->rip, key));
+ WT_ERR(__hs_delete_record(session, r, key, delete_hs_upd->upd, delete_hs_upd->tombstone));
+ }
+
+err:
+ __wt_scr_free(session, &key);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index bfa0461d783..8b88c33c7f8 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -795,6 +795,8 @@ extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg)
extern int __wt_hs_delete_key(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id,
const WT_ITEM *key, bool reinsert, bool error_on_ts_ordering)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index 64519f5de01..b644419c6de 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -65,6 +65,17 @@ struct __wt_rec_chunk {
};
/*
+ * WT_DELETE_HS_UPD --
+ * Update that needs to be deleted from the history store.
+ */
+struct __wt_delete_hs_upd {
+ WT_INSERT *ins; /* Insert list reference */
+ WT_ROW *rip; /* Original on-page reference */
+ WT_UPDATE *upd;
+ WT_UPDATE *tombstone;
+};
+
+/*
* Reconciliation is the process of taking an in-memory page, walking each entry
* in the page, building a backing disk image in a temporary buffer representing
* that information, and writing that buffer to disk. What could be simpler?
@@ -227,6 +238,15 @@ struct __wt_reconcile {
size_t supd_allocated;
size_t supd_memsize; /* Size of saved update structures */
+ /*
+ * List of updates to be deleted from the history store. While reviewing updates for each page,
+ * we save the updates that needs to be deleted from history store here, and then delete them
+ * after we have built the disk image.
+ */
+ WT_DELETE_HS_UPD *delete_hs_upd; /* Updates to delete from history store */
+ uint32_t delete_hs_upd_next;
+ size_t delete_hs_upd_allocated;
+
/* List of pages we've written so far. */
WT_MULTI *multi;
uint32_t multi_next;
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 0c80140b1db..428693a90d7 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -187,6 +187,8 @@ struct __wt_data_handle;
typedef struct __wt_data_handle WT_DATA_HANDLE;
struct __wt_data_handle_cache;
typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE;
+struct __wt_delete_hs_upd;
+typedef struct __wt_delete_hs_upd WT_DELETE_HS_UPD;
struct __wt_dlh;
typedef struct __wt_dlh WT_DLH;
struct __wt_dsrc_stats;
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index 3e19e383ca2..f34884e6a65 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -19,7 +19,7 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_
WT_SAVE_UPD *supd;
WT_ASSERT_ALWAYS(session, onpage_upd != NULL || supd_restore,
- "If nothing is committed the update chain must be restored");
+ "If nothing is committed, the update chain must be restored");
WT_ASSERT_ALWAYS(session,
onpage_upd == NULL || onpage_upd->type == WT_UPDATE_STANDARD ||
onpage_upd->type == WT_UPDATE_MODIFY,
@@ -40,6 +40,27 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_
}
/*
+ * __rec_delete_hs_upd_save --
+ * Save an update into a WT_DELETE_HS_UPD list to delete it from the history store later.
+ */
+static inline int
+__rec_delete_hs_upd_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip,
+ WT_UPDATE *upd, WT_UPDATE *tombstone)
+{
+ WT_DELETE_HS_UPD *delete_hs_upd;
+
+ WT_RET(__wt_realloc_def(
+ session, &r->delete_hs_upd_allocated, r->delete_hs_upd_next + 1, &r->delete_hs_upd));
+ delete_hs_upd = &r->delete_hs_upd[r->delete_hs_upd_next];
+ delete_hs_upd->ins = ins;
+ delete_hs_upd->rip = rip;
+ delete_hs_upd->upd = upd;
+ delete_hs_upd->tombstone = tombstone;
+ ++r->delete_hs_upd_next;
+ return (0);
+}
+
+/*
* __rec_append_orig_value --
* Append the key's original value to its update list. It assumes that we have an onpage value,
* the onpage value is not a prepared update, and we don't overwrite transaction id to
@@ -183,6 +204,44 @@ err:
}
/*
+ * __rec_find_and_save_delete_hs_upd --
+ * Find and save the update that needs to be deleted from the history store later
+ */
+static int
+__rec_find_and_save_delete_hs_upd(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins,
+ WT_ROW *rip, WT_UPDATE_SELECT *upd_select)
+{
+ WT_UPDATE *delete_tombstone, *delete_upd;
+
+ delete_tombstone = NULL;
+
+ for (delete_upd = upd_select->tombstone != NULL ? upd_select->tombstone : upd_select->upd;
+ delete_upd != NULL; delete_upd = delete_upd->next) {
+ if (delete_upd->txnid == WT_TXN_ABORTED)
+ continue;
+
+ if (F_ISSET(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS)) {
+ WT_ASSERT_ALWAYS(session,
+ F_ISSET(delete_upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS),
+ "Attempting to remove an update from the history store in WiredTiger, but the "
+ "update was missing.");
+ if (delete_upd->type == WT_UPDATE_TOMBSTONE)
+ delete_tombstone = delete_upd;
+ else {
+ WT_RET(
+ __rec_delete_hs_upd_save(session, r, ins, rip, delete_upd, delete_tombstone));
+ break;
+ }
+ }
+ }
+
+ WT_ASSERT_ALWAYS(session, delete_tombstone == NULL || delete_upd != NULL,
+ "If we delete a tombstone from the history store, we must also delete the update.");
+
+ return (0);
+}
+
+/*
* __rec_need_save_upd --
* Return if we need to save the update chain
*/
@@ -190,8 +249,6 @@ static inline bool
__rec_need_save_upd(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates)
{
- WT_UPDATE *upd;
-
if (upd_select->tw.prepare)
return (true);
@@ -202,15 +259,6 @@ __rec_need_save_upd(
if (upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE)
return (false);
- /* Save the update chain to delete the update from the history store later. */
- for (upd = upd_select->upd; upd != NULL; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS))
- return (true);
- }
-
/*
* Don't save updates for any reconciliation that doesn't involve history store (in-memory
* database, metadata, and history store reconciliation itself), except when the selected stop
@@ -297,15 +345,10 @@ __rec_validate_upd_chain(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *s
if (!F_ISSET(r, WT_REC_CHECKPOINT_RUNNING))
return (0);
- for (upd = select_upd; upd != NULL; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- /* Cannot delete the update from history store when checkpoint is running. */
- if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) {
- WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint);
- return (EBUSY);
- }
+ /* Cannot delete the update from history store when checkpoint is running. */
+ if (r->delete_hs_upd_next > 0) {
+ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint);
+ return (EBUSY);
}
/*
@@ -795,6 +838,13 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W
NULL :
upd_select->upd;
+ /*
+ * If we have done a prepared rollback, we may have restored a history store value to the update
+ * chain but the same value is left in the history store. Save it to delete it from the history
+ * store later.
+ */
+ WT_RET(__rec_find_and_save_delete_hs_upd(session, r, ins, rip, upd_select));
+
/* Check the update chain for conditions that could prevent it's eviction. */
WT_RET(__rec_validate_upd_chain(session, r, onpage_upd, &upd_select->tw, vpack));
@@ -836,7 +886,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W
/* Catch this case in diagnostic builds. */
WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_no_ts_checkpoint_race_3);
WT_ASSERT(session, false);
- WT_RET(EBUSY);
+ return (EBUSY);
}
/*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 7cb78c9c59f..25e75f12ef1 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -620,6 +620,9 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
r->supd_next = 0;
r->supd_memsize = 0;
+ /* The list of updates to be deleted from the history store. */
+ r->delete_hs_upd_next = 0;
+
/* The list of pages we've written. */
r->multi = NULL;
r->multi_next = 0;
@@ -772,6 +775,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
__wt_buf_free(session, &r->chunk_B.image);
__wt_free(session, r->supd);
+ __wt_free(session, r->delete_hs_upd);
__wt_rec_dictionary_free(session, r);
@@ -2666,6 +2670,12 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/* Flag as unused for non diagnostic builds. */
WT_UNUSED(btree);
+ /*
+ * Delete the updates left in the history store by prepared rollback first before moving updates
+ * to the history store.
+ */
+ WT_ERR(__wt_hs_delete_updates(session, r));
+
/* Check if there's work to do. */
for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i)
if (multi->supd != NULL)
diff --git a/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805
new file mode 100644
index 00000000000..34fd526585b
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805
@@ -0,0 +1,196 @@
+############################################
+# RUN PARAMETERS: V3
+############################################
+assert.read_timestamp=0
+backup=0
+backup.incremental=off
+backup.incr_granularity=11691
+block_cache=0
+block_cache.cache_on_checkpoint=0
+block_cache.cache_on_writes=1
+block_cache.size=62
+btree.huffman_value=0
+cache=2178
+cache.evict_max=2
+cache.minimum=20
+checkpoint=on
+checkpoint.log_size=200
+checkpoint.wait=31
+disk.data_extend=0
+disk.direct_io=0
+disk.encryption=rotn-7
+disk.mmap=1
+disk.mmap_all=0
+format.abort=0
+format.independent_thread_rng=1
+format.major_timeout=0
+import=0
+logging=0
+logging.compression=none
+logging.file_max=64891
+logging.prealloc=0
+logging.remove=1
+ops.alter=0
+ops.compaction=0
+ops.hs_cursor=1
+ops.prepare=1
+ops.random_cursor=0
+ops.salvage=0
+ops.verify=1
+quiet=1
+runs.in_memory=0
+runs.ops=0
+runs.rows=1000000
+runs.tables=5
+runs.threads=32
+runs.timer=26
+runs.verify_failure_dump=0
+statistics=0
+statistics.server=0
+stress.aggressive_sweep=0
+stress.checkpoint=0
+stress.checkpoint_evict_page=0
+stress.checkpoint_reserved_txnid_delay=0
+stress.checkpoint_prepare=0
+stress.evict_reposition=0
+stress.failpoint_eviction_fail_after_reconciliation=1
+stress.failpoint_hs_delete_key_from_ts=0
+stress.hs_checkpoint_delay=0
+stress.hs_search=0
+stress.hs_sweep=0
+stress.split_1=0
+stress.split_2=0
+stress.split_3=0
+stress.split_4=0
+stress.split_5=0
+stress.split_6=0
+stress.split_7=0
+transaction.implicit=0
+transaction.timestamps=1
+wiredtiger.config=off
+wiredtiger.rwlock=1
+wiredtiger.leak_memory=0
+############################################
+# TABLE PARAMETERS: table 1
+############################################
+table1.btree.compression=snappy
+table1.btree.dictionary=0
+table1.btree.internal_key_truncation=1
+table1.btree.internal_page_max=9
+table1.btree.leaf_page_max=10
+table1.btree.memory_page_max=1
+table1.btree.repeat_data_pct=47
+table1.btree.split_pct=77
+table1.btree.value_max=3212
+table1.btree.value_min=0
+table1.disk.checksum=unencrypted
+table1.disk.firstfit=0
+table1.ops.pct.delete=55
+table1.ops.pct.insert=42
+table1.ops.pct.modify=3
+table1.ops.pct.read=0
+table1.ops.pct.write=0
+table1.ops.truncate=1
+table1.runs.mirror=0
+table1.runs.source=table
+table1.runs.type=variable-length column-store
+############################################
+# TABLE PARAMETERS: table 2
+############################################
+table2.btree.compression=zstd
+table2.btree.dictionary=0
+table2.btree.internal_key_truncation=1
+table2.btree.internal_page_max=13
+table2.btree.key_max=46
+table2.btree.key_min=25
+table2.btree.leaf_page_max=14
+table2.btree.memory_page_max=6
+table2.btree.prefix_len=0
+table2.btree.prefix_compression=1
+table2.btree.prefix_compression_min=8
+table2.btree.reverse=0
+table2.btree.split_pct=69
+table2.btree.value_max=3706
+table2.btree.value_min=13
+table2.disk.checksum=off
+table2.disk.firstfit=0
+table2.ops.pct.delete=72
+table2.ops.pct.insert=0
+table2.ops.pct.modify=5
+table2.ops.pct.read=1
+table2.ops.pct.write=22
+table2.ops.truncate=1
+table2.runs.mirror=0
+table2.runs.source=table
+table2.runs.type=row-store
+############################################
+# TABLE PARAMETERS: table 3
+############################################
+table3.btree.bitcnt=3
+table3.btree.compression=zlib
+table3.btree.internal_key_truncation=1
+table3.btree.internal_page_max=17
+table3.btree.leaf_page_max=11
+table3.btree.memory_page_max=10
+table3.btree.split_pct=81
+table3.disk.checksum=on
+table3.disk.firstfit=0
+table3.ops.pct.delete=9
+table3.ops.pct.insert=44
+table3.ops.pct.modify=14
+table3.ops.pct.read=32
+table3.ops.pct.write=1
+table3.ops.truncate=1
+table3.runs.mirror=0
+table3.runs.source=table
+table3.runs.type=fixed-length column-store
+############################################
+# TABLE PARAMETERS: table 4
+############################################
+table4.btree.compression=zlib
+table4.btree.dictionary=1
+table4.btree.internal_key_truncation=1
+table4.btree.internal_page_max=14
+table4.btree.key_max=30
+table4.btree.key_min=27
+table4.btree.leaf_page_max=14
+table4.btree.memory_page_max=9
+table4.btree.prefix_len=67
+table4.btree.prefix_compression=1
+table4.btree.prefix_compression_min=5
+table4.btree.reverse=0
+table4.btree.split_pct=88
+table4.btree.value_max=120
+table4.btree.value_min=20
+table4.disk.checksum=on
+table4.disk.firstfit=0
+table4.ops.pct.delete=4
+table4.ops.pct.insert=41
+table4.ops.pct.modify=0
+table4.ops.pct.read=2
+table4.ops.pct.write=53
+table4.ops.truncate=1
+table4.runs.mirror=0
+table4.runs.source=file
+table4.runs.type=row-store
+############################################
+# TABLE PARAMETERS: table 5
+############################################
+table5.btree.bitcnt=2
+table5.btree.compression=snappy
+table5.btree.internal_key_truncation=1
+table5.btree.internal_page_max=11
+table5.btree.leaf_page_max=14
+table5.btree.memory_page_max=7
+table5.btree.split_pct=97
+table5.disk.checksum=on
+table5.disk.firstfit=0
+table5.ops.pct.delete=19
+table5.ops.pct.insert=2
+table5.ops.pct.modify=71
+table5.ops.pct.read=7
+table5.ops.pct.write=1
+table5.ops.truncate=1
+table5.runs.mirror=0
+table5.runs.source=table
+table5.runs.type=fixed-length column-store
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare25.py b/src/third_party/wiredtiger/test/suite/test_prepare25.py
index 95fce47181d..e512c3c7fef 100644
--- a/src/third_party/wiredtiger/test/suite/test_prepare25.py
+++ b/src/third_party/wiredtiger/test/suite/test_prepare25.py
@@ -31,7 +31,7 @@ from wtscenario import make_scenarios
# test_prepare25.py
# Test prepare rollback and then prepare commit with failed eviction.
-class test_prepare23(wttest.WiredTigerTestCase):
+class test_prepare25(wttest.WiredTigerTestCase):
conn_config = 'timing_stress_for_test=[failpoint_eviction_fail_after_reconciliation]'
format_values = [
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare26.py b/src/third_party/wiredtiger/test/suite/test_prepare26.py
new file mode 100644
index 00000000000..52e08cedff6
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_prepare26.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# test_prepare26.py
+# Test prepare rollback and then delete the key.
+class test_prepare26(wttest.WiredTigerTestCase):
+ format_values = [
+ ('column', dict(key_format='r', value_format='S')),
+ ('column_fix', dict(key_format='r', value_format='8t')),
+ ('row_integer', dict(key_format='i', value_format='S')),
+ ]
+
+ scenarios = make_scenarios(format_values)
+
+ def test_prepare26(self):
+ uri = "table:test_prepare26"
+ self.session.create(uri, 'key_format=' + self.key_format + ',value_format=' + self.value_format)
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "a"
+ value_b = "b"
+ value_c = "c"
+
+ # Pin oldest timestamp to 1
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1))
+
+ # Insert a value
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_a
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+
+ # Do a prepared update
+ self.session.begin_transaction()
+ cursor[1] = value_c
+ self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20))
+
+ # Evict the page
+ session2 = self.conn.open_session()
+ evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)')
+ session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(10))
+ self.assertEquals(evict_cursor[1], value_a)
+ evict_cursor.reset()
+ evict_cursor.close()
+ session2.rollback_transaction()
+
+ # Rollback the prepared transaction
+ self.session.rollback_transaction()
+
+ # Delete the key
+ self.session.begin_transaction()
+ cursor.set_key(1)
+ cursor.remove()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+
+ # Set oldest timestamp to 30
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(30))
+
+ # Evict the page again
+ evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)')
+ session2.begin_transaction()
+ evict_cursor.set_key(1)
+ if self.value_format == '8t':
+ self.assertEquals(evict_cursor[1], 0)
+ else:
+ evict_cursor.set_key(1)
+ self.assertEquals(evict_cursor.search(), wiredtiger.WT_NOTFOUND)
+ evict_cursor.reset()
+ evict_cursor.close()
+ session2.rollback_transaction()
+
+ # Do another update
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_b
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40))
+
+ # Do another update
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_c
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(50))
+
+ # Evict the page again
+ evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)')
+ session2.begin_transaction('read_timestamp=' + self.timestamp_str(50))
+ self.assertEquals(evict_cursor[1], value_c)
+ evict_cursor.reset()
+ evict_cursor.close()
+ session2.rollback_transaction()
+
+ # Verify we read nothing at the oldest
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(30))
+ if self.value_format == '8t':
+ self.assertEquals(cursor[1], 0)
+ else:
+ cursor.set_key(1)
+ self.assertEquals(cursor.search(), wiredtiger.WT_NOTFOUND)
+ self.session.rollback_transaction()
+
+if __name__ == '__main__':
+ wttest.run()