summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-12-05 16:26:09 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-12-05 05:54:13 +0000
commita98ea6e3be4a0c0aa93f691bbc5b4ae26f3dfafa (patch)
treedad96a1a24cf501d8a51be03b0873e5d306c3034
parent101cdcc819bd22d374383450fa216a49f8031795 (diff)
downloadmongo-a98ea6e3be4a0c0aa93f691bbc5b4ae26f3dfafa.tar.gz
Import wiredtiger: adba8ce2917cd4d131237b7ccf4fc89199e26480 from branch mongodb-5.0
ref: f1fa61904b..adba8ce291 for: 5.0.15 WT-9805 Save the updates need to be deleted from history store and delete them later
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c7
-rw-r--r--src/third_party/wiredtiger/src/history/hs_rec.c289
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h2
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h23
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h2
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c94
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c10
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare25.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare26.py134
10 files changed, 405 insertions, 160 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 8f4375f0d27..7f18ae7d4ff 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "f1fa61904bdec4a931daa49452cf3b92a3bca37a"
+ "commit": "adba8ce2917cd4d131237b7ccf4fc89199e26480"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 013ccb16845..ecaf3bc74a7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -454,6 +454,9 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt)
WT_STAT_CONN_DATA_INCR(session, cursor_reset);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ /* Initialize the update value as we are not pointing to any value. */
+ cbt->upd_value->type = WT_UPDATE_INVALID;
+ WT_TIME_WINDOW_INIT(&cbt->upd_value->tw);
return (__cursor_reset(cbt));
}
@@ -1886,6 +1889,10 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
cbt->modify_update = &cbt->_modify_update;
cbt->upd_value = &cbt->_upd_value;
+ /* Initialize the value. */
+ cbt->upd_value->type = WT_UPDATE_INVALID;
+ WT_TIME_WINDOW_INIT(&cbt->upd_value->tw);
+
#ifdef HAVE_DIAGNOSTIC
cbt->lastkey = &cbt->_lastkey;
cbt->lastrecno = WT_RECNO_OOB;
diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c
index 01d3d40a7c1..a1fbd7c3014 100644
--- a/src/third_party/wiredtiger/src/history/hs_rec.c
+++ b/src/third_party/wiredtiger/src/history/hs_rec.c
@@ -61,65 +61,6 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree)
}
/*
- * __hs_delete_record --
- * Delete the update left in the history store
- */
-static int
-__hs_delete_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key,
- WT_UPDATE *delete_upd, WT_UPDATE *delete_tombstone)
-{
- WT_DECL_RET;
- bool hs_read_committed;
-#ifdef HAVE_DIAGNOSTIC
- WT_TIME_WINDOW *hs_tw;
-#endif
-
- hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
- F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
-
- /* No need to delete from the history store if it is already obsolete. */
- if (delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)) {
- ret = 0;
- goto done;
- }
-
- hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX);
- WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
- /* It's possible the value in the history store becomes obsolete concurrently. */
- if (ret == WT_NOTFOUND) {
- WT_ASSERT(
- session, delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone));
- ret = 0;
- goto done;
- }
-
-#ifdef HAVE_DIAGNOSTIC
- __wt_hs_upd_time_window(hs_cursor, &hs_tw);
- WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == delete_upd->txnid);
- WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == delete_upd->start_ts);
- WT_ASSERT(session,
- hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == delete_upd->durable_ts);
- if (delete_tombstone != NULL) {
- WT_ASSERT(session, hs_tw->stop_txn == delete_tombstone->txnid);
- WT_ASSERT(session, hs_tw->stop_ts == delete_tombstone->start_ts);
- WT_ASSERT(session, hs_tw->durable_stop_ts == delete_tombstone->durable_ts);
- } else
- WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw));
-#endif
-
- WT_ERR(hs_cursor->remove(hs_cursor));
-done:
- if (delete_tombstone != NULL)
- F_CLR(delete_tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
- F_CLR(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
-
-err:
- if (!hs_read_committed)
- F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
- return (ret);
-}
-
-/*
* __hs_insert_record --
* A helper function to insert the record into the history store including stop time point.
*/
@@ -349,9 +290,45 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_UPDATE_VECTOR *updates,
}
/*
+ * __hs_pack_key --
+ * Pack the history store key
+ */
+static inline int
+__hs_pack_key(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_INSERT *ins,
+ WT_ROW *rip, WT_ITEM *key)
+{
+ WT_DECL_RET;
+ uint8_t *p;
+
+ switch (r->page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ p = key->mem;
+ WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins)));
+ key->size = WT_PTRDIFF(p, key->data);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (ins == NULL) {
+ WT_WITH_BTREE(
+ session, btree, ret = __wt_row_leaf_key(session, r->page, rip, key, false));
+ WT_RET(ret);
+ } else {
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ }
+ break;
+ default:
+ WT_RET(__wt_illegal_value(session, r->page->type));
+ }
+
+ return (ret);
+}
+
+/*
* __wt_hs_insert_updates --
- * Copy one set of saved updates into the database's history store table. Whether the function
- * fails or succeeds, if there is a successful write to history, cache_write_hs is set to true.
+ * Copy one set of saved updates into the database's history store table if they haven't been
+ * moved there. Whether the function fails or succeeds, if there is a successful write to
+ * history, cache_write_hs is set to true.
*/
int
__wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *multi)
@@ -373,13 +350,11 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult
WT_UPDATE_VECTOR out_of_order_ts_updates;
WT_SAVE_UPD *list;
WT_UPDATE *first_globally_visible_upd, *fix_ts_upd, *min_ts_upd, *out_of_order_ts_upd;
- WT_UPDATE *delete_tombstone, *delete_upd, *newest_hs, *non_aborted_upd, *oldest_upd, *prev_upd,
- *ref_upd, *tombstone, *upd;
+ WT_UPDATE *newest_hs, *non_aborted_upd, *oldest_upd, *prev_upd, *ref_upd, *tombstone, *upd;
WT_TIME_WINDOW tw;
wt_off_t hs_size;
uint64_t insert_cnt, max_hs_size, modify_cnt;
uint32_t i;
- uint8_t *p;
int nentries;
bool enable_reverse_modify, error_on_ooo_ts, hs_inserted, squashed;
@@ -421,68 +396,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult
if (list->onpage_upd == NULL)
continue;
- /* History store table key component: source key. */
- switch (r->page->type) {
- case WT_PAGE_COL_FIX:
- case WT_PAGE_COL_VAR:
- p = key->mem;
- WT_ERR(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
- key->size = WT_PTRDIFF(p, key->data);
- break;
- case WT_PAGE_ROW_LEAF:
- if (list->ins == NULL) {
- WT_WITH_BTREE(
- session, btree, ret = __wt_row_leaf_key(session, r->page, list->rip, key, false));
- WT_ERR(ret);
- } else {
- key->data = WT_INSERT_KEY(list->ins);
- key->size = WT_INSERT_KEY_SIZE(list->ins);
- }
- break;
- default:
- WT_ERR(__wt_illegal_value(session, r->page->type));
- }
-
- newest_hs = first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL;
- ref_upd = list->onpage_upd;
- delete_tombstone = delete_upd = NULL;
-
- __wt_update_vector_clear(&out_of_order_ts_updates);
- __wt_update_vector_clear(&updates);
-
- /*
- * Reverse deltas are only supported on 'S' and 'u' value formats.
- */
- enable_reverse_modify =
- (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u"));
-
- /*
- * Delete the update that is both on the update chain and the history store from the history
- * store. Otherwise, we will trigger out of order fix when the update is inserted to the
- * history store again.
- */
- for (upd = list->onpage_tombstone != NULL ? list->onpage_tombstone : list->onpage_upd;
- upd != NULL; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) {
- /*
- * If we want to remove an update from the history store in WiredTiger, it must be
- * in history store.
- */
- WT_ASSERT(session, F_ISSET(upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS));
- if (upd->type == WT_UPDATE_TOMBSTONE)
- delete_tombstone = upd;
- else {
- delete_upd = upd;
- WT_ERR(
- __hs_delete_record(session, hs_cursor, key, delete_upd, delete_tombstone));
- break;
- }
- }
- }
-
/* Skip aborted updates. */
for (upd = list->onpage_upd->next; upd != NULL && upd->txnid == WT_TXN_ABORTED;
upd = upd->next)
@@ -496,6 +409,21 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult
if (F_ISSET(upd, WT_UPDATE_HS))
continue;
+ /* History store table key component: source key. */
+ WT_ERR(__hs_pack_key(session, btree, r, list->ins, list->rip, key));
+
+ newest_hs = first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL;
+ ref_upd = list->onpage_upd;
+
+ __wt_update_vector_clear(&updates);
+ __wt_update_vector_clear(&out_of_order_ts_updates);
+
+ /*
+ * Reverse deltas are only supported on 'S' and 'u' value formats.
+ */
+ enable_reverse_modify =
+ (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u"));
+
/*
* The algorithm assumes the oldest update on the update chain in memory is either a full
* update or a tombstone.
@@ -885,7 +813,7 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3
{
WT_DECL_RET;
WT_ITEM hs_key;
- wt_timestamp_t hs_ts;
+ wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
bool hs_read_all_flag;
@@ -910,7 +838,7 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3
ret = 0;
goto done;
} else {
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
++hs_counter;
}
@@ -940,8 +868,8 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_RET;
WT_ITEM hs_key, hs_value;
- WT_TIME_WINDOW hs_insert_tw, tw, *twp;
- wt_timestamp_t hs_ts;
+ WT_TIME_WINDOW hs_insert_tw, *twp;
+ wt_timestamp_t hs_durable_start_ts, hs_durable_stop_ts, hs_start_ts;
uint64_t hs_counter, hs_upd_type;
uint32_t hs_btree_id;
#ifdef HAVE_DIAGNOSTIC
@@ -1016,7 +944,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
continue;
/* We shouldn't have crossed the btree and user key search space. */
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
WT_ASSERT(session, hs_btree_id == btree_id);
#ifdef HAVE_DIAGNOSTIC
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
@@ -1028,7 +956,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
* the cell. The cell's start timestamp can be cleared during reconciliation if it is
* globally visible.
*/
- if (hs_ts >= ts || twp->stop_ts >= ts)
+ if (hs_start_ts >= ts || twp->stop_ts >= ts)
break;
}
if (ret == WT_NOTFOUND)
@@ -1095,7 +1023,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
*/
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
/* We shouldn't have crossed the btree and user key search space. */
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
WT_ASSERT(session, hs_btree_id == btree_id);
#ifdef HAVE_DIAGNOSTIC
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
@@ -1113,7 +1041,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
* ignoring them.
*/
__wt_hs_upd_time_window(hs_cursor, &twp);
- if (hs_ts < ts && twp->stop_ts < ts)
+ if (hs_start_ts < ts && twp->stop_ts < ts)
continue;
if (reinsert) {
@@ -1167,10 +1095,10 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
/* Extract the underlying value for reinsertion. */
WT_ERR(hs_cursor->get_value(
- hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value));
+ hs_cursor, &hs_durable_stop_ts, &hs_durable_start_ts, &hs_upd_type, &hs_value));
/* Reinsert the update with corrected timestamps. */
- if (no_ts_tombstone && hs_ts == ts)
+ if (no_ts_tombstone && hs_start_ts == ts)
*counter = hs_counter;
/* Insert the value back with different timestamps. */
@@ -1197,3 +1125,92 @@ err:
hs_insert_cursor->close(hs_insert_cursor);
return (ret);
}
+
+/*
+ * __hs_delete_record --
+ * Delete an update from the history store if it is not obsolete
+ */
+static int
+__hs_delete_record(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, WT_UPDATE *upd, WT_UPDATE *tombstone)
+{
+ WT_DECL_RET;
+ bool hs_read_committed;
+#ifdef HAVE_DIAGNOSTIC
+ WT_TIME_WINDOW *hs_tw;
+#endif
+
+ if (r->hs_cursor == NULL)
+ WT_RET(__wt_curhs_open(session, NULL, &r->hs_cursor));
+ hs_read_committed = F_ISSET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
+ /* Ensure we can see all the content in the history store. */
+ F_SET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
+
+ /* No need to delete from the history store if it is already obsolete. */
+ if (tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone))
+ goto done;
+
+ r->hs_cursor->set_key(r->hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX);
+ WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, r->hs_cursor), true);
+ /* It's possible the value in the history store becomes obsolete concurrently. */
+ if (ret == WT_NOTFOUND) {
+ WT_ASSERT(session, tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone));
+ ret = 0;
+ } else {
+#ifdef HAVE_DIAGNOSTIC
+ __wt_hs_upd_time_window(r->hs_cursor, &hs_tw);
+ WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == upd->txnid);
+ WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == upd->start_ts);
+ WT_ASSERT(session,
+ hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == upd->durable_ts);
+ if (tombstone != NULL) {
+ WT_ASSERT(session, hs_tw->stop_txn == tombstone->txnid);
+ WT_ASSERT(session, hs_tw->stop_ts == tombstone->start_ts);
+ WT_ASSERT(session, hs_tw->durable_stop_ts == tombstone->durable_ts);
+ } else
+ WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw));
+#endif
+
+ WT_ERR(r->hs_cursor->remove(r->hs_cursor));
+ }
+done:
+ if (tombstone != NULL)
+ F_CLR(tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
+ F_CLR(upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS);
+
+err:
+ if (!hs_read_committed)
+ F_CLR(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
+ return (ret);
+}
+
+/*
+ * __wt_hs_delete_updates --
+ * Delete the updates from the history store
+ */
+int
+__wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_DELETE_HS_UPD *delete_hs_upd;
+ uint32_t i;
+
+ /* Nothing to delete from the history store. */
+ if (r->delete_hs_upd == NULL)
+ return (0);
+
+ btree = S2BT(session);
+
+ WT_RET(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
+
+ for (delete_hs_upd = r->delete_hs_upd, i = 0; i < r->delete_hs_upd_next; ++delete_hs_upd, ++i) {
+ WT_ERR(__hs_pack_key(session, btree, r, delete_hs_upd->ins, delete_hs_upd->rip, key));
+ WT_ERR(__hs_delete_record(session, r, key, delete_hs_upd->upd, delete_hs_upd->tombstone));
+ }
+
+err:
+ __wt_scr_free(session, &key);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 48532c375df..57f30316c26 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -787,6 +787,8 @@ extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg)
extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool ooo_tombstone,
bool error_on_ooo_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index 70664653e5d..9d648f62d6a 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -61,6 +61,17 @@ struct __wt_rec_chunk {
};
/*
+ * WT_DELETE_HS_UPD --
+ * Update that needs to be deleted from the history store.
+ */
+struct __wt_delete_hs_upd {
+ WT_INSERT *ins; /* Insert list reference */
+ WT_ROW *rip; /* Original on-page reference */
+ WT_UPDATE *upd;
+ WT_UPDATE *tombstone;
+};
+
+/*
* Reconciliation is the process of taking an in-memory page, walking each entry
* in the page, building a backing disk image in a temporary buffer representing
* that information, and writing that buffer to disk. What could be simpler?
@@ -217,6 +228,15 @@ struct __wt_reconcile {
size_t supd_allocated;
size_t supd_memsize; /* Size of saved update structures */
+ /*
+ * List of updates to be deleted from the history store. While reviewing updates for each page,
+ * we save the updates that needs to be deleted from history store here, and then delete them
+ * after we have built the disk image.
+ */
+ WT_DELETE_HS_UPD *delete_hs_upd; /* Updates to delete from history store */
+ uint32_t delete_hs_upd_next;
+ size_t delete_hs_upd_allocated;
+
/* List of pages we've written so far. */
WT_MULTI *multi;
uint32_t multi_next;
@@ -295,7 +315,8 @@ struct __wt_reconcile {
};
typedef struct {
- WT_UPDATE *upd; /* Update to write (or NULL) */
+ WT_UPDATE *upd; /* Update to write (or NULL) */
+ WT_UPDATE *tombstone; /* The tombstone to write (or NULL) */
WT_TIME_WINDOW tw;
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index d1666d35a9c..22724bc2be3 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -181,6 +181,8 @@ struct __wt_data_handle;
typedef struct __wt_data_handle WT_DATA_HANDLE;
struct __wt_data_handle_cache;
typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE;
+struct __wt_delete_hs_upd;
+typedef struct __wt_delete_hs_upd WT_DELETE_HS_UPD;
struct __wt_dlh;
typedef struct __wt_dlh WT_DLH;
struct __wt_dsrc_stats;
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index 855fa327439..5d476a00c75 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -52,6 +52,27 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_
}
/*
+ * __rec_delete_hs_upd_save --
+ * Save an update into a WT_DELETE_HS_UPD list to delete it from the history store later.
+ */
+static inline int
+__rec_delete_hs_upd_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip,
+ WT_UPDATE *upd, WT_UPDATE *tombstone)
+{
+ WT_DELETE_HS_UPD *delete_hs_upd;
+
+ WT_RET(__wt_realloc_def(
+ session, &r->delete_hs_upd_allocated, r->delete_hs_upd_next + 1, &r->delete_hs_upd));
+ delete_hs_upd = &r->delete_hs_upd[r->delete_hs_upd_next];
+ delete_hs_upd->ins = ins;
+ delete_hs_upd->rip = rip;
+ delete_hs_upd->upd = upd;
+ delete_hs_upd->tombstone = tombstone;
+ ++r->delete_hs_upd_next;
+ return (0);
+}
+
+/*
* __rec_append_orig_value --
* Append the key's original value to its update list. It assumes that we have an onpage value,
* the onpage value is not a prepared update, and we don't overwrite transaction id to
@@ -194,6 +215,45 @@ err:
}
/*
+ * __rec_find_and_save_delete_hs_upd --
+ * Find and save the update that needs to be deleted from the history store later
+ */
+static int
+__rec_find_and_save_delete_hs_upd(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins,
+ WT_ROW *rip, WT_UPDATE_SELECT *upd_select)
+{
+ WT_UPDATE *delete_tombstone, *delete_upd;
+
+ delete_tombstone = NULL;
+
+ for (delete_upd = upd_select->tombstone != NULL ? upd_select->tombstone : upd_select->upd;
+ delete_upd != NULL; delete_upd = delete_upd->next) {
+ if (delete_upd->txnid == WT_TXN_ABORTED)
+ continue;
+
+ if (F_ISSET(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS)) {
+ /*
+ * If we want to remove an update from the history store in WiredTiger, it must be in
+ * history store.
+ */
+ WT_ASSERT(session, F_ISSET(delete_upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS));
+ if (delete_upd->type == WT_UPDATE_TOMBSTONE)
+ delete_tombstone = delete_upd;
+ else {
+ WT_RET(
+ __rec_delete_hs_upd_save(session, r, ins, rip, delete_upd, delete_tombstone));
+ break;
+ }
+ }
+ }
+
+ /* If we delete a tombstone from the history store, we must also delete the update. */
+ WT_ASSERT(session, delete_tombstone == NULL || delete_upd != NULL);
+
+ return (0);
+}
+
+/*
* __rec_need_save_upd --
* Return if we need to save the update chain
*/
@@ -201,8 +261,6 @@ static inline bool
__rec_need_save_upd(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates)
{
- WT_UPDATE *upd;
-
if (upd_select->tw.prepare)
return (true);
@@ -213,15 +271,6 @@ __rec_need_save_upd(
if (upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE)
return (false);
- /* Save the update chain to delete the update from the history store later. */
- for (upd = upd_select->upd; upd != NULL; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS))
- return (true);
- }
-
/*
* Don't save updates for any reconciliation that doesn't involve history store (in-memory
* database, fixed length column store, metadata, and history store reconciliation itself),
@@ -308,15 +357,10 @@ __rec_validate_upd_chain(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *s
if (!F_ISSET(r, WT_REC_CHECKPOINT_RUNNING))
return (0);
- for (upd = select_upd; upd != NULL; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- /* Cannot delete the update from history store when checkpoint is running. */
- if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) {
- WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint);
- return (EBUSY);
- }
+ /* Cannot delete the update from history store when checkpoint is running. */
+ if (r->delete_hs_upd_next > 0) {
+ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint);
+ return (EBUSY);
}
/*
@@ -431,6 +475,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W
* both must be initialized.
*/
upd_select->upd = NULL;
+ upd_select->tombstone = NULL;
upd_select->upd_saved = false;
upd_select->ooo_tombstone = false;
select_tw = &upd_select->tw;
@@ -634,7 +679,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W
*/
if (upd->type == WT_UPDATE_TOMBSTONE) {
WT_TIME_WINDOW_SET_STOP(select_tw, upd);
- tombstone = upd;
+ tombstone = upd_select->tombstone = upd;
/* Find the update this tombstone applies to. */
if (!__wt_txn_upd_visible_all(session, upd)) {
@@ -733,6 +778,13 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W
NULL :
upd_select->upd;
+ /*
+ * If we have done a prepared rollback, we may have restored a history store value to the update
+ * chain but the same value is left in the history store. Save it to delete it from the history
+ * store later.
+ */
+ WT_ERR(__rec_find_and_save_delete_hs_upd(session, r, ins, rip, upd_select));
+
/* Check the update chain for conditions that could prevent it's eviction. */
WT_ERR(__rec_validate_upd_chain(session, r, onpage_upd, select_tw, vpack));
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 74ec3714443..c1c25b192fd 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -623,6 +623,9 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
r->supd_next = 0;
r->supd_memsize = 0;
+ /* The list of updates to be deleted from the history store. */
+ r->delete_hs_upd_next = 0;
+
/* The list of pages we've written. */
r->multi = NULL;
r->multi_next = 0;
@@ -775,6 +778,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
__wt_buf_free(session, &r->chunk_B.image);
__wt_free(session, r->supd);
+ __wt_free(session, r->delete_hs_upd);
__wt_rec_dictionary_free(session, r);
@@ -2385,6 +2389,12 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/* Flag as unused for non diagnostic builds. */
WT_UNUSED(btree);
+ /*
+ * Delete the updates left in the history store by prepared rollback first before moving updates
+ * to the history store.
+ */
+ WT_ERR(__wt_hs_delete_updates(session, r));
+
/* Check if there's work to do. */
for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i)
if (multi->supd != NULL)
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare25.py b/src/third_party/wiredtiger/test/suite/test_prepare25.py
index 3059abb3b25..fd6100a54f7 100644
--- a/src/third_party/wiredtiger/test/suite/test_prepare25.py
+++ b/src/third_party/wiredtiger/test/suite/test_prepare25.py
@@ -31,7 +31,7 @@ from wtscenario import make_scenarios
# test_prepare25.py
# Test prepare rollback and then prepare commit with failed eviction.
-class test_prepare23(wttest.WiredTigerTestCase):
+class test_prepare25(wttest.WiredTigerTestCase):
conn_config = 'timing_stress_for_test=[failpoint_eviction_fail_after_reconciliation]'
format_values = [
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare26.py b/src/third_party/wiredtiger/test/suite/test_prepare26.py
new file mode 100644
index 00000000000..3a9fa353a1b
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_prepare26.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# test_prepare26.py
+# Test prepare rollback and then delete the key.
+class test_prepare26(wttest.WiredTigerTestCase):
+ format_values = [
+ ('column', dict(key_format='r', value_format='S')),
+ #('column_fix', dict(key_format='r', value_format='8t')),
+ ('row_integer', dict(key_format='i', value_format='S')),
+ ]
+
+ scenarios = make_scenarios(format_values)
+
+ def test_prepare26(self):
+ uri = "table:test_prepare26"
+ self.session.create(uri, 'key_format=' + self.key_format + ',value_format=' + self.value_format)
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "a"
+ value_b = "b"
+ value_c = "c"
+
+ # Pin oldest timestamp to 1
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1))
+
+ # Insert a value
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_a
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+
+ # Do a prepared update
+ self.session.begin_transaction()
+ cursor[1] = value_c
+ self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20))
+
+ # Evict the page
+ session2 = self.conn.open_session()
+ evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)')
+ session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(10))
+ self.assertEquals(evict_cursor[1], value_a)
+ evict_cursor.reset()
+ evict_cursor.close()
+ session2.rollback_transaction()
+
+ # Rollback the prepared transaction
+ self.session.rollback_transaction()
+
+ # Delete the key
+ self.session.begin_transaction()
+ cursor.set_key(1)
+ cursor.remove()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+
+ # Set oldest timestamp to 30
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(30))
+
+ # Evict the page again
+ evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)')
+ session2.begin_transaction()
+ evict_cursor.set_key(1)
+ if self.value_format == '8t':
+ self.assertEquals(evict_cursor[1], 0)
+ else:
+ evict_cursor.set_key(1)
+ self.assertEquals(evict_cursor.search(), wiredtiger.WT_NOTFOUND)
+ evict_cursor.reset()
+ evict_cursor.close()
+ session2.rollback_transaction()
+
+ # Do another update
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_b
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40))
+
+ # Do another update
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_c
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(50))
+
+ # Evict the page again
+ evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)')
+ session2.begin_transaction('read_timestamp=' + self.timestamp_str(50))
+ self.assertEquals(evict_cursor[1], value_c)
+ evict_cursor.reset()
+ evict_cursor.close()
+ session2.rollback_transaction()
+
+ # Verify we read nothing at the oldest
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(30))
+ if self.value_format == '8t':
+ self.assertEquals(cursor[1], 0)
+ else:
+ cursor.set_key(1)
+ self.assertEquals(cursor.search(), wiredtiger.WT_NOTFOUND)
+ self.session.rollback_transaction()
+
+if __name__ == '__main__':
+ wttest.run()