summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-12-05 15:46:33 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-12-05 05:22:18 +0000
commit123043c43ad35ac2862a54c675ee42b01b8db6e7 (patch)
tree9334ea4401eb02217b246e24006f8efa448b9e79
parent4eecc7eea204498b1fc179f00bab747b94d85675 (diff)
downloadmongo-123043c43ad35ac2862a54c675ee42b01b8db6e7.tar.gz
Import wiredtiger: d90f4fa81dec7af7d20f7ee52f3d1258aa2c20e6 from branch mongodb-6.0
ref: 29d40146fc..d90f4fa81d for: 6.0.4 WT-9705 Fix accidentally freeing update on the update chain in prepared commit
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/include/timestamp_inline.h16
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c327
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c1
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare23.py1
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare24.py126
6 files changed, 339 insertions, 134 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index e04495e5632..f2eff9e7272 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-6.0",
- "commit": "29d40146fc78114efd83aa579c5d9c013f0a6016"
+ "commit": "d90f4fa81dec7af7d20f7ee52f3d1258aa2c20e6"
}
diff --git a/src/third_party/wiredtiger/src/include/timestamp_inline.h b/src/third_party/wiredtiger/src/include/timestamp_inline.h
index 8962970b3ea..6ccb7d3b5e3 100644
--- a/src/third_party/wiredtiger/src/include/timestamp_inline.h
+++ b/src/third_party/wiredtiger/src/include/timestamp_inline.h
@@ -70,6 +70,22 @@
(tw)->stop_txn = (upd)->txnid; \
} while (0)
+/* Copy the start values of a time window from another time window. */
+#define WT_TIME_WINDOW_COPY_START(dest, source) \
+ do { \
+ (dest)->durable_start_ts = (source)->durable_start_ts; \
+ (dest)->start_ts = (source)->start_ts; \
+ (dest)->start_txn = (source)->start_txn; \
+ } while (0)
+
+/* Copy the stop values of a time window from another time window. */
+#define WT_TIME_WINDOW_COPY_STOP(dest, source) \
+ do { \
+ (dest)->durable_stop_ts = (source)->durable_stop_ts; \
+ (dest)->stop_ts = (source)->stop_ts; \
+ (dest)->stop_txn = (source)->stop_txn; \
+ } while (0)
+
/*
* Initialize the fields in an aggregated time window to their defaults. The aggregated durable
* timestamp values represent the maximum durable timestamp over set of timestamps. These aggregated
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 6de0205e967..2f3a3598dc9 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -712,14 +712,13 @@ __wt_txn_release(WT_SESSION_IMPL *session)
}
/*
- * __txn_locate_hs_record --
- * Locate the update older than the prepared update in the history store and append it to the
- * update chain if necessary.
+ * __txn_prepare_rollback_restore_hs_update --
+ * Restore the history store update to the update chain before roll back prepared update evicted
+ * to disk
*/
static int
-__txn_locate_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *page,
- WT_UPDATE *chain, bool commit, WT_UPDATE **fix_updp, bool *upd_appended,
- WT_UPDATE *first_committed_upd, bool first_committed_upd_in_hs)
+__txn_prepare_rollback_restore_hs_update(
+ WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *page, WT_UPDATE *upd_chain)
{
WT_DECL_ITEM(hs_value);
WT_DECL_RET;
@@ -730,11 +729,9 @@ __txn_locate_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *
uint64_t type_full;
char ts_string[2][WT_TS_INT_STRING_SIZE];
- WT_ASSERT(session, chain != NULL);
+ WT_ASSERT(session, upd_chain != NULL);
hs_tw = NULL;
- *fix_updp = NULL;
- *upd_appended = false;
size = total_size = 0;
tombstone = upd = NULL;
@@ -746,30 +743,12 @@ __txn_locate_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *
/* The value older than the prepared update in the history store must be a full value. */
WT_ASSERT(session, (uint8_t)type_full == WT_UPDATE_STANDARD);
- /*
- * If the history update already has a stop time point and we are committing the prepared update
- * there is no work to do. This happens if a deleted key is reinserted by a prepared update.
- */
- if (hs_stop_durable_ts != WT_TS_MAX && commit)
- goto done;
-
- if (!first_committed_upd_in_hs) {
- __wt_hs_upd_time_window(hs_cursor, &hs_tw);
- WT_ERR(__wt_upd_alloc(session, hs_value, WT_UPDATE_STANDARD, &upd, &size));
- upd->txnid = hs_tw->start_txn;
- upd->durable_ts = hs_tw->durable_start_ts;
- upd->start_ts = hs_tw->start_ts;
- if (commit)
- *fix_updp = upd;
- } else if (commit)
- *fix_updp = first_committed_upd;
-
- /*
- * When the prepared update is getting committed or the history store update is still on the
- * update chain, no need to append it onto the update chain.
- */
- if (commit || first_committed_upd_in_hs)
- goto done;
+ /* Use time window in cell to initialize the update. */
+ __wt_hs_upd_time_window(hs_cursor, &hs_tw);
+ WT_ERR(__wt_upd_alloc(session, hs_value, WT_UPDATE_STANDARD, &upd, &size));
+ upd->txnid = hs_tw->start_txn;
+ upd->durable_ts = hs_tw->durable_start_ts;
+ upd->start_ts = hs_tw->start_ts;
/*
* Set the flag to indicate that this update has been restored from history store for the
@@ -807,17 +786,16 @@ __txn_locate_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *
}
/* Walk to the end of the chain and we can only have prepared updates on the update chain. */
- for (;; chain = chain->next) {
- WT_ASSERT(
- session, chain->txnid != WT_TXN_ABORTED && chain->prepare_state == WT_PREPARE_INPROGRESS);
+ for (;; upd_chain = upd_chain->next) {
+ WT_ASSERT(session,
+ upd_chain->txnid != WT_TXN_ABORTED && upd_chain->prepare_state == WT_PREPARE_INPROGRESS);
- if (chain->next == NULL)
+ if (upd_chain->next == NULL)
break;
}
/* Append the update to the end of the chain. */
- WT_PUBLISH(chain->next, upd);
- *upd_appended = true;
+ WT_PUBLISH(upd_chain->next, upd);
__wt_cache_page_inmem_incr(session, page, total_size);
@@ -826,7 +804,6 @@ err:
WT_ASSERT(session, tombstone == NULL || upd == tombstone);
__wt_free_update_list(session, &upd);
}
-done:
__wt_scr_free(session, &hs_value);
return (ret);
}
@@ -912,34 +889,40 @@ __txn_timestamp_usage_check(WT_SESSION_IMPL *session, WT_TXN_OP *op, WT_UPDATE *
}
/*
- * __txn_fixup_prepared_update --
- * Fix the history store record with the max stop time point if we commit the prepared update.
+ * __txn_fixup_hs_update --
+ * Fix the history store update with the max stop time point if we commit the prepared update.
*/
static int
-__txn_fixup_prepared_update(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_UPDATE *fix_upd)
+__txn_fixup_hs_update(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor)
{
+ WT_DECL_ITEM(hs_value);
WT_DECL_RET;
- WT_ITEM hs_value;
- WT_TIME_WINDOW tw;
+ WT_TIME_WINDOW *hs_tw, tw;
WT_TXN *txn;
- uint32_t txn_flags;
-#ifdef HAVE_DIAGNOSTIC
- uint64_t hs_upd_type;
wt_timestamp_t hs_durable_ts, hs_stop_durable_ts;
-#endif
+ uint64_t type_full;
+ bool txn_error, txn_prepare_ignore_api_check;
+ hs_tw = NULL;
txn = session->txn;
- WT_TIME_WINDOW_INIT(&tw);
- /* We should not fix a history store record already with a valid tombstone. */
- WT_ASSERT(session, fix_upd->type != WT_UPDATE_TOMBSTONE);
+ __wt_hs_upd_time_window(hs_cursor, &hs_tw);
+
+ /*
+ * If the history update already has a stop time point there is no work to do. This happens if a
+ * deleted key is reinserted by a prepared update.
+ */
+ if (WT_TIME_WINDOW_HAS_STOP(hs_tw))
+ return (0);
+
+ WT_RET(__wt_scr_alloc(session, 0, &hs_value));
/*
* Transaction error is cleared temporarily as cursor functions are not allowed after an error
* or a prepared transaction.
*/
- txn_flags = FLD_MASK(txn->flags, WT_TXN_ERROR);
- F_CLR(txn, txn_flags);
+ txn_error = F_ISSET(txn, WT_TXN_ERROR);
+ F_CLR(txn, WT_TXN_ERROR);
/*
* The API layer will immediately return an error if the WT_TXN_PREPARE flag is set before
@@ -948,34 +931,42 @@ __txn_fixup_prepared_update(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_U
* marked as a prepared transaction. The flag WT_TXN_PREPARE_IGNORE_API_CHECK is set so that
* cursor operations can proceed without having to clear the WT_TXN_PREPARE flag.
*/
+ txn_prepare_ignore_api_check = F_ISSET(txn, WT_TXN_PREPARE_IGNORE_API_CHECK);
F_SET(txn, WT_TXN_PREPARE_IGNORE_API_CHECK);
+ /* Get current value. */
+ WT_ERR(
+ hs_cursor->get_value(hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value));
+
+ /* The old stop timestamp must be max. */
+ WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX);
+ /* The value older than the prepared update in the history store must be a full value. */
+ WT_ASSERT(session, (uint8_t)type_full == WT_UPDATE_STANDARD);
+
+ /*
+ * Set the stop time point to be the committing transaction's time point and copy the start time
+ * point from the current history store update.
+ */
tw.stop_ts = txn->commit_timestamp;
tw.durable_stop_ts = txn->durable_timestamp;
tw.stop_txn = txn->id;
- WT_TIME_WINDOW_SET_START(&tw, fix_upd);
+ WT_TIME_WINDOW_COPY_START(&tw, hs_tw);
-#ifdef HAVE_DIAGNOSTIC
- /* Retrieve the existing update value and stop timestamp. */
- WT_ERR(hs_cursor->get_value(
- hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &hs_upd_type, &hs_value));
- WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX);
- WT_ASSERT(session, (uint8_t)hs_upd_type == WT_UPDATE_STANDARD);
-#endif
/*
* We need to update the stop durable timestamp stored in the history store value.
*
* Pack the value using cursor api.
*/
- hs_value.data = fix_upd->data;
- hs_value.size = fix_upd->size;
hs_cursor->set_value(hs_cursor, &tw, tw.durable_stop_ts, tw.durable_start_ts,
- (uint64_t)WT_UPDATE_STANDARD, &hs_value);
+ (uint64_t)WT_UPDATE_STANDARD, hs_value);
WT_ERR(hs_cursor->update(hs_cursor));
err:
- F_SET(txn, txn_flags);
- F_CLR(txn, WT_TXN_PREPARE_IGNORE_API_CHECK);
+ if (!txn_prepare_ignore_api_check)
+ F_CLR(txn, WT_TXN_PREPARE_IGNORE_API_CHECK);
+ if (txn_error)
+ F_SET(txn, WT_TXN_ERROR);
+ __wt_scr_free(session, &hs_value);
return (ret);
}
@@ -1153,18 +1144,22 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
WT_PAGE *page;
WT_TIME_WINDOW tw;
WT_TXN *txn;
- WT_UPDATE *first_committed_upd, *fix_upd, *upd, *upd_followed_tombstone;
+ WT_UPDATE *first_committed_upd, *upd, *upd_followed_tombstone;
#ifdef HAVE_DIAGNOSTIC
WT_UPDATE *head_upd;
#endif
- uint8_t *p, hs_recno_key_buf[WT_INTPACK64_MAXSIZE];
+ uint8_t *p, resolve_case, hs_recno_key_buf[WT_INTPACK64_MAXSIZE];
char ts_string[3][WT_TS_INT_STRING_SIZE];
- bool first_committed_upd_in_hs, prepare_on_disk, tw_found, upd_appended;
+ bool tw_found, has_hs_record;
hs_cursor = NULL;
txn = session->txn;
- fix_upd = NULL;
- upd_appended = false;
+ has_hs_record = false;
+#define RESOLVE_UPDATE_CHAIN 0
+#define RESOLVE_PREPARE_ON_DISK 1
+#define RESOLVE_PREPARE_EVICTION_FAILURE 2
+#define RESOLVE_IN_MEMORY 3
+ resolve_case = RESOLVE_UPDATE_CHAIN;
WT_RET(__txn_search_prepared_op(session, op, cursorp, &upd));
@@ -1217,49 +1212,98 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
page = cbt->ref->page;
/*
- * Locate the previous update from the history store and append it to the update chain if
- * required. We know there may be content in the history store if the prepared update is written
- * to the disk image or first committed update older than the prepared update is marked as
- * WT_UPDATE_HS. The second case is rare but can happen if the eviction that writes the prepared
- * update to the disk image fails after it has inserted the other updates of the key into the
- * history store.
+ * If the prepared update is a single tombstone, we don't need to do anything special and we can
+ * directly resolve it in memory.
+ *
+ * If the prepared update is not a tombstone or we have multiple prepared updates in the same
+ * transaction. There are four base cases:
*
- * We need to do this before we resolve the prepared updates because if we abort the prepared
- * updates first, the history search logic may race with other sessions modifying the same key
- * and checkpoint moving the new updates to the history store.
+ * 1) Prepared updates are on the update chain and hasn't been reconciled to write to data
+ * store.
+ * Simply resolve the prepared updates in memory.
*
- * Fix the history store entry for the updates other than tombstone type or the tombstone
- * followed by the update is also from the same prepared transaction by either restoring the
- * previous update from history store or removing the key.
+ * 2) Prepared updates are written to the data store.
+ * If there is no older updates written to the history store:
+ * commit: simply resolve the prepared updates in memory.
+ * rollback: delete the whole key.
+ *
+ * If there are older updates written to the history store:
+ * commit: fix the stop timestamp of the newest update in the history store if it has a
+ * max timestamp.
+ * rollback: restore the newest update in the history store to the data store and mark
+ * it to be deleted from the history store in the future reconciliation.
+ *
+ * 3) Prepared updates are successfully reconciled to a new disk image in eviction but the
+ * eviction fails and the updates are restored back to the old disk image.
+ * If there is no older updates written to the history store:
+ * commit: simply resolve the prepared updates in memory.
+ * rollback: delete the whole key.
+ *
+ * If there are older updates written to the history store:
+ * commit: fix the stop timestamp of the newest update in the history store if it has a
+ * max timestamp.
+ * rollback: mark the data update (or tombstone and data update) that is older
+ * than the prepared updates to be deleted from the history store in the
+ * future reconciliation.
+ *
+ * 4) We are running an in-memory database:
+ * commit: resolve the prepared updates in memory.
+ * rollback: if the prepared update is written to the disk image, delete the whole key.
*/
- prepare_on_disk = F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS) &&
- (upd->type != WT_UPDATE_TOMBSTONE ||
- (upd->next != NULL && upd->durable_ts == upd->next->durable_ts &&
- upd->txnid == upd->next->txnid && upd->start_ts == upd->next->start_ts));
- first_committed_upd_in_hs =
- first_committed_upd != NULL && F_ISSET(first_committed_upd, WT_UPDATE_HS);
/*
- * Marked the update older than the prepared update that is already in the history store to be
- * deleted from the history store.
+ * We also need to handle the on disk prepared updates if we have a prepared delete and a
+ * prepared update on the disk image.
*/
- if (first_committed_upd_in_hs && !commit) {
- if (first_committed_upd->type == WT_UPDATE_TOMBSTONE) {
- for (upd_followed_tombstone = first_committed_upd->next; upd_followed_tombstone != NULL;
- upd_followed_tombstone = upd_followed_tombstone->next)
- if (upd_followed_tombstone->txnid != WT_TXN_ABORTED)
- break;
- /* We may not find a full update following the tombstone if it is obsolete. */
- if (upd_followed_tombstone != NULL) {
- WT_ASSERT(session, F_ISSET(upd_followed_tombstone, WT_UPDATE_HS));
- F_SET(first_committed_upd, WT_UPDATE_TO_DELETE_FROM_HS);
- F_SET(upd_followed_tombstone, WT_UPDATE_TO_DELETE_FROM_HS);
- }
- } else
- F_SET(first_committed_upd, WT_UPDATE_TO_DELETE_FROM_HS);
- }
+ if (F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS) &&
+ (upd->type != WT_UPDATE_TOMBSTONE ||
+ (upd->next != NULL && upd->durable_ts == upd->next->durable_ts &&
+ upd->txnid == upd->next->txnid && upd->start_ts == upd->next->start_ts)))
+ resolve_case = RESOLVE_PREPARE_ON_DISK;
+ else if (first_committed_upd != NULL && F_ISSET(first_committed_upd, WT_UPDATE_HS))
+ resolve_case = RESOLVE_PREPARE_EVICTION_FAILURE;
+ else if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ resolve_case = RESOLVE_IN_MEMORY;
+ else
+ resolve_case = RESOLVE_UPDATE_CHAIN;
- if (prepare_on_disk || first_committed_upd_in_hs) {
+ switch (resolve_case) {
+ case RESOLVE_PREPARE_EVICTION_FAILURE:
+ /*
+ * If we see the first committed update has been moved to the history store, we must have
+ * done a successful reconciliation on the page but failed to evict it. Also reconciliation
+ * could not possibly empty the page because the prepared update is not globally visible.
+ * Therefore, reconciliation must have either split the page or done a page rewrite.
+ *
+ * In this case, we still need to resolve the prepared update as if we have successfully
+ * evicted the page because the value older than the prepared update has been written to the
+ * history store with the max timestamp.
+ */
+ WT_ASSERT(session,
+ page->modify->rec_result == WT_PM_REC_MULTIBLOCK ||
+ page->modify->rec_result == WT_PM_REC_REPLACE);
+ /*
+ * Marked the update older than the prepared update that is already in the history store to
+ * be deleted from the history store.
+ */
+ if (!commit) {
+ if (first_committed_upd->type == WT_UPDATE_TOMBSTONE) {
+ for (upd_followed_tombstone = first_committed_upd->next;
+ upd_followed_tombstone != NULL;
+ upd_followed_tombstone = upd_followed_tombstone->next)
+ if (upd_followed_tombstone->txnid != WT_TXN_ABORTED)
+ break;
+ /* We may not find a full update following the tombstone if it is obsolete. */
+ if (upd_followed_tombstone != NULL) {
+ WT_ASSERT(session, F_ISSET(upd_followed_tombstone, WT_UPDATE_HS));
+ F_SET(first_committed_upd, WT_UPDATE_TO_DELETE_FROM_HS);
+ F_SET(upd_followed_tombstone, WT_UPDATE_TO_DELETE_FROM_HS);
+ }
+ } else
+ F_SET(first_committed_upd, WT_UPDATE_TO_DELETE_FROM_HS);
+ }
+ /* Fall through. */
+ case RESOLVE_PREPARE_ON_DISK:
btree = S2BT(session);
/*
@@ -1277,23 +1321,42 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
hs_recno_key.size = WT_PTRDIFF(p, hs_recno_key_buf);
hs_cursor->set_key(hs_cursor, 4, btree->id, &hs_recno_key, WT_TS_MAX, UINT64_MAX);
}
+ /*
+ * Locate the previous update from the history store. We know there may be content in the
+ * history store if the prepared update is written to the disk image or first committed
+ * update older than the prepared update is marked as WT_UPDATE_HS. The second case is rare
+ * but can happen if the previous eviction that writes the prepared update to the disk image
+ * fails after reconciliation.
+ *
+ * We need to locate the history store update before we resolve the prepared updates because
+ * if we abort the prepared updates first, the history store search may race with other
+ * sessions modifying the same key and checkpoint moving the new updates to the history
+ * store.
+ */
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
/* We should only get not found if the prepared update is on disk. */
- WT_ASSERT(session, ret != WT_NOTFOUND || prepare_on_disk);
- if (ret == WT_NOTFOUND && !commit) {
+ WT_ASSERT(session, ret != WT_NOTFOUND || resolve_case == RESOLVE_PREPARE_ON_DISK);
+ if (ret == 0) {
+ has_hs_record = true;
+ /*
+ * Restore the history store update to the update chain if we are rolling back the
+ * prepared update written to the disk image.
+ */
+ if (!commit && resolve_case == RESOLVE_PREPARE_ON_DISK)
+ WT_ERR(__txn_prepare_rollback_restore_hs_update(session, hs_cursor, page, upd));
+ } else {
+ ret = 0;
/*
* Allocate a tombstone and prepend it to the row so when we reconcile the update chain
* we don't copy the prepared cell, which is now associated with a rolled back prepare,
* and instead write nothing.
*/
- WT_ERR(__txn_append_tombstone(session, op, cbt));
- } else if (ret == 0)
- WT_ERR(__txn_locate_hs_record(session, hs_cursor, page, upd, commit, &fix_upd,
- &upd_appended, first_committed_upd, first_committed_upd_in_hs));
- else
- ret = 0;
- } else if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && !commit && first_committed_upd == NULL) {
+ if (!commit)
+ WT_ERR(__txn_append_tombstone(session, op, cbt));
+ }
+ break;
+ case RESOLVE_IN_MEMORY:
/*
* For in-memory configurations of WiredTiger if a prepared update is reconciled and then
* rolled back the on-page value will not be marked as aborted until the next eviction. In
@@ -1304,9 +1367,15 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
* rolling back a prepared reconciled update would result in only aborted updates on the
* update chain.
*/
- tw_found = __wt_read_cell_time_window(cbt, &tw);
- if (tw_found && tw.prepare == WT_PREPARE_INPROGRESS)
- WT_ERR(__txn_append_tombstone(session, op, cbt));
+ if (!commit && first_committed_upd == NULL) {
+ tw_found = __wt_read_cell_time_window(cbt, &tw);
+ if (tw_found && tw.prepare == WT_PREPARE_INPROGRESS)
+ WT_ERR(__txn_append_tombstone(session, op, cbt));
+ }
+ break;
+ default:
+ WT_ASSERT(session, resolve_case == RESOLVE_UPDATE_CHAIN);
+ break;
}
/*
@@ -1336,15 +1405,11 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
__wt_page_modify_set(session, page);
/*
- * Fix the history store contents if they exist, when there are no more updates in the update
- * list. Only in eviction, it is possible to write an unfinished history store update when the
- * prepared updates are written to the data store. When the page is read back into memory, there
- * will be only one uncommitted prepared update.
+ * Fix the history store record's stop time point if we are committing the prepared update and
+ * the previous update is written to the history store.
*/
- if (fix_upd != NULL) {
- WT_ASSERT(session, commit && fix_upd->type != WT_UPDATE_TOMBSTONE);
- WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd));
- }
+ if (commit && has_hs_record)
+ WT_ERR(__txn_fixup_hs_update(session, hs_cursor));
prepare_verify:
#ifdef HAVE_DIAGNOSTIC
@@ -1364,8 +1429,8 @@ prepare_verify:
* If we restored an update from the history store, it should be the last update on the
* chain.
*/
- if (upd_appended && head_upd->type == WT_UPDATE_STANDARD &&
- F_ISSET(head_upd, WT_UPDATE_RESTORED_FROM_HS))
+ if (!commit && resolve_case == RESOLVE_PREPARE_ON_DISK &&
+ head_upd->type == WT_UPDATE_STANDARD && F_ISSET(head_upd, WT_UPDATE_RESTORED_FROM_HS))
WT_ASSERT(session, head_upd->next == NULL);
}
#endif
@@ -1373,8 +1438,6 @@ prepare_verify:
err:
if (hs_cursor != NULL)
WT_TRET(hs_cursor->close(hs_cursor));
- if (!upd_appended)
- __wt_free(session, fix_upd);
return (ret);
}
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
index 8a8d142c021..ccfa2764787 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
@@ -133,7 +133,6 @@ main(int argc, char *argv[])
break;
default:
return (usage());
- break;
}
break;
case 't':
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare23.py b/src/third_party/wiredtiger/test/suite/test_prepare23.py
index 112fc747820..f7496a5d228 100644
--- a/src/third_party/wiredtiger/test/suite/test_prepare23.py
+++ b/src/third_party/wiredtiger/test/suite/test_prepare23.py
@@ -88,6 +88,7 @@ class test_prepare23(wttest.WiredTigerTestCase):
self.session.begin_transaction()
cursor[i] = value_c
self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(ts + 40))
+ cursor.reset()
# Evict the page
session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(ts + 20))
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare24.py b/src/third_party/wiredtiger/test/suite/test_prepare24.py
new file mode 100644
index 00000000000..af367af87e5
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_prepare24.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# test_prepare24.py
+# Test prepare commit after eviction failure.
+class test_prepare24(wttest.WiredTigerTestCase):
+ conn_config = 'timing_stress_for_test=[failpoint_eviction_fail_after_reconciliation]'
+
+ format_values = [
+ ('column', dict(key_format='r', value_format='S')),
+ ('column_fix', dict(key_format='r', value_format='8t')),
+ ('row_integer', dict(key_format='i', value_format='S')),
+ ]
+
+ delete = [
+ ('delete', dict(delete=True)),
+ ('non-delete', dict(delete=False)),
+ ]
+
+ scenarios = make_scenarios(format_values, delete)
+
+ def test_prepare24(self):
+ uri = "table:test_prepare24"
+ self.session.create(uri, 'key_format=' + self.key_format + ',value_format=' + self.value_format)
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ else:
+ value_a = "a"
+ value_b = "b"
+
+ # Pin oldest timestamp to 1
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1))
+
+ cursor = self.session.open_cursor(uri)
+ session2 = self.conn.open_session()
+ evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)')
+ ts = 0
+ for i in range (1, 1001):
+ # Insert a value
+ self.session.begin_transaction()
+ cursor[i] = value_a
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts + 10))
+
+ if self.delete:
+ self.session.begin_transaction()
+ cursor.set_key(i)
+ cursor.remove()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts + 20))
+
+ # Do a prepared update
+ self.session.begin_transaction()
+ cursor[i] = value_b
+ self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(ts + 30))
+ cursor.reset()
+
+ # Evict the page
+ session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(ts + 10))
+ self.assertEquals(evict_cursor[i], value_a)
+ evict_cursor.reset()
+ session2.rollback_transaction()
+
+ # Commit the prepared transaction
+ self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(ts + 30))
+ self.session.timestamp_transaction('durable_timestamp=' + self.timestamp_str(ts + 40))
+ self.session.commit_transaction()
+
+ # Evict the page again
+ session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(ts + 10))
+ self.assertEquals(evict_cursor[i], value_a)
+ evict_cursor.reset()
+ session2.rollback_transaction()
+
+ # Verify we can still read back value a
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts + 10))
+ self.assertEquals(cursor[i], value_a)
+ self.session.rollback_transaction()
+
+ # Verify we can still read back the deletion
+ if self.delete:
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts + 20))
+ if self.value_format == '8t':
+ self.assertEquals(cursor[i], 0)
+ else:
+ cursor.set_key(i)
+ self.assertEquals(cursor.search(), wiredtiger.WT_NOTFOUND)
+ self.session.rollback_transaction()
+
+ # Verify we can still read back the prepared update
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts + 30))
+ self.assertEquals(cursor[i], value_b)
+ self.session.rollback_transaction()
+
+ ts += 40
+
+if __name__ == '__main__':
+ wttest.run()