summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-06-17 15:47:15 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-06-17 06:09:32 +0000
commit9c0b5cd1e5fa4c2a22fecd3ed9267d3341bc84e4 (patch)
tree8a459334b2a47b12e9df9008925f4368a43316b7
parent3bebe0b8f9c89098586fa2638fd1bb640e516486 (diff)
downloadmongo-9c0b5cd1e5fa4c2a22fecd3ed9267d3341bc84e4.tar.gz
Import wiredtiger: b2dce5bb8f2bbecb704537eee9cff61d7dc106a0 from branch mongodb-5.0
ref: 4524d572ff..b2dce5bb8f for: 5.1.0 WT-7553 Loosen the restrictions around evicting fast-truncate pages to avoid cache-stuck failures
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c162
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c10
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c15
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c13
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h18
-rw-r--r--src/third_party/wiredtiger/src/include/btree_inline.h19
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h28
-rw-r--r--src/third_party/wiredtiger/src/include/txn_inline.h92
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_child.c16
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c49
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c4
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c3
14 files changed, 233 insertions, 205 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 9356fe181ee..5a963156a53 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "4524d572ff451edf69aaa70d211946a3b1ccb19f"
+ "commit": "b2dce5bb8f2bbecb704537eee9cff61d7dc106a0"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index a4d82c3d904..acb89293002 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -20,16 +20,16 @@
* state to WT_REF_DELETED. Pages ineligible for this fast path include pages already in the cache,
* having overflow items, or requiring history store records. Ineligible pages are read and have
* their rows updated/deleted individually. The transaction for the delete operation is stored in
- * memory referenced by the WT_REF.page_del field.
+ * memory referenced by the WT_REF.ft_info.del field.
*
* Future cursor walks of the tree will skip the deleted page based on the transaction stored for
* the delete, but it gets more complicated if a read is done using a random key, or a cursor walk
* is done with a transaction where the delete is not visible. In those cases, we read the original
* contents of the page. The page-read code notices a deleted page is being read, and as part of the
- * read instantiates the contents of the page, creating a WT_UPDATE with a deleted operation, in the
- * same transaction as deleted the page. In other words, the read process makes it appear as if the
- * page was read and each individual row deleted, exactly as would have happened if the page had
- * been in the cache all along.
+ * read instantiates the contents of the page, creating a WT_UPDATE with a tombstone, in the same
+ * transaction as deleted the page. In other words, the read process makes it appear as if the page
+ * was read and each individual row deleted, exactly as would have happened if the page had been in
+ * the cache all along.
*
* There's an additional complication to support rollback of the page delete. When the page was
* marked deleted, a pointer to the WT_REF was saved in the deleting session's transaction list and
@@ -39,14 +39,14 @@
* saved/restored during reconciliation and appear on multiple pages, and the WT_REF stored in the
* deleting session's transaction list is no longer useful. For this reason, when the page is
* instantiated by a read, a list of the WT_UPDATE structures on the page is stored in the
- * WT_REF.page_del field, with the transaction ID, that way the session committing/unrolling the
- * delete can find all WT_UPDATE structures that require update.
+ * WT_REF.ft_info.update field, that way the session resolving the delete can find all WT_UPDATE
+ * structures that require update.
*
* One final note: pages can also be marked deleted if emptied and evicted. In that case, the WT_REF
- * state will be set to WT_REF_DELETED but there will not be any associated WT_REF.page_del field.
- * These pages are always skipped during cursor traversal (the page could not have been evicted if
- * there were updates that weren't globally visible), and if read is forced to instantiate such a
- * page, it simply creates an empty page from scratch.
+ * state will be set to WT_REF_DELETED but there will not be any associated WT_REF.ft_info.del
+ * field. These pages are always skipped during cursor traversal (the page could not have been
+ * evicted if there were updates that weren't globally visible), and if read is forced to
+ * instantiate such a page, it simply creates an empty page from scratch.
*/
/*
@@ -93,16 +93,12 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
return (0);
/*
- * If this WT_REF was previously part of a truncate operation, there may be existing page-delete
- * information. The structure is only read while the state is locked, free the previous version.
- *
- * Note: changes have been made, we must publish any state change from this point on.
+ * There should be no previous page-delete information: if the previous fast-truncate didn't
+ * instantiate the page, then we'd never get here to do another delete; if the previous fast-
+ * truncate did instantiate the page, then any fast-truncate information was removed at that
+ * point and/or when the fast-truncate transaction was resolved.
*/
- if (ref->page_del != NULL) {
- WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED);
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
+ WT_ASSERT(session, ref->ft_info.del == NULL);
/*
* We cannot truncate pages that have overflow key/value items as the overflow blocks have to be
@@ -128,8 +124,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
WT_ERR(__wt_page_parent_modify_set(session, ref, false));
/* Allocate and initialize the page-deleted structure. */
- WT_ERR(__wt_calloc_one(session, &ref->page_del));
- ref->page_del->previous_state = previous_state;
+ WT_ERR(__wt_calloc_one(session, &ref->ft_info.del));
+ ref->ft_info.del->previous_state = previous_state;
WT_ERR(__wt_txn_modify_page_delete(session, ref));
@@ -141,7 +137,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
return (0);
err:
- __wt_free(session, ref->page_del);
+ __wt_free(session, ref->ft_info.del);
/* Publish the page to its previous state, ensuring visibility. */
WT_REF_SET_STATE(ref, previous_state);
@@ -150,7 +146,7 @@ err:
/*
* __wt_delete_page_rollback --
- * Abort pages that were deleted without being instantiated.
+ * Abort fast-truncate operations.
*/
int
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
@@ -160,7 +156,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
uint8_t current_state;
bool locked;
- /* Lock the reference. We cannot access ref->page_del except when locked. */
+ /* Lock the reference. We cannot access ref->ft_info.del except when locked. */
for (locked = false, sleep_usecs = yield_count = 0;;) {
switch (current_state = ref->state) {
case WT_REF_LOCKED:
@@ -188,23 +184,25 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
- * If the page is still "deleted", it's as we left it, all we have to do is reset the state.
- *
- * We can't use the normal read path to get a copy of the page because the session may have
- * closed the cursor, we no longer have the reference to the tree required for a hazard pointer.
- * We're safe because with unresolved transactions, the page isn't going anywhere.
- *
- * The page is in an in-memory state, which means it was instantiated at some point. Walk any
- * list of update structures and abort them.
+ * If the page is still "deleted", it's as we left it, simply reset the state. Otherwise, the
+ * page is in an in-memory state, which means it was instantiated at some point. Walk any list
+ * of update structures and abort them. We can't use the normal read path to get the pages with
+ * updates (the original page may have split, so there many be more than one page), because the
+ * session may have closed the cursor, we no longer have the reference to the tree required for
+ * a hazard pointer. We're safe since pages with unresolved transactions aren't going anywhere.
*/
if (current_state == WT_REF_DELETED)
- current_state = ref->page_del->previous_state;
- else if ((updp = ref->page_del->update_list) != NULL)
+ current_state = ref->ft_info.del->previous_state;
+ else if ((updp = ref->ft_info.update) != NULL)
for (; *updp != NULL; ++updp)
(*updp)->txnid = WT_TXN_ABORTED;
- /* Finally mark the truncate aborted */
- ref->page_del->txnid = WT_TXN_ABORTED;
+ /*
+ * We didn't set the WT_PAGE_DELETED transaction ID to aborted or discard any WT_UPDATE list,
+ * instead, we discard both structures entirely, it has the same effect. It's a single call,
+ * they're a union of two pointers.
+ */
+ __wt_free(session, ref->ft_info.del);
WT_REF_SET_STATE(ref, current_state);
return (0);
@@ -240,16 +238,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
skip = !__wt_page_del_active(session, ref, visible_all);
/*
- * The page_del structure can be freed as soon as the delete is stable: it is only read when the
- * ref state is locked. It is worth checking every time we come through because once this is
- * freed, we no longer need synchronization to check the ref.
+ * The fast-truncate structure can be freed as soon as the delete is stable: it is only read
+ * when the ref state is locked. It is worth checking every time we come through because once
+ * this is freed, we no longer need synchronization to check the ref.
*/
- if (skip && ref->page_del != NULL &&
+ if (skip && ref->ft_info.del != NULL &&
(visible_all ||
- __wt_txn_visible_all(session, ref->page_del->txnid, ref->page_del->timestamp))) {
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
+ __wt_txn_visible_all(session, ref->ft_info.del->txnid, ref->ft_info.del->timestamp)))
+ __wt_overwrite_and_free(session, ref->ft_info.del);
WT_REF_SET_STATE(ref, WT_REF_DELETED);
return (skip);
@@ -296,12 +292,14 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE_DELETED *page_del;
WT_ROW *rip;
WT_TIME_WINDOW tw;
- WT_UPDATE **upd_array, *upd;
+ WT_UPDATE **upd_array, **update_list, *upd;
size_t size, total_size;
uint32_t count, i;
btree = S2BT(session);
page = ref->page;
+ page_del = NULL;
+ update_list = NULL;
WT_STAT_CONN_DATA_INCR(session, cache_read_deleted);
@@ -315,7 +313,15 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
if (!F_ISSET(btree, WT_BTREE_READONLY))
__wt_page_modify_set(session, page);
- if (ref->page_del != NULL && ref->page_del->prepare_state != WT_PREPARE_INIT)
+ /*
+ * Allocate the per-page update array if one doesn't already exist. (It might already exist
+ * because deletes are instantiated after the history store table updates.)
+ */
+ if (page->entries != 0 && page->modify->mod_row_update == NULL)
+ WT_PAGE_ALLOC_AND_SWAP(
+ session, page, page->modify->mod_row_update, upd_array, page->entries);
+
+ if (ref->ft_info.del != NULL && ref->ft_info.del->prepare_state != WT_PREPARE_INIT)
WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared);
/*
@@ -332,26 +338,16 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
* in the system forced us to keep the old version of the page around, then we crashed and
* recovered or we're running inside a checkpoint, and now we're being forced to read that page.
*
- * Expect a page-deleted structure if there's a running transaction that needs to be resolved,
- * otherwise, there may not be one (and, if the transaction has resolved, we can ignore the
- * page-deleted structure).
- */
- page_del = __wt_page_del_active(session, ref, true) ? ref->page_del : NULL;
-
- /*
- * Allocate the per-page update array if one doesn't already exist. (It might already exist
- * because deletes are instantiated after the history store table updates.)
- */
- if (page->entries != 0 && page->modify->mod_row_update == NULL)
- WT_PAGE_ALLOC_AND_SWAP(
- session, page, page->modify->mod_row_update, upd_array, page->entries);
-
- /*
- * Allocate the per-reference update array; in the case of instantiating a page deleted in a
- * running transaction, we need a list of the update structures for the eventual commit or
- * abort.
+ * If there's a page-deleted structure that's not yet globally visible, get a reference and
+ * migrate transaction ID and timestamp information to the updates (globally visible means the
+ * updates don't require that information).
+ *
+ * If the truncate operation is not yet resolved, link updates in the page-deleted structure so
+ * they can be found when the transaction is aborted or committed, even if they have moved to
+ * other pages.
*/
- if (page_del != NULL) {
+ page_del = __wt_page_del_active(session, ref, true) ? ref->ft_info.del : NULL;
+ if (page_del != NULL && page_del->committed == 0) {
count = 0;
if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
WT_SKIP_FOREACH (ins, insert)
@@ -362,12 +358,11 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
WT_SKIP_FOREACH (ins, insert)
++count;
}
- WT_RET(__wt_calloc_def(session, count + 1, &page_del->update_list));
- __wt_cache_page_inmem_incr(session, page, (count + 1) * sizeof(page_del->update_list));
+ WT_RET(__wt_calloc_def(session, count + 1, &update_list));
}
/* Walk the page entries, giving each one a tombstone. */
- size = total_size = 0;
+ total_size = size = 0;
count = 0;
upd_array = page->modify->mod_row_update;
if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
@@ -377,8 +372,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
upd->next = ins->upd;
ins->upd = upd;
- if (page_del != NULL)
- page_del->update_list[count++] = upd;
+ if (update_list != NULL)
+ update_list[count++] = upd;
}
WT_ROW_FOREACH (page, rip, i) {
/*
@@ -392,8 +387,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
upd->next = upd_array[WT_ROW_SLOT(page, rip)];
upd_array[WT_ROW_SLOT(page, rip)] = upd;
- if (page_del != NULL)
- page_del->update_list[count++] = upd;
+ if (update_list != NULL)
+ update_list[count++] = upd;
if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
WT_SKIP_FOREACH (ins, insert) {
@@ -402,23 +397,24 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
upd->next = ins->upd;
ins->upd = upd;
- if (page_del != NULL)
- page_del->update_list[count++] = upd;
+ if (update_list != NULL)
+ update_list[count++] = upd;
}
}
}
-
__wt_cache_page_inmem_incr(session, page, total_size);
+ /*
+ * We no longer need the WT_PAGE_DELETED structure, all of its information should have been
+ * transferred to the list of WT_UPDATE structures (if any).
+ */
+ __wt_overwrite_and_free(session, ref->ft_info.del);
+ if (update_list != NULL)
+ ref->ft_info.update = update_list;
+
return (0);
err:
- /*
- * The page-delete update structure may have existed before we were called, and presumably might
- * be in use by a running transaction. The list of update structures cannot have been created
- * before we were called, and should not exist if we exit with an error.
- */
- if (page_del != NULL)
- __wt_free(session, page_del->update_list);
+ __wt_free(session, update_list);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index 02bd970e0c6..f60b4968e7c 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -290,11 +290,8 @@ __wt_free_ref(WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pa
/* Free any address allocation. */
__wt_ref_addr_free(session, ref);
- /* Free any page-deleted information. */
- if (ref->page_del != NULL) {
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
+ /* Free any backing fast-truncate memory. */
+ __wt_free(session, ref->ft_info.del);
__wt_overwrite_and_free_len(session, ref, WT_REF_CLEAR_SIZE);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index b5b997054ef..47d3f120d3a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -608,14 +608,8 @@ __split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *paren
}
}
- /*
- * The page-delete and history store memory weren't added to the parent's footprint, ignore it
- * here.
- */
- if (ref->page_del != NULL) {
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
+ /* Free any backing fast-truncate memory. */
+ __wt_free(session, ref->ft_info.del);
/* Free the backing block and address. */
WT_TRET(__wt_ref_block_free(session, ref));
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index 1e83f0a5643..f6091a4436e 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -85,24 +85,17 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
switch (syncop) {
case WT_SYNC_CLOSE:
- /*
- * Evict the page.
- *
- * Ensure the ref state is restored to the previous value if eviction fails.
- */
+ /* Evict the page. */
WT_ERR(__wt_evict(session, ref, ref->state, WT_EVICT_CALL_CLOSING));
break;
case WT_SYNC_DISCARD:
/*
- * Discard the page regardless of whether it is dirty.
- *
- * If the page has a page deleted structure, we are discarding the page that is cleaned
- * by a checkpoint.
+ * Discard the page whether it is dirty or not. The check if the page can be evicted is
+ * not exhaustive, but provides basic checking on the page's status.
*/
WT_ASSERT(session,
F_ISSET(dhandle, WT_DHANDLE_DEAD) || F_ISSET(S2C(session), WT_CONN_CLOSING) ||
- __wt_page_can_evict(session, ref, NULL) ||
- (ref->page_del != NULL && page->modify->page_state == WT_PAGE_CLEAN));
+ __wt_page_can_evict(session, ref, NULL));
__wt_ref_out(session, ref);
break;
case WT_SYNC_CHECKPOINT:
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 26b38dc5996..6d501b6ac28 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -184,15 +184,6 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32
/* Figure out whether reconciliation was done on the page */
clean_page = __wt_page_evict_clean(page);
- /*
- * Discard all page-deleted information. If a truncate call deleted this page, there's memory
- * associated with it we no longer need, eviction will have built a new version of the page.
- */
- if (ref->page_del != NULL) {
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
-
/* Update the reference and discard the page. */
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
@@ -530,9 +521,7 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
WT_RET(ret);
}
- /*
- * It is always OK to evict pages from dead trees if they don't have children.
- */
+ /* It is always OK to evict pages from dead trees if they don't have children. */
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
return (0);
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 1db9da86349..e9fa12975ff 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -835,7 +835,7 @@ struct __wt_page_deleted {
uint8_t previous_state; /* Previous state */
- WT_UPDATE **update_list; /* List of updates for abort */
+ uint8_t committed; /* Committed */
};
/*
@@ -906,7 +906,21 @@ struct __wt_ref {
#undef ref_ikey
#define ref_ikey key.ikey
- WT_PAGE_DELETED *page_del; /* Deleted page information */
+ /*
+ * Fast-truncate information. When a WT_REF is included in a fast-truncate operation, WT_REF.del
+ * is allocated and initialized. If the page must be instantiated before the truncate becomes
+ * globally visible, WT_UPDATE structures are created for the page entries, the transaction
+ * information from WT_REF.del is migrated to those WT_UPDATE structures, and the WT_REF.del
+ * field is freed and replaced by the WT_REF.update array (needed for subsequent transaction
+ * commit/abort). Doing anything other than testing if WT_REF.del/update is non-NULL (which
+ * eviction does), requires the WT_REF be locked. If the locked WT_REF's previous state was
+ * WT_REF_DELETED, WT_REF.del is valid, if the WT_REF's previous state was an in-memory state,
+ * then WT_REF.update is valid.
+ */
+ union {
+ WT_PAGE_DELETED *del; /* Page not instantiated, page-deleted structure */
+ WT_UPDATE **update; /* Page instantiated, update list for subsequent commit/abort */
+ } ft_info;
/*
* In DIAGNOSTIC mode we overwrite the WT_REF on free to force failures. Don't clear the history in
diff --git a/src/third_party/wiredtiger/src/include/btree_inline.h b/src/third_party/wiredtiger/src/include/btree_inline.h
index 9c0e0ce784e..f7c501f6018 100644
--- a/src/third_party/wiredtiger/src/include/btree_inline.h
+++ b/src/third_party/wiredtiger/src/include/btree_inline.h
@@ -1464,7 +1464,9 @@ __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
WT_PAGE_DELETED *page_del;
uint8_t prepare_state;
- if ((page_del = ref->page_del) == NULL)
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+ if ((page_del = ref->ft_info.del) == NULL)
return (false);
if (page_del->txnid == WT_TXN_ABORTED)
return (false);
@@ -1651,15 +1653,20 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
page = ref->page;
mod = page->modify;
- /* A truncated page can't be evicted until the truncate completes. */
- if (__wt_page_del_active(session, ref, true))
- return (false);
-
- /* Otherwise, never modified pages can always be evicted. */
+ /* Never modified pages can always be evicted. */
if (mod == NULL)
return (true);
/*
+ * If a fast-truncate page is subsequently instantiated, it can become an eviction candidate. If
+ * the fast-truncate itself has not resolved when the page is instantiated, a list of updates is
+ * created, which will be discarded as part of transaction resolution. Don't attempt to evict a
+ * fast-truncate page until any update list has been removed.
+ */
+ if (ref->ft_info.update != NULL)
+ return (false);
+
+ /*
* We can't split or evict multiblock row-store pages where the parent's key for the page is an
* overflow item, because the split into the parent frees the backing blocks for any
* no-longer-used overflow keys, which will corrupt the checkpoint's block management.
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index 3bde7b511b4..02395d8fa0b 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -118,11 +118,11 @@
addr))
/*
- * Our internal free function clears the underlying address atomically so there is a smaller chance
- * of racing threads seeing intermediate results while a structure is being free'd. (That would be a
- * bug, of course, but I'd rather not drop core, just the same.) That's a non-standard "free" API,
- * and the resulting bug is a mother to find -- make sure we get it right, don't make the caller
- * remember to put the & operator on the pointer.
+ * Our internal free function clears the underlying address so there is a smaller chance of racing
+ * threads seeing intermediate results while a structure is being free'd. (That would be a bug, of
+ * course, but I'd rather not drop core, just the same.) That's a non-standard "free" API, and the
+ * resulting bug is non-trivial to find -- make sure we get it right, don't make the caller remember
+ * to put the & operator on the pointer.
*/
#define __wt_free(session, p) \
do { \
@@ -134,15 +134,21 @@
/* Overwrite whether or not this is a diagnostic build. */
#define __wt_explicit_overwrite(p, size) memset(p, WT_DEBUG_BYTE, size)
#ifdef HAVE_DIAGNOSTIC
-#define __wt_overwrite_and_free(session, p) \
- do { \
- __wt_explicit_overwrite(p, sizeof(*(p))); \
- __wt_free(session, p); \
+#define __wt_overwrite_and_free(session, p) \
+ do { \
+ void *__p = &(p); \
+ if (*(void **)__p != NULL) { \
+ __wt_explicit_overwrite(p, sizeof(*(p))); \
+ __wt_free_int(session, __p); \
+ } \
} while (0)
#define __wt_overwrite_and_free_len(session, p, len) \
do { \
- __wt_explicit_overwrite(p, len); \
- __wt_free(session, p); \
+ void *__p = &(p); \
+ if (*(void **)__p != NULL) { \
+ __wt_explicit_overwrite(p, len); \
+ __wt_free_int(session, __p); \
+ } \
} while (0)
#else
#define __wt_overwrite_and_free(session, p) __wt_free(session, p)
diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h
index 0deaf77a532..97fea21a14c 100644
--- a/src/third_party/wiredtiger/src/include/txn_inline.h
+++ b/src/third_party/wiredtiger/src/include/txn_inline.h
@@ -251,10 +251,7 @@ __wt_txn_op_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool comm
txn = session->txn;
- /*
- * Lock the ref to ensure we don't race with eviction freeing the page deleted update list or
- * with a page instantiate.
- */
+ /* Lock the ref to ensure we don't race with page instantiation. */
WT_REF_LOCK(session, ref, &previous_state);
if (commit) {
@@ -264,20 +261,27 @@ __wt_txn_op_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool comm
ts = txn->prepare_timestamp;
prepare_state = WT_PREPARE_INPROGRESS;
}
- for (updp = ref->page_del->update_list; updp != NULL && *updp != NULL; ++updp) {
- (*updp)->start_ts = ts;
- /*
- * Holding the ref locked means we have exclusive access, so if we are committing we don't
- * need to use the prepare locked transition state.
- */
- (*updp)->prepare_state = prepare_state;
+
+ /*
+ * Timestamps and prepare state are in the page deleted structure for truncates, or in the
+ * updates in the case of instantiated pages.
+ */
+ if (previous_state == WT_REF_DELETED) {
+ ref->ft_info.del->timestamp = ts;
if (commit)
- (*updp)->durable_ts = txn->durable_timestamp;
- }
- ref->page_del->timestamp = ts;
- if (commit)
- ref->page_del->durable_timestamp = txn->durable_timestamp;
- WT_PUBLISH(ref->page_del->prepare_state, prepare_state);
+ ref->ft_info.del->durable_timestamp = txn->durable_timestamp;
+ WT_PUBLISH(ref->ft_info.del->prepare_state, prepare_state);
+ } else if ((updp = ref->ft_info.update) != NULL)
+ for (; *updp != NULL; ++updp) {
+ (*updp)->start_ts = ts;
+ /*
+ * Holding the ref locked means we have exclusive access, so if we are committing we
+ * don't need to use the prepare locked transition state.
+ */
+ (*updp)->prepare_state = prepare_state;
+ if (commit)
+ (*updp)->durable_ts = txn->durable_timestamp;
+ }
WT_REF_UNLOCK(ref, previous_state);
}
@@ -295,16 +299,23 @@ __wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref
txn = session->txn;
- /*
- * Lock the ref to ensure we don't race with eviction freeing the page deleted update list or
- * with a page instantiate.
- */
+ /* Lock the ref to ensure we don't race with page instantiation. */
WT_REF_LOCK(session, ref, &previous_state);
- for (updp = ref->page_del->update_list; updp != NULL && *updp != NULL; ++updp) {
- (*updp)->start_ts = txn->commit_timestamp;
- (*updp)->durable_ts = txn->durable_timestamp;
- }
+ /*
+ * Timestamps are in the page deleted structure for truncates, or in the updates in the case of
+ * instantiated pages. Both commit and durable timestamps need to be updated.
+ */
+ if (previous_state == WT_REF_DELETED) {
+ if (ref->ft_info.del->timestamp == WT_TS_NONE) {
+ ref->ft_info.del->timestamp = txn->commit_timestamp;
+ ref->ft_info.del->durable_timestamp = txn->durable_timestamp;
+ }
+ } else if ((updp = ref->ft_info.update) != NULL)
+ for (; *updp != NULL; ++updp) {
+ (*updp)->start_ts = txn->commit_timestamp;
+ (*updp)->durable_ts = txn->durable_timestamp;
+ }
WT_REF_UNLOCK(ref, previous_state);
}
@@ -320,7 +331,6 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
{
WT_TXN *txn;
WT_UPDATE *upd;
- wt_timestamp_t *timestamp;
txn = session->txn;
@@ -345,22 +355,19 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
__txn_resolve_prepared_update(session, upd);
}
} else {
- /*
- * The timestamp is in the page deleted structure for truncates, or in the update for other
- * operations. Both commit and durable timestamps need to be updated.
- */
- timestamp = op->type == WT_TXN_OP_REF_DELETE ? &op->u.ref->page_del->timestamp :
- &op->u.op_upd->start_ts;
- if (*timestamp == WT_TS_NONE) {
- *timestamp = txn->commit_timestamp;
-
- timestamp = op->type == WT_TXN_OP_REF_DELETE ? &op->u.ref->page_del->durable_timestamp :
- &op->u.op_upd->durable_ts;
- *timestamp = txn->durable_timestamp;
- }
-
if (op->type == WT_TXN_OP_REF_DELETE)
__wt_txn_op_delete_commit_apply_timestamps(session, op->u.ref);
+ else {
+ /*
+ * The timestamp is in the update for operations other than truncate. Both commit and
+ * durable timestamps need to be updated.
+ */
+ upd = op->u.op_upd;
+ if (upd->start_ts == WT_TS_NONE) {
+ upd->start_ts = txn->commit_timestamp;
+ upd->durable_ts = txn->durable_timestamp;
+ }
+ }
}
}
@@ -421,9 +428,10 @@ __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
WT_RET(__txn_next_op(session, &op));
op->type = WT_TXN_OP_REF_DELETE;
-
op->u.ref = ref;
- ref->page_del->txnid = txn->id;
+
+ /* This access to the WT_PAGE_DELETED structure is safe, caller has the WT_REF locked. */
+ ref->ft_info.del->txnid = txn->id;
__wt_txn_op_set_timestamp(session, op);
WT_ERR(__wt_txn_log_op(session, NULL));
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c
index 56104639e53..6423eb3347d 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_child.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c
@@ -17,7 +17,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C
{
WT_PAGE_DELETED *page_del;
- page_del = ref->page_del;
+ page_del = ref->ft_info.del;
/*
* Internal pages with child leaf pages in the WT_REF_DELETED state are a special case during
@@ -61,18 +61,10 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C
* function instantiates an entirely new page.)
*/
if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) {
- /*
- * Minor memory cleanup: if a truncate call deleted this page and we were ever forced to
- * instantiate the page in memory, we would have built a list of updates in the page
- * reference in order to be able to commit/rollback the truncate. We just passed a
- * visibility test, discard the update list.
- */
- if (page_del != NULL) {
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
-
WT_RET(__wt_ref_block_free(session, ref));
+
+ /* Any fast-truncate information can be freed as soon as the delete is stable. */
+ __wt_overwrite_and_free(session, ref->ft_info.del);
}
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 42ddf278aec..6c613ffbe87 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -1434,15 +1434,16 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_UPDATE *upd;
wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp;
uint32_t fileid;
- u_int i;
+ uint8_t previous_state;
+ u_int i, ft_resolution;
#ifdef HAVE_DIAGNOSTIC
u_int prepare_count;
#endif
bool locked, prepare, readonly, update_durable_ts;
- txn = session->txn;
conn = S2C(session);
cursor = NULL;
+ txn = session->txn;
txn_global = &conn->txn_global;
#ifdef HAVE_DIAGNOSTIC
prepare_count = 0;
@@ -1567,6 +1568,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
/* Note: we're going to commit: nothing can fail after this point. */
/* Process and free updates. */
+ ft_resolution = 0;
for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
fileid = op->btree->id;
switch (op->type) {
@@ -1610,8 +1612,12 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
}
break;
case WT_TXN_OP_REF_DELETE:
- __wt_txn_op_set_timestamp(session, op);
- break;
+ /*
+ * Fast-truncate operations are resolved in a second pass after failure is no longer
+ * possible.
+ */
+ ++ft_resolution;
+ continue;
case WT_TXN_OP_TRUNCATE_COL:
case WT_TXN_OP_TRUNCATE_ROW:
/* Other operations don't need timestamps. */
@@ -1623,11 +1629,6 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
if (cursor != NULL)
WT_CLEAR(cursor->key);
}
- txn->mod_count = 0;
-#ifdef HAVE_DIAGNOSTIC
- WT_ASSERT(session, txn->prepare_count == prepare_count);
- txn->prepare_count = 0;
-#endif
if (cursor != NULL) {
WT_ERR(cursor->close(cursor));
@@ -1635,6 +1636,36 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
}
/*
+ * Resolve any fast-truncate transactions and allow eviction to proceed on instantiated pages.
+ * This isn't done as part of the initial processing because until now the commit could still
+ * switch to an abort. The action allowing eviction to proceed is clearing the WT_UPDATE list,
+ * (if any), associated with the commit. We're the only consumer of that list and we no longer
+ * need it, and eviction knows it means abort or commit has completed on instantiated pages.
+ */
+ for (i = 0, op = txn->mod; ft_resolution > 0 && i < txn->mod_count; i++, op++)
+ if (op->type == WT_TXN_OP_REF_DELETE) {
+ __wt_txn_op_set_timestamp(session, op);
+
+ WT_REF_LOCK(session, op->u.ref, &previous_state);
+ if (previous_state == WT_REF_DELETED)
+ op->u.ref->ft_info.del->committed = 1;
+ else
+ __wt_free(session, op->u.ref->ft_info.update);
+ WT_REF_UNLOCK(op->u.ref, previous_state);
+
+ __wt_txn_op_free(session, op);
+
+ --ft_resolution;
+ }
+ WT_ASSERT(session, ft_resolution == 0);
+
+ txn->mod_count = 0;
+#ifdef HAVE_DIAGNOSTIC
+ WT_ASSERT(session, txn->prepare_count == prepare_count);
+ txn->prepare_count = 0;
+#endif
+
+ /*
* If durable is set, we'll try to update the global durable timestamp with that value. If
* durable isn't set, durable is implied to be the same as commit so we'll use that instead.
*/
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index c7ebb3654f8..54f9c2e9333 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -1121,8 +1121,8 @@ __rollback_abort_fast_truncate(
* individual WT_UPDATE structures. When reviewing internal pages, ignore the second case, an
* instantiated page is handled when the leaf page is visited.
*/
- if (ref->state == WT_REF_DELETED && ref->page_del != NULL &&
- rollback_timestamp < ref->page_del->durable_timestamp) {
+ if (ref->state == WT_REF_DELETED && ref->ft_info.del != NULL &&
+ rollback_timestamp < ref->ft_info.del->durable_timestamp) {
__wt_verbose(
session, WT_VERB_RECOVERY_RTS(session), "%p: deleted page rolled back", (void *)ref);
WT_RET(__wt_delete_page_rollback(session, ref));
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index 3fd5706efad..3a37b4fd8bf 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -391,7 +391,8 @@ operations(u_int ops_seconds, bool lastrun)
* The system should be quiescent at this point, call rollback to stable. Generally, we expect
* applications to do rollback-to-stable as part of the database open, but calling it outside of
* the open path is expected in the case of applications that are "restarting" but skipping the
- * close/re-open pair.
+ * close/re-open pair. Note we are not advancing the oldest timestamp, otherwise we wouldn't be
+ * able to replay operations from after rollback-to-stable completes.
*/
tinfo_rollback_to_stable(session);