summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-09-05 13:33:07 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-05 04:37:06 +0000
commit24143f27c9ba0d3075b4980a57a5b3ee9ec37344 (patch)
treeaeb89a37eac5ebd0f28b5d2600a101e85000c89c
parentb06609850b12a66509613e5c2d5b86fc17ce9de1 (diff)
downloadmongo-24143f27c9ba0d3075b4980a57a5b3ee9ec37344.tar.gz
Import wiredtiger: d619c325f86cd59ee25d4bcf43b738afcf7bfdf9 from branch mongodb-6.1
ref: 478f555081..d619c325f8 for: 6.1.0-rc1 WT-9720 Clean up and reorganize the fast-truncate code (#8188)
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok2
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c235
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c91
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c3
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c57
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h191
-rw-r--r--src/third_party/wiredtiger/src/include/btree_inline.h117
-rw-r--r--src/third_party/wiredtiger/src/include/cell.h3
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h10
-rw-r--r--src/third_party/wiredtiger/src/include/txn_inline.h65
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_child.c86
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c5
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c40
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c15
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c29
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint06.py4
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py6
-rw-r--r--src/third_party/wiredtiger/test/suite/test_truncate16.py196
-rw-r--r--src/third_party/wiredtiger/test/suite/test_truncate17.py212
-rw-r--r--src/third_party/wiredtiger/test/suite/test_truncate18.py207
24 files changed, 1234 insertions, 356 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 9b1ca5f20d2..47a8e0e878d 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -231,6 +231,7 @@ ITEMs
ITER
InitializeCriticalSectionAndSpinCount
Inline
+Instantiation
Intra
Ippokratis
Iu
@@ -445,6 +446,7 @@ TIMESTAMPS
TMP
TODO
TORTIOUS
+TRYLOCK
TSO
TW
TXN
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index e0cc1039833..f30d1fa76fc 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-6.1",
- "commit": "478f5550817985718478ac04f3295e88440f8c3e"
+ "commit": "d619c325f86cd59ee25d4bcf43b738afcf7bfdf9"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 993c23bdb34..c410df4a76a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -364,7 +364,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *vali
* us whether the insert was actually an append to allow skipping the on-disk check. Note
* that appends can't have history store content. This is true both for "real" appends at
* the end of the tree and also for appends that are filling in truncated gaps in the middle
- * of the tree -- the gap only appears when the truncation becomes globally visible and at
+ * of the tree -- the gap only appears after the truncation becomes globally visible and at
* that point by definition nothing older can be accessible.
*/
if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH))
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 89efcd8599c..092dd58eb64 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -17,37 +17,42 @@
* The way cursor truncate works is it explicitly reads the first and last pages of the truncate
* range, then walks the tree with a flag so the tree walk code skips reading eligible pages within
* the range and instead just marks them as deleted, by changing their WT_REF state to
- * WT_REF_DELETED. Pages ineligible for this fast path include pages already in the cache, having
- * overflow items, or belonging to FLCS trees. Ineligible pages are read and have their rows
- * updated/deleted individually. The transaction for the delete operation is stored in memory
- * referenced by the WT_REF.ft_info.del field.
+ * WT_REF_DELETED. Pages ineligible for this fast path ("fast-truncate" or "fast-delete") include
+ * pages already in the cache, having overflow items, containing prepared values, or belonging to
+ * FLCS trees. Ineligible pages are read and have their rows updated/deleted individually
+ * ("slow-truncate"). The transaction for the delete operation is stored in memory referenced by the
+ * WT_REF.page_del field.
*
* Future cursor walks of the tree will skip the deleted page based on the transaction stored for
* the delete, but it gets more complicated if a read is done using a random key, or a cursor walk
- * is done with a transaction where the delete is not visible. In those cases, we read the original
- * contents of the page. The page-read code notices a deleted page is being read, and as part of the
- * read instantiates the contents of the page, creating tombstone WT_UPDATE records, in the same
- * transaction that deleted the page. In other words, the read process makes it appear as if the
- * page was read and each individual row deleted, exactly as would have happened if the page had
- * been in the cache all along.
+ * is done with a transaction where the delete is not visible, or if an update is applied. In those
+ * cases, we read the original contents of the page. The page-read code notices a deleted page is
+ * being read, and as part of the read instantiates the contents of the page, creating tombstone
+ * WT_UPDATE records, in the same transaction that deleted the page. In other words, the read
+ * process makes it appear as if the page was read and each individual row deleted, exactly as
+ * would have happened if the page had been in the cache all along.
*
* There's an additional complication to support rollback of the page delete. When the page was
* marked deleted, a pointer to the WT_REF was saved in the deleting session's transaction list and
* the delete is unrolled by resetting the WT_REF_DELETED state back to WT_REF_DISK. However, if the
- * page has been instantiated by some reading thread, that's not enough, each individual row on the
+ * page has been instantiated by some reading thread, that's not enough; each individual row on the
* page must have the delete operation reset. If the page split, the WT_UPDATE lists might have been
* saved/restored during reconciliation and appear on multiple pages, and the WT_REF stored in the
* deleting session's transaction list is no longer useful. For this reason, when the page is
* instantiated by a read, a list of the WT_UPDATE structures on the page is stored in the
- * WT_REF.ft_info.update field, that way the session resolving the delete can find all WT_UPDATE
- * structures that require update.
+ * WT_PAGE_MODIFY.inst_updates field. That way the session resolving the delete can find all
+ * WT_UPDATE structures that require update.
*
* There are two other ways pages can be marked deleted: if they reconcile empty, or if they are
* found to be eligible for deletion and contain only obsolete items. (The latter is known as
- * "checkpoint cleanup" and happens in bt_sync.c.) In these cases, the WT_REF state will be set to
- * WT_REF_DELETED but there will not be any associated WT_REF.ft_info.del field since the page
- * contains no data. These pages are always skipped during cursor traversal, and if read is forced
- * to instantiate such a page, it creates an empty page from scratch.
+ * "checkpoint cleanup" and happens in bt_sync.c.) There are also two cases in which deleted pages
+ * are manufactured out of thin air: in VLCS, if a key-space gap exists between the start recno of
+ * an internal page and the start recno of its first child, a deleted page is created to cover this
+ * space; and, when new trees are created they are created with a single deleted leaf page. In these
+ * cases, the WT_REF state will be set to WT_REF_DELETED but there will not be any associated
+ * WT_REF.page_del field since the page contains no data. These pages are always skipped during
+ * cursor traversal, and if read is forced to instantiate such a page, it creates an empty page from
+ * scratch.
*
* This feature is not available for FLCS objects. While most of the machinery exists (it is mostly
* a property of column-store internal pages) there is a showstopper problem. For VLCS, truncate
@@ -83,13 +88,13 @@
* split operation is delicate and risky and it was better to preserve that page. This requires
* special-case code in four places: (a) in split, for VLCS trees, don't discard the first child ref
* in splits, even if it's deleted and the deletion is globally visible; (b) in VLCS trees, don't
- * attempt reverse splits originating from that page, as that would discard it; (c) when loading an
- * internal page, create an extra ref in this position if the first on-disk child starts at a later
- * recno from the internal page itself; and (d) in verify, accept that the page in this position
- * might be an empty deleted ref with no on-disk address. Note that the critical issue is not
- * _discarding_ this page after deleting it. It is fine for it to _be_ deleted, as long as the ref
- * always exists when the internal page is in memory. (It is not written to disk either; internal
- * page reconciliation skips it.)
+ * attempt reverse splits originating from that page, as that would discard it; (c) as noted above,
+ * when loading an internal page, create an extra ref in this position if the first on-disk child
+ * starts at a later recno from the internal page itself; and (d) in verify, accept that the page
+ * in this position might be an empty deleted ref with no on-disk address. Note that the critical
+ * issue is that one must not _discard_ this page after deleting it. It is fine for it to _be_
+ * deleted, as long as the ref always exists when the internal page is in memory. (It is not written
+ * to disk either; internal page reconciliation skips it.)
*/
/*
@@ -136,13 +141,14 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
return (0);
/*
- * There should be no previous page-delete information: if the previous fast-truncate didn't
- * instantiate the page, then we'd never get here to do another delete; if the previous fast-
- * truncate did instantiate the page, then (for a read-write tree; we can't get here in a
- * readonly tree) any fast-truncate information was removed at that point and/or when the
- * fast-truncate transaction was resolved.
+ * There should be no previous page-delete information: if the page was previously deleted and
+ * remains deleted, it'll be in WT_REF_DELETED state and we won't get here to do another delete.
+ * If the page was previously deleted and instantiated, we can only get here if it was written
+ * out again or we successfully just evicted it; in that case, the reconciliation will have
+ * cleared the final traces of the previous deletion and instantiation. Furthermore, any prior
+ * deletion must have committed or another attempt would have failed with an update conflict.
*/
- WT_ASSERT(session, ref->ft_info.del == NULL);
+ WT_ASSERT(session, ref->page_del == NULL);
/*
* We cannot truncate pages that have overflow key/value items as the overflow blocks have to be
@@ -151,6 +157,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
*
* Additionally, if the page has prepared updates or the aggregated start time point on the page
* is not visible to us then we cannot truncate the page.
+ *
+ * Note that we indicate this by succeeding without setting the skip flag, not via EBUSY.
*/
if (!__wt_ref_addr_copy(session, ref, &addr))
goto err;
@@ -171,8 +179,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
WT_ERR(__wt_page_parent_modify_set(session, ref, false));
/* Allocate and initialize the page-deleted structure. */
- WT_ERR(__wt_calloc_one(session, &ref->ft_info.del));
- ref->ft_info.del->previous_ref_state = previous_state;
+ WT_ERR(__wt_calloc_one(session, &ref->page_del));
+ ref->page_del->previous_ref_state = previous_state;
/* History store truncation is non-transactional. */
if (!WT_IS_HS(session->dhandle))
@@ -186,7 +194,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
return (0);
err:
- __wt_free(session, ref->ft_info.del);
+ __wt_free(session, ref->page_del);
/* Publish the page to its previous state, ensuring visibility. */
WT_REF_SET_STATE(ref, previous_state);
@@ -205,7 +213,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
uint8_t current_state;
bool locked;
- /* Lock the reference. We cannot access ref->ft_info.del except when locked. */
+ /* Lock the reference. We cannot access ref->page_del except when locked. */
for (locked = false, sleep_usecs = yield_count = 0;;) {
switch (current_state = ref->state) {
case WT_REF_LOCKED:
@@ -235,17 +243,23 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* There are two possible cases:
*
- * 1. The state is WT_REF_DELETED. In this case ft_info.del cannot be null, because the
+ * 1. The state is WT_REF_DELETED. In this case page_del cannot be null, because the
* operation cannot reach global visibility while its transaction remains uncommitted. The page
* itself is as we left it, so we can just reset the state.
*
- * 2. The state is WT_REF_MEM. We check ft_info.update for a list of updates to abort. Allow the
- * update list to be null to be conservative.
+ * 2. The state is WT_REF_MEM. We check mod->inst_updates for a list of updates to abort. Allow
+ * the update list to be null to be conservative.
*/
- if (current_state == WT_REF_DELETED)
- current_state = ref->ft_info.del->previous_ref_state;
- else {
- if ((updp = ref->ft_info.update) != NULL)
+ if (current_state == WT_REF_DELETED) {
+ current_state = ref->page_del->previous_ref_state;
+ /*
+ * Don't set the WT_PAGE_DELETED transaction ID to aborted; instead, just discard the
+ * structure. This avoids having to check for an aborted delete in other situations.
+ */
+ __wt_free(session, ref->page_del);
+ } else {
+ WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
+ if ((updp = ref->page->modify->inst_updates) != NULL) {
/*
* Walk any list of update structures and abort them. We can't use the normal read path
* to get the pages with updates (the original page may have split, so there may be more
@@ -255,26 +269,20 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
*/
for (; *updp != NULL; ++updp)
(*updp)->txnid = WT_TXN_ABORTED;
- WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
+ /* Now discard the updates. */
+ __wt_free(session, ref->page->modify->inst_updates);
+ }
/*
- * Drop any page_deleted information that has been moved to the modify structure. Note that
- * while this must have been an instantiated page, the information (and flag) is only kept
- * until the page is reconciled for the first time after instantiation, so it might not be
- * set now.
+ * Drop any page_deleted information remaining in the ref. Note that while this must have
+ * been an instantiated page, the information (and flag) is only kept until the page is
+ * reconciled for the first time after instantiation, so it might not be set now.
*/
if (ref->page->modify->instantiated) {
ref->page->modify->instantiated = false;
- __wt_free(session, ref->page->modify->page_del);
+ __wt_free(session, ref->page_del);
}
}
- /*
- * Don't set the WT_PAGE_DELETED transaction ID to aborted, discard any WT_UPDATE list or set
- * the committed flag; instead, discard the structures, it has the same effect. It's a single
- * call, they're a union of two pointers.
- */
- __wt_free(session, ref->ft_info.del);
-
WT_REF_SET_STATE(ref, current_state);
return (0);
}
@@ -293,8 +301,8 @@ __delete_redo_window_cleanup_internal(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ASSERT(session, F_ISSET(ref, WT_REF_FLAG_INTERNAL));
if (ref->page != NULL) {
WT_INTL_FOREACH_BEGIN (session, ref->page, child) {
- if (child->state == WT_REF_DELETED && child->ft_info.del != NULL)
- __cell_redo_page_del_cleanup(session, ref->page->dsk, child->ft_info.del);
+ if (child->state == WT_REF_DELETED && child->page_del != NULL)
+ __cell_redo_page_del_cleanup(session, ref->page->dsk, child->page_del);
}
WT_INTL_FOREACH_END;
}
@@ -303,7 +311,9 @@ __delete_redo_window_cleanup_internal(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* __delete_redo_window_cleanup_skip --
* Tree-walk skip function for __wt_delete_redo_window_cleanup. This skips all leaf pages; we'll
- * visit all in-memory internal pages via the flag settings on the tree-walk call.
+ * visit all in-memory internal pages via the flag settings on the tree-walk call. Note that we
+ * won't be called (even here) for deleted leaf pages themselves, because they're skipped by
+ * default.
*/
static int
__delete_redo_window_cleanup_skip(
@@ -352,18 +362,19 @@ __wt_delete_redo_window_cleanup(WT_SESSION_IMPL *session)
bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
{
- bool skip;
+ bool discard, skip;
/*
- * Deleted pages come from two sources: either it's a truncate as described above, or the page
- * has been emptied by other operations and eviction deleted it.
+ * Deleted pages come from several possible sources (as described at the top of this file).
*
- * In both cases, the WT_REF state will be WT_REF_DELETED. In the case of a truncated page,
- * there will be a WT_PAGE_DELETED structure with the transaction ID of the transaction that
- * deleted the page, and the page is visible if that transaction ID is visible. In the case of
- * an empty page, there will be no WT_PAGE_DELETED structure and the delete is by definition
- * visible, eviction could not have deleted the page if there were changes on it that were not
- * globally visible.
+ * In all cases, the WT_REF state will be WT_REF_DELETED. If there is a WT_PAGE_DELETED
+ * structure describing a transaction, the deletion is visible (so the page is *not* visible) if
+ * the transaction is visible. If there is no WT_PAGE_DELETED structure, the deletion is
+ * globally visible. This happens either because the structure described a transaction that had
+ * become globally visible and was previously removed, or because the page was deleted by a
+ * non-transactional mechanism. (In the latter case, the deletion is inherently globally
+ * visible; pages only become empty if nothing in them remains visible to anyone, and newly
+ * minted empty pages cannot have anything in them to see.)
*
* We're here because we found a WT_REF state set to WT_REF_DELETED. It is possible the page is
* being read into memory right now, though, and the page could switch to an in-memory state at
@@ -372,21 +383,28 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED))
return (false);
- skip = !__wt_page_del_active(session, ref, visible_all);
-
/*
- * The fast-truncate structure can be freed as soon as the delete is stable: it is only read
- * when the ref state is locked. It is worth checking every time we come through because once
- * this is freed, we no longer need synchronization to check the ref.
+ * Check visibility.
*
- * Note that if the visible_all flag is set, skip already reflects the visible_all result so we
- * don't need to do it twice.
+ * Use the option to hide prepared transactions in all checks; we can't skip a page if the
+ * deletion is only prepared (we need to visit it to generate a prepare conflict), and we can't
+ * discard the page_del info either, as doing so leads to dropping the on-disk page and if the
+ * prepared transaction rolls back we'd then be in trouble.
*/
- if (skip && ref->ft_info.del != NULL &&
- (visible_all ||
- __wt_txn_visible_all(
- session, ref->ft_info.del->txnid, ref->ft_info.del->durable_timestamp)))
- __wt_overwrite_and_free(session, ref->ft_info.del);
+ if (visible_all)
+ skip = discard = __wt_page_del_visible_all(session, ref->page_del, true);
+ else {
+ skip = __wt_page_del_visible(session, ref->page_del, true);
+ discard = skip ? __wt_page_del_visible_all(session, ref->page_del, true) : false;
+ }
+
+ /*
+ * The fast-truncate structure can be freed as soon as the delete is globally visible: it is
+ * only read when the ref state is locked. It is worth checking every time we come through
+ * because once this is freed, we no longer need synchronization to check the ref.
+ */
+ if (discard && ref->page_del != NULL)
+ __wt_overwrite_and_free(session, ref->page_del);
WT_REF_SET_STATE(ref, WT_REF_DELETED);
return (skip);
@@ -406,7 +424,8 @@ __tombstone_update_alloc(
F_SET(upd, WT_UPDATE_RESTORED_FAST_TRUNCATE);
/*
- * Cleared memory matches the lowest possible transaction ID and timestamp, do nothing.
+ * Cleared memory matches the lowest possible transaction ID and timestamp; do nothing if the
+ * page_del pointer is null.
*/
if (page_del != NULL) {
upd->txnid = page_del->txnid;
@@ -430,7 +449,9 @@ __instantiate_tombstone(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del,
/*
* If we find an existing stop time point we don't need to append a tombstone. Such rows would
* not have been visible to the original truncate operation and were, logically, skipped over
- * rather than re-deleted.
+ * rather than re-deleted. (If the row _was_ visible to the truncate in spite of having been
+ * subsequently removed, the stop time not being visible would have forced its page to be slow-
+ * truncated rather than fast-truncated.)
*/
if (WT_TIME_WINDOW_HAS_STOP(tw))
*updp = NULL;
@@ -587,30 +608,31 @@ err:
* Instantiate an entirely deleted row-store leaf page.
*/
int
-__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_DELETED *page_del)
+__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
WT_PAGE *page;
+ WT_PAGE_DELETED *page_del;
WT_ROW *rip;
WT_UPDATE **update_list;
uint32_t count, i;
/*
* An operation is accessing a "deleted" page, and we're building an in-memory version of the
- * page (making it look like all entries in the page were individually updated by a remove
- * operation). We end up here if a transaction used a truncate call to delete the page without
+ * page, making it look like all entries in the page were individually updated by a remove
+ * operation. We end up here if a transaction used a truncate call to delete the page without
* reading it, and something else that can't yet see the truncation decided to read the page.
+ * (We also end up here if someone who _can_ see the truncation writes new data into the same
+ * namespace before the deleted pages are discarded.)
*
* This can happen after the truncate transaction resolves, but it can also happen before. In
* the latter case, we need to keep track of the updates we populate the page with, so they can
* be found when the transaction resolves. The page we're loading might split, in which case
* finding the updates any other way would become a problem.
- *
- * The page_del structure passed in is either ref->ft_info.del, or under certain circumstances
- * when that's unavailable, one extracted from the parent page's address cell.
*/
page = ref->page;
+ page_del = ref->page_del;
update_list = NULL;
/* Fast-truncate only happens to leaf pages, and FLCS isn't supported. */
@@ -626,20 +648,18 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_DELE
WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared);
/*
- * Give the page a modify structure and mark the page dirty if the tree isn't read-only. If the
- * tree can be written, the page must be marked dirty: otherwise it can be discarded, and that
- * will lose the truncate information if the parent page hasn't been reconciled since the
- * truncation happened.
+ * Give the page a modify structure. We need it to remember that the page has been instantiated.
+ * We do not need to mark the page dirty here. (It used to be necessary because evicting a clean
+ * instantiated page would lose the delete information; but that is no longer the case.) Note
+ * though that because VLCS instantiation goes through col_modify it will mark the page dirty
+ * regardless, except in read-only trees where attempts to mark things dirty are ignored. (Row-
+ * store instantiation adds the tombstones by hand and so does not need to mark the page dirty.)
*
- * If the tree cannot be written (checked in page-modify-set), we won't dirty the page. In this
- * case the truncate information must have been read from the parent page's on-disk cell, so we
- * can fetch it again if we discard the page and then reread it.
- *
- * Truncates can appear in read-only trees (whether a read-only open of the live database or via
- * a checkpoint cursor) if they were not yet globally visible when the tree was checkpointed.
+ * Note that partially visible truncates that may need instantiation can appear in read-only
+ * trees (whether a read-only open of the live database or via a checkpoint cursor) if they were
+ * not yet globally visible when the tree was checkpointed.
*/
WT_RET(__wt_page_modify_init(session, page));
- __wt_page_modify_set(session, page);
/*
* If the truncate operation is not yet resolved, count how many updates we're going to need and
@@ -681,23 +701,14 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_DELE
break;
}
+ page->modify->instantiated = true;
+ page->modify->inst_updates = update_list;
+
/*
- * Move the WT_PAGE_DELETED structure to page->modify; all of its information has been copied to
- * the list of WT_UPDATE structures (if any), but we may still need it for internal page
- * reconciliation.
- *
- * Note: when the page_del passed in isn't the one in the ref, there should be none in the ref.
- * This only happens in readonly trees (see bt_page.c) and is a consequence of it being possible
- * for a deleted page to be in WT_REF_DISK state if it's already been instantiated once and then
- * evicted. In this case we can set modify->page_del to NULL regardless of the truncation's
- * visibility (rather than copying the passed-in information); modify->page_del is only used by
- * parent-page reconciliation and readonly trees shouldn't ever reach that code.
+ * We will leave the WT_PAGE_DELETED structure in the ref; all of its information has been
+ * copied to the list of WT_UPDATE structures (if any), but we may still need it for internal
+ * page reconciliation until the instantiated page is itself successfully reconciled.
*/
- WT_ASSERT(session, page_del == ref->ft_info.del || ref->ft_info.del == NULL);
- page->modify->instantiated = true;
- page->modify->page_del = ref->ft_info.del;
- /* We don't need to null ft_info.del because assigning ft_info.update overwrites it. */
- ref->ft_info.update = update_list;
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index af5c1c523c9..6af568b3a76 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -215,7 +215,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_ovfl_discard_free(session, page);
__wt_free(session, page->modify->ovfl_track);
- __wt_free(session, page->modify->page_del);
+ __wt_free(session, page->modify->inst_updates);
__wt_spin_destroy(session, &page->modify->page_lock);
__wt_free(session, page->modify);
@@ -295,7 +295,7 @@ __wt_free_ref(WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pa
__wt_ref_addr_free(session, ref);
/* Free any backing fast-truncate memory. */
- __wt_free(session, ref->ft_info.del);
+ __wt_free(session, ref->page_del);
__wt_overwrite_and_free_len(session, ref, WT_REF_CLEAR_SIZE);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 5283ac827f7..87abfebf041 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -649,8 +649,8 @@ __inmem_col_int_init_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *home, u
* fast-delete state for the page.
*/
if (page_del != NULL && F_ISSET(home->dsk, WT_PAGE_FT_UPDATE)) {
- WT_RET(__wt_calloc_one(session, &ref->ft_info.del));
- *ref->ft_info.del = *page_del;
+ WT_RET(__wt_calloc_one(session, &ref->page_del));
+ *ref->page_del = *page_del;
}
WT_REF_SET_STATE(ref, WT_REF_DELETED);
}
@@ -871,8 +871,8 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
* Recreate the fast-delete state for the page.
*/
if (F_ISSET(page->dsk, WT_PAGE_FT_UPDATE)) {
- WT_ERR(__wt_calloc_one(session, &ref->ft_info.del));
- *ref->ft_info.del = unpack.page_del;
+ WT_ERR(__wt_calloc_one(session, &ref->page_del));
+ *ref->page_del = unpack.page_del;
}
WT_REF_SET_STATE(ref, WT_REF_DELETED);
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 2f6f4e3eb88..6baf6f780df 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -95,7 +95,6 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
WT_DECL_RET;
WT_ITEM tmp;
WT_PAGE *notused;
- WT_PAGE_DELETED *del;
uint32_t page_flags;
uint8_t previous_state;
bool prepare;
@@ -118,24 +117,68 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
}
/*
- * Set the WT_REF_FLAG_READING flag for normal reads. Checkpoints can skip over clean pages
- * being read into cache, but need to wait for deletes to be resolved (in order for checkpoint
- * to write the correct version of the page).
+ * Set the WT_REF_FLAG_READING flag for normal reads; this causes reconciliation of the parent
+ * page to skip examining this page in detail and write out a reference to the on-disk version.
+ * Don't do this for deleted pages, as the reconciliation needs to examine the page delete
+ * information. That requires locking the ref, which requires waiting for the read to finish.
+ * (It is possible that always writing out a reference to the on-disk version of the page is
+ * sufficient in this case, but it's not entirely clear; we expect reads of deleted pages to be
+ * rare, so it's better to do the safe thing.)
*/
if (previous_state == WT_REF_DISK)
F_SET(ref, WT_REF_FLAG_READING);
/*
* Get the address: if there is no address, the page was deleted and a subsequent search or
- * insert is forcing re-creation of the name space.
+ * insert is forcing re-creation of the name space. There can't be page delete information,
+ * because that information is an amendment to an on-disk page; when a page is deleted any page
+ * delete information should expire and be removed before the original on-disk page is actually
+ * discarded.
*/
if (!__wt_ref_addr_copy(session, ref, &addr)) {
WT_ASSERT(session, previous_state == WT_REF_DELETED);
-
+ WT_ASSERT(session, ref->page_del == NULL);
WT_ERR(__wt_btree_new_leaf_page(session, ref));
goto skip_read;
}
+ /*
+ * If the page is deleted and the deletion is globally visible, don't bother reading and
+ * explicitly instantiating the existing page. Get a fresh page and pretend we got it by reading
+ * the on-disk page. Note that it's important to set the instantiated flag on the page so that
+ * reconciling the parent internal page knows it was previously deleted. Otherwise it's possible
+ * to write out a reference to the original page without the deletion, which will cause it to
+ * come back to life unexpectedly.
+ *
+ * Setting the instantiated flag requires a modify structure. We don't need to mark it dirty; if
+ * it gets discarded before something else modifies it, eviction will see the instantiated flag
+ * and set the ref state back to WT_REF_DELETED.
+ *
+ * Skip this optimization in cases that need the obsolete values. To minimize the number of
+ * special cases, use the same test as for skipping instantiation below.
+ */
+ if (previous_state == WT_REF_DELETED &&
+ !F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+ /*
+ * If the deletion has not yet been found to be globally visible (page_del isn't NULL),
+ * check if it is now, in case we can in fact avoid reading the page. Hide prepared deletes
+ * from this check; if the deletion is prepared we still need to load the page, because the
+ * reader might be reading at a timestamp early enough to not conflict with the prepare.
+ * Update oldest before checking; we're about to read from disk so it's worth doing some
+ * work to avoid that.
+ */
+ WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+ if (ref->page_del != NULL && __wt_page_del_visible_all(session, ref->page_del, true))
+ __wt_overwrite_and_free(session, ref->page_del);
+
+ if (ref->page_del == NULL) {
+ WT_ERR(__wt_btree_new_leaf_page(session, ref));
+ WT_ERR(__wt_page_modify_init(session, ref->page));
+ ref->page->modify->instantiated = true;
+ goto skip_read;
+ }
+ }
+
/* There's an address, read the backing disk page and build an in-memory version of the page. */
WT_ERR(__wt_blkcache_read(session, &tmp, addr.addr, addr.size));
@@ -159,27 +202,27 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
/*
* In the case of a fast delete, move all of the page's records to a deleted state based on the
* fast-delete information. Skip for special commands that don't care about an in-memory state.
+ * (But do set up page->modify and set page->modify->instantiated so evicting the pages while
+ * these commands are working doesn't go off the rails.)
*
- * Note: there are three possible cases - the state was WT_REF_DELETED and ft_info.del was NULL;
- * the state was WT_REF_DELETED and ft_info.del was non-NULL; and the state was WT_REF_DISK and
- * the parent page cell was a WT_CELL_ADDR_DEL cell. The last is only valid in a readonly tree.
+ * There are two possible cases: the state was WT_REF_DELETED and page_del was or wasn't NULL.
+ * It used to also be possible for eviction to set the state to WT_REF_DISK while the parent
+ * page nonetheless had a WT_CELL_ADDR_DEL cell. This is not supposed to happen any more, so for
+ * now at least assert it doesn't.
*
- * ft_info.del gets cleared and set to NULL if the deletion is found to be globally visible;
- * this can happen in any of several places.
+ * page_del gets cleared and set to NULL if the deletion is found to be globally visible; this
+ * can happen in any of several places.
*/
- del = NULL;
- if (previous_state == WT_REF_DISK) {
- WT_ASSERT(session, ref->ft_info.del == NULL);
- if (addr.del_set) {
- WT_ASSERT(session, F_ISSET(S2BT(session), WT_BTREE_READONLY));
- del = &addr.del;
- }
- } else
- del = ref->ft_info.del;
-
- if ((previous_state == WT_REF_DELETED || del != NULL) &&
- !F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
- WT_ERR(__wt_delete_page_instantiate(session, ref, del));
+ WT_ASSERT(
+ session, previous_state != WT_REF_DISK || (ref->page_del == NULL && addr.del_set == false));
+
+ if (previous_state == WT_REF_DELETED) {
+ if (F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+ WT_ERR(__wt_page_modify_init(session, ref->page));
+ ref->page->modify->instantiated = true;
+ } else
+ WT_ERR(__wt_delete_page_instantiate(session, ref));
+ }
skip_read:
F_CLR(ref, WT_REF_FLAG_READING);
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index cf719a791aa..a48210bd5bc 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -607,7 +607,7 @@ __split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *paren
}
/* Free any backing fast-truncate memory. */
- __wt_free(session, ref->ft_info.del);
+ __wt_free(session, ref->page_del);
/* Free the backing block and address. */
WT_TRET(__wt_ref_block_free(session, ref));
@@ -1780,7 +1780,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
WT_ASSERT(session, __wt_leaf_page_can_split(session, page));
WT_ASSERT(session, __wt_page_is_modified(page));
- WT_ASSERT(session, ref->ft_info.del == NULL);
F_SET_ATOMIC_16(page, WT_PAGE_SPLIT_INSERT); /* Only split in-memory once. */
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 2a9c8454458..3917c6c5b25 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -323,18 +323,36 @@ static int
__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_DECL_RET;
+ bool instantiated;
/*
- * Discard the page and update the reference structure. A page with a disk address is an on-disk
- * page, and a page without a disk address is a re-instantiated deleted page (for example, by
- * searching), that was never subsequently written.
+ * We might discard an instantiated deleted page, because instantiated pages are not marked
+ * dirty by default. Check this before discarding the modify structure in __wt_ref_out.
+ */
+ if (ref->page->modify != NULL && ref->page->modify->instantiated)
+ instantiated = true;
+ else {
+ WT_ASSERT(session, ref->page_del == NULL);
+ instantiated = false;
+ }
+
+ /*
+ * Discard the page and update the reference structure. A leaf page without a disk address is a
+ * deleted page that either was created empty and never written out, or had its on-disk page
+ * discarded already after the deletion became globally visible. It is not immediately clear if
+ * it's possible to get an internal page without a disk address here, but if one appears it can
+ * be deleted. (Note that deleting an internal page implicitly turns it into a leaf.)
+ *
+ * A page with a disk address is now on disk, unless it was deleted and instantiated and then
+ * evicted unmodified, in which case it is still deleted. In the latter case set the state back
+ * to WT_REF_DELETED.
*/
__wt_ref_out(session, ref);
if (ref->addr == NULL) {
WT_WITH_PAGE_INDEX(session, ret = __evict_delete_ref(session, ref, flags));
WT_RET_BUSY_OK(ret);
} else
- WT_REF_SET_STATE(ref, WT_REF_DISK);
+ WT_REF_SET_STATE(ref, instantiated ? WT_REF_DELETED : WT_REF_DISK);
return (0);
}
@@ -471,8 +489,8 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
/*
* It is always OK to evict pages from checkpoint cursor trees if they don't have children, and
- * visibility checks for pages deleted in the checkpoint aren't needed (or correct when done in
- * eviction threads).
+ * visibility checks for pages found to be deleted in the checkpoint aren't needed (or correct
+ * when done in eviction threads).
*/
if (WT_READING_CHECKPOINT(session))
return (0);
@@ -501,6 +519,22 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
if (!__wt_atomic_casv8(&child->state, WT_REF_DELETED, WT_REF_LOCKED))
return (__wt_set_return(session, EBUSY));
/*
+ * Insert a read/read barrier so we're guaranteed the page_del state we read below comes
+ * after the locking operation on the ref state and therefore after the previous unlock
+ * of the ref. Otherwise we might read an inconsistent view of the page deletion info,
+ * and while many combinations are harmless and would just lead us to falsely refuse to
+ * evict, some (e.g. reading committed as true and a stale durable timestamp from before
+ * it was set by commit) are not.
+ *
+ * Note that while ordinarily a lock acquire should have an acquire (read/any) barrier
+ * after it, because we are only reading the write part is irrelevant and a read/read
+ * barrier is sufficient.
+ *
+ * FIXME-WT-9780: this and the CAS should be rolled into a WT_REF_TRYLOCK macro.
+ */
+ WT_READ_BARRIER();
+
+ /*
* We can evict any truncation that's committed. However, restrictions in reconciliation
* mean that it needs to be visible to us when we get there. And unfortunately we are
* upstream of the point where eviction threads get snapshots. Plus, application threads
@@ -512,15 +546,20 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
* 3. If we do not but we're an eviction thread, go ahead. We will get a snapshot
* shortly and any committed operation will be visible in it.
* 4. Otherwise, check if the operation is globally visible.
+ *
+ * Even though we specifically can't evict prepared truncations, we don't need to deploy
+ * the special-case logic for prepared transactions in __wt_page_del_visible; prepared
+ * transactions aren't committed so they'll fail the first check.
*/
- if (!__wt_page_del_committed(child->ft_info.del))
+ if (!__wt_page_del_committed(child->page_del))
visible = false;
else if (F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT))
- visible = __wt_page_del_visible(session, child->ft_info.del, false);
+ visible = __wt_page_del_visible(session, child->page_del, false);
else if (F_ISSET(session, WT_SESSION_EVICTION))
visible = true;
else
- visible = __wt_page_del_visible(session, child->ft_info.del, true);
+ visible = __wt_page_del_visible_all(session, child->page_del, false);
+ /* FIXME-WT-9780: is there a reason this doesn't use WT_REF_UNLOCK? */
child->state = WT_REF_DELETED;
if (!visible)
return (__wt_set_return(session, EBUSY));
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index c67d8c3477d..3b3dbdb538f 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -445,9 +445,15 @@ struct __wt_page_modify {
/* Overflow record tracking for reconciliation. */
WT_OVFL_TRACK *ovfl_track;
- /* Cached page-delete information for newly instantiated deleted pages. */
- WT_PAGE_DELETED *page_del; /* Deletion information; NULL if globally visible. */
- bool instantiated; /* True if this is a newly instantiated page. */
+ /*
+ * Page-delete information for newly instantiated deleted pages. The instantiated flag remains
+ * set until the page is reconciled successfully; this indicates that the page_del information
+ * in the ref remains valid. The update list remains set (if set at all) until the transaction
+ * that deleted the page is resolved. These transitions are independent; that is, the first
+ * reconciliation can happen either before or after the delete transaction resolves.
+ */
+ bool instantiated; /* True if this is a newly instantiated page. */
+ WT_UPDATE **inst_updates; /* Update list for instantiated page with unresolved truncate. */
#define WT_PAGE_LOCK(s, p) __wt_spin_lock((s), &(p)->modify->page_lock)
#define WT_PAGE_TRYLOCK(s, p) __wt_spin_trylock((s), &(p)->modify->page_lock)
@@ -807,16 +813,17 @@ struct __wt_page {
*
* WT_REF_DELETED:
* The page is on disk, but has been deleted from the tree; we can delete
- * row-store leaf pages without reading them if they don't reference
- * overflow items.
+ * row-store and VLCS leaf pages without reading them if they don't
+ * reference overflow items.
*
* WT_REF_LOCKED:
* Locked for exclusive access. In eviction, this page or a parent has
* been selected for eviction; once hazard pointers are checked, the page
* will be evicted. When reading a page that was previously deleted, it
- * is locked until the page is in memory with records marked deleted. The
- * thread that set the page to WT_REF_LOCKED has exclusive access, no
- * other thread may use the WT_REF until the state is changed.
+ * is locked until the page is in memory and the deletion has been
+ * instantiated with tombstone updates. The thread that set the page to
+ * WT_REF_LOCKED has exclusive access; no other thread may use the WT_REF
+ * until the state is changed.
*
* WT_REF_MEM:
* Set by a reading thread once the page has been read from disk; the page
@@ -847,7 +854,9 @@ struct __wt_page {
/*
* WT_PAGE_DELETED --
- * Related information for truncated pages.
+ * Information about how they got deleted for deleted pages. This structure records the
+ * transaction that deleted the page, plus the state the ref was in when the deletion happened.
+ * This structure is akin to an update but applies to a whole page.
*/
struct __wt_page_deleted {
/*
@@ -863,8 +872,8 @@ struct __wt_page_deleted {
wt_timestamp_t durable_timestamp;
/*
- * The prepare state is used for transaction prepare to manage visibility and inheriting prepare
- * state to update_list.
+ * The prepare state is used for transaction prepare to manage visibility and propagating the
+ * prepare state to the updates generated at instantiation time.
*/
volatile uint8_t prepare_state;
@@ -952,83 +961,119 @@ struct __wt_ref {
#define ref_ikey key.ikey
/*
- * Fast-truncate information, written-to/read-from disk as necessary in the internal page's
- * deleted page proxy cell. When a WT_REF first becomes part of a fast-truncate operation, the
- * ft_info.del field is allocated and initialized.
+ * Page deletion information, written-to/read-from disk as necessary in the internal page's
+ * address cell. (Deleted-address cells are also referred to as "proxy cells".) When a WT_REF
+ * first becomes part of a fast-truncate operation, the page_del field is allocated and
+ * initialized; it is similar to an update and holds information about the transaction that
+ * performed the truncate. It can be discarded and set to NULL when that transaction reaches
+ * global visibility.
*
- * Fast-truncate pages might have to be instantiated if a thread for which the operation isn't
- * visible accesses the page. This can happen if the operation hasn't committed yet; it can also
- * happen if an older read transaction visits the page, and it can happen if the fast-truncate
- * operation is included in a checkpoint and then seen later, after a restart or via a
- * checkpoint cursor.
+ * Operations other than truncate that produce deleted pages (checkpoint cleanup, reconciliation
+ * as empty, etc.) leave the page_del field NULL as in these cases the deletion is already
+ * globally visible.
*
- * If the page must be instantiated for any reason: (1) WT_UPDATE structures are created for the
- * page entries, (2) the transaction information from ft_info.del is copied to those WT_UPDATE
- * structures (making them a match for the truncate operation), (3) the ft_info.del field is
- * discarded, and (4) the WT_REF state switches to WT_REF_MEM.
+ * Once the deletion is globally visible, the original on-disk page is no longer needed and can
+ * be discarded; this happens the next time the parent page is reconciled, either by eviction or
+ * by a checkpoint. The ref remains, however, and still occupies the same key space in the table
+ * that it always did.
*
- * If the fast-truncate operation has not yet committed, additionally the ft_info.update field
- * is created, which is an array of references to the WT_UPDATE structures, for subsequent
- * transaction commit/abort. (The page can split, so there needs to be some way to find all of
- * the update structures.)
+ * Deleted refs (and thus chunks of the tree namespace) are only discarded at two points: when
+ * the parent page is discarded after being evicted, or in the course of internal page splits
+ * and reverse splits. Until this happens, the "same" page can be brought back to life by
+ * writing to its portion of the key space.
*
- * Doing anything other than testing if ft_info.del or ft_info.update is non-NULL (which
- * eviction does) requires the WT_REF be locked.
+ * A deleted page needs to be "instantiated" (read in from disk and converted to an in-memory
+ * page where every item on the page has been individually deleted) if we need to position a
+ * cursor on the page, or if we need to visit it for other reasons. Logic exists to avoid that
+ * in various common cases (see: __wt_btcur_skip_page, __wt_delete_page_skip) but in many less
+ * common situations we proceed with instantiation anyway to avoid multiplying the number of
+ * special cases in the system.
*
- * Because ft_info is a union it is important to always access the correct field. It is also
- * vital to interpret the state correctly and consider all the possible cases.
+ * Common triggers for instantiation include: another thread reading from the page before a
+ * truncate commits; an older reader visiting a page after a truncate commits; a thread reading
+ * the page via a checkpoint cursor if the truncation wasn't yet globally visible at checkpoint
+ * time; a thread reading the page after shutdown and restart under similar circumstances; RTS
+ * needing to roll back a committed but unstable truncation (and possibly also updates that
+ * occurred before the truncation); and a thread writing to the truncated portion of the table
+ * space after the truncation but before the page is completely discarded.
*
- * The union access should be ft_info.del if the state is WT_REF_DELETED (states 1 and 2 below),
- * and should be ft_info.update if the state is WT_REF_MEM (states 5-6 below). Otherwise,
- * neither field is valid and the pointer should always be NULL.
+ * If the page must be instantiated for any reason: (1) for each entry on the page a WT_UPDATE
+ * is created; (2) the transaction information from page_del is copied to those WT_UPDATE
+ * structures (making them a match for the truncate operation), and (3) the WT_REF state
+ * switches to WT_REF_MEM.
*
- * These are the possible states:
+ * If the fast-truncate operation has not yet committed, an array of references to the WT_UPDATE
+ * structures is placed in modify->inst_updates. This is used to find the updates when the
+ * operation subsequently resolves. (The page can split, so there needs to be some way to find
+ * all of the update structures.)
*
- * 1. The WT_REF state is WT_REF_DELETED and ft_info.del is NULL. This means the page is deleted
- * and the deletion is globally visible. Any on-disk page has been or will be discarded.
+ * After instantiation, the page_del structure is kept until the instantiated page is next
+ * reconciled. This is because in some cases reconciliation of the parent internal page may need
+ * to write out a reference to the pre-instantiated on-disk page, at which point the page_del
+ * information is needed to build the correct reference.
*
- * 2. The WT_REF state is WT_REF_DELETED and ft_info.del is not NULL. The page is deleted, but
- * but the deletion may not yet be globally visible (or visible to any given reader either.) The
- * on-disk page remains in case we need it to satisfy reads. ft_info.del describes the delete
- * operation. If it is necessary to read the page on behalf of a thread that cannot see the
- * deletion, the page must be instantiated as described above.
+ * If the ref is in WT_REF_DELETED state, all actions besides checking whether page_del is NULL
+ * require that the WT_REF be locked. There are two reasons for this: first, the page might be
+ * instantiated at any time, and it is important to not see a partly-completed instantiation;
+ * and second, the page_del structure is discarded opportunistically if its transaction is found
+ * to be globally visible, so accessing it without locking the ref is unsafe.
*
- * 3. The WT_REF state is WT_REF_DISK, and the parent page's address cell is a deleted-address
- * cell. ft_info is not valid; ft_info.del should read as NULL. The page is on disk, and
- * deleted; the deletion may not yet be globally visible. Because the time aggregate stored in
- * the parent internal page includes the deletion time, tree walks will skip the page as
- * appropriate without needing the fast-delete information. This state can only happen in
- * readonly trees; it is a result of the page being read in and instantiated, but not marked
- * dirty, then discarded by eviction. (In principle eviction should set the state back to
- * WT_REF_DELETED in this case; however, this turns out to be awkward and we work around it
- * instead.) This state only arises in two places: when reading in the page, and in some cases
- * of skipping over the page; both cases already need to unpack the address cell, so we can use
- * it to retrieve the fast-delete information. Other than these considerations, this state is
- * indistinguishable from state 4.
+ * If the ref is in WT_REF_MEM state because it has been instantiated, the safety requirements
+ * are somewhat looser. Checking for an instantiated page by examining modify->instantiated does
+ * not require locking. Checking if modify->inst_updates is non-NULL (which means that the
+ * truncation isn't committed) also doesn't require locking. In general the page_del structure
+ * should not be used after instantiation; exceptions are (a) it is still updated by transaction
+ * prepare, commit, and rollback (so that it remains correct) and (b) it is used by internal
+ * page reconciliation if that occurs before the instantiated child is itself reconciled. (The
+ * latter can only happen if the child is evicted in a fairly narrow time window during a
+ * checkpoint.) This still requires locking the ref.
*
- * 4. The WT_REF state is WT_REF_DISK, and the parent page's address cell is not a
- * deleted-address cell. ft_info is not valid; ft_info.del should read as NULL. This is an
- * ordinary on-disk page.
+ * It is vital to consider all the possible cases when touching a deleted or instantiated page.
*
- * 5. The WT_REF state is WT_REF_MEM, and ft_info.update is NULL. This is an ordinary in-memory
- * page.
+ * There are two major groups of states:
+ *
+ * 1. The WT_REF state is WT_REF_DELETED. This means the page is deleted and not in memory.
+ * - If the page has no disk address, the ref is a placeholder in the key space and may in
+ * general be discarded at the next opportunity. (Some restrictions apply in VLCS.)
+ * - If the page has a disk address, page_del may be NULL. In this case, the deletion of the
+ * page is globally visible and the on-disk page can be discarded at the next opportunity.
+ * - If the page has a disk address and page_del is not NULL, page_del contains information
+ * about the transaction that deleted the page. It is necessary to lock the ref to read
+ * page_del; at that point (if the state hasn't changed while getting the lock)
+ * page_del->committed can be used to check if the transaction is committed or not.
*
- * 6. The WT_REF state is WT_REF_MEM, and ft_info.update is not NULL. This is a deleted page
- * that was instantiated when the delete transaction was not yet resolved. ft_info.update is the
- * list of updates created by the instantiation, which is used to commit or abort them as needed
- * and then cleared. It is not possible to get to this state if the truncate information was
- * read from disk; uncommitted (including prepared) truncates are not evicted or checkpointed.
+ * 2. The WT_REF state is WT_REF_MEM. The page is either an ordinary page or an instantiated
+ * deleted page.
+ * - If ref->page->modify is NULL, the page is ordinary.
+ * - If ref->page->modify->instantiated is false and ref->page->modify->inst_updates is NULL,
+ * the page is ordinary.
+ * - If ref->page->modify->instantiated is true, the page is instantiated and has not yet
+ * been reconciled. ref->page_del is either NULL (meaning the deletion is globally visible)
+ * or contains information about the transaction that deleted the page. This information is
+ * only meaningful either (a) in relation to the existing on-disk page rather than the in-
+ * memory page (this can be needed to reconcile the parent internal page) or (b) if the
+ * page is clean.
+ * - If ref->page->modify->inst_updates is not NULL, the page is instantiated and the
+ * transaction that deleted it has not resolved yet. The update list is used during commit
+ * or rollback to find the updates created during instantiation.
*
- * In both states 5 and 6, the page will have a modify structure to hold the instantiated
- * tombstones. If the tree is read-write, the page will be marked dirty. Until it is reconciled,
- * modify->instantiated will also be set to true, and modify->page_del will hold the page-delete
- * information used for the instantiation, if any. This is needed under some circumstances
- * for checkpointing internal pages.
+ * The last two points of group (2) are orthogonal; that is, after instantiation the
+ * instantiated flag and page_del structure (on the one hand) and the update list (on the other)
+ * are used and discarded independently. The former persists only until the page is first
+ * successfully reconciled; the latter persists until the transaction resolves. These events may
+ * occur in either order.
+ *
+ * As described above, in any state in group (1) an access to the page may require it be read
+ * into memory, at which point it moves into group (2). Instantiation always sets the
+ * instantiated flag to true; the updates list is only created if the transaction has not yet
+ * resolved at the point instantiation happens. (The ref is locked in both transaction
+ * resolution and instantiation to make sure these events happen in a well-defined order.)
+ *
+ * Because internal pages with uncommitted (including prepared) deletions are not written to
+ * disk, a page instantiated after its parent was read from disk will always have inst_updates
+ * set to NULL.
*/
- union {
- WT_PAGE_DELETED *del; /* Page not instantiated, page-deleted structure */
- WT_UPDATE **update; /* Page instantiated, update list for subsequent commit/abort */
- } ft_info;
+ WT_PAGE_DELETED *page_del; /* Page-delete information for a deleted page. */
#ifdef HAVE_REF_TRACK
/*
diff --git a/src/third_party/wiredtiger/src/include/btree_inline.h b/src/third_party/wiredtiger/src/include/btree_inline.h
index c5c70a5b622..3cf159da8c3 100644
--- a/src/third_party/wiredtiger/src/include/btree_inline.h
+++ b/src/third_party/wiredtiger/src/include/btree_inline.h
@@ -1582,55 +1582,72 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
- * __wt_page_del_visible --
- * Return if a truncate operation is visible to the caller.
+ * __wt_page_del_visible_all --
+ * Check if a truncate operation is visible to everyone and the data under it is obsolete.
*/
static inline bool
-__wt_page_del_visible(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, bool visible_all)
+__wt_page_del_visible_all(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, bool hide_prepared)
{
uint8_t prepare_state;
/*
- * In general usage, a NULL WT_PAGE_DELETED is a truncate operation whose details were discarded
- * when it became globally visible.
+ * Like other visible_all checks, use the durable timestamp to avoid complications: there is
+ * potentially a window where a prepared and committed transaction can be visible but not yet
+ * durable, and in that window the changes under it are not obsolete yet.
+ *
+ * The hide_prepared argument causes prepared but not committed transactions to be treated as
+ * invisible. (Apparently prepared and uncommitted transactions can be visible_all, but we need
+ * to not see them in some cases; for example, prepared deletions can't exist on disk because
+ * the on-disk format doesn't have space for the extra "I'm prepared" bit, so we avoid seeing
+ * them in reconciliation. Similarly, we can't skip over a page just because a transaction has
+ * deleted it and prepared; only committed transactions are suitable.)
+ *
+ * In all cases, the ref owning the page_deleted structure should be locked and its pre-lock
+ * state should be WT_REF_DELETED. This prevents the page from being instantiated while we look
+ * at it, and locks out other operations that might simultaneously discard the structure (either
+ * after checking visibility, or because its transaction aborted).
*/
+
+ /* If the page delete info is NULL, the deletion was previously found to be globally visible. */
if (page_del == NULL)
return (true);
/* We discard page_del on transaction abort, so should never see an aborted one. */
WT_ASSERT(session, page_del->txnid != WT_TXN_ABORTED);
- WT_ORDERED_READ(prepare_state, page_del->prepare_state);
- if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED)
- return (false);
+ if (hide_prepared) {
+ WT_ORDERED_READ(prepare_state, page_del->prepare_state);
+ if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED)
+ return (false);
+ }
- return (visible_all ?
- __wt_txn_visible_all(session, page_del->txnid, page_del->durable_timestamp) :
- __wt_txn_visible(session, page_del->txnid, page_del->timestamp));
+ return (__wt_txn_visible_all(session, page_del->txnid, page_del->durable_timestamp));
}
/*
- * __wt_page_del_active --
- * Return if a truncate operation is active.
+ * __wt_page_del_visible --
+ * Return if a truncate operation is visible to the caller. The same considerations apply as in
+ * the visible_all version.
*/
static inline bool
-__wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
+__wt_page_del_visible(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, bool hide_prepared)
{
- /*
- * Return if a truncate operation is active: "active" means approximately that the truncate is
- * still in progress, that is, that the underlying original page may still be required. This
- * function in practice is actually a visibility test (it returns whether the truncate is *not*
- * visible) and should be renamed and have its sense flipped to be more consistent with the rest
- * of the system.
- *
- * Our caller should have already locked the WT_REF and confirmed that the previous state was
- * WT_REF_DELETED. Consequently there are two possible cases: either ft_info.del is NULL (in
- * which case the deletion is globally visible and cannot be rolled back) or it is not, in which
- * case the information in ft_info.del gives us the visibility.
- */
- WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+ uint8_t prepare_state;
- return (!__wt_page_del_visible(session, ref->ft_info.del, visible_all));
+ /* If the page delete info is NULL, the deletion was previously found to be globally visible. */
+ if (page_del == NULL)
+ return (true);
+
+ /* We discard page_del on transaction abort, so should never see an aborted one. */
+ WT_ASSERT(session, page_del->txnid != WT_TXN_ABORTED);
+
+ if (hide_prepared) {
+ WT_ORDERED_READ(prepare_state, page_del->prepare_state);
+ if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED)
+ return (false);
+ }
+
+ return (__wt_txn_visible(session, page_del->txnid, page_del->timestamp));
}
/*
@@ -1638,7 +1655,10 @@ __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
* Return if a truncate operation is resolved. (Since truncations that abort are removed
* immediately, "resolved" and "committed" are equivalent here.) The caller should have already
* locked the ref and confirmed that the ref's previous state was WT_REF_DELETED. The page_del
- * argument should be the ref's ft_info.del member.
+ * argument should be the ref's page_del member. This function should only be used for pages in
+ * WT_REF_DELETED state. For deleted pages that have been instantiated in memory, the update
+ * list in the page modify structure should be checked instead, as the page_del structure might
+ * have been discarded already. (The update list is non-null if the transaction is unresolved.)
*/
static inline bool
__wt_page_del_committed(WT_PAGE_DELETED *page_del)
@@ -1834,13 +1854,18 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
/*
* Check the fast-truncate information. Pages with an uncommitted truncate cannot be evicted.
*
- * Because the page is in memory, we look at ft_info.update. If it's not NULL, that means the
+ * Because the page is in memory, we look at mod.inst_updates. If it's not NULL, that means the
* truncate operation isn't committed.
*
- * The list of updates in ft_info.update will be discarded when the transaction they belong to
+ * The list of updates in mod.inst_updates will be discarded when the transaction they belong to
* is resolved.
+ *
+ * Note that we are not using __wt_page_del_committed here because (a) examining the page_del
+ * structure requires locking the ref, and (b) once in memory the page_del structure only
+ * remains until the next reconciliation, and nothing prevents that from occurring before the
+ * transaction commits.
*/
- if (ref->ft_info.update != NULL)
+ if (mod->inst_updates != NULL)
return (false);
/*
@@ -2236,21 +2261,21 @@ __wt_btcur_skip_page(
WT_REF_LOCK(session, ref, &previous_state);
/*
- * Check the fast-truncate information, there are 4 cases:
+ * Check the fast-truncate information; there are 3 cases:
*
- * (1) The page is in the WT_REF_DELETED state and ft_info.del is NULL. The page is deleted.
- * (2) The page is in the WT_REF_DELETED state and ft_info.del is not NULL. The page is deleted
- * if the truncate operation is visible. Look at ft_info.del; we could use the info from the
+ * (1) The page is in the WT_REF_DELETED state and page_del is NULL. The page is deleted. This
+ * case is folded into the next because __wt_page_del_visible handles it.
+ * (2) The page is in the WT_REF_DELETED state and page_del is not NULL. The page is deleted
+ * if the truncate operation is visible. Look at page_del; we could use the info from the
* address cell below too, but that's slower.
- * (3) The page is in the WT_REF_DISK state. The page may be deleted; check the delete info from
- * the address cell.
- * (4) The page is in memory and has been instantiated. The delete info from the address cell
- * will serve for readonly/unmodified pages, and for modified pages we can't skip the page
- * anyway.
- */
- if (previous_state == WT_REF_DELETED &&
- (ref->ft_info.del == NULL ||
- __wt_txn_visible(session, ref->ft_info.del->txnid, ref->ft_info.del->timestamp))) {
+ * (3) The page is in memory and has been instantiated. The delete info from the address cell
+ * will serve for readonly/unmodified pages, and for modified pages we can't skip the page.
+ * (This case is checked further below.)
+ *
+ * In all cases, make use of the option to __wt_page_del_visible to hide prepared transactions,
+ * as we shouldn't skip pages where the deletion is prepared but not committed.
+ */
+ if (previous_state == WT_REF_DELETED && __wt_page_del_visible(session, ref->page_del, true)) {
*skipp = true;
goto unlock;
}
@@ -2264,7 +2289,7 @@ __wt_btcur_skip_page(
(previous_state == WT_REF_MEM && !__wt_page_is_modified(ref->page))) &&
__wt_ref_addr_copy(session, ref, &addr)) {
/* If there's delete information in the disk address, we can use it. */
- if (addr.del_set && __wt_txn_visible(session, addr.del.txnid, addr.del.timestamp)) {
+ if (addr.del_set && __wt_page_del_visible(session, &addr.del, true)) {
*skipp = true;
goto unlock;
}
diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h
index 351c7ae6464..0b504d1ed59 100644
--- a/src/third_party/wiredtiger/src/include/cell.h
+++ b/src/third_party/wiredtiger/src/include/cell.h
@@ -24,6 +24,9 @@
* Deleted cells are place-holders for column-store files, where entries cannot
* be removed in order to preserve the record count.
*
+ * Note that deleted value cells (WT_CELL_DEL) are different from deleted-address
+ * cells (WT_CELL_ADDR_DEL).
+ *
* Here's the cell use by page type:
*
* WT_PAGE_ROW_INT (row-store internal page):
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 1ae1eb3042d..470e83000c2 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -661,8 +661,8 @@ extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_
WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref,
- WT_PAGE_DELETED *page_del) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_delete_redo_window_cleanup(WT_SESSION_IMPL *session)
@@ -1986,12 +1986,12 @@ static inline bool __wt_op_timer_fired(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-static inline bool __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_page_del_committed(WT_PAGE_DELETED *page_del)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_page_del_visible(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del,
- bool visible_all) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ bool hide_prepared) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline bool __wt_page_del_visible_all(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del,
+ bool hide_prepared) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_page_evict_clean(WT_PAGE *page)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page)
diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h
index 519fb4b9dbf..7c94909e721 100644
--- a/src/third_party/wiredtiger/src/include/txn_inline.h
+++ b/src/third_party/wiredtiger/src/include/txn_inline.h
@@ -231,22 +231,25 @@ __wt_txn_op_delete_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bo
/*
* Timestamps and prepare state are in the page deleted structure for truncates, or in the
- * updates in the case of instantiated pages. In the case of instantiated pages we may also need
- * to update the page deleted structure saved in page->modify.
+ * updates list in the case of instantiated pages. We also need to update any page deleted
+ * structure in the ref.
*
- * Only two cases are possible. First: the state is WT_REF_DELETED. In this case ft_info.del
- * cannot be NULL yet because an uncommitted operation cannot have reached global visibility.
- * Otherwise: there is an uncommitted delete operation we're handling, so the page can't be in a
- * non-deleted state, and the tree can't be readonly. Therefore the page must have been
+ * Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot
+ * be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at
+ * least, global visibility in the sense we need to use it for truncations, in which prepared
+ * and uncommitted transactions are not visible.)
+ *
+ * Otherwise: there is an uncommitted delete operation we're handling, so the page must have
+ * been deleted at some point, and the tree can't be readonly. Therefore the page must have been
* instantiated, the state must be WT_REF_MEM, and there should be an update list in
- * ft_info.update. (But just in case, allow the update list to be null. Perhaps the page was
- * truncated when all items on it were already deleted, so no tombstones were created during
- * instantiation.)
+ * mod->inst_updates. (But just in case, allow the update list to be null.) There might be a
+ * non-null page_del structure to update, depending on whether the page has been reconciled
+ * since it was deleted and then instantiated.
*/
- if (previous_state == WT_REF_DELETED)
- page_del = ref->ft_info.del;
- else {
- if ((updp = ref->ft_info.update) != NULL)
+ if (previous_state != WT_REF_DELETED) {
+ WT_ASSERT(session, previous_state == WT_REF_MEM);
+ WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
+ if ((updp = ref->page->modify->inst_updates) != NULL)
for (; *updp != NULL; ++updp) {
(*updp)->start_ts = ts;
/*
@@ -257,9 +260,8 @@ __wt_txn_op_delete_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bo
if (commit)
(*updp)->durable_ts = txn->durable_timestamp;
}
- WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
- page_del = ref->page->modify->page_del;
}
+ page_del = ref->page_del;
if (page_del != NULL) {
page_del->timestamp = ts;
if (commit)
@@ -289,28 +291,31 @@ __wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref
/*
* Timestamps are in the page deleted structure for truncates, or in the updates in the case of
- * instantiated pages. Both commit and durable timestamps need to be updated.
+ * instantiated pages. We also need to update any page deleted structure in the ref. Both commit
+ * and durable timestamps need to be updated.
*
- * Only two cases are possible. First: the state is WT_REF_DELETED. In this case ft_info.del
- * cannot be NULL yet because an uncommitted operation cannot have reached global visibility.
- * Otherwise: there is an uncommitted delete operation we're handling, so the page can't be in a
- * non-deleted state, and the tree can't be readonly. Therefore the page must have been
+ * Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot
+ * be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at
+ * least, global visibility in the sense we need to use it for truncations, in which prepared
+ * and uncommitted transactions are not visible.)
+ *
+ * Otherwise: there is an uncommitted delete operation we're handling, so the page must have
+ * been deleted at some point, and the tree can't be readonly. Therefore the page must have been
* instantiated, the state must be WT_REF_MEM, and there should be an update list in
- * ft_info.update. (But just in case, allow the update list to be null. Perhaps the page was
- * truncated when all items on it were already deleted, so no tombstones were created during
- * instantiation.)
+ * mod->inst_updates. (But just in case, allow the update list to be null.) There might be a
+ * non-null page_del structure to update, depending on whether the page has been reconciled
+ * since it was deleted and then instantiated.
*/
- if (previous_state == WT_REF_DELETED)
- page_del = ref->ft_info.del;
- else {
- if ((updp = ref->ft_info.update) != NULL)
+ if (previous_state != WT_REF_DELETED) {
+ WT_ASSERT(session, previous_state == WT_REF_MEM);
+ WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
+ if ((updp = ref->page->modify->inst_updates) != NULL)
for (; *updp != NULL; ++updp) {
(*updp)->start_ts = txn->commit_timestamp;
(*updp)->durable_ts = txn->durable_timestamp;
}
- WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
- page_del = ref->page->modify->page_del;
}
+ page_del = ref->page_del;
if (page_del != NULL && page_del->timestamp == WT_TS_NONE) {
page_del->timestamp = txn->commit_timestamp;
page_del->durable_timestamp = txn->durable_timestamp;
@@ -437,7 +442,7 @@ __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
* This access to the WT_PAGE_DELETED structure is safe; caller has the WT_REF locked, and in
* fact just allocated the structure to fill in.
*/
- ref->ft_info.del->txnid = txn->id;
+ ref->page_del->txnid = txn->id;
__wt_txn_op_set_timestamp(session, op);
if (__wt_log_op(session))
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c
index 03b07757377..229413a6d48 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_child.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c
@@ -13,10 +13,14 @@
* Handle pages with leaf pages in the WT_REF_DELETED state.
*/
static int
-__rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref,
- WT_PAGE_DELETED *page_del, WT_CHILD_MODIFY_STATE *cmsp)
+__rec_child_deleted(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_CHILD_MODIFY_STATE *cmsp)
{
+ WT_PAGE_DELETED *page_del;
uint8_t prepare_state;
+ bool visible, visible_all;
+
+ page_del = ref->page_del;
cmsp->state = WT_CHILD_IGNORE;
@@ -28,6 +32,17 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref,
return (__wt_ref_block_free(session, ref));
/*
+ * Check visibility. If the truncation is visible to us, we'll also want to know if it's visible
+ * to everyone. Use the special-case logic in __wt_page_del_visible to hide prepared truncations
+ * as we can't write them to disk.
+ */
+ if (F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)) {
+ visible = __wt_page_del_visible(session, page_del, true);
+ visible_all = visible ? __wt_page_del_visible_all(session, page_del, true) : false;
+ } else
+ visible = visible_all = __wt_page_del_visible_all(session, page_del, true);
+
+ /*
* The truncate may not yet be visible to us. In that case, we proceed as with any change not
* visible during reconciliation by ignoring the change for the purposes of writing the internal
* page.
@@ -39,7 +54,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref,
* been written to disk yet; if the page gets marked clean it might be discarded and then the
* truncation is lost.
*/
- if (!__wt_page_del_visible(session, page_del, !F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT))) {
+ if (!visible) {
if (F_ISSET(r, WT_REC_VISIBILITY_ERR))
WT_RET_PANIC(session, EINVAL, "reconciliation illegally skipped an update");
/*
@@ -89,20 +104,22 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref,
}
/*
- * Deal with underlying disk blocks. If there are readers that might want to see the page's
- * state before it's deleted, or the fast-delete can be undone by RTS, we can't discard the
- * pages. Write a cell to the internal page with information describing the fast-delete.
+ * If there are readers that might want to see the page's state before it's deleted, or the
+ * fast-delete can be undone by RTS, we can't discard the pages. Write a cell to the internal
+ * page with information describing the fast-delete.
*
* We have the WT_REF locked, but that lock is released before returning to the function writing
* cells to the page. Copy out the current fast-truncate information for that function.
*/
- if (!__wt_page_del_visible(session, page_del, true)) {
+ if (!visible_all) {
cmsp->del = *page_del;
cmsp->state = WT_CHILD_PROXY;
return (0);
}
/*
+ * Deal with underlying disk blocks.
+ *
* Globally visible truncate, discard the leaf page to the block manager and no cell needs to be
* written. Done outside of the underlying tracking routines because this action is permanent
* and irrevocable. (Clearing the address means we've lost track of the disk address in a
@@ -112,13 +129,8 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref,
*/
WT_RET(__wt_ref_block_free(session, ref));
- /*
- * Globally visible fast-truncate information is never used again, a NULL value is identical.
- * Fast-truncate information in the page-modify structure can be used more than once if this
- * reconciliation of the internal page were to fail.
- */
- if (page_del == ref->ft_info.del)
- __wt_overwrite_and_free(session, ref->ft_info.del);
+ /* Globally visible fast-truncate information is never used again, a NULL value is identical. */
+ __wt_overwrite_and_free(session, ref->page_del);
return (0);
}
@@ -157,7 +169,7 @@ __wt_rec_child_modify(
// 9417 IGNORE
WT_ASSERT(session, ref->addr != NULL);
/* DISK pages do not have fast-truncate info. */
- WT_ASSERT(session, ref->ft_info.del == NULL);
+ WT_ASSERT(session, ref->page_del == NULL);
goto done;
case WT_REF_DELETED:
@@ -170,7 +182,7 @@ __wt_rec_child_modify(
*/
if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED))
break;
- ret = __rec_child_deleted(session, r, ref, ref->ft_info.del, cmsp);
+ ret = __rec_child_deleted(session, r, ref, cmsp);
WT_REF_SET_STATE(ref, WT_REF_DELETED);
goto done;
@@ -217,15 +229,20 @@ __wt_rec_child_modify(
* Set WT_READ_NO_WAIT because we're only interested in the WT_REF's final state. Pages
* in transition might change WT_REF state during our read, and then return WT_NOTFOUND
* to us. In that case, loop and look again.
+ *
+ * If we retried from below this point and already have a hazard pointer, don't do it
+ * again.
*/
- ret = __wt_page_in(
- session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT);
- if (ret == WT_NOTFOUND) {
- ret = 0;
- break;
+ if (cmsp->hazard == false) {
+ ret = __wt_page_in(session, ref,
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT);
+ if (ret == WT_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ WT_RET(ret);
+ cmsp->hazard = true;
}
- WT_RET(ret);
- cmsp->hazard = true;
/*
* The child is potentially modified if the page's modify structure has been created. If
@@ -252,17 +269,32 @@ __wt_rec_child_modify(
* Depending on visibility, we may need to write the original page, or write a proxy
* (deleted-address) cell with the pre-instantiation page-delete information, or we may
* be able to ignore the page entirely. We keep the original fast-truncate information
- * in the modify structure after instantiation to make the visibility check possible.
+ * in the ref after instantiation to make the visibility check possible.
*
* The key is the page-modify.instantiated flag, removed during page reconciliation. If
* it's set, instantiation happened after checkpoint passed the leaf page and we treat
* this page like a WT_REF_DELETED page, evaluating it as it was before instantiation.
*
- * We do not need additional locking: with a hazard pointer the page can't be evicted,
- * and reconciliation is the only thing that can clear the page-modify info.
+ * We need to lock the ref for it to be safe to examine the page_del structure, in case
+ * the transaction in it is unresolved and tries to roll back (which discards the
+ * structure) while we're looking at it. It should be possible to skip the locking if
+ * the instantiation update list is NULL (that means the transaction is resolved) but
+ * for now let's do the conservatively safe thing.
*/
if (mod != NULL && mod->instantiated) {
- WT_RET(__rec_child_deleted(session, r, ref, mod->page_del, cmsp));
+ if (!WT_REF_CAS_STATE(session, ref, WT_REF_MEM, WT_REF_LOCKED))
+ /* Oops. Retry... */
+ break;
+
+ /* This is a very small race window, but check just in case. */
+ if (mod->instantiated == false) {
+ WT_REF_SET_STATE(ref, WT_REF_MEM);
+ /* Retry from the top; we may now have a rec_result. */
+ break;
+ }
+
+ WT_RET(__rec_child_deleted(session, r, ref, cmsp));
+ WT_REF_SET_STATE(ref, WT_REF_MEM);
goto done;
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index bd5b3d12f62..34f6615bc05 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -1672,8 +1672,11 @@ next:
* - There were invisible updates, because then the page isn't really empty. Also, at least
* for now if we try to restore updates to an empty page col_modify will trip on its
* shoelaces.
+ * - We wrote no cells at all. This can happen if a page with no cells and no append list
+ * entries at all (not just one with no or only aborted updates) gets marked dirty somehow
+ * and reconciled; this is apparently possible in some circumstances.
*/
- if (!wrote_real_values && salvage == NULL && r->leave_dirty == false) {
+ if (!wrote_real_values && salvage == NULL && r->leave_dirty == false && r->entries > 0) {
WT_ASSERT(session, r->entries == 1);
r->entries = 0;
WT_STAT_CONN_DATA_INCR(session, rec_vlcs_emptied_pages);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 1e9cfba872f..2c26d83adf7 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -2397,12 +2397,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_REF *ref;
WT_TIME_AGGREGATE ta;
uint32_t i;
+ uint8_t previous_ref_state;
btree = S2BT(session);
bm = btree->bm;
mod = page->modify;
ref = r->ref;
WT_TIME_AGGREGATE_INIT(&ta);
+ previous_ref_state = 0;
/*
* If using the history store table eviction path and we found updates that weren't globally
@@ -2558,10 +2560,42 @@ split:
break;
}
- /* If the page has post-instantiation delete information, we don't need it any more. */
+ /*
+ * If the page has post-instantiation delete information, we don't need it any more. Note: this
+ * is the only place in the system that potentially touches ref->page_del without locking the
+ * ref. There are two other pieces of code it can interact with: transaction rollback and parent
+ * internal page reconciliation. We use __wt_free_page_del here and in transaction rollback to
+ * make the deletion atomic. Reconciliation of the parent is locked out for the following
+ * reasons: first, if we are evicting the leaf here, eviction has the ref locked, and the parent
+ * will wait for it; and if we are checkpointing the leaf, we can't simultaneously be
+ * checkpointing the parent, and we can't be evicting the parent either because internal pages
+ * can't be evicted while they have in-memory children.
+ */
if (mod->instantiated) {
- mod->instantiated = false;
- __wt_free(session, mod->page_del);
+ /*
+ * Unfortunately, it seems we need to lock the ref at this point. Ultimately the page_del
+ * structure and the instantiated flag need to both be cleared simultaneously (otherwise
+ * instantiated == false and page_del not NULL violates the intended invariant and other
+ * code can assert) and there are several other places that can still be interacting with
+ * the page_del structure at this point (even though the page has been instantiated) and we
+ * need to wait for those to finish before discarding it.
+ *
+ * Note: if we're in eviction, the ref is already locked.
+ */
+ if (!F_ISSET(r, WT_REC_EVICT)) {
+ WT_REF_LOCK(session, ref, &previous_ref_state);
+ WT_ASSERT(session, previous_ref_state == WT_REF_MEM);
+ } else
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+ /* Check the instantiated flag again in case it got cleared while we waited. */
+ if (mod->instantiated) {
+ mod->instantiated = false;
+ __wt_free(session, ref->page_del);
+ }
+
+ if (!F_ISSET(r, WT_REC_EVICT))
+ WT_REF_UNLOCK(ref, previous_ref_state);
}
return (0);
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index e91b435beb2..cfdf7c522ff 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -1721,16 +1721,19 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
/*
* Only two cases are possible. First: the state is WT_REF_DELETED. In this case
- * ft_info.del cannot be NULL yet because an uncommitted operation cannot have reached
+ * page_del cannot be NULL yet because an uncommitted operation cannot have reached
* global visibility. Otherwise: there is an uncommitted delete operation we're
* handling, so the page can't be in a non-deleted state, and the tree can't be
* readonly. Therefore the page must have been instantiated, the state must be
- * WT_REF_MEM, and there should be an update list in ft_info.update.
+ * WT_REF_MEM, and there should be an update list in modify->inst_updates. There may
+ * also be a non-NULL page_del to update.
*/
- if (previous_state == WT_REF_DELETED)
- op->u.ref->ft_info.del->committed = true;
- else
- __wt_free(session, op->u.ref->ft_info.update);
+ if (previous_state != WT_REF_DELETED) {
+ WT_ASSERT(session, op->u.ref->page != NULL && op->u.ref->page->modify != NULL);
+ __wt_free(session, op->u.ref->page->modify->inst_updates);
+ }
+ if (op->u.ref->page_del != NULL)
+ op->u.ref->page_del->committed = true;
WT_REF_UNLOCK(op->u.ref, previous_state);
}
__wt_txn_op_free(session, op);
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 9d8fe657a15..0a17f020b05 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -1131,8 +1131,11 @@ __rollback_page_needs_abort(
* is greater than or equal to recovered checkpoint snapshot min:
* 1. The reconciled replace page max durable timestamp.
* 2. The reconciled multi page max durable timestamp.
- * 3. The on page address max durable timestamp.
- * 4. The off page address max durable timestamp.
+ * 3. For just-instantiated deleted pages that have not otherwise been modified, the durable
+ * timestamp in the page delete information. This timestamp isn't reflected in the address's
+ * time aggregate.
+ * 4. The on page address max durable timestamp.
+ * 5. The off page address max durable timestamp.
*/
if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) {
tag = "reconciled replace block";
@@ -1149,6 +1152,15 @@ __rollback_page_needs_abort(
prepared = true;
}
result = (durable_ts > rollback_timestamp) || prepared;
+ } else if (mod != NULL && mod->instantiated && !__wt_page_is_modified(ref->page) &&
+ ref->page_del != NULL) {
+ tag = "page_del info";
+ durable_ts = ref->page_del->durable_timestamp;
+ prepared = ref->page_del->prepare_state == WT_PREPARE_INPROGRESS ||
+ ref->page_del->prepare_state == WT_PREPARE_LOCKED;
+ newest_txn = ref->page_del->txnid;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn);
} else if (!__wt_off_page(ref->home, addr)) {
tag = "on page cell";
/* Check if the page is obsolete using the page disk address. */
@@ -1252,11 +1264,20 @@ __rollback_to_stable_page_skip(
*/
if (ref->state == WT_REF_DELETED &&
WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) {
- page_del = ref->ft_info.del;
+ page_del = ref->page_del;
if (page_del == NULL ||
(__rollback_txn_visible_id(session, page_del->txnid) &&
- page_del->durable_timestamp <= rollback_timestamp))
+ page_del->durable_timestamp <= rollback_timestamp)) {
+ /*
+ * We should never see a prepared truncate here; not at recovery time because prepared
+ * truncates can't be written to disk, and not during a runtime RTS either because it
+ * should not be possible to do that with an unresolved prepared transaction.
+ */
+ WT_ASSERT(session,
+ page_del == NULL || page_del->prepare_state == WT_PREPARE_INIT ||
+ page_del->prepare_state == WT_PREPARE_RESOLVED);
*skipp = true;
+ }
WT_REF_SET_STATE(ref, WT_REF_DELETED);
return (0);
}
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint06.py b/src/third_party/wiredtiger/test/suite/test_checkpoint06.py
index 5689af7f9af..dcbae5386b7 100644
--- a/src/third_party/wiredtiger/test/suite/test_checkpoint06.py
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint06.py
@@ -78,8 +78,6 @@ class test_checkpoint06(wttest.WiredTigerTestCase):
self.session.begin_transaction()
start = self.session.open_cursor(self.uri)
start.set_key(5)
- end = self.session.open_cursor(self.uri)
- end.set_key(9995)
self.session.truncate(None, start, None, None)
if self.prepare:
self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(3))
@@ -95,7 +93,7 @@ class test_checkpoint06(wttest.WiredTigerTestCase):
',stable_timestamp=' + self.timestamp_str(4))
cursor = self.session.open_cursor(self.uri_evict)
- # Insert some more data to trigger eviction
+ # Insert some more data into another table to trigger eviction
for i in range(1, nrows + 1):
self.session.begin_transaction()
cursor[i] = value
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py
index e72b85f5b52..aae9047a7f8 100644
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py
@@ -32,9 +32,9 @@ from wiredtiger import stat, WT_NOTFOUND
from wtdataset import SimpleDataSet
from wtscenario import make_scenarios
-# test_rollback_to_stable33.py
+# test_rollback_to_stable34.py
# Test interaction between fast-delete and RTS.
-class test_rollback_to_stable33(test_rollback_to_stable_base):
+class test_rollback_to_stable34(test_rollback_to_stable_base):
session_config = 'isolation=snapshot'
conn_config = 'cache_size=50MB,statistics=(all),log=(enabled=false)'
@@ -104,7 +104,7 @@ class test_rollback_to_stable33(test_rollback_to_stable_base):
nrows = 10000
# Create a table without logging.
- uri = "table:rollback_to_stable33"
+ uri = "table:rollback_to_stable34"
ds = SimpleDataSet(
self, uri, 0, key_format=self.key_format, value_format=self.value_format,
config='log=(enabled=false)' + self.extraconfig)
diff --git a/src/third_party/wiredtiger/test/suite/test_truncate16.py b/src/third_party/wiredtiger/test/suite/test_truncate16.py
new file mode 100644
index 00000000000..96f94b79b1c
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_truncate16.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wttest
+from helper import simulate_crash_restart
+from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_NOTFOUND, WT_ROLLBACK
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_truncate16.py
+#
+# Make sure that no shenanigans occur if we try to read from a page that's been
+# fast-truncated by a prepared transaction.
+
+class test_truncate16(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all)'
+ session_config = 'isolation=snapshot'
+
+ # Hook to run using remove instead of truncate for reference. This should not alter the
+ # behavior... but may if things are broken. Disable the reference version by default as it's
+ # only useful when investigating behavior changes. This list is first in the make_scenarios
+ # call so the additional cases don't change the scenario numbering.
+ trunc_values = [
+ ('truncate', dict(trunc_with_remove=False)),
+ #('remove', dict(trunc_with_remove=True)),
+ ]
+ format_values = [
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('column_fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('integer_row', dict(key_format='i', value_format='S', extraconfig='')),
+ ]
+ checkpoint_values = [
+ ('no_checkpoint', dict(do_checkpoint=False)),
+ ('checkpoint', dict(do_checkpoint=True)),
+ ]
+ scenarios = make_scenarios(trunc_values, format_values, checkpoint_values)
+
+ def truncate(self, session, uri, make_key, keynum1, keynum2):
+ if self.trunc_with_remove:
+ cursor = session.open_cursor(uri)
+ err = 0
+ for k in range(keynum1, keynum2 + 1):
+ cursor.set_key(k)
+ try:
+ err = cursor.remove()
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ if err != 0:
+ break
+ cursor.close()
+ else:
+ lo_cursor = session.open_cursor(uri)
+ hi_cursor = session.open_cursor(uri)
+ lo_cursor.set_key(make_key(keynum1))
+ hi_cursor.set_key(make_key(keynum2))
+ try:
+ err = session.truncate(None, lo_cursor, hi_cursor, None)
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ lo_cursor.close()
+ hi_cursor.close()
+ return err
+
+ def test_truncate16(self):
+ nrows = 10000
+
+ # Create a table.
+ uri = "table:truncate16"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+
+ # Pin oldest and stable timestamps to 1.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) +
+ ',stable_timestamp=' + self.timestamp_str(1))
+
+ # Write some baseline data at time 10.
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value_a
+ if i % 487 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+ cursor.close()
+
+ # Mark it stable.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+
+ # Reopen the connection so nothing is in memory and we can fast-truncate.
+ self.reopen_conn()
+
+ # Make a session to prepare in.
+ session2 = self.conn.open_session()
+
+ # Truncate the middle of the table.
+ #
+ # Prepare the truncate at time 20 and leave it hanging.
+ session2.begin_transaction()
+ err = self.truncate(session2, ds.uri, ds.key, nrows // 4 + 1, 3 * nrows // 4)
+ self.assertEqual(err, 0)
+ session2.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20))
+
+ # Make sure we did at least one fast-delete. (Unless we specifically didn't want to,
+ # or running on FLCS where it isn't supported.)
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2]
+ if self.value_format == '8t' or self.trunc_with_remove:
+ self.assertEqual(fastdelete_pages, 0)
+ else:
+ self.assertGreater(fastdelete_pages, 0)
+ stat_cursor.close()
+
+ # Optionally checkpoint at this stage, just in case it breaks or trips on
+ # the prepared truncation.
+ if self.do_checkpoint:
+ self.session.checkpoint()
+
+ # Now read from the truncated region. This should give WT_PREPARE_CONFLICT.
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(30))
+ cursor.set_key(nrows // 2)
+ self.assertRaisesException(WiredTigerError,
+ lambda: cursor.search(),
+ exceptionString='/conflict with a prepared update/')
+
+ # It should have instantiated the page under the key we read, and nothing else.
+ # (But not if we weren't fast-deleting.)
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ read_deleted = stat_cursor[stat.conn.cache_read_deleted][2]
+ if self.value_format == '8t' or self.trunc_with_remove:
+ self.assertEqual(read_deleted, 0)
+ else:
+ self.assertEqual(read_deleted, 1)
+ stat_cursor.close()
+
+ # Now toss the prepared transaction, and just for kicks make sure we can read the whole
+ # table.
+ session2.rollback_transaction()
+
+ for i in range(1, nrows + 1):
+ cursor.next()
+ self.assertEqual(cursor.get_key(), ds.key(i))
+ self.assertEqual(cursor.get_value(), value_a)
+ self.session.rollback_transaction()
+
+ # Unlike RTS, transaction rollback should not instantiate pages, so the number of
+ # instantiated pages should remain 1.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ read_deleted_still = stat_cursor[stat.conn.cache_read_deleted][2]
+ self.assertEqual(read_deleted_still, read_deleted)
+ stat_cursor.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_truncate17.py b/src/third_party/wiredtiger/test/suite/test_truncate17.py
new file mode 100644
index 00000000000..a3b4f7d1df6
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_truncate17.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wttest
+from helper import simulate_crash_restart
+from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_NOTFOUND, WT_ROLLBACK
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_truncate17.py
+#
+# Make sure that no shenanigans occur if we try to read from a page that's been
+# fast-truncated by a prepared transaction.
+
+class test_truncate17(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all)'
+ session_config = 'isolation=snapshot'
+
+ # Hook to run using remove instead of truncate for reference. This should not alter the
+ # behavior... but may if things are broken. Disable the reference version by default as it's
+ # only useful when investigating behavior changes. This list is first in the make_scenarios
+ # call so the additional cases don't change the scenario numbering.
+ trunc_values = [
+ ('truncate', dict(trunc_with_remove=False)),
+ #('remove', dict(trunc_with_remove=True)),
+ ]
+ format_values = [
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('column_fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('integer_row', dict(key_format='i', value_format='S', extraconfig='')),
+ ]
+ checkpoint_values = [
+ ('no_checkpoint', dict(do_checkpoint=False)),
+ ('checkpoint', dict(do_checkpoint=True)),
+ ]
+ scenarios = make_scenarios(trunc_values, format_values, checkpoint_values)
+
+ def stat_tree(self, uri):
+ statscursor = self.session.open_cursor('statistics:' + uri, None, 'statistics=(all)')
+
+ entries = statscursor[stat.dsrc.btree_entries][2]
+ if self.value_format == '8t':
+ leaf_pages = statscursor[stat.dsrc.btree_column_fix][2]
+ internal_pages = statscursor[stat.dsrc.btree_column_internal][2]
+ elif self.key_format == 'r':
+ leaf_pages = statscursor[stat.dsrc.btree_column_variable][2]
+ internal_pages = statscursor[stat.dsrc.btree_column_internal][2]
+ else:
+ leaf_pages = statscursor[stat.dsrc.btree_row_leaf][2]
+ internal_pages = statscursor[stat.dsrc.btree_row_internal][2]
+
+ return (entries, (leaf_pages, internal_pages))
+
+ def truncate(self, session, uri, make_key, keynum1, keynum2):
+ if self.trunc_with_remove:
+ cursor = session.open_cursor(uri)
+ err = 0
+ for k in range(keynum1, keynum2 + 1):
+ cursor.set_key(k)
+ try:
+ err = cursor.remove()
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ if err != 0:
+ break
+ cursor.close()
+ else:
+ lo_cursor = session.open_cursor(uri)
+ hi_cursor = session.open_cursor(uri)
+ lo_cursor.set_key(make_key(keynum1))
+ hi_cursor.set_key(make_key(keynum2))
+ try:
+ err = session.truncate(None, lo_cursor, hi_cursor, None)
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ lo_cursor.close()
+ hi_cursor.close()
+ return err
+
+ def test_truncate17(self):
+ nrows = 10000
+
+ # Create a table.
+ uri = "table:truncate17"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+
+ # Pin oldest and stable timestamps to 1.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) +
+ ',stable_timestamp=' + self.timestamp_str(1))
+
+ # Write some baseline data at time 10.
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value_a
+ if i % 487 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+ cursor.close()
+
+ # Mark it stable.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+
+ # Reopen the connection so as to stat the on-disk version of the tree.
+ self.reopen_conn()
+
+ # Stat the tree to get a baseline.
+ (base_entries, base_pages) = self.stat_tree(uri)
+ self.assertEqual(base_entries, nrows)
+
+ # Reopen the connection again so nothing is in memory and we can fast-truncate.
+ self.reopen_conn()
+
+ # Make a session to prepare in.
+ session2 = self.conn.open_session()
+
+ # Truncate the middle of the table.
+ #
+ # Prepare the truncate at time 20 and leave it hanging.
+ session2.begin_transaction()
+ err = self.truncate(session2, ds.uri, ds.key, nrows // 4 + 1, 3 * nrows // 4)
+ self.assertEqual(err, 0)
+ session2.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20))
+
+ # Make sure we did at least one fast-delete. (Unless we specifically didn't want to,
+ # or running on FLCS where it isn't supported.)
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2]
+ if self.value_format == '8t' or self.trunc_with_remove:
+ self.assertEqual(fastdelete_pages, 0)
+ else:
+ self.assertGreater(fastdelete_pages, 0)
+ stat_cursor.close()
+
+ # Optionally checkpoint at this stage, just in case it breaks or trips on
+ # the prepared truncation.
+ if self.do_checkpoint:
+ self.session.checkpoint()
+
+ # Stat the tree again. Stats are not transactional, and are effectively
+ # read-uncommitted; we should see the results of the prepared truncate.
+ # However, the truncated pages aren't actually gone yet, so the page counts
+ # shouldn't change.
+ (entries, pages) = self.stat_tree(uri)
+ if self.value_format == '8t':
+ self.assertEqual(entries, nrows)
+ else:
+ self.assertEqual(entries, nrows // 2)
+ self.assertEqual(pages, base_pages)
+
+ # This should instantiate all the deleted pages.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ read_deleted = stat_cursor[stat.conn.cache_read_deleted][2]
+ self.assertEqual(read_deleted, fastdelete_pages)
+ stat_cursor.close()
+
+ # Now toss the prepared transaction.
+ session2.rollback_transaction()
+
+ # Unlike RTS, transaction rollback should not instantiate pages, plus there are
+ # no more deleted pages to instantiate, so the number of instantiated pages should
+ # remain unchanged.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ read_deleted = stat_cursor[stat.conn.cache_read_deleted][2]
+ self.assertEqual(read_deleted, fastdelete_pages)
+ stat_cursor.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_truncate18.py b/src/third_party/wiredtiger/test/suite/test_truncate18.py
new file mode 100644
index 00000000000..0e971ed389a
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_truncate18.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wttest
+from helper import simulate_crash_restart
+from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_NOTFOUND, WT_ROLLBACK
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_truncate18.py
+#
+# The optimization that replaces deleted pages full of obsolete values with physically
+# empty pages can cause problems, because for some purposes the empty page is not
+# equivalent.
+#
+# In particular, the key order checks in row-store verify depend on the keys being
+# physically present, and loading an empty page defeats that. This is more or less
+# harmless except in the case of the leftmost leaf page, whose keys are used to
+# initialize the check.
+#
+# It is not entirely trivial to reach the failure state, because the page under the start
+# point of a truncate is never fast-truncated and that in turn means the leftmost page of
+# the tree is never fast-truncated. Consequently, to get a deleted leftmost leaf we must
+# truncate a range the beginning of the tree and then cause at least the first page of the
+# range to be discarded while keeping some of the rest of it.
+#
+# The only way I've thought of to do this is to truncate a range that spans more than one
+# internal page. Then the first internal page of the range can be reconciled (required to
+# discard the non-deleted leftmost page) without discarding the whole truncated range.
+#
+# Consequently we crank down internal_page_max to avoid needing an excessively large test.
+#
+# Then we set things up so that the truncation becomes globally visible and run verify.
+# That currently asserts. The fix for this is likely to disable the optimization when in
+# verify, so the only real purpose of this test is to prevent the behavior from regressing.
+# It is therefore not full of scenarios but specific to this one problem.
+
+class test_truncate18(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all)'
+ session_config = 'isolation=snapshot'
+
+ # Hook to run using remove instead of truncate for reference. This should not alter the
+ # behavior... but may if things are broken. Disable the reference version by default as it's
+ # only useful when investigating behavior changes. This list is first in the make_scenarios
+ # call so the additional cases don't change the scenario numbering.
+ trunc_values = [
+ ('truncate', dict(trunc_with_remove=False)),
+ #('remove', dict(trunc_with_remove=True)),
+ ]
+ format_values = [
+ ('integer_row', dict(key_format='i', value_format='S', extraconfig='')),
+ ]
+ scenarios = make_scenarios(trunc_values, format_values)
+
+ # Truncate, from keynum1 to keynum2, inclusive.
+ def truncate(self, uri, make_key, keynum1, keynum2, read_ts, commit_ts):
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(read_ts))
+ if self.trunc_with_remove:
+ cursor = self.session.open_cursor(uri)
+ err = 0
+ for k in range(keynum1, keynum2 + 1):
+ cursor.set_key(k)
+ try:
+ err = cursor.remove()
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ if err != 0:
+ break
+ cursor.close()
+ else:
+ lo_cursor = self.session.open_cursor(uri)
+ hi_cursor = self.session.open_cursor(uri)
+ lo_cursor.set_key(make_key(keynum1))
+ hi_cursor.set_key(make_key(keynum2))
+ try:
+ err = self.session.truncate(None, lo_cursor, hi_cursor, None)
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ lo_cursor.close()
+ hi_cursor.close()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(commit_ts))
+ return err
+
+ def test_truncate18(self):
+ # With the small internal pages, 10000 rows is enough. 5000 rows is not.
+ nrows = 10000
+
+ # Create a table.
+ uri = "table:truncate18"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config='internal_page_max=4096' + self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+
+ # Pin oldest and stable timestamps to 1.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) +
+ ',stable_timestamp=' + self.timestamp_str(1))
+
+ # Write some baseline data at time 10.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value_a
+ if i % 487 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+ cursor.close()
+
+ # Mark it stable.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+
+ # Reopen the connection again so nothing is in memory and we can fast-truncate.
+ self.reopen_conn()
+
+ # Truncate most of the tree, beginning at the first key, at time 20.
+ err = self.truncate(ds.uri, ds.key, 1, 7 * nrows // 8, 15, 20)
+ self.assertEqual(err, 0)
+
+ # Make sure we did at least one fast-delete. (Unless we specifically didn't want to,
+ # or running on FLCS where it isn't supported.)
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2]
+ if self.value_format == '8t' or self.trunc_with_remove:
+ self.assertEqual(fastdelete_pages, 0)
+ else:
+ self.assertGreater(fastdelete_pages, 0)
+ stat_cursor.close()
+
+ # Mark all this stable.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(30))
+
+ # Reopen the connection again so everything is purely on disk.
+ self.reopen_conn()
+
+ # Age out the baseline data, so the pages we truncated contain entirely obsolete data.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(30))
+
+ # Since we didn't fast-truncate the first page (one can't) we need to get it
+ # discarded by forcing it to reconcile empty. This will also discard all the
+ # fast-truncated pages that are children of the first internal page. For the
+ # test to work we need to have more fast-truncated pages beyond that, but there
+ # is no good way to crosscheck if we do.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[ds.key(1)] = value_b
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(35))
+ self.session.begin_transaction()
+ cursor.set_key(ds.key(1))
+ self.assertEqual(cursor.remove(), 0)
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40))
+ cursor.close()
+
+ # Mark this change stable (and age out the scratch value we wrote) and checkpoint it.
+ # This will reconcile the first leaf page and the first internal page, and internal
+ # pages above that, but leave the second internal page alone since we did nothing to
+ # bring it into memory.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40))
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(40))
+ self.session.checkpoint()
+
+ # Reopen the connection yet again.
+ self.reopen_conn()
+
+ # Now verify the tree. In the problem scenario described above, this will assert.
+ self.session.verify(ds.uri, None)
+
+if __name__ == '__main__':
+ wttest.run()