diff options
author | Luke Chen <luke.chen@mongodb.com> | 2022-09-05 13:33:07 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-09-05 04:37:06 +0000 |
commit | 24143f27c9ba0d3075b4980a57a5b3ee9ec37344 (patch) | |
tree | aeb89a37eac5ebd0f28b5d2600a101e85000c89c | |
parent | b06609850b12a66509613e5c2d5b86fc17ce9de1 (diff) | |
download | mongo-24143f27c9ba0d3075b4980a57a5b3ee9ec37344.tar.gz |
Import wiredtiger: d619c325f86cd59ee25d4bcf43b738afcf7bfdf9 from branch mongodb-6.1
ref: 478f555081..d619c325f8
for: 6.1.0-rc1
WT-9720 Clean up and reorganize the fast-truncate code (#8188)
24 files changed, 1234 insertions, 356 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 9b1ca5f20d2..47a8e0e878d 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -231,6 +231,7 @@ ITEMs ITER InitializeCriticalSectionAndSpinCount Inline +Instantiation Intra Ippokratis Iu @@ -445,6 +446,7 @@ TIMESTAMPS TMP TODO TORTIOUS +TRYLOCK TSO TW TXN diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index e0cc1039833..f30d1fa76fc 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-6.1", - "commit": "478f5550817985718478ac04f3295e88440f8c3e" + "commit": "d619c325f86cd59ee25d4bcf43b738afcf7bfdf9" } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 993c23bdb34..c410df4a76a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -364,7 +364,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *vali * us whether the insert was actually an append to allow skipping the on-disk check. Note * that appends can't have history store content. This is true both for "real" appends at * the end of the tree and also for appends that are filling in truncated gaps in the middle - * of the tree -- the gap only appears when the truncation becomes globally visible and at + * of the tree -- the gap only appears after the truncation becomes globally visible and at * that point by definition nothing older can be accessible. */ if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 89efcd8599c..092dd58eb64 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -17,37 +17,42 @@ * The way cursor truncate works is it explicitly reads the first and last pages of the truncate * range, then walks the tree with a flag so the tree walk code skips reading eligible pages within * the range and instead just marks them as deleted, by changing their WT_REF state to - * WT_REF_DELETED. Pages ineligible for this fast path include pages already in the cache, having - * overflow items, or belonging to FLCS trees. Ineligible pages are read and have their rows - * updated/deleted individually. The transaction for the delete operation is stored in memory - * referenced by the WT_REF.ft_info.del field. + * WT_REF_DELETED. Pages ineligible for this fast path ("fast-truncate" or "fast-delete") include + * pages already in the cache, having overflow items, containing prepared values, or belonging to + * FLCS trees. Ineligible pages are read and have their rows updated/deleted individually + * ("slow-truncate"). The transaction for the delete operation is stored in memory referenced by the + * WT_REF.page_del field. * * Future cursor walks of the tree will skip the deleted page based on the transaction stored for * the delete, but it gets more complicated if a read is done using a random key, or a cursor walk - * is done with a transaction where the delete is not visible. In those cases, we read the original - * contents of the page. The page-read code notices a deleted page is being read, and as part of the - * read instantiates the contents of the page, creating tombstone WT_UPDATE records, in the same - * transaction that deleted the page. In other words, the read process makes it appear as if the - * page was read and each individual row deleted, exactly as would have happened if the page had - * been in the cache all along. + * is done with a transaction where the delete is not visible, or if an update is applied. In those + * cases, we read the original contents of the page. The page-read code notices a deleted page is + * being read, and as part of the read instantiates the contents of the page, creating tombstone + * WT_UPDATE records, in the same transaction that deleted the page. In other words, the read + * process makes it appear as if the page was read and each individual row deleted, exactly as + * would have happened if the page had been in the cache all along. * * There's an additional complication to support rollback of the page delete. When the page was * marked deleted, a pointer to the WT_REF was saved in the deleting session's transaction list and * the delete is unrolled by resetting the WT_REF_DELETED state back to WT_REF_DISK. However, if the - * page has been instantiated by some reading thread, that's not enough, each individual row on the + * page has been instantiated by some reading thread, that's not enough; each individual row on the * page must have the delete operation reset. If the page split, the WT_UPDATE lists might have been * saved/restored during reconciliation and appear on multiple pages, and the WT_REF stored in the * deleting session's transaction list is no longer useful. For this reason, when the page is * instantiated by a read, a list of the WT_UPDATE structures on the page is stored in the - * WT_REF.ft_info.update field, that way the session resolving the delete can find all WT_UPDATE - * structures that require update. + * WT_PAGE_MODIFY.inst_updates field. That way the session resolving the delete can find all + * WT_UPDATE structures that require update. * * There are two other ways pages can be marked deleted: if they reconcile empty, or if they are * found to be eligible for deletion and contain only obsolete items. (The latter is known as - * "checkpoint cleanup" and happens in bt_sync.c.) In these cases, the WT_REF state will be set to - * WT_REF_DELETED but there will not be any associated WT_REF.ft_info.del field since the page - * contains no data. These pages are always skipped during cursor traversal, and if read is forced - * to instantiate such a page, it creates an empty page from scratch. + * "checkpoint cleanup" and happens in bt_sync.c.) There are also two cases in which deleted pages + * are manufactured out of thin air: in VLCS, if a key-space gap exists between the start recno of + * an internal page and the start recno of its first child, a deleted page is created to cover this + * space; and, when new trees are created they are created with a single deleted leaf page. In these + * cases, the WT_REF state will be set to WT_REF_DELETED but there will not be any associated + * WT_REF.page_del field since the page contains no data. These pages are always skipped during + * cursor traversal, and if read is forced to instantiate such a page, it creates an empty page from + * scratch. * * This feature is not available for FLCS objects. While most of the machinery exists (it is mostly * a property of column-store internal pages) there is a showstopper problem. For VLCS, truncate @@ -83,13 +88,13 @@ * split operation is delicate and risky and it was better to preserve that page. This requires * special-case code in four places: (a) in split, for VLCS trees, don't discard the first child ref * in splits, even if it's deleted and the deletion is globally visible; (b) in VLCS trees, don't - * attempt reverse splits originating from that page, as that would discard it; (c) when loading an - * internal page, create an extra ref in this position if the first on-disk child starts at a later - * recno from the internal page itself; and (d) in verify, accept that the page in this position - * might be an empty deleted ref with no on-disk address. Note that the critical issue is not - * _discarding_ this page after deleting it. It is fine for it to _be_ deleted, as long as the ref - * always exists when the internal page is in memory. (It is not written to disk either; internal - * page reconciliation skips it.) + * attempt reverse splits originating from that page, as that would discard it; (c) as noted above, + * when loading an internal page, create an extra ref in this position if the first on-disk child + * starts at a later recno from the internal page itself; and (d) in verify, accept that the page + * in this position might be an empty deleted ref with no on-disk address. Note that the critical + * issue is that one must not _discard_ this page after deleting it. It is fine for it to _be_ + * deleted, as long as the ref always exists when the internal page is in memory. (It is not written + * to disk either; internal page reconciliation skips it.) */ /* @@ -136,13 +141,14 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) return (0); /* - * There should be no previous page-delete information: if the previous fast-truncate didn't - * instantiate the page, then we'd never get here to do another delete; if the previous fast- - * truncate did instantiate the page, then (for a read-write tree; we can't get here in a - * readonly tree) any fast-truncate information was removed at that point and/or when the - * fast-truncate transaction was resolved. + * There should be no previous page-delete information: if the page was previously deleted and + * remains deleted, it'll be in WT_REF_DELETED state and we won't get here to do another delete. + * If the page was previously deleted and instantiated, we can only get here if it was written + * out again or we successfully just evicted it; in that case, the reconciliation will have + * cleared the final traces of the previous deletion and instantiation. Furthermore, any prior + * deletion must have committed or another attempt would have failed with an update conflict. */ - WT_ASSERT(session, ref->ft_info.del == NULL); + WT_ASSERT(session, ref->page_del == NULL); /* * We cannot truncate pages that have overflow key/value items as the overflow blocks have to be @@ -151,6 +157,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) * * Additionally, if the page has prepared updates or the aggregated start time point on the page * is not visible to us then we cannot truncate the page. + * + * Note that we indicate this by succeeding without setting the skip flag, not via EBUSY. */ if (!__wt_ref_addr_copy(session, ref, &addr)) goto err; @@ -171,8 +179,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* Allocate and initialize the page-deleted structure. */ - WT_ERR(__wt_calloc_one(session, &ref->ft_info.del)); - ref->ft_info.del->previous_ref_state = previous_state; + WT_ERR(__wt_calloc_one(session, &ref->page_del)); + ref->page_del->previous_ref_state = previous_state; /* History store truncation is non-transactional. */ if (!WT_IS_HS(session->dhandle)) @@ -186,7 +194,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) return (0); err: - __wt_free(session, ref->ft_info.del); + __wt_free(session, ref->page_del); /* Publish the page to its previous state, ensuring visibility. */ WT_REF_SET_STATE(ref, previous_state); @@ -205,7 +213,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) uint8_t current_state; bool locked; - /* Lock the reference. We cannot access ref->ft_info.del except when locked. */ + /* Lock the reference. We cannot access ref->page_del except when locked. */ for (locked = false, sleep_usecs = yield_count = 0;;) { switch (current_state = ref->state) { case WT_REF_LOCKED: @@ -235,17 +243,23 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) /* * There are two possible cases: * - * 1. The state is WT_REF_DELETED. In this case ft_info.del cannot be null, because the + * 1. The state is WT_REF_DELETED. In this case page_del cannot be null, because the * operation cannot reach global visibility while its transaction remains uncommitted. The page * itself is as we left it, so we can just reset the state. * - * 2. The state is WT_REF_MEM. We check ft_info.update for a list of updates to abort. Allow the - * update list to be null to be conservative. + * 2. The state is WT_REF_MEM. We check mod->inst_updates for a list of updates to abort. Allow + * the update list to be null to be conservative. */ - if (current_state == WT_REF_DELETED) - current_state = ref->ft_info.del->previous_ref_state; - else { - if ((updp = ref->ft_info.update) != NULL) + if (current_state == WT_REF_DELETED) { + current_state = ref->page_del->previous_ref_state; + /* + * Don't set the WT_PAGE_DELETED transaction ID to aborted; instead, just discard the + * structure. This avoids having to check for an aborted delete in other situations. + */ + __wt_free(session, ref->page_del); + } else { + WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL); + if ((updp = ref->page->modify->inst_updates) != NULL) { /* * Walk any list of update structures and abort them. We can't use the normal read path * to get the pages with updates (the original page may have split, so there may be more @@ -255,26 +269,20 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) */ for (; *updp != NULL; ++updp) (*updp)->txnid = WT_TXN_ABORTED; - WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL); + /* Now discard the updates. */ + __wt_free(session, ref->page->modify->inst_updates); + } /* - * Drop any page_deleted information that has been moved to the modify structure. Note that - * while this must have been an instantiated page, the information (and flag) is only kept - * until the page is reconciled for the first time after instantiation, so it might not be - * set now. + * Drop any page_deleted information remaining in the ref. Note that while this must have + * been an instantiated page, the information (and flag) is only kept until the page is + * reconciled for the first time after instantiation, so it might not be set now. */ if (ref->page->modify->instantiated) { ref->page->modify->instantiated = false; - __wt_free(session, ref->page->modify->page_del); + __wt_free(session, ref->page_del); } } - /* - * Don't set the WT_PAGE_DELETED transaction ID to aborted, discard any WT_UPDATE list or set - * the committed flag; instead, discard the structures, it has the same effect. It's a single - * call, they're a union of two pointers. - */ - __wt_free(session, ref->ft_info.del); - WT_REF_SET_STATE(ref, current_state); return (0); } @@ -293,8 +301,8 @@ __delete_redo_window_cleanup_internal(WT_SESSION_IMPL *session, WT_REF *ref) WT_ASSERT(session, F_ISSET(ref, WT_REF_FLAG_INTERNAL)); if (ref->page != NULL) { WT_INTL_FOREACH_BEGIN (session, ref->page, child) { - if (child->state == WT_REF_DELETED && child->ft_info.del != NULL) - __cell_redo_page_del_cleanup(session, ref->page->dsk, child->ft_info.del); + if (child->state == WT_REF_DELETED && child->page_del != NULL) + __cell_redo_page_del_cleanup(session, ref->page->dsk, child->page_del); } WT_INTL_FOREACH_END; } @@ -303,7 +311,9 @@ __delete_redo_window_cleanup_internal(WT_SESSION_IMPL *session, WT_REF *ref) /* * __delete_redo_window_cleanup_skip -- * Tree-walk skip function for __wt_delete_redo_window_cleanup. This skips all leaf pages; we'll - * visit all in-memory internal pages via the flag settings on the tree-walk call. + * visit all in-memory internal pages via the flag settings on the tree-walk call. Note that we + * won't be called (even here) for deleted leaf pages themselves, because they're skipped by + * default. */ static int __delete_redo_window_cleanup_skip( @@ -352,18 +362,19 @@ __wt_delete_redo_window_cleanup(WT_SESSION_IMPL *session) bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { - bool skip; + bool discard, skip; /* - * Deleted pages come from two sources: either it's a truncate as described above, or the page - * has been emptied by other operations and eviction deleted it. + * Deleted pages come from several possible sources (as described at the top of this file). * - * In both cases, the WT_REF state will be WT_REF_DELETED. In the case of a truncated page, - * there will be a WT_PAGE_DELETED structure with the transaction ID of the transaction that - * deleted the page, and the page is visible if that transaction ID is visible. In the case of - * an empty page, there will be no WT_PAGE_DELETED structure and the delete is by definition - * visible, eviction could not have deleted the page if there were changes on it that were not - * globally visible. + * In all cases, the WT_REF state will be WT_REF_DELETED. If there is a WT_PAGE_DELETED + * structure describing a transaction, the deletion is visible (so the page is *not* visible) if + * the transaction is visible. If there is no WT_PAGE_DELETED structure, the deletion is + * globally visible. This happens either because the structure described a transaction that had + * become globally visible and was previously removed, or because the page was deleted by a + * non-transactional mechanism. (In the latter case, the deletion is inherently globally + * visible; pages only become empty if nothing in them remains visible to anyone, and newly + * minted empty pages cannot have anything in them to see.) * * We're here because we found a WT_REF state set to WT_REF_DELETED. It is possible the page is * being read into memory right now, though, and the page could switch to an in-memory state at @@ -372,21 +383,28 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) return (false); - skip = !__wt_page_del_active(session, ref, visible_all); - /* - * The fast-truncate structure can be freed as soon as the delete is stable: it is only read - * when the ref state is locked. It is worth checking every time we come through because once - * this is freed, we no longer need synchronization to check the ref. + * Check visibility. * - * Note that if the visible_all flag is set, skip already reflects the visible_all result so we - * don't need to do it twice. + * Use the option to hide prepared transactions in all checks; we can't skip a page if the + * deletion is only prepared (we need to visit it to generate a prepare conflict), and we can't + * discard the page_del info either, as doing so leads to dropping the on-disk page and if the + * prepared transaction rolls back we'd then be in trouble. */ - if (skip && ref->ft_info.del != NULL && - (visible_all || - __wt_txn_visible_all( - session, ref->ft_info.del->txnid, ref->ft_info.del->durable_timestamp))) - __wt_overwrite_and_free(session, ref->ft_info.del); + if (visible_all) + skip = discard = __wt_page_del_visible_all(session, ref->page_del, true); + else { + skip = __wt_page_del_visible(session, ref->page_del, true); + discard = skip ? __wt_page_del_visible_all(session, ref->page_del, true) : false; + } + + /* + * The fast-truncate structure can be freed as soon as the delete is globally visible: it is + * only read when the ref state is locked. It is worth checking every time we come through + * because once this is freed, we no longer need synchronization to check the ref. + */ + if (discard && ref->page_del != NULL) + __wt_overwrite_and_free(session, ref->page_del); WT_REF_SET_STATE(ref, WT_REF_DELETED); return (skip); @@ -406,7 +424,8 @@ __tombstone_update_alloc( F_SET(upd, WT_UPDATE_RESTORED_FAST_TRUNCATE); /* - * Cleared memory matches the lowest possible transaction ID and timestamp, do nothing. + * Cleared memory matches the lowest possible transaction ID and timestamp; do nothing if the + * page_del pointer is null. */ if (page_del != NULL) { upd->txnid = page_del->txnid; @@ -430,7 +449,9 @@ __instantiate_tombstone(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, /* * If we find an existing stop time point we don't need to append a tombstone. Such rows would * not have been visible to the original truncate operation and were, logically, skipped over - * rather than re-deleted. + * rather than re-deleted. (If the row _was_ visible to the truncate in spite of having been + * subsequently removed, the stop time not being visible would have forced its page to be slow- + * truncated rather than fast-truncated.) */ if (WT_TIME_WINDOW_HAS_STOP(tw)) *updp = NULL; @@ -587,30 +608,31 @@ err: * Instantiate an entirely deleted row-store leaf page. */ int -__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_DELETED *page_del) +__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_PAGE *page; + WT_PAGE_DELETED *page_del; WT_ROW *rip; WT_UPDATE **update_list; uint32_t count, i; /* * An operation is accessing a "deleted" page, and we're building an in-memory version of the - * page (making it look like all entries in the page were individually updated by a remove - * operation). We end up here if a transaction used a truncate call to delete the page without + * page, making it look like all entries in the page were individually updated by a remove + * operation. We end up here if a transaction used a truncate call to delete the page without * reading it, and something else that can't yet see the truncation decided to read the page. + * (We also end up here if someone who _can_ see the truncation writes new data into the same + * namespace before the deleted pages are discarded.) * * This can happen after the truncate transaction resolves, but it can also happen before. In * the latter case, we need to keep track of the updates we populate the page with, so they can * be found when the transaction resolves. The page we're loading might split, in which case * finding the updates any other way would become a problem. - * - * The page_del structure passed in is either ref->ft_info.del, or under certain circumstances - * when that's unavailable, one extracted from the parent page's address cell. */ page = ref->page; + page_del = ref->page_del; update_list = NULL; /* Fast-truncate only happens to leaf pages, and FLCS isn't supported. */ @@ -626,20 +648,18 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_DELE WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared); /* - * Give the page a modify structure and mark the page dirty if the tree isn't read-only. If the - * tree can be written, the page must be marked dirty: otherwise it can be discarded, and that - * will lose the truncate information if the parent page hasn't been reconciled since the - * truncation happened. + * Give the page a modify structure. We need it to remember that the page has been instantiated. + * We do not need to mark the page dirty here. (It used to be necessary because evicting a clean + * instantiated page would lose the delete information; but that is no longer the case.) Note + * though that because VLCS instantiation goes through col_modify it will mark the page dirty + * regardless, except in read-only trees where attempts to mark things dirty are ignored. (Row- + * store instantiation adds the tombstones by hand and so does not need to mark the page dirty.) * - * If the tree cannot be written (checked in page-modify-set), we won't dirty the page. In this - * case the truncate information must have been read from the parent page's on-disk cell, so we - * can fetch it again if we discard the page and then reread it. - * - * Truncates can appear in read-only trees (whether a read-only open of the live database or via - * a checkpoint cursor) if they were not yet globally visible when the tree was checkpointed. + * Note that partially visible truncates that may need instantiation can appear in read-only + * trees (whether a read-only open of the live database or via a checkpoint cursor) if they were + * not yet globally visible when the tree was checkpointed. */ WT_RET(__wt_page_modify_init(session, page)); - __wt_page_modify_set(session, page); /* * If the truncate operation is not yet resolved, count how many updates we're going to need and @@ -681,23 +701,14 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_DELE break; } + page->modify->instantiated = true; + page->modify->inst_updates = update_list; + /* - * Move the WT_PAGE_DELETED structure to page->modify; all of its information has been copied to - * the list of WT_UPDATE structures (if any), but we may still need it for internal page - * reconciliation. - * - * Note: when the page_del passed in isn't the one in the ref, there should be none in the ref. - * This only happens in readonly trees (see bt_page.c) and is a consequence of it being possible - * for a deleted page to be in WT_REF_DISK state if it's already been instantiated once and then - * evicted. In this case we can set modify->page_del to NULL regardless of the truncation's - * visibility (rather than copying the passed-in information); modify->page_del is only used by - * parent-page reconciliation and readonly trees shouldn't ever reach that code. + * We will leave the WT_PAGE_DELETED structure in the ref; all of its information has been + * copied to the list of WT_UPDATE structures (if any), but we may still need it for internal + * page reconciliation until the instantiated page is itself successfully reconciled. */ - WT_ASSERT(session, page_del == ref->ft_info.del || ref->ft_info.del == NULL); - page->modify->instantiated = true; - page->modify->page_del = ref->ft_info.del; - /* We don't need to null ft_info.del because assigning ft_info.update overwrites it. */ - ref->ft_info.update = update_list; return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index af5c1c523c9..6af568b3a76 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -215,7 +215,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_ovfl_discard_free(session, page); __wt_free(session, page->modify->ovfl_track); - __wt_free(session, page->modify->page_del); + __wt_free(session, page->modify->inst_updates); __wt_spin_destroy(session, &page->modify->page_lock); __wt_free(session, page->modify); @@ -295,7 +295,7 @@ __wt_free_ref(WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pa __wt_ref_addr_free(session, ref); /* Free any backing fast-truncate memory. */ - __wt_free(session, ref->ft_info.del); + __wt_free(session, ref->page_del); __wt_overwrite_and_free_len(session, ref, WT_REF_CLEAR_SIZE); } diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 5283ac827f7..87abfebf041 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -649,8 +649,8 @@ __inmem_col_int_init_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *home, u * fast-delete state for the page. */ if (page_del != NULL && F_ISSET(home->dsk, WT_PAGE_FT_UPDATE)) { - WT_RET(__wt_calloc_one(session, &ref->ft_info.del)); - *ref->ft_info.del = *page_del; + WT_RET(__wt_calloc_one(session, &ref->page_del)); + *ref->page_del = *page_del; } WT_REF_SET_STATE(ref, WT_REF_DELETED); } @@ -871,8 +871,8 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) * Recreate the fast-delete state for the page. */ if (F_ISSET(page->dsk, WT_PAGE_FT_UPDATE)) { - WT_ERR(__wt_calloc_one(session, &ref->ft_info.del)); - *ref->ft_info.del = unpack.page_del; + WT_ERR(__wt_calloc_one(session, &ref->page_del)); + *ref->page_del = unpack.page_del; } WT_REF_SET_STATE(ref, WT_REF_DELETED); diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 2f6f4e3eb88..6baf6f780df 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -95,7 +95,6 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_DECL_RET; WT_ITEM tmp; WT_PAGE *notused; - WT_PAGE_DELETED *del; uint32_t page_flags; uint8_t previous_state; bool prepare; @@ -118,24 +117,68 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) } /* - * Set the WT_REF_FLAG_READING flag for normal reads. Checkpoints can skip over clean pages - * being read into cache, but need to wait for deletes to be resolved (in order for checkpoint - * to write the correct version of the page). + * Set the WT_REF_FLAG_READING flag for normal reads; this causes reconciliation of the parent + * page to skip examining this page in detail and write out a reference to the on-disk version. + * Don't do this for deleted pages, as the reconciliation needs to examine the page delete + * information. That requires locking the ref, which requires waiting for the read to finish. + * (It is possible that always writing out a reference to the on-disk version of the page is + * sufficient in this case, but it's not entirely clear; we expect reads of deleted pages to be + * rare, so it's better to do the safe thing.) */ if (previous_state == WT_REF_DISK) F_SET(ref, WT_REF_FLAG_READING); /* * Get the address: if there is no address, the page was deleted and a subsequent search or - * insert is forcing re-creation of the name space. + * insert is forcing re-creation of the name space. There can't be page delete information, + * because that information is an amendment to an on-disk page; when a page is deleted any page + * delete information should expire and be removed before the original on-disk page is actually + * discarded. */ if (!__wt_ref_addr_copy(session, ref, &addr)) { WT_ASSERT(session, previous_state == WT_REF_DELETED); - + WT_ASSERT(session, ref->page_del == NULL); WT_ERR(__wt_btree_new_leaf_page(session, ref)); goto skip_read; } + /* + * If the page is deleted and the deletion is globally visible, don't bother reading and + * explicitly instantiating the existing page. Get a fresh page and pretend we got it by reading + * the on-disk page. Note that it's important to set the instantiated flag on the page so that + * reconciling the parent internal page knows it was previously deleted. Otherwise it's possible + * to write out a reference to the original page without the deletion, which will cause it to + * come back to life unexpectedly. + * + * Setting the instantiated flag requires a modify structure. We don't need to mark it dirty; if + * it gets discarded before something else modifies it, eviction will see the instantiated flag + * and set the ref state back to WT_REF_DELETED. + * + * Skip this optimization in cases that need the obsolete values. To minimize the number of + * special cases, use the same test as for skipping instantiation below. + */ + if (previous_state == WT_REF_DELETED && + !F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { + /* + * If the deletion has not yet been found to be globally visible (page_del isn't NULL), + * check if it is now, in case we can in fact avoid reading the page. Hide prepared deletes + * from this check; if the deletion is prepared we still need to load the page, because the + * reader might be reading at a timestamp early enough to not conflict with the prepare. + * Update oldest before checking; we're about to read from disk so it's worth doing some + * work to avoid that. + */ + WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); + if (ref->page_del != NULL && __wt_page_del_visible_all(session, ref->page_del, true)) + __wt_overwrite_and_free(session, ref->page_del); + + if (ref->page_del == NULL) { + WT_ERR(__wt_btree_new_leaf_page(session, ref)); + WT_ERR(__wt_page_modify_init(session, ref->page)); + ref->page->modify->instantiated = true; + goto skip_read; + } + } + /* There's an address, read the backing disk page and build an in-memory version of the page. */ WT_ERR(__wt_blkcache_read(session, &tmp, addr.addr, addr.size)); @@ -159,27 +202,27 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) /* * In the case of a fast delete, move all of the page's records to a deleted state based on the * fast-delete information. Skip for special commands that don't care about an in-memory state. + * (But do set up page->modify and set page->modify->instantiated so evicting the pages while + * these commands are working doesn't go off the rails.) * - * Note: there are three possible cases - the state was WT_REF_DELETED and ft_info.del was NULL; - * the state was WT_REF_DELETED and ft_info.del was non-NULL; and the state was WT_REF_DISK and - * the parent page cell was a WT_CELL_ADDR_DEL cell. The last is only valid in a readonly tree. + * There are two possible cases: the state was WT_REF_DELETED and page_del was or wasn't NULL. + * It used to also be possible for eviction to set the state to WT_REF_DISK while the parent + * page nonetheless had a WT_CELL_ADDR_DEL cell. This is not supposed to happen any more, so for + * now at least assert it doesn't. * - * ft_info.del gets cleared and set to NULL if the deletion is found to be globally visible; - * this can happen in any of several places. + * page_del gets cleared and set to NULL if the deletion is found to be globally visible; this + * can happen in any of several places. */ - del = NULL; - if (previous_state == WT_REF_DISK) { - WT_ASSERT(session, ref->ft_info.del == NULL); - if (addr.del_set) { - WT_ASSERT(session, F_ISSET(S2BT(session), WT_BTREE_READONLY)); - del = &addr.del; - } - } else - del = ref->ft_info.del; - - if ((previous_state == WT_REF_DELETED || del != NULL) && - !F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) - WT_ERR(__wt_delete_page_instantiate(session, ref, del)); + WT_ASSERT( + session, previous_state != WT_REF_DISK || (ref->page_del == NULL && addr.del_set == false)); + + if (previous_state == WT_REF_DELETED) { + if (F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { + WT_ERR(__wt_page_modify_init(session, ref->page)); + ref->page->modify->instantiated = true; + } else + WT_ERR(__wt_delete_page_instantiate(session, ref)); + } skip_read: F_CLR(ref, WT_REF_FLAG_READING); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index cf719a791aa..a48210bd5bc 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -607,7 +607,7 @@ __split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *paren } /* Free any backing fast-truncate memory. */ - __wt_free(session, ref->ft_info.del); + __wt_free(session, ref->page_del); /* Free the backing block and address. */ WT_TRET(__wt_ref_block_free(session, ref)); @@ -1780,7 +1780,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ WT_ASSERT(session, __wt_leaf_page_can_split(session, page)); WT_ASSERT(session, __wt_page_is_modified(page)); - WT_ASSERT(session, ref->ft_info.del == NULL); F_SET_ATOMIC_16(page, WT_PAGE_SPLIT_INSERT); /* Only split in-memory once. */ diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 2a9c8454458..3917c6c5b25 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -323,18 +323,36 @@ static int __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_DECL_RET; + bool instantiated; /* - * Discard the page and update the reference structure. A page with a disk address is an on-disk - * page, and a page without a disk address is a re-instantiated deleted page (for example, by - * searching), that was never subsequently written. + * We might discard an instantiated deleted page, because instantiated pages are not marked + * dirty by default. Check this before discarding the modify structure in __wt_ref_out. + */ + if (ref->page->modify != NULL && ref->page->modify->instantiated) + instantiated = true; + else { + WT_ASSERT(session, ref->page_del == NULL); + instantiated = false; + } + + /* + * Discard the page and update the reference structure. A leaf page without a disk address is a + * deleted page that either was created empty and never written out, or had its on-disk page + * discarded already after the deletion became globally visible. It is not immediately clear if + * it's possible to get an internal page without a disk address here, but if one appears it can + * be deleted. (Note that deleting an internal page implicitly turns it into a leaf.) + * + * A page with a disk address is now on disk, unless it was deleted and instantiated and then + * evicted unmodified, in which case it is still deleted. In the latter case set the state back + * to WT_REF_DELETED. */ __wt_ref_out(session, ref); if (ref->addr == NULL) { WT_WITH_PAGE_INDEX(session, ret = __evict_delete_ref(session, ref, flags)); WT_RET_BUSY_OK(ret); } else - WT_REF_SET_STATE(ref, WT_REF_DISK); + WT_REF_SET_STATE(ref, instantiated ? WT_REF_DELETED : WT_REF_DISK); return (0); } @@ -471,8 +489,8 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) /* * It is always OK to evict pages from checkpoint cursor trees if they don't have children, and - * visibility checks for pages deleted in the checkpoint aren't needed (or correct when done in - * eviction threads). + * visibility checks for pages found to be deleted in the checkpoint aren't needed (or correct + * when done in eviction threads). */ if (WT_READING_CHECKPOINT(session)) return (0); @@ -501,6 +519,22 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) if (!__wt_atomic_casv8(&child->state, WT_REF_DELETED, WT_REF_LOCKED)) return (__wt_set_return(session, EBUSY)); /* + * Insert a read/read barrier so we're guaranteed the page_del state we read below comes + * after the locking operation on the ref state and therefore after the previous unlock + * of the ref. Otherwise we might read an inconsistent view of the page deletion info, + * and while many combinations are harmless and would just lead us to falsely refuse to + * evict, some (e.g. reading committed as true and a stale durable timestamp from before + * it was set by commit) are not. + * + * Note that while ordinarily a lock acquire should have an acquire (read/any) barrier + * after it, because we are only reading the write part is irrelevant and a read/read + * barrier is sufficient. + * + * FIXME-WT-9780: this and the CAS should be rolled into a WT_REF_TRYLOCK macro. + */ + WT_READ_BARRIER(); + + /* * We can evict any truncation that's committed. However, restrictions in reconciliation * mean that it needs to be visible to us when we get there. And unfortunately we are * upstream of the point where eviction threads get snapshots. Plus, application threads @@ -512,15 +546,20 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) * 3. If we do not but we're an eviction thread, go ahead. We will get a snapshot * shortly and any committed operation will be visible in it. * 4. Otherwise, check if the operation is globally visible. + * + * Even though we specifically can't evict prepared truncations, we don't need to deploy + * the special-case logic for prepared transactions in __wt_page_del_visible; prepared + * transactions aren't committed so they'll fail the first check. */ - if (!__wt_page_del_committed(child->ft_info.del)) + if (!__wt_page_del_committed(child->page_del)) visible = false; else if (F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)) - visible = __wt_page_del_visible(session, child->ft_info.del, false); + visible = __wt_page_del_visible(session, child->page_del, false); else if (F_ISSET(session, WT_SESSION_EVICTION)) visible = true; else - visible = __wt_page_del_visible(session, child->ft_info.del, true); + visible = __wt_page_del_visible_all(session, child->page_del, false); + /* FIXME-WT-9780: is there a reason this doesn't use WT_REF_UNLOCK? */ child->state = WT_REF_DELETED; if (!visible) return (__wt_set_return(session, EBUSY)); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index c67d8c3477d..3b3dbdb538f 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -445,9 +445,15 @@ struct __wt_page_modify { /* Overflow record tracking for reconciliation. */ WT_OVFL_TRACK *ovfl_track; - /* Cached page-delete information for newly instantiated deleted pages. */ - WT_PAGE_DELETED *page_del; /* Deletion information; NULL if globally visible. */ - bool instantiated; /* True if this is a newly instantiated page. */ + /* + * Page-delete information for newly instantiated deleted pages. The instantiated flag remains + * set until the page is reconciled successfully; this indicates that the page_del information + * in the ref remains valid. The update list remains set (if set at all) until the transaction + * that deleted the page is resolved. These transitions are independent; that is, the first + * reconciliation can happen either before or after the delete transaction resolves. + */ + bool instantiated; /* True if this is a newly instantiated page. */ + WT_UPDATE **inst_updates; /* Update list for instantiated page with unresolved truncate. */ #define WT_PAGE_LOCK(s, p) __wt_spin_lock((s), &(p)->modify->page_lock) #define WT_PAGE_TRYLOCK(s, p) __wt_spin_trylock((s), &(p)->modify->page_lock) @@ -807,16 +813,17 @@ struct __wt_page { * * WT_REF_DELETED: * The page is on disk, but has been deleted from the tree; we can delete - * row-store leaf pages without reading them if they don't reference - * overflow items. + * row-store and VLCS leaf pages without reading them if they don't + * reference overflow items. * * WT_REF_LOCKED: * Locked for exclusive access. In eviction, this page or a parent has * been selected for eviction; once hazard pointers are checked, the page * will be evicted. When reading a page that was previously deleted, it - * is locked until the page is in memory with records marked deleted. The - * thread that set the page to WT_REF_LOCKED has exclusive access, no - * other thread may use the WT_REF until the state is changed. + * is locked until the page is in memory and the deletion has been + * instantiated with tombstone updates. The thread that set the page to + * WT_REF_LOCKED has exclusive access; no other thread may use the WT_REF + * until the state is changed. * * WT_REF_MEM: * Set by a reading thread once the page has been read from disk; the page @@ -847,7 +854,9 @@ struct __wt_page { /* * WT_PAGE_DELETED -- - * Related information for truncated pages. + * Information about how they got deleted for deleted pages. This structure records the + * transaction that deleted the page, plus the state the ref was in when the deletion happened. + * This structure is akin to an update but applies to a whole page. */ struct __wt_page_deleted { /* @@ -863,8 +872,8 @@ struct __wt_page_deleted { wt_timestamp_t durable_timestamp; /* - * The prepare state is used for transaction prepare to manage visibility and inheriting prepare - * state to update_list. + * The prepare state is used for transaction prepare to manage visibility and propagating the + * prepare state to the updates generated at instantiation time. */ volatile uint8_t prepare_state; @@ -952,83 +961,119 @@ struct __wt_ref { #define ref_ikey key.ikey /* - * Fast-truncate information, written-to/read-from disk as necessary in the internal page's - * deleted page proxy cell. When a WT_REF first becomes part of a fast-truncate operation, the - * ft_info.del field is allocated and initialized. + * Page deletion information, written-to/read-from disk as necessary in the internal page's + * address cell. (Deleted-address cells are also referred to as "proxy cells".) When a WT_REF + * first becomes part of a fast-truncate operation, the page_del field is allocated and + * initialized; it is similar to an update and holds information about the transaction that + * performed the truncate. It can be discarded and set to NULL when that transaction reaches + * global visibility. * - * Fast-truncate pages might have to be instantiated if a thread for which the operation isn't - * visible accesses the page. This can happen if the operation hasn't committed yet; it can also - * happen if an older read transaction visits the page, and it can happen if the fast-truncate - * operation is included in a checkpoint and then seen later, after a restart or via a - * checkpoint cursor. + * Operations other than truncate that produce deleted pages (checkpoint cleanup, reconciliation + * as empty, etc.) leave the page_del field NULL as in these cases the deletion is already + * globally visible. * - * If the page must be instantiated for any reason: (1) WT_UPDATE structures are created for the - * page entries, (2) the transaction information from ft_info.del is copied to those WT_UPDATE - * structures (making them a match for the truncate operation), (3) the ft_info.del field is - * discarded, and (4) the WT_REF state switches to WT_REF_MEM. + * Once the deletion is globally visible, the original on-disk page is no longer needed and can + * be discarded; this happens the next time the parent page is reconciled, either by eviction or + * by a checkpoint. The ref remains, however, and still occupies the same key space in the table + * that it always did. * - * If the fast-truncate operation has not yet committed, additionally the ft_info.update field - * is created, which is an array of references to the WT_UPDATE structures, for subsequent - * transaction commit/abort. (The page can split, so there needs to be some way to find all of - * the update structures.) + * Deleted refs (and thus chunks of the tree namespace) are only discarded at two points: when + * the parent page is discarded after being evicted, or in the course of internal page splits + * and reverse splits. Until this happens, the "same" page can be brought back to life by + * writing to its portion of the key space. * - * Doing anything other than testing if ft_info.del or ft_info.update is non-NULL (which - * eviction does) requires the WT_REF be locked. + * A deleted page needs to be "instantiated" (read in from disk and converted to an in-memory + * page where every item on the page has been individually deleted) if we need to position a + * cursor on the page, or if we need to visit it for other reasons. Logic exists to avoid that + * in various common cases (see: __wt_btcur_skip_page, __wt_delete_page_skip) but in many less + * common situations we proceed with instantiation anyway to avoid multiplying the number of + * special cases in the system. * - * Because ft_info is a union it is important to always access the correct field. It is also - * vital to interpret the state correctly and consider all the possible cases. + * Common triggers for instantiation include: another thread reading from the page before a + * truncate commits; an older reader visiting a page after a truncate commits; a thread reading + * the page via a checkpoint cursor if the truncation wasn't yet globally visible at checkpoint + * time; a thread reading the page after shutdown and restart under similar circumstances; RTS + * needing to roll back a committed but unstable truncation (and possibly also updates that + * occurred before the truncation); and a thread writing to the truncated portion of the table + * space after the truncation but before the page is completely discarded. * - * The union access should be ft_info.del if the state is WT_REF_DELETED (states 1 and 2 below), - * and should be ft_info.update if the state is WT_REF_MEM (states 5-6 below). Otherwise, - * neither field is valid and the pointer should always be NULL. + * If the page must be instantiated for any reason: (1) for each entry on the page a WT_UPDATE + * is created; (2) the transaction information from page_del is copied to those WT_UPDATE + * structures (making them a match for the truncate operation), and (3) the WT_REF state + * switches to WT_REF_MEM. * - * These are the possible states: + * If the fast-truncate operation has not yet committed, an array of references to the WT_UPDATE + * structures is placed in modify->inst_updates. This is used to find the updates when the + * operation subsequently resolves. (The page can split, so there needs to be some way to find + * all of the update structures.) * - * 1. The WT_REF state is WT_REF_DELETED and ft_info.del is NULL. This means the page is deleted - * and the deletion is globally visible. Any on-disk page has been or will be discarded. + * After instantiation, the page_del structure is kept until the instantiated page is next + * reconciled. This is because in some cases reconciliation of the parent internal page may need + * to write out a reference to the pre-instantiated on-disk page, at which point the page_del + * information is needed to build the correct reference. * - * 2. The WT_REF state is WT_REF_DELETED and ft_info.del is not NULL. The page is deleted, but - * but the deletion may not yet be globally visible (or visible to any given reader either.) The - * on-disk page remains in case we need it to satisfy reads. ft_info.del describes the delete - * operation. If it is necessary to read the page on behalf of a thread that cannot see the - * deletion, the page must be instantiated as described above. + * If the ref is in WT_REF_DELETED state, all actions besides checking whether page_del is NULL + * require that the WT_REF be locked. There are two reasons for this: first, the page might be + * instantiated at any time, and it is important to not see a partly-completed instantiation; + * and second, the page_del structure is discarded opportunistically if its transaction is found + * to be globally visible, so accessing it without locking the ref is unsafe. * - * 3. The WT_REF state is WT_REF_DISK, and the parent page's address cell is a deleted-address - * cell. ft_info is not valid; ft_info.del should read as NULL. The page is on disk, and - * deleted; the deletion may not yet be globally visible. Because the time aggregate stored in - * the parent internal page includes the deletion time, tree walks will skip the page as - * appropriate without needing the fast-delete information. This state can only happen in - * readonly trees; it is a result of the page being read in and instantiated, but not marked - * dirty, then discarded by eviction. (In principle eviction should set the state back to - * WT_REF_DELETED in this case; however, this turns out to be awkward and we work around it - * instead.) This state only arises in two places: when reading in the page, and in some cases - * of skipping over the page; both cases already need to unpack the address cell, so we can use - * it to retrieve the fast-delete information. Other than these considerations, this state is - * indistinguishable from state 4. + * If the ref is in WT_REF_MEM state because it has been instantiated, the safety requirements + * are somewhat looser. Checking for an instantiated page by examining modify->instantiated does + * not require locking. Checking if modify->inst_updates is non-NULL (which means that the + * truncation isn't committed) also doesn't require locking. In general the page_del structure + * should not be used after instantiation; exceptions are (a) it is still updated by transaction + * prepare, commit, and rollback (so that it remains correct) and (b) it is used by internal + * page reconciliation if that occurs before the instantiated child is itself reconciled. (The + * latter can only happen if the child is evicted in a fairly narrow time window during a + * checkpoint.) This still requires locking the ref. * - * 4. The WT_REF state is WT_REF_DISK, and the parent page's address cell is not a - * deleted-address cell. ft_info is not valid; ft_info.del should read as NULL. This is an - * ordinary on-disk page. + * It is vital to consider all the possible cases when touching a deleted or instantiated page. * - * 5. The WT_REF state is WT_REF_MEM, and ft_info.update is NULL. This is an ordinary in-memory - * page. + * There are two major groups of states: + * + * 1. The WT_REF state is WT_REF_DELETED. This means the page is deleted and not in memory. + * - If the page has no disk address, the ref is a placeholder in the key space and may in + * general be discarded at the next opportunity. (Some restrictions apply in VLCS.) + * - If the page has a disk address, page_del may be NULL. In this case, the deletion of the + * page is globally visible and the on-disk page can be discarded at the next opportunity. + * - If the page has a disk address and page_del is not NULL, page_del contains information + * about the transaction that deleted the page. It is necessary to lock the ref to read + * page_del; at that point (if the state hasn't changed while getting the lock) + * page_del->committed can be used to check if the transaction is committed or not. * - * 6. The WT_REF state is WT_REF_MEM, and ft_info.update is not NULL. This is a deleted page - * that was instantiated when the delete transaction was not yet resolved. ft_info.update is the - * list of updates created by the instantiation, which is used to commit or abort them as needed - * and then cleared. It is not possible to get to this state if the truncate information was - * read from disk; uncommitted (including prepared) truncates are not evicted or checkpointed. + * 2. The WT_REF state is WT_REF_MEM. The page is either an ordinary page or an instantiated + * deleted page. + * - If ref->page->modify is NULL, the page is ordinary. + * - If ref->page->modify->instantiated is false and ref->page->modify->inst_updates is NULL, + * the page is ordinary. + * - If ref->page->modify->instantiated is true, the page is instantiated and has not yet + * been reconciled. ref->page_del is either NULL (meaning the deletion is globally visible) + * or contains information about the transaction that deleted the page. This information is + * only meaningful either (a) in relation to the existing on-disk page rather than the in- + * memory page (this can be needed to reconcile the parent internal page) or (b) if the + * page is clean. + * - If ref->page->modify->inst_updates is not NULL, the page is instantiated and the + * transaction that deleted it has not resolved yet. The update list is used during commit + * or rollback to find the updates created during instantiation. * - * In both states 5 and 6, the page will have a modify structure to hold the instantiated - * tombstones. If the tree is read-write, the page will be marked dirty. Until it is reconciled, - * modify->instantiated will also be set to true, and modify->page_del will hold the page-delete - * information used for the instantiation, if any. This is needed under some circumstances - * for checkpointing internal pages. + * The last two points of group (2) are orthogonal; that is, after instantiation the + * instantiated flag and page_del structure (on the one hand) and the update list (on the other) + * are used and discarded independently. The former persists only until the page is first + * successfully reconciled; the latter persists until the transaction resolves. These events may + * occur in either order. + * + * As described above, in any state in group (1) an access to the page may require it be read + * into memory, at which point it moves into group (2). Instantiation always sets the + * instantiated flag to true; the updates list is only created if the transaction has not yet + * resolved at the point instantiation happens. (The ref is locked in both transaction + * resolution and instantiation to make sure these events happen in a well-defined order.) + * + * Because internal pages with uncommitted (including prepared) deletions are not written to + * disk, a page instantiated after its parent was read from disk will always have inst_updates + * set to NULL. */ - union { - WT_PAGE_DELETED *del; /* Page not instantiated, page-deleted structure */ - WT_UPDATE **update; /* Page instantiated, update list for subsequent commit/abort */ - } ft_info; + WT_PAGE_DELETED *page_del; /* Page-delete information for a deleted page. */ #ifdef HAVE_REF_TRACK /* diff --git a/src/third_party/wiredtiger/src/include/btree_inline.h b/src/third_party/wiredtiger/src/include/btree_inline.h index c5c70a5b622..3cf159da8c3 100644 --- a/src/third_party/wiredtiger/src/include/btree_inline.h +++ b/src/third_party/wiredtiger/src/include/btree_inline.h @@ -1582,55 +1582,72 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * __wt_page_del_visible -- - * Return if a truncate operation is visible to the caller. + * __wt_page_del_visible_all -- + * Check if a truncate operation is visible to everyone and the data under it is obsolete. */ static inline bool -__wt_page_del_visible(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, bool visible_all) +__wt_page_del_visible_all(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, bool hide_prepared) { uint8_t prepare_state; /* - * In general usage, a NULL WT_PAGE_DELETED is a truncate operation whose details were discarded - * when it became globally visible. + * Like other visible_all checks, use the durable timestamp to avoid complications: there is + * potentially a window where a prepared and committed transaction can be visible but not yet + * durable, and in that window the changes under it are not obsolete yet. + * + * The hide_prepared argument causes prepared but not committed transactions to be treated as + * invisible. (Apparently prepared and uncommitted transactions can be visible_all, but we need + * to not see them in some cases; for example, prepared deletions can't exist on disk because + * the on-disk format doesn't have space for the extra "I'm prepared" bit, so we avoid seeing + * them in reconciliation. Similarly, we can't skip over a page just because a transaction has + * deleted it and prepared; only committed transactions are suitable.) + * + * In all cases, the ref owning the page_deleted structure should be locked and its pre-lock + * state should be WT_REF_DELETED. This prevents the page from being instantiated while we look + * at it, and locks out other operations that might simultaneously discard the structure (either + * after checking visibility, or because its transaction aborted). */ + + /* If the page delete info is NULL, the deletion was previously found to be globally visible. */ if (page_del == NULL) return (true); /* We discard page_del on transaction abort, so should never see an aborted one. */ WT_ASSERT(session, page_del->txnid != WT_TXN_ABORTED); - WT_ORDERED_READ(prepare_state, page_del->prepare_state); - if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED) - return (false); + if (hide_prepared) { + WT_ORDERED_READ(prepare_state, page_del->prepare_state); + if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED) + return (false); + } - return (visible_all ? - __wt_txn_visible_all(session, page_del->txnid, page_del->durable_timestamp) : - __wt_txn_visible(session, page_del->txnid, page_del->timestamp)); + return (__wt_txn_visible_all(session, page_del->txnid, page_del->durable_timestamp)); } /* - * __wt_page_del_active -- - * Return if a truncate operation is active. + * __wt_page_del_visible -- + * Return if a truncate operation is visible to the caller. The same considerations apply as in + * the visible_all version. */ static inline bool -__wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) +__wt_page_del_visible(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, bool hide_prepared) { - /* - * Return if a truncate operation is active: "active" means approximately that the truncate is - * still in progress, that is, that the underlying original page may still be required. This - * function in practice is actually a visibility test (it returns whether the truncate is *not* - * visible) and should be renamed and have its sense flipped to be more consistent with the rest - * of the system. - * - * Our caller should have already locked the WT_REF and confirmed that the previous state was - * WT_REF_DELETED. Consequently there are two possible cases: either ft_info.del is NULL (in - * which case the deletion is globally visible and cannot be rolled back) or it is not, in which - * case the information in ft_info.del gives us the visibility. - */ - WT_ASSERT(session, ref->state == WT_REF_LOCKED); + uint8_t prepare_state; - return (!__wt_page_del_visible(session, ref->ft_info.del, visible_all)); + /* If the page delete info is NULL, the deletion was previously found to be globally visible. */ + if (page_del == NULL) + return (true); + + /* We discard page_del on transaction abort, so should never see an aborted one. */ + WT_ASSERT(session, page_del->txnid != WT_TXN_ABORTED); + + if (hide_prepared) { + WT_ORDERED_READ(prepare_state, page_del->prepare_state); + if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED) + return (false); + } + + return (__wt_txn_visible(session, page_del->txnid, page_del->timestamp)); } /* @@ -1638,7 +1655,10 @@ __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) * Return if a truncate operation is resolved. (Since truncations that abort are removed * immediately, "resolved" and "committed" are equivalent here.) The caller should have already * locked the ref and confirmed that the ref's previous state was WT_REF_DELETED. The page_del - * argument should be the ref's ft_info.del member. + * argument should be the ref's page_del member. This function should only be used for pages in + * WT_REF_DELETED state. For deleted pages that have been instantiated in memory, the update + * list in the page modify structure should be checked instead, as the page_del structure might + * have been discarded already. (The update list is non-null if the transaction is unresolved.) */ static inline bool __wt_page_del_committed(WT_PAGE_DELETED *page_del) @@ -1834,13 +1854,18 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) /* * Check the fast-truncate information. Pages with an uncommitted truncate cannot be evicted. * - * Because the page is in memory, we look at ft_info.update. If it's not NULL, that means the + * Because the page is in memory, we look at mod.inst_updates. If it's not NULL, that means the * truncate operation isn't committed. * - * The list of updates in ft_info.update will be discarded when the transaction they belong to + * The list of updates in mod.inst_updates will be discarded when the transaction they belong to * is resolved. + * + * Note that we are not using __wt_page_del_committed here because (a) examining the page_del + * structure requires locking the ref, and (b) once in memory the page_del structure only + * remains until the next reconciliation, and nothing prevents that from occurring before the + * transaction commits. */ - if (ref->ft_info.update != NULL) + if (mod->inst_updates != NULL) return (false); /* @@ -2236,21 +2261,21 @@ __wt_btcur_skip_page( WT_REF_LOCK(session, ref, &previous_state); /* - * Check the fast-truncate information, there are 4 cases: + * Check the fast-truncate information; there are 3 cases: * - * (1) The page is in the WT_REF_DELETED state and ft_info.del is NULL. The page is deleted. - * (2) The page is in the WT_REF_DELETED state and ft_info.del is not NULL. The page is deleted - * if the truncate operation is visible. Look at ft_info.del; we could use the info from the + * (1) The page is in the WT_REF_DELETED state and page_del is NULL. The page is deleted. This + * case is folded into the next because __wt_page_del_visible handles it. + * (2) The page is in the WT_REF_DELETED state and page_del is not NULL. The page is deleted + * if the truncate operation is visible. Look at page_del; we could use the info from the * address cell below too, but that's slower. - * (3) The page is in the WT_REF_DISK state. The page may be deleted; check the delete info from - * the address cell. - * (4) The page is in memory and has been instantiated. The delete info from the address cell - * will serve for readonly/unmodified pages, and for modified pages we can't skip the page - * anyway. - */ - if (previous_state == WT_REF_DELETED && - (ref->ft_info.del == NULL || - __wt_txn_visible(session, ref->ft_info.del->txnid, ref->ft_info.del->timestamp))) { + * (3) The page is in memory and has been instantiated. The delete info from the address cell + * will serve for readonly/unmodified pages, and for modified pages we can't skip the page. + * (This case is checked further below.) + * + * In all cases, make use of the option to __wt_page_del_visible to hide prepared transactions, + * as we shouldn't skip pages where the deletion is prepared but not committed. + */ + if (previous_state == WT_REF_DELETED && __wt_page_del_visible(session, ref->page_del, true)) { *skipp = true; goto unlock; } @@ -2264,7 +2289,7 @@ __wt_btcur_skip_page( (previous_state == WT_REF_MEM && !__wt_page_is_modified(ref->page))) && __wt_ref_addr_copy(session, ref, &addr)) { /* If there's delete information in the disk address, we can use it. */ - if (addr.del_set && __wt_txn_visible(session, addr.del.txnid, addr.del.timestamp)) { + if (addr.del_set && __wt_page_del_visible(session, &addr.del, true)) { *skipp = true; goto unlock; } diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h index 351c7ae6464..0b504d1ed59 100644 --- a/src/third_party/wiredtiger/src/include/cell.h +++ b/src/third_party/wiredtiger/src/include/cell.h @@ -24,6 +24,9 @@ * Deleted cells are place-holders for column-store files, where entries cannot * be removed in order to preserve the record count. * + * Note that deleted value cells (WT_CELL_DEL) are different from deleted-address + * cells (WT_CELL_ADDR_DEL). + * * Here's the cell use by page type: * * WT_PAGE_ROW_INT (row-store internal page): diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 1ae1eb3042d..470e83000c2 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -661,8 +661,8 @@ extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_ WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, - WT_PAGE_DELETED *page_del) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_redo_window_cleanup(WT_SESSION_IMPL *session) @@ -1986,12 +1986,12 @@ static inline bool __wt_op_timer_fired(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline bool __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_page_del_committed(WT_PAGE_DELETED *page_del) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_page_del_visible(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, - bool visible_all) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + bool hide_prepared) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_page_del_visible_all(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, + bool hide_prepared) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_page_evict_clean(WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page) diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h index 519fb4b9dbf..7c94909e721 100644 --- a/src/third_party/wiredtiger/src/include/txn_inline.h +++ b/src/third_party/wiredtiger/src/include/txn_inline.h @@ -231,22 +231,25 @@ __wt_txn_op_delete_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bo /* * Timestamps and prepare state are in the page deleted structure for truncates, or in the - * updates in the case of instantiated pages. In the case of instantiated pages we may also need - * to update the page deleted structure saved in page->modify. + * updates list in the case of instantiated pages. We also need to update any page deleted + * structure in the ref. * - * Only two cases are possible. First: the state is WT_REF_DELETED. In this case ft_info.del - * cannot be NULL yet because an uncommitted operation cannot have reached global visibility. - * Otherwise: there is an uncommitted delete operation we're handling, so the page can't be in a - * non-deleted state, and the tree can't be readonly. Therefore the page must have been + * Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot + * be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at + * least, global visibility in the sense we need to use it for truncations, in which prepared + * and uncommitted transactions are not visible.) + * + * Otherwise: there is an uncommitted delete operation we're handling, so the page must have + * been deleted at some point, and the tree can't be readonly. Therefore the page must have been * instantiated, the state must be WT_REF_MEM, and there should be an update list in - * ft_info.update. (But just in case, allow the update list to be null. Perhaps the page was - * truncated when all items on it were already deleted, so no tombstones were created during - * instantiation.) + * mod->inst_updates. (But just in case, allow the update list to be null.) There might be a + * non-null page_del structure to update, depending on whether the page has been reconciled + * since it was deleted and then instantiated. */ - if (previous_state == WT_REF_DELETED) - page_del = ref->ft_info.del; - else { - if ((updp = ref->ft_info.update) != NULL) + if (previous_state != WT_REF_DELETED) { + WT_ASSERT(session, previous_state == WT_REF_MEM); + WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL); + if ((updp = ref->page->modify->inst_updates) != NULL) for (; *updp != NULL; ++updp) { (*updp)->start_ts = ts; /* @@ -257,9 +260,8 @@ __wt_txn_op_delete_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bo if (commit) (*updp)->durable_ts = txn->durable_timestamp; } - WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL); - page_del = ref->page->modify->page_del; } + page_del = ref->page_del; if (page_del != NULL) { page_del->timestamp = ts; if (commit) @@ -289,28 +291,31 @@ __wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref /* * Timestamps are in the page deleted structure for truncates, or in the updates in the case of - * instantiated pages. Both commit and durable timestamps need to be updated. + * instantiated pages. We also need to update any page deleted structure in the ref. Both commit + * and durable timestamps need to be updated. * - * Only two cases are possible. First: the state is WT_REF_DELETED. In this case ft_info.del - * cannot be NULL yet because an uncommitted operation cannot have reached global visibility. - * Otherwise: there is an uncommitted delete operation we're handling, so the page can't be in a - * non-deleted state, and the tree can't be readonly. Therefore the page must have been + * Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot + * be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at + * least, global visibility in the sense we need to use it for truncations, in which prepared + * and uncommitted transactions are not visible.) + * + * Otherwise: there is an uncommitted delete operation we're handling, so the page must have + * been deleted at some point, and the tree can't be readonly. Therefore the page must have been * instantiated, the state must be WT_REF_MEM, and there should be an update list in - * ft_info.update. (But just in case, allow the update list to be null. Perhaps the page was - * truncated when all items on it were already deleted, so no tombstones were created during - * instantiation.) + * mod->inst_updates. (But just in case, allow the update list to be null.) There might be a + * non-null page_del structure to update, depending on whether the page has been reconciled + * since it was deleted and then instantiated. */ - if (previous_state == WT_REF_DELETED) - page_del = ref->ft_info.del; - else { - if ((updp = ref->ft_info.update) != NULL) + if (previous_state != WT_REF_DELETED) { + WT_ASSERT(session, previous_state == WT_REF_MEM); + WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL); + if ((updp = ref->page->modify->inst_updates) != NULL) for (; *updp != NULL; ++updp) { (*updp)->start_ts = txn->commit_timestamp; (*updp)->durable_ts = txn->durable_timestamp; } - WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL); - page_del = ref->page->modify->page_del; } + page_del = ref->page_del; if (page_del != NULL && page_del->timestamp == WT_TS_NONE) { page_del->timestamp = txn->commit_timestamp; page_del->durable_timestamp = txn->durable_timestamp; @@ -437,7 +442,7 @@ __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref) * This access to the WT_PAGE_DELETED structure is safe; caller has the WT_REF locked, and in * fact just allocated the structure to fill in. */ - ref->ft_info.del->txnid = txn->id; + ref->page_del->txnid = txn->id; __wt_txn_op_set_timestamp(session, op); if (__wt_log_op(session)) diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c index 03b07757377..229413a6d48 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_child.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c @@ -13,10 +13,14 @@ * Handle pages with leaf pages in the WT_REF_DELETED state. */ static int -__rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, - WT_PAGE_DELETED *page_del, WT_CHILD_MODIFY_STATE *cmsp) +__rec_child_deleted( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_CHILD_MODIFY_STATE *cmsp) { + WT_PAGE_DELETED *page_del; uint8_t prepare_state; + bool visible, visible_all; + + page_del = ref->page_del; cmsp->state = WT_CHILD_IGNORE; @@ -28,6 +32,17 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, return (__wt_ref_block_free(session, ref)); /* + * Check visibility. If the truncation is visible to us, we'll also want to know if it's visible + * to everyone. Use the special-case logic in __wt_page_del_visible to hide prepared truncations + * as we can't write them to disk. + */ + if (F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)) { + visible = __wt_page_del_visible(session, page_del, true); + visible_all = visible ? __wt_page_del_visible_all(session, page_del, true) : false; + } else + visible = visible_all = __wt_page_del_visible_all(session, page_del, true); + + /* * The truncate may not yet be visible to us. In that case, we proceed as with any change not * visible during reconciliation by ignoring the change for the purposes of writing the internal * page. @@ -39,7 +54,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, * been written to disk yet; if the page gets marked clean it might be discarded and then the * truncation is lost. */ - if (!__wt_page_del_visible(session, page_del, !F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT))) { + if (!visible) { if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) WT_RET_PANIC(session, EINVAL, "reconciliation illegally skipped an update"); /* @@ -89,20 +104,22 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, } /* - * Deal with underlying disk blocks. If there are readers that might want to see the page's - * state before it's deleted, or the fast-delete can be undone by RTS, we can't discard the - * pages. Write a cell to the internal page with information describing the fast-delete. + * If there are readers that might want to see the page's state before it's deleted, or the + * fast-delete can be undone by RTS, we can't discard the pages. Write a cell to the internal + * page with information describing the fast-delete. * * We have the WT_REF locked, but that lock is released before returning to the function writing * cells to the page. Copy out the current fast-truncate information for that function. */ - if (!__wt_page_del_visible(session, page_del, true)) { + if (!visible_all) { cmsp->del = *page_del; cmsp->state = WT_CHILD_PROXY; return (0); } /* + * Deal with underlying disk blocks. + * * Globally visible truncate, discard the leaf page to the block manager and no cell needs to be * written. Done outside of the underlying tracking routines because this action is permanent * and irrevocable. (Clearing the address means we've lost track of the disk address in a @@ -112,13 +129,8 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, */ WT_RET(__wt_ref_block_free(session, ref)); - /* - * Globally visible fast-truncate information is never used again, a NULL value is identical. - * Fast-truncate information in the page-modify structure can be used more than once if this - * reconciliation of the internal page were to fail. - */ - if (page_del == ref->ft_info.del) - __wt_overwrite_and_free(session, ref->ft_info.del); + /* Globally visible fast-truncate information is never used again, a NULL value is identical. */ + __wt_overwrite_and_free(session, ref->page_del); return (0); } @@ -157,7 +169,7 @@ __wt_rec_child_modify( // 9417 IGNORE WT_ASSERT(session, ref->addr != NULL); /* DISK pages do not have fast-truncate info. */ - WT_ASSERT(session, ref->ft_info.del == NULL); + WT_ASSERT(session, ref->page_del == NULL); goto done; case WT_REF_DELETED: @@ -170,7 +182,7 @@ __wt_rec_child_modify( */ if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) break; - ret = __rec_child_deleted(session, r, ref, ref->ft_info.del, cmsp); + ret = __rec_child_deleted(session, r, ref, cmsp); WT_REF_SET_STATE(ref, WT_REF_DELETED); goto done; @@ -217,15 +229,20 @@ __wt_rec_child_modify( * Set WT_READ_NO_WAIT because we're only interested in the WT_REF's final state. Pages * in transition might change WT_REF state during our read, and then return WT_NOTFOUND * to us. In that case, loop and look again. + * + * If we retried from below this point and already have a hazard pointer, don't do it + * again. */ - ret = __wt_page_in( - session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT); - if (ret == WT_NOTFOUND) { - ret = 0; - break; + if (cmsp->hazard == false) { + ret = __wt_page_in(session, ref, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT); + if (ret == WT_NOTFOUND) { + ret = 0; + break; + } + WT_RET(ret); + cmsp->hazard = true; } - WT_RET(ret); - cmsp->hazard = true; /* * The child is potentially modified if the page's modify structure has been created. If @@ -252,17 +269,32 @@ __wt_rec_child_modify( * Depending on visibility, we may need to write the original page, or write a proxy * (deleted-address) cell with the pre-instantiation page-delete information, or we may * be able to ignore the page entirely. We keep the original fast-truncate information - * in the modify structure after instantiation to make the visibility check possible. + * in the ref after instantiation to make the visibility check possible. * * The key is the page-modify.instantiated flag, removed during page reconciliation. If * it's set, instantiation happened after checkpoint passed the leaf page and we treat * this page like a WT_REF_DELETED page, evaluating it as it was before instantiation. * - * We do not need additional locking: with a hazard pointer the page can't be evicted, - * and reconciliation is the only thing that can clear the page-modify info. + * We need to lock the ref for it to be safe to examine the page_del structure, in case + * the transaction in it is unresolved and tries to roll back (which discards the + * structure) while we're looking at it. It should be possible to skip the locking if + * the instantiation update list is NULL (that means the transaction is resolved) but + * for now let's do the conservatively safe thing. */ if (mod != NULL && mod->instantiated) { - WT_RET(__rec_child_deleted(session, r, ref, mod->page_del, cmsp)); + if (!WT_REF_CAS_STATE(session, ref, WT_REF_MEM, WT_REF_LOCKED)) + /* Oops. Retry... */ + break; + + /* This is a very small race window, but check just in case. */ + if (mod->instantiated == false) { + WT_REF_SET_STATE(ref, WT_REF_MEM); + /* Retry from the top; we may now have a rec_result. */ + break; + } + + WT_RET(__rec_child_deleted(session, r, ref, cmsp)); + WT_REF_SET_STATE(ref, WT_REF_MEM); goto done; } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index bd5b3d12f62..34f6615bc05 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -1672,8 +1672,11 @@ next: * - There were invisible updates, because then the page isn't really empty. Also, at least * for now if we try to restore updates to an empty page col_modify will trip on its * shoelaces. + * - We wrote no cells at all. This can happen if a page with no cells and no append list + * entries at all (not just one with no or only aborted updates) gets marked dirty somehow + * and reconciled; this is apparently possible in some circumstances. */ - if (!wrote_real_values && salvage == NULL && r->leave_dirty == false) { + if (!wrote_real_values && salvage == NULL && r->leave_dirty == false && r->entries > 0) { WT_ASSERT(session, r->entries == 1); r->entries = 0; WT_STAT_CONN_DATA_INCR(session, rec_vlcs_emptied_pages); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 1e9cfba872f..2c26d83adf7 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -2397,12 +2397,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_REF *ref; WT_TIME_AGGREGATE ta; uint32_t i; + uint8_t previous_ref_state; btree = S2BT(session); bm = btree->bm; mod = page->modify; ref = r->ref; WT_TIME_AGGREGATE_INIT(&ta); + previous_ref_state = 0; /* * If using the history store table eviction path and we found updates that weren't globally @@ -2558,10 +2560,42 @@ split: break; } - /* If the page has post-instantiation delete information, we don't need it any more. */ + /* + * If the page has post-instantiation delete information, we don't need it any more. Note: this + * is the only place in the system that potentially touches ref->page_del without locking the + * ref. There are two other pieces of code it can interact with: transaction rollback and parent + * internal page reconciliation. We use __wt_free_page_del here and in transaction rollback to + * make the deletion atomic. Reconciliation of the parent is locked out for the following + * reasons: first, if we are evicting the leaf here, eviction has the ref locked, and the parent + * will wait for it; and if we are checkpointing the leaf, we can't simultaneously be + * checkpointing the parent, and we can't be evicting the parent either because internal pages + * can't be evicted while they have in-memory children. + */ if (mod->instantiated) { - mod->instantiated = false; - __wt_free(session, mod->page_del); + /* + * Unfortunately, it seems we need to lock the ref at this point. Ultimately the page_del + * structure and the instantiated flag need to both be cleared simultaneously (otherwise + * instantiated == false and page_del not NULL violates the intended invariant and other + * code can assert) and there are several other places that can still be interacting with + * the page_del structure at this point (even though the page has been instantiated) and we + * need to wait for those to finish before discarding it. + * + * Note: if we're in eviction, the ref is already locked. + */ + if (!F_ISSET(r, WT_REC_EVICT)) { + WT_REF_LOCK(session, ref, &previous_ref_state); + WT_ASSERT(session, previous_ref_state == WT_REF_MEM); + } else + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + + /* Check the instantiated flag again in case it got cleared while we waited. */ + if (mod->instantiated) { + mod->instantiated = false; + __wt_free(session, ref->page_del); + } + + if (!F_ISSET(r, WT_REC_EVICT)) + WT_REF_UNLOCK(ref, previous_ref_state); } return (0); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index e91b435beb2..cfdf7c522ff 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -1721,16 +1721,19 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) /* * Only two cases are possible. First: the state is WT_REF_DELETED. In this case - * ft_info.del cannot be NULL yet because an uncommitted operation cannot have reached + * page_del cannot be NULL yet because an uncommitted operation cannot have reached * global visibility. Otherwise: there is an uncommitted delete operation we're * handling, so the page can't be in a non-deleted state, and the tree can't be * readonly. Therefore the page must have been instantiated, the state must be - * WT_REF_MEM, and there should be an update list in ft_info.update. + * WT_REF_MEM, and there should be an update list in modify->inst_updates. There may + * also be a non-NULL page_del to update. */ - if (previous_state == WT_REF_DELETED) - op->u.ref->ft_info.del->committed = true; - else - __wt_free(session, op->u.ref->ft_info.update); + if (previous_state != WT_REF_DELETED) { + WT_ASSERT(session, op->u.ref->page != NULL && op->u.ref->page->modify != NULL); + __wt_free(session, op->u.ref->page->modify->inst_updates); + } + if (op->u.ref->page_del != NULL) + op->u.ref->page_del->committed = true; WT_REF_UNLOCK(op->u.ref, previous_state); } __wt_txn_op_free(session, op); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 9d8fe657a15..0a17f020b05 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -1131,8 +1131,11 @@ __rollback_page_needs_abort( * is greater than or equal to recovered checkpoint snapshot min: * 1. The reconciled replace page max durable timestamp. * 2. The reconciled multi page max durable timestamp. - * 3. The on page address max durable timestamp. - * 4. The off page address max durable timestamp. + * 3. For just-instantiated deleted pages that have not otherwise been modified, the durable + * timestamp in the page delete information. This timestamp isn't reflected in the address's + * time aggregate. + * 4. The on page address max durable timestamp. + * 5. The off page address max durable timestamp. */ if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { tag = "reconciled replace block"; @@ -1149,6 +1152,15 @@ __rollback_page_needs_abort( prepared = true; } result = (durable_ts > rollback_timestamp) || prepared; + } else if (mod != NULL && mod->instantiated && !__wt_page_is_modified(ref->page) && + ref->page_del != NULL) { + tag = "page_del info"; + durable_ts = ref->page_del->durable_timestamp; + prepared = ref->page_del->prepare_state == WT_PREPARE_INPROGRESS || + ref->page_del->prepare_state == WT_PREPARE_LOCKED; + newest_txn = ref->page_del->txnid; + result = (durable_ts > rollback_timestamp) || prepared || + WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn); } else if (!__wt_off_page(ref->home, addr)) { tag = "on page cell"; /* Check if the page is obsolete using the page disk address. */ @@ -1252,11 +1264,20 @@ __rollback_to_stable_page_skip( */ if (ref->state == WT_REF_DELETED && WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) { - page_del = ref->ft_info.del; + page_del = ref->page_del; if (page_del == NULL || (__rollback_txn_visible_id(session, page_del->txnid) && - page_del->durable_timestamp <= rollback_timestamp)) + page_del->durable_timestamp <= rollback_timestamp)) { + /* + * We should never see a prepared truncate here; not at recovery time because prepared + * truncates can't be written to disk, and not during a runtime RTS either because it + * should not be possible to do that with an unresolved prepared transaction. + */ + WT_ASSERT(session, + page_del == NULL || page_del->prepare_state == WT_PREPARE_INIT || + page_del->prepare_state == WT_PREPARE_RESOLVED); *skipp = true; + } WT_REF_SET_STATE(ref, WT_REF_DELETED); return (0); } diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint06.py b/src/third_party/wiredtiger/test/suite/test_checkpoint06.py index 5689af7f9af..dcbae5386b7 100644 --- a/src/third_party/wiredtiger/test/suite/test_checkpoint06.py +++ b/src/third_party/wiredtiger/test/suite/test_checkpoint06.py @@ -78,8 +78,6 @@ class test_checkpoint06(wttest.WiredTigerTestCase): self.session.begin_transaction() start = self.session.open_cursor(self.uri) start.set_key(5) - end = self.session.open_cursor(self.uri) - end.set_key(9995) self.session.truncate(None, start, None, None) if self.prepare: self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(3)) @@ -95,7 +93,7 @@ class test_checkpoint06(wttest.WiredTigerTestCase): ',stable_timestamp=' + self.timestamp_str(4)) cursor = self.session.open_cursor(self.uri_evict) - # Insert some more data to trigger eviction + # Insert some more data into another table to trigger eviction for i in range(1, nrows + 1): self.session.begin_transaction() cursor[i] = value diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py index e72b85f5b52..aae9047a7f8 100644 --- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py @@ -32,9 +32,9 @@ from wiredtiger import stat, WT_NOTFOUND from wtdataset import SimpleDataSet from wtscenario import make_scenarios -# test_rollback_to_stable33.py +# test_rollback_to_stable34.py # Test interaction between fast-delete and RTS. -class test_rollback_to_stable33(test_rollback_to_stable_base): +class test_rollback_to_stable34(test_rollback_to_stable_base): session_config = 'isolation=snapshot' conn_config = 'cache_size=50MB,statistics=(all),log=(enabled=false)' @@ -104,7 +104,7 @@ class test_rollback_to_stable33(test_rollback_to_stable_base): nrows = 10000 # Create a table without logging. - uri = "table:rollback_to_stable33" + uri = "table:rollback_to_stable34" ds = SimpleDataSet( self, uri, 0, key_format=self.key_format, value_format=self.value_format, config='log=(enabled=false)' + self.extraconfig) diff --git a/src/third_party/wiredtiger/test/suite/test_truncate16.py b/src/third_party/wiredtiger/test/suite/test_truncate16.py new file mode 100644 index 00000000000..96f94b79b1c --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_truncate16.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wttest +from helper import simulate_crash_restart +from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_NOTFOUND, WT_ROLLBACK +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_truncate16.py +# +# Make sure that no shenanigans occur if we try to read from a page that's been +# fast-truncated by a prepared transaction. + +class test_truncate16(wttest.WiredTigerTestCase): + conn_config = 'statistics=(all)' + session_config = 'isolation=snapshot' + + # Hook to run using remove instead of truncate for reference. This should not alter the + # behavior... but may if things are broken. Disable the reference version by default as it's + # only useful when investigating behavior changes. This list is first in the make_scenarios + # call so the additional cases don't change the scenario numbering. + trunc_values = [ + ('truncate', dict(trunc_with_remove=False)), + #('remove', dict(trunc_with_remove=True)), + ] + format_values = [ + ('column', dict(key_format='r', value_format='S', extraconfig='')), + ('column_fix', dict(key_format='r', value_format='8t', + extraconfig=',allocation_size=512,leaf_page_max=512')), + ('integer_row', dict(key_format='i', value_format='S', extraconfig='')), + ] + checkpoint_values = [ + ('no_checkpoint', dict(do_checkpoint=False)), + ('checkpoint', dict(do_checkpoint=True)), + ] + scenarios = make_scenarios(trunc_values, format_values, checkpoint_values) + + def truncate(self, session, uri, make_key, keynum1, keynum2): + if self.trunc_with_remove: + cursor = session.open_cursor(uri) + err = 0 + for k in range(keynum1, keynum2 + 1): + cursor.set_key(k) + try: + err = cursor.remove() + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + if err != 0: + break + cursor.close() + else: + lo_cursor = session.open_cursor(uri) + hi_cursor = session.open_cursor(uri) + lo_cursor.set_key(make_key(keynum1)) + hi_cursor.set_key(make_key(keynum2)) + try: + err = session.truncate(None, lo_cursor, hi_cursor, None) + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + lo_cursor.close() + hi_cursor.close() + return err + + def test_truncate16(self): + nrows = 10000 + + # Create a table. + uri = "table:truncate16" + ds = SimpleDataSet( + self, uri, 0, key_format=self.key_format, value_format=self.value_format, + config=self.extraconfig) + ds.populate() + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + else: + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + + # Pin oldest and stable timestamps to 1. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) + + ',stable_timestamp=' + self.timestamp_str(1)) + + # Write some baseline data at time 10. + cursor = self.session.open_cursor(ds.uri) + self.session.begin_transaction() + for i in range(1, nrows + 1): + cursor[ds.key(i)] = value_a + if i % 487 == 0: + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + self.session.begin_transaction() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + cursor.close() + + # Mark it stable. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10)) + + # Reopen the connection so nothing is in memory and we can fast-truncate. + self.reopen_conn() + + # Make a session to prepare in. + session2 = self.conn.open_session() + + # Truncate the middle of the table. + # + # Prepare the truncate at time 20 and leave it hanging. + session2.begin_transaction() + err = self.truncate(session2, ds.uri, ds.key, nrows // 4 + 1, 3 * nrows // 4) + self.assertEqual(err, 0) + session2.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20)) + + # Make sure we did at least one fast-delete. (Unless we specifically didn't want to, + # or running on FLCS where it isn't supported.) + stat_cursor = self.session.open_cursor('statistics:', None, None) + fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2] + if self.value_format == '8t' or self.trunc_with_remove: + self.assertEqual(fastdelete_pages, 0) + else: + self.assertGreater(fastdelete_pages, 0) + stat_cursor.close() + + # Optionally checkpoint at this stage, just in case it breaks or trips on + # the prepared truncation. + if self.do_checkpoint: + self.session.checkpoint() + + # Now read from the truncated region. This should give WT_PREPARE_CONFLICT. + cursor = self.session.open_cursor(ds.uri) + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(30)) + cursor.set_key(nrows // 2) + self.assertRaisesException(WiredTigerError, + lambda: cursor.search(), + exceptionString='/conflict with a prepared update/') + + # It should have instantiated the page under the key we read, and nothing else. + # (But not if we weren't fast-deleting.) + stat_cursor = self.session.open_cursor('statistics:', None, None) + read_deleted = stat_cursor[stat.conn.cache_read_deleted][2] + if self.value_format == '8t' or self.trunc_with_remove: + self.assertEqual(read_deleted, 0) + else: + self.assertEqual(read_deleted, 1) + stat_cursor.close() + + # Now toss the prepared transaction, and just for kicks make sure we can read the whole + # table. + session2.rollback_transaction() + + for i in range(1, nrows + 1): + cursor.next() + self.assertEqual(cursor.get_key(), ds.key(i)) + self.assertEqual(cursor.get_value(), value_a) + self.session.rollback_transaction() + + # Unlike RTS, transaction rollback should not instantiate pages, so the number of + # instantiated pages should remain 1. + stat_cursor = self.session.open_cursor('statistics:', None, None) + read_deleted_still = stat_cursor[stat.conn.cache_read_deleted][2] + self.assertEqual(read_deleted_still, read_deleted) + stat_cursor.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_truncate17.py b/src/third_party/wiredtiger/test/suite/test_truncate17.py new file mode 100644 index 00000000000..a3b4f7d1df6 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_truncate17.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wttest +from helper import simulate_crash_restart +from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_NOTFOUND, WT_ROLLBACK +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_truncate17.py +# +# Make sure that no shenanigans occur if we try to read from a page that's been +# fast-truncated by a prepared transaction. + +class test_truncate17(wttest.WiredTigerTestCase): + conn_config = 'statistics=(all)' + session_config = 'isolation=snapshot' + + # Hook to run using remove instead of truncate for reference. This should not alter the + # behavior... but may if things are broken. Disable the reference version by default as it's + # only useful when investigating behavior changes. This list is first in the make_scenarios + # call so the additional cases don't change the scenario numbering. + trunc_values = [ + ('truncate', dict(trunc_with_remove=False)), + #('remove', dict(trunc_with_remove=True)), + ] + format_values = [ + ('column', dict(key_format='r', value_format='S', extraconfig='')), + ('column_fix', dict(key_format='r', value_format='8t', + extraconfig=',allocation_size=512,leaf_page_max=512')), + ('integer_row', dict(key_format='i', value_format='S', extraconfig='')), + ] + checkpoint_values = [ + ('no_checkpoint', dict(do_checkpoint=False)), + ('checkpoint', dict(do_checkpoint=True)), + ] + scenarios = make_scenarios(trunc_values, format_values, checkpoint_values) + + def stat_tree(self, uri): + statscursor = self.session.open_cursor('statistics:' + uri, None, 'statistics=(all)') + + entries = statscursor[stat.dsrc.btree_entries][2] + if self.value_format == '8t': + leaf_pages = statscursor[stat.dsrc.btree_column_fix][2] + internal_pages = statscursor[stat.dsrc.btree_column_internal][2] + elif self.key_format == 'r': + leaf_pages = statscursor[stat.dsrc.btree_column_variable][2] + internal_pages = statscursor[stat.dsrc.btree_column_internal][2] + else: + leaf_pages = statscursor[stat.dsrc.btree_row_leaf][2] + internal_pages = statscursor[stat.dsrc.btree_row_internal][2] + + return (entries, (leaf_pages, internal_pages)) + + def truncate(self, session, uri, make_key, keynum1, keynum2): + if self.trunc_with_remove: + cursor = session.open_cursor(uri) + err = 0 + for k in range(keynum1, keynum2 + 1): + cursor.set_key(k) + try: + err = cursor.remove() + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + if err != 0: + break + cursor.close() + else: + lo_cursor = session.open_cursor(uri) + hi_cursor = session.open_cursor(uri) + lo_cursor.set_key(make_key(keynum1)) + hi_cursor.set_key(make_key(keynum2)) + try: + err = session.truncate(None, lo_cursor, hi_cursor, None) + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + lo_cursor.close() + hi_cursor.close() + return err + + def test_truncate17(self): + nrows = 10000 + + # Create a table. + uri = "table:truncate17" + ds = SimpleDataSet( + self, uri, 0, key_format=self.key_format, value_format=self.value_format, + config=self.extraconfig) + ds.populate() + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + else: + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + + # Pin oldest and stable timestamps to 1. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) + + ',stable_timestamp=' + self.timestamp_str(1)) + + # Write some baseline data at time 10. + cursor = self.session.open_cursor(ds.uri) + self.session.begin_transaction() + for i in range(1, nrows + 1): + cursor[ds.key(i)] = value_a + if i % 487 == 0: + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + self.session.begin_transaction() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + cursor.close() + + # Mark it stable. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10)) + + # Reopen the connection so as to stat the on-disk version of the tree. + self.reopen_conn() + + # Stat the tree to get a baseline. + (base_entries, base_pages) = self.stat_tree(uri) + self.assertEqual(base_entries, nrows) + + # Reopen the connection again so nothing is in memory and we can fast-truncate. + self.reopen_conn() + + # Make a session to prepare in. + session2 = self.conn.open_session() + + # Truncate the middle of the table. + # + # Prepare the truncate at time 20 and leave it hanging. + session2.begin_transaction() + err = self.truncate(session2, ds.uri, ds.key, nrows // 4 + 1, 3 * nrows // 4) + self.assertEqual(err, 0) + session2.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20)) + + # Make sure we did at least one fast-delete. (Unless we specifically didn't want to, + # or running on FLCS where it isn't supported.) + stat_cursor = self.session.open_cursor('statistics:', None, None) + fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2] + if self.value_format == '8t' or self.trunc_with_remove: + self.assertEqual(fastdelete_pages, 0) + else: + self.assertGreater(fastdelete_pages, 0) + stat_cursor.close() + + # Optionally checkpoint at this stage, just in case it breaks or trips on + # the prepared truncation. + if self.do_checkpoint: + self.session.checkpoint() + + # Stat the tree again. Stats are not transactional, and are effectively + # read-uncommitted; we should see the results of the prepared truncate. + # However, the truncated pages aren't actually gone yet, so the page counts + # shouldn't change. + (entries, pages) = self.stat_tree(uri) + if self.value_format == '8t': + self.assertEqual(entries, nrows) + else: + self.assertEqual(entries, nrows // 2) + self.assertEqual(pages, base_pages) + + # This should instantiate all the deleted pages. + stat_cursor = self.session.open_cursor('statistics:', None, None) + read_deleted = stat_cursor[stat.conn.cache_read_deleted][2] + self.assertEqual(read_deleted, fastdelete_pages) + stat_cursor.close() + + # Now toss the prepared transaction. + session2.rollback_transaction() + + # Unlike RTS, transaction rollback should not instantiate pages, plus there are + # no more deleted pages to instantiate, so the number of instantiated pages should + # remain unchanged. + stat_cursor = self.session.open_cursor('statistics:', None, None) + read_deleted = stat_cursor[stat.conn.cache_read_deleted][2] + self.assertEqual(read_deleted, fastdelete_pages) + stat_cursor.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_truncate18.py b/src/third_party/wiredtiger/test/suite/test_truncate18.py new file mode 100644 index 00000000000..0e971ed389a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_truncate18.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wttest +from helper import simulate_crash_restart +from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_NOTFOUND, WT_ROLLBACK +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_truncate18.py +# +# The optimization that replaces deleted pages full of obsolete values with physically +# empty pages can cause problems, because for some purposes the empty page is not +# equivalent. +# +# In particular, the key order checks in row-store verify depend on the keys being +# physically present, and loading an empty page defeats that. This is more or less +# harmless except in the case of the leftmost leaf page, whose keys are used to +# initialize the check. +# +# It is not entirely trivial to reach the failure state, because the page under the start +# point of a truncate is never fast-truncated and that in turn means the leftmost page of +# the tree is never fast-truncated. Consequently, to get a deleted leftmost leaf we must +# truncate a range the beginning of the tree and then cause at least the first page of the +# range to be discarded while keeping some of the rest of it. +# +# The only way I've thought of to do this is to truncate a range that spans more than one +# internal page. Then the first internal page of the range can be reconciled (required to +# discard the non-deleted leftmost page) without discarding the whole truncated range. +# +# Consequently we crank down internal_page_max to avoid needing an excessively large test. +# +# Then we set things up so that the truncation becomes globally visible and run verify. +# That currently asserts. The fix for this is likely to disable the optimization when in +# verify, so the only real purpose of this test is to prevent the behavior from regressing. +# It is therefore not full of scenarios but specific to this one problem. + +class test_truncate18(wttest.WiredTigerTestCase): + conn_config = 'statistics=(all)' + session_config = 'isolation=snapshot' + + # Hook to run using remove instead of truncate for reference. This should not alter the + # behavior... but may if things are broken. Disable the reference version by default as it's + # only useful when investigating behavior changes. This list is first in the make_scenarios + # call so the additional cases don't change the scenario numbering. + trunc_values = [ + ('truncate', dict(trunc_with_remove=False)), + #('remove', dict(trunc_with_remove=True)), + ] + format_values = [ + ('integer_row', dict(key_format='i', value_format='S', extraconfig='')), + ] + scenarios = make_scenarios(trunc_values, format_values) + + # Truncate, from keynum1 to keynum2, inclusive. + def truncate(self, uri, make_key, keynum1, keynum2, read_ts, commit_ts): + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(read_ts)) + if self.trunc_with_remove: + cursor = self.session.open_cursor(uri) + err = 0 + for k in range(keynum1, keynum2 + 1): + cursor.set_key(k) + try: + err = cursor.remove() + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + if err != 0: + break + cursor.close() + else: + lo_cursor = self.session.open_cursor(uri) + hi_cursor = self.session.open_cursor(uri) + lo_cursor.set_key(make_key(keynum1)) + hi_cursor.set_key(make_key(keynum2)) + try: + err = self.session.truncate(None, lo_cursor, hi_cursor, None) + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + lo_cursor.close() + hi_cursor.close() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(commit_ts)) + return err + + def test_truncate18(self): + # With the small internal pages, 10000 rows is enough. 5000 rows is not. + nrows = 10000 + + # Create a table. + uri = "table:truncate18" + ds = SimpleDataSet( + self, uri, 0, key_format=self.key_format, value_format=self.value_format, + config='internal_page_max=4096' + self.extraconfig) + ds.populate() + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + else: + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + + # Pin oldest and stable timestamps to 1. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) + + ',stable_timestamp=' + self.timestamp_str(1)) + + # Write some baseline data at time 10. + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + for i in range(1, nrows + 1): + cursor[ds.key(i)] = value_a + if i % 487 == 0: + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + self.session.begin_transaction() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + cursor.close() + + # Mark it stable. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10)) + + # Reopen the connection again so nothing is in memory and we can fast-truncate. + self.reopen_conn() + + # Truncate most of the tree, beginning at the first key, at time 20. + err = self.truncate(ds.uri, ds.key, 1, 7 * nrows // 8, 15, 20) + self.assertEqual(err, 0) + + # Make sure we did at least one fast-delete. (Unless we specifically didn't want to, + # or running on FLCS where it isn't supported.) + stat_cursor = self.session.open_cursor('statistics:', None, None) + fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2] + if self.value_format == '8t' or self.trunc_with_remove: + self.assertEqual(fastdelete_pages, 0) + else: + self.assertGreater(fastdelete_pages, 0) + stat_cursor.close() + + # Mark all this stable. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(30)) + + # Reopen the connection again so everything is purely on disk. + self.reopen_conn() + + # Age out the baseline data, so the pages we truncated contain entirely obsolete data. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(30)) + + # Since we didn't fast-truncate the first page (one can't) we need to get it + # discarded by forcing it to reconcile empty. This will also discard all the + # fast-truncated pages that are children of the first internal page. For the + # test to work we need to have more fast-truncated pages beyond that, but there + # is no good way to crosscheck if we do. + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[ds.key(1)] = value_b + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(35)) + self.session.begin_transaction() + cursor.set_key(ds.key(1)) + self.assertEqual(cursor.remove(), 0) + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40)) + cursor.close() + + # Mark this change stable (and age out the scratch value we wrote) and checkpoint it. + # This will reconcile the first leaf page and the first internal page, and internal + # pages above that, but leave the second internal page alone since we did nothing to + # bring it into memory. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40)) + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(40)) + self.session.checkpoint() + + # Reopen the connection yet again. + self.reopen_conn() + + # Now verify the tree. In the problem scenario described above, this will assert. + self.session.verify(ds.uri, None) + +if __name__ == '__main__': + wttest.run() |