diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-06-17 15:47:15 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-06-17 06:09:32 +0000 |
commit | 9c0b5cd1e5fa4c2a22fecd3ed9267d3341bc84e4 (patch) | |
tree | 8a459334b2a47b12e9df9008925f4368a43316b7 | |
parent | 3bebe0b8f9c89098586fa2638fd1bb640e516486 (diff) | |
download | mongo-9c0b5cd1e5fa4c2a22fecd3ed9267d3341bc84e4.tar.gz |
Import wiredtiger: b2dce5bb8f2bbecb704537eee9cff61d7dc106a0 from branch mongodb-5.0
ref: 4524d572ff..b2dce5bb8f
for: 5.1.0
WT-7553 Loosen the restrictions around evicting fast-truncate pages to avoid cache-stuck failures
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_delete.c | 162 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_discard.c | 7 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 10 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_file.c | 15 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_page.c | 13 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btmem.h | 18 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree_inline.h | 19 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/misc.h | 28 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/txn_inline.h | 92 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_child.c | 16 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn.c | 49 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/format/ops.c | 3 |
14 files changed, 233 insertions, 205 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 9356fe181ee..5a963156a53 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "4524d572ff451edf69aaa70d211946a3b1ccb19f" + "commit": "b2dce5bb8f2bbecb704537eee9cff61d7dc106a0" } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index a4d82c3d904..acb89293002 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -20,16 +20,16 @@ * state to WT_REF_DELETED. Pages ineligible for this fast path include pages already in the cache, * having overflow items, or requiring history store records. Ineligible pages are read and have * their rows updated/deleted individually. The transaction for the delete operation is stored in - * memory referenced by the WT_REF.page_del field. + * memory referenced by the WT_REF.ft_info.del field. * * Future cursor walks of the tree will skip the deleted page based on the transaction stored for * the delete, but it gets more complicated if a read is done using a random key, or a cursor walk * is done with a transaction where the delete is not visible. In those cases, we read the original * contents of the page. The page-read code notices a deleted page is being read, and as part of the - * read instantiates the contents of the page, creating a WT_UPDATE with a deleted operation, in the - * same transaction as deleted the page. In other words, the read process makes it appear as if the - * page was read and each individual row deleted, exactly as would have happened if the page had - * been in the cache all along. + * read instantiates the contents of the page, creating a WT_UPDATE with a tombstone, in the same + * transaction as deleted the page. In other words, the read process makes it appear as if the page + * was read and each individual row deleted, exactly as would have happened if the page had been in + * the cache all along. * * There's an additional complication to support rollback of the page delete. When the page was * marked deleted, a pointer to the WT_REF was saved in the deleting session's transaction list and @@ -39,14 +39,14 @@ * saved/restored during reconciliation and appear on multiple pages, and the WT_REF stored in the * deleting session's transaction list is no longer useful. For this reason, when the page is * instantiated by a read, a list of the WT_UPDATE structures on the page is stored in the - * WT_REF.page_del field, with the transaction ID, that way the session committing/unrolling the - * delete can find all WT_UPDATE structures that require update. + * WT_REF.ft_info.update field, that way the session resolving the delete can find all WT_UPDATE + * structures that require update. * * One final note: pages can also be marked deleted if emptied and evicted. In that case, the WT_REF - * state will be set to WT_REF_DELETED but there will not be any associated WT_REF.page_del field. - * These pages are always skipped during cursor traversal (the page could not have been evicted if - * there were updates that weren't globally visible), and if read is forced to instantiate such a - * page, it simply creates an empty page from scratch. + * state will be set to WT_REF_DELETED but there will not be any associated WT_REF.ft_info.del + * field. These pages are always skipped during cursor traversal (the page could not have been + * evicted if there were updates that weren't globally visible), and if read is forced to + * instantiate such a page, it simply creates an empty page from scratch. */ /* @@ -93,16 +93,12 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) return (0); /* - * If this WT_REF was previously part of a truncate operation, there may be existing page-delete - * information. The structure is only read while the state is locked, free the previous version. - * - * Note: changes have been made, we must publish any state change from this point on. + * There should be no previous page-delete information: if the previous fast-truncate didn't + * instantiate the page, then we'd never get here to do another delete; if the previous fast- + * truncate did instantiate the page, then any fast-truncate information was removed at that + * point and/or when the fast-truncate transaction was resolved. */ - if (ref->page_del != NULL) { - WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED); - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } + WT_ASSERT(session, ref->ft_info.del == NULL); /* * We cannot truncate pages that have overflow key/value items as the overflow blocks have to be @@ -128,8 +124,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* Allocate and initialize the page-deleted structure. */ - WT_ERR(__wt_calloc_one(session, &ref->page_del)); - ref->page_del->previous_state = previous_state; + WT_ERR(__wt_calloc_one(session, &ref->ft_info.del)); + ref->ft_info.del->previous_state = previous_state; WT_ERR(__wt_txn_modify_page_delete(session, ref)); @@ -141,7 +137,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) return (0); err: - __wt_free(session, ref->page_del); + __wt_free(session, ref->ft_info.del); /* Publish the page to its previous state, ensuring visibility. */ WT_REF_SET_STATE(ref, previous_state); @@ -150,7 +146,7 @@ err: /* * __wt_delete_page_rollback -- - * Abort pages that were deleted without being instantiated. + * Abort fast-truncate operations. */ int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) @@ -160,7 +156,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) uint8_t current_state; bool locked; - /* Lock the reference. We cannot access ref->page_del except when locked. */ + /* Lock the reference. We cannot access ref->ft_info.del except when locked. */ for (locked = false, sleep_usecs = yield_count = 0;;) { switch (current_state = ref->state) { case WT_REF_LOCKED: @@ -188,23 +184,25 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * If the page is still "deleted", it's as we left it, all we have to do is reset the state. - * - * We can't use the normal read path to get a copy of the page because the session may have - * closed the cursor, we no longer have the reference to the tree required for a hazard pointer. - * We're safe because with unresolved transactions, the page isn't going anywhere. - * - * The page is in an in-memory state, which means it was instantiated at some point. Walk any - * list of update structures and abort them. + * If the page is still "deleted", it's as we left it, simply reset the state. Otherwise, the + * page is in an in-memory state, which means it was instantiated at some point. Walk any list + * of update structures and abort them. We can't use the normal read path to get the pages with + * updates (the original page may have split, so there many be more than one page), because the + * session may have closed the cursor, we no longer have the reference to the tree required for + * a hazard pointer. We're safe since pages with unresolved transactions aren't going anywhere. */ if (current_state == WT_REF_DELETED) - current_state = ref->page_del->previous_state; - else if ((updp = ref->page_del->update_list) != NULL) + current_state = ref->ft_info.del->previous_state; + else if ((updp = ref->ft_info.update) != NULL) for (; *updp != NULL; ++updp) (*updp)->txnid = WT_TXN_ABORTED; - /* Finally mark the truncate aborted */ - ref->page_del->txnid = WT_TXN_ABORTED; + /* + * We didn't set the WT_PAGE_DELETED transaction ID to aborted or discard any WT_UPDATE list, + * instead, we discard both structures entirely, it has the same effect. It's a single call, + * they're a union of two pointers. + */ + __wt_free(session, ref->ft_info.del); WT_REF_SET_STATE(ref, current_state); return (0); @@ -240,16 +238,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) skip = !__wt_page_del_active(session, ref, visible_all); /* - * The page_del structure can be freed as soon as the delete is stable: it is only read when the - * ref state is locked. It is worth checking every time we come through because once this is - * freed, we no longer need synchronization to check the ref. + * The fast-truncate structure can be freed as soon as the delete is stable: it is only read + * when the ref state is locked. It is worth checking every time we come through because once + * this is freed, we no longer need synchronization to check the ref. */ - if (skip && ref->page_del != NULL && + if (skip && ref->ft_info.del != NULL && (visible_all || - __wt_txn_visible_all(session, ref->page_del->txnid, ref->page_del->timestamp))) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } + __wt_txn_visible_all(session, ref->ft_info.del->txnid, ref->ft_info.del->timestamp))) + __wt_overwrite_and_free(session, ref->ft_info.del); WT_REF_SET_STATE(ref, WT_REF_DELETED); return (skip); @@ -296,12 +292,14 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE_DELETED *page_del; WT_ROW *rip; WT_TIME_WINDOW tw; - WT_UPDATE **upd_array, *upd; + WT_UPDATE **upd_array, **update_list, *upd; size_t size, total_size; uint32_t count, i; btree = S2BT(session); page = ref->page; + page_del = NULL; + update_list = NULL; WT_STAT_CONN_DATA_INCR(session, cache_read_deleted); @@ -315,7 +313,15 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) if (!F_ISSET(btree, WT_BTREE_READONLY)) __wt_page_modify_set(session, page); - if (ref->page_del != NULL && ref->page_del->prepare_state != WT_PREPARE_INIT) + /* + * Allocate the per-page update array if one doesn't already exist. (It might already exist + * because deletes are instantiated after the history store table updates.) + */ + if (page->entries != 0 && page->modify->mod_row_update == NULL) + WT_PAGE_ALLOC_AND_SWAP( + session, page, page->modify->mod_row_update, upd_array, page->entries); + + if (ref->ft_info.del != NULL && ref->ft_info.del->prepare_state != WT_PREPARE_INIT) WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared); /* @@ -332,26 +338,16 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * in the system forced us to keep the old version of the page around, then we crashed and * recovered or we're running inside a checkpoint, and now we're being forced to read that page. * - * Expect a page-deleted structure if there's a running transaction that needs to be resolved, - * otherwise, there may not be one (and, if the transaction has resolved, we can ignore the - * page-deleted structure). - */ - page_del = __wt_page_del_active(session, ref, true) ? ref->page_del : NULL; - - /* - * Allocate the per-page update array if one doesn't already exist. (It might already exist - * because deletes are instantiated after the history store table updates.) - */ - if (page->entries != 0 && page->modify->mod_row_update == NULL) - WT_PAGE_ALLOC_AND_SWAP( - session, page, page->modify->mod_row_update, upd_array, page->entries); - - /* - * Allocate the per-reference update array; in the case of instantiating a page deleted in a - * running transaction, we need a list of the update structures for the eventual commit or - * abort. + * If there's a page-deleted structure that's not yet globally visible, get a reference and + * migrate transaction ID and timestamp information to the updates (globally visible means the + * updates don't require that information). + * + * If the truncate operation is not yet resolved, link updates in the page-deleted structure so + * they can be found when the transaction is aborted or committed, even if they have moved to + * other pages. */ - if (page_del != NULL) { + page_del = __wt_page_del_active(session, ref, true) ? ref->ft_info.del : NULL; + if (page_del != NULL && page_del->committed == 0) { count = 0; if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) WT_SKIP_FOREACH (ins, insert) @@ -362,12 +358,11 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_SKIP_FOREACH (ins, insert) ++count; } - WT_RET(__wt_calloc_def(session, count + 1, &page_del->update_list)); - __wt_cache_page_inmem_incr(session, page, (count + 1) * sizeof(page_del->update_list)); + WT_RET(__wt_calloc_def(session, count + 1, &update_list)); } /* Walk the page entries, giving each one a tombstone. */ - size = total_size = 0; + total_size = size = 0; count = 0; upd_array = page->modify->mod_row_update; if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) @@ -377,8 +372,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) upd->next = ins->upd; ins->upd = upd; - if (page_del != NULL) - page_del->update_list[count++] = upd; + if (update_list != NULL) + update_list[count++] = upd; } WT_ROW_FOREACH (page, rip, i) { /* @@ -392,8 +387,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) upd->next = upd_array[WT_ROW_SLOT(page, rip)]; upd_array[WT_ROW_SLOT(page, rip)] = upd; - if (page_del != NULL) - page_del->update_list[count++] = upd; + if (update_list != NULL) + update_list[count++] = upd; if ((insert = WT_ROW_INSERT(page, rip)) != NULL) WT_SKIP_FOREACH (ins, insert) { @@ -402,23 +397,24 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) upd->next = ins->upd; ins->upd = upd; - if (page_del != NULL) - page_del->update_list[count++] = upd; + if (update_list != NULL) + update_list[count++] = upd; } } } - __wt_cache_page_inmem_incr(session, page, total_size); + /* + * We no longer need the WT_PAGE_DELETED structure, all of its information should have been + * transferred to the list of WT_UPDATE structures (if any). + */ + __wt_overwrite_and_free(session, ref->ft_info.del); + if (update_list != NULL) + ref->ft_info.update = update_list; + return (0); err: - /* - * The page-delete update structure may have existed before we were called, and presumably might - * be in use by a running transaction. The list of update structures cannot have been created - * before we were called, and should not exist if we exit with an error. - */ - if (page_del != NULL) - __wt_free(session, page_del->update_list); + __wt_free(session, update_list); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 02bd970e0c6..f60b4968e7c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -290,11 +290,8 @@ __wt_free_ref(WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pa /* Free any address allocation. */ __wt_ref_addr_free(session, ref); - /* Free any page-deleted information. */ - if (ref->page_del != NULL) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } + /* Free any backing fast-truncate memory. */ + __wt_free(session, ref->ft_info.del); __wt_overwrite_and_free_len(session, ref, WT_REF_CLEAR_SIZE); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index b5b997054ef..47d3f120d3a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -608,14 +608,8 @@ __split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *paren } } - /* - * The page-delete and history store memory weren't added to the parent's footprint, ignore it - * here. - */ - if (ref->page_del != NULL) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } + /* Free any backing fast-truncate memory. */ + __wt_free(session, ref->ft_info.del); /* Free the backing block and address. */ WT_TRET(__wt_ref_block_free(session, ref)); diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 1e83f0a5643..f6091a4436e 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -85,24 +85,17 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) switch (syncop) { case WT_SYNC_CLOSE: - /* - * Evict the page. - * - * Ensure the ref state is restored to the previous value if eviction fails. - */ + /* Evict the page. */ WT_ERR(__wt_evict(session, ref, ref->state, WT_EVICT_CALL_CLOSING)); break; case WT_SYNC_DISCARD: /* - * Discard the page regardless of whether it is dirty. - * - * If the page has a page deleted structure, we are discarding the page that is cleaned - * by a checkpoint. + * Discard the page whether it is dirty or not. The check if the page can be evicted is + * not exhaustive, but provides basic checking on the page's status. */ WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_DEAD) || F_ISSET(S2C(session), WT_CONN_CLOSING) || - __wt_page_can_evict(session, ref, NULL) || - (ref->page_del != NULL && page->modify->page_state == WT_PAGE_CLEAN)); + __wt_page_can_evict(session, ref, NULL)); __wt_ref_out(session, ref); break; case WT_SYNC_CHECKPOINT: diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 26b38dc5996..6d501b6ac28 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -184,15 +184,6 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32 /* Figure out whether reconciliation was done on the page */ clean_page = __wt_page_evict_clean(page); - /* - * Discard all page-deleted information. If a truncate call deleted this page, there's memory - * associated with it we no longer need, eviction will have built a new version of the page. - */ - if (ref->page_del != NULL) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } - /* Update the reference and discard the page. */ if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); @@ -530,9 +521,7 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool WT_RET(ret); } - /* - * It is always OK to evict pages from dead trees if they don't have children. - */ + /* It is always OK to evict pages from dead trees if they don't have children. */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) return (0); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 1db9da86349..e9fa12975ff 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -835,7 +835,7 @@ struct __wt_page_deleted { uint8_t previous_state; /* Previous state */ - WT_UPDATE **update_list; /* List of updates for abort */ + uint8_t committed; /* Committed */ }; /* @@ -906,7 +906,21 @@ struct __wt_ref { #undef ref_ikey #define ref_ikey key.ikey - WT_PAGE_DELETED *page_del; /* Deleted page information */ + /* + * Fast-truncate information. When a WT_REF is included in a fast-truncate operation, WT_REF.del + * is allocated and initialized. If the page must be instantiated before the truncate becomes + * globally visible, WT_UPDATE structures are created for the page entries, the transaction + * information from WT_REF.del is migrated to those WT_UPDATE structures, and the WT_REF.del + * field is freed and replaced by the WT_REF.update array (needed for subsequent transaction + * commit/abort). Doing anything other than testing if WT_REF.del/update is non-NULL (which + * eviction does), requires the WT_REF be locked. If the locked WT_REF's previous state was + * WT_REF_DELETED, WT_REF.del is valid, if the WT_REF's previous state was an in-memory state, + * then WT_REF.update is valid. + */ + union { + WT_PAGE_DELETED *del; /* Page not instantiated, page-deleted structure */ + WT_UPDATE **update; /* Page instantiated, update list for subsequent commit/abort */ + } ft_info; /* * In DIAGNOSTIC mode we overwrite the WT_REF on free to force failures. Don't clear the history in diff --git a/src/third_party/wiredtiger/src/include/btree_inline.h b/src/third_party/wiredtiger/src/include/btree_inline.h index 9c0e0ce784e..f7c501f6018 100644 --- a/src/third_party/wiredtiger/src/include/btree_inline.h +++ b/src/third_party/wiredtiger/src/include/btree_inline.h @@ -1464,7 +1464,9 @@ __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) WT_PAGE_DELETED *page_del; uint8_t prepare_state; - if ((page_del = ref->page_del) == NULL) + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + + if ((page_del = ref->ft_info.del) == NULL) return (false); if (page_del->txnid == WT_TXN_ABORTED) return (false); @@ -1651,15 +1653,20 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) page = ref->page; mod = page->modify; - /* A truncated page can't be evicted until the truncate completes. */ - if (__wt_page_del_active(session, ref, true)) - return (false); - - /* Otherwise, never modified pages can always be evicted. */ + /* Never modified pages can always be evicted. */ if (mod == NULL) return (true); /* + * If a fast-truncate page is subsequently instantiated, it can become an eviction candidate. If + * the fast-truncate itself has not resolved when the page is instantiated, a list of updates is + * created, which will be discarded as part of transaction resolution. Don't attempt to evict a + * fast-truncate page until any update list has been removed. + */ + if (ref->ft_info.update != NULL) + return (false); + + /* * We can't split or evict multiblock row-store pages where the parent's key for the page is an * overflow item, because the split into the parent frees the backing blocks for any * no-longer-used overflow keys, which will corrupt the checkpoint's block management. diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 3bde7b511b4..02395d8fa0b 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -118,11 +118,11 @@ addr)) /* - * Our internal free function clears the underlying address atomically so there is a smaller chance - * of racing threads seeing intermediate results while a structure is being free'd. (That would be a - * bug, of course, but I'd rather not drop core, just the same.) That's a non-standard "free" API, - * and the resulting bug is a mother to find -- make sure we get it right, don't make the caller - * remember to put the & operator on the pointer. + * Our internal free function clears the underlying address so there is a smaller chance of racing + * threads seeing intermediate results while a structure is being free'd. (That would be a bug, of + * course, but I'd rather not drop core, just the same.) That's a non-standard "free" API, and the + * resulting bug is non-trivial to find -- make sure we get it right, don't make the caller remember + * to put the & operator on the pointer. */ #define __wt_free(session, p) \ do { \ @@ -134,15 +134,21 @@ /* Overwrite whether or not this is a diagnostic build. */ #define __wt_explicit_overwrite(p, size) memset(p, WT_DEBUG_BYTE, size) #ifdef HAVE_DIAGNOSTIC -#define __wt_overwrite_and_free(session, p) \ - do { \ - __wt_explicit_overwrite(p, sizeof(*(p))); \ - __wt_free(session, p); \ +#define __wt_overwrite_and_free(session, p) \ + do { \ + void *__p = &(p); \ + if (*(void **)__p != NULL) { \ + __wt_explicit_overwrite(p, sizeof(*(p))); \ + __wt_free_int(session, __p); \ + } \ } while (0) #define __wt_overwrite_and_free_len(session, p, len) \ do { \ - __wt_explicit_overwrite(p, len); \ - __wt_free(session, p); \ + void *__p = &(p); \ + if (*(void **)__p != NULL) { \ + __wt_explicit_overwrite(p, len); \ + __wt_free_int(session, __p); \ + } \ } while (0) #else #define __wt_overwrite_and_free(session, p) __wt_free(session, p) diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h index 0deaf77a532..97fea21a14c 100644 --- a/src/third_party/wiredtiger/src/include/txn_inline.h +++ b/src/third_party/wiredtiger/src/include/txn_inline.h @@ -251,10 +251,7 @@ __wt_txn_op_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool comm txn = session->txn; - /* - * Lock the ref to ensure we don't race with eviction freeing the page deleted update list or - * with a page instantiate. - */ + /* Lock the ref to ensure we don't race with page instantiation. */ WT_REF_LOCK(session, ref, &previous_state); if (commit) { @@ -264,20 +261,27 @@ __wt_txn_op_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool comm ts = txn->prepare_timestamp; prepare_state = WT_PREPARE_INPROGRESS; } - for (updp = ref->page_del->update_list; updp != NULL && *updp != NULL; ++updp) { - (*updp)->start_ts = ts; - /* - * Holding the ref locked means we have exclusive access, so if we are committing we don't - * need to use the prepare locked transition state. - */ - (*updp)->prepare_state = prepare_state; + + /* + * Timestamps and prepare state are in the page deleted structure for truncates, or in the + * updates in the case of instantiated pages. + */ + if (previous_state == WT_REF_DELETED) { + ref->ft_info.del->timestamp = ts; if (commit) - (*updp)->durable_ts = txn->durable_timestamp; - } - ref->page_del->timestamp = ts; - if (commit) - ref->page_del->durable_timestamp = txn->durable_timestamp; - WT_PUBLISH(ref->page_del->prepare_state, prepare_state); + ref->ft_info.del->durable_timestamp = txn->durable_timestamp; + WT_PUBLISH(ref->ft_info.del->prepare_state, prepare_state); + } else if ((updp = ref->ft_info.update) != NULL) + for (; *updp != NULL; ++updp) { + (*updp)->start_ts = ts; + /* + * Holding the ref locked means we have exclusive access, so if we are committing we + * don't need to use the prepare locked transition state. + */ + (*updp)->prepare_state = prepare_state; + if (commit) + (*updp)->durable_ts = txn->durable_timestamp; + } WT_REF_UNLOCK(ref, previous_state); } @@ -295,16 +299,23 @@ __wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref txn = session->txn; - /* - * Lock the ref to ensure we don't race with eviction freeing the page deleted update list or - * with a page instantiate. - */ + /* Lock the ref to ensure we don't race with page instantiation. */ WT_REF_LOCK(session, ref, &previous_state); - for (updp = ref->page_del->update_list; updp != NULL && *updp != NULL; ++updp) { - (*updp)->start_ts = txn->commit_timestamp; - (*updp)->durable_ts = txn->durable_timestamp; - } + /* + * Timestamps are in the page deleted structure for truncates, or in the updates in the case of + * instantiated pages. Both commit and durable timestamps need to be updated. + */ + if (previous_state == WT_REF_DELETED) { + if (ref->ft_info.del->timestamp == WT_TS_NONE) { + ref->ft_info.del->timestamp = txn->commit_timestamp; + ref->ft_info.del->durable_timestamp = txn->durable_timestamp; + } + } else if ((updp = ref->ft_info.update) != NULL) + for (; *updp != NULL; ++updp) { + (*updp)->start_ts = txn->commit_timestamp; + (*updp)->durable_ts = txn->durable_timestamp; + } WT_REF_UNLOCK(ref, previous_state); } @@ -320,7 +331,6 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) { WT_TXN *txn; WT_UPDATE *upd; - wt_timestamp_t *timestamp; txn = session->txn; @@ -345,22 +355,19 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) __txn_resolve_prepared_update(session, upd); } } else { - /* - * The timestamp is in the page deleted structure for truncates, or in the update for other - * operations. Both commit and durable timestamps need to be updated. - */ - timestamp = op->type == WT_TXN_OP_REF_DELETE ? &op->u.ref->page_del->timestamp : - &op->u.op_upd->start_ts; - if (*timestamp == WT_TS_NONE) { - *timestamp = txn->commit_timestamp; - - timestamp = op->type == WT_TXN_OP_REF_DELETE ? &op->u.ref->page_del->durable_timestamp : - &op->u.op_upd->durable_ts; - *timestamp = txn->durable_timestamp; - } - if (op->type == WT_TXN_OP_REF_DELETE) __wt_txn_op_delete_commit_apply_timestamps(session, op->u.ref); + else { + /* + * The timestamp is in the update for operations other than truncate. Both commit and + * durable timestamps need to be updated. + */ + upd = op->u.op_upd; + if (upd->start_ts == WT_TS_NONE) { + upd->start_ts = txn->commit_timestamp; + upd->durable_ts = txn->durable_timestamp; + } + } } } @@ -421,9 +428,10 @@ __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref) WT_RET(__txn_next_op(session, &op)); op->type = WT_TXN_OP_REF_DELETE; - op->u.ref = ref; - ref->page_del->txnid = txn->id; + + /* This access to the WT_PAGE_DELETED structure is safe, caller has the WT_REF locked. */ + ref->ft_info.del->txnid = txn->id; __wt_txn_op_set_timestamp(session, op); WT_ERR(__wt_txn_log_op(session, NULL)); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c index 56104639e53..6423eb3347d 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_child.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c @@ -17,7 +17,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C { WT_PAGE_DELETED *page_del; - page_del = ref->page_del; + page_del = ref->ft_info.del; /* * Internal pages with child leaf pages in the WT_REF_DELETED state are a special case during @@ -61,18 +61,10 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C * function instantiates an entirely new page.) */ if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) { - /* - * Minor memory cleanup: if a truncate call deleted this page and we were ever forced to - * instantiate the page in memory, we would have built a list of updates in the page - * reference in order to be able to commit/rollback the truncate. We just passed a - * visibility test, discard the update list. - */ - if (page_del != NULL) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } - WT_RET(__wt_ref_block_free(session, ref)); + + /* Any fast-truncate information can be freed as soon as the delete is stable. */ + __wt_overwrite_and_free(session, ref->ft_info.del); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 42ddf278aec..6c613ffbe87 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -1434,15 +1434,16 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_UPDATE *upd; wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp; uint32_t fileid; - u_int i; + uint8_t previous_state; + u_int i, ft_resolution; #ifdef HAVE_DIAGNOSTIC u_int prepare_count; #endif bool locked, prepare, readonly, update_durable_ts; - txn = session->txn; conn = S2C(session); cursor = NULL; + txn = session->txn; txn_global = &conn->txn_global; #ifdef HAVE_DIAGNOSTIC prepare_count = 0; @@ -1567,6 +1568,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) /* Note: we're going to commit: nothing can fail after this point. */ /* Process and free updates. */ + ft_resolution = 0; for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { fileid = op->btree->id; switch (op->type) { @@ -1610,8 +1612,12 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) } break; case WT_TXN_OP_REF_DELETE: - __wt_txn_op_set_timestamp(session, op); - break; + /* + * Fast-truncate operations are resolved in a second pass after failure is no longer + * possible. + */ + ++ft_resolution; + continue; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* Other operations don't need timestamps. */ @@ -1623,11 +1629,6 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) if (cursor != NULL) WT_CLEAR(cursor->key); } - txn->mod_count = 0; -#ifdef HAVE_DIAGNOSTIC - WT_ASSERT(session, txn->prepare_count == prepare_count); - txn->prepare_count = 0; -#endif if (cursor != NULL) { WT_ERR(cursor->close(cursor)); @@ -1635,6 +1636,36 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) } /* + * Resolve any fast-truncate transactions and allow eviction to proceed on instantiated pages. + * This isn't done as part of the initial processing because until now the commit could still + * switch to an abort. The action allowing eviction to proceed is clearing the WT_UPDATE list, + * (if any), associated with the commit. We're the only consumer of that list and we no longer + * need it, and eviction knows it means abort or commit has completed on instantiated pages. + */ + for (i = 0, op = txn->mod; ft_resolution > 0 && i < txn->mod_count; i++, op++) + if (op->type == WT_TXN_OP_REF_DELETE) { + __wt_txn_op_set_timestamp(session, op); + + WT_REF_LOCK(session, op->u.ref, &previous_state); + if (previous_state == WT_REF_DELETED) + op->u.ref->ft_info.del->committed = 1; + else + __wt_free(session, op->u.ref->ft_info.update); + WT_REF_UNLOCK(op->u.ref, previous_state); + + __wt_txn_op_free(session, op); + + --ft_resolution; + } + WT_ASSERT(session, ft_resolution == 0); + + txn->mod_count = 0; +#ifdef HAVE_DIAGNOSTIC + WT_ASSERT(session, txn->prepare_count == prepare_count); + txn->prepare_count = 0; +#endif + + /* * If durable is set, we'll try to update the global durable timestamp with that value. If * durable isn't set, durable is implied to be the same as commit so we'll use that instead. */ diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index c7ebb3654f8..54f9c2e9333 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -1121,8 +1121,8 @@ __rollback_abort_fast_truncate( * individual WT_UPDATE structures. When reviewing internal pages, ignore the second case, an * instantiated page is handled when the leaf page is visited. */ - if (ref->state == WT_REF_DELETED && ref->page_del != NULL && - rollback_timestamp < ref->page_del->durable_timestamp) { + if (ref->state == WT_REF_DELETED && ref->ft_info.del != NULL && + rollback_timestamp < ref->ft_info.del->durable_timestamp) { __wt_verbose( session, WT_VERB_RECOVERY_RTS(session), "%p: deleted page rolled back", (void *)ref); WT_RET(__wt_delete_page_rollback(session, ref)); diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 3fd5706efad..3a37b4fd8bf 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -391,7 +391,8 @@ operations(u_int ops_seconds, bool lastrun) * The system should be quiescent at this point, call rollback to stable. Generally, we expect * applications to do rollback-to-stable as part of the database open, but calling it outside of * the open path is expected in the case of applications that are "restarting" but skipping the - * close/re-open pair. + * close/re-open pair. Note we are not advancing the oldest timestamp, otherwise we wouldn't be + * able to replay operations from after rollback-to-stable completes. */ tinfo_rollback_to_stable(session); |