From 56655b06ac46825c5937ccca5947dc84ccbca69c Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Mon, 2 Mar 2020 14:15:18 +1100 Subject: Import wiredtiger: 7e595e4a3a9c30c9db0eb33f7da72c97526a2b99 from branch mongodb-4.4 ref: 0a0a6354a8..7e595e4a3a for: 4.3.4 Revert below tickets: WT-5518 Split-parent code can race with other threads when checking the WT_REF.state WT-5552 Checkpoint reconciliation and page splits free the WT_REF.addr field without locking WT-5658 Fix heap-use-after-free in parent split code --- src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_discard.c | 17 -- src/third_party/wiredtiger/src/btree/bt_split.c | 239 ++++++++++----------- src/third_party/wiredtiger/src/include/btree.i | 17 ++ src/third_party/wiredtiger/src/include/extern.h | 2 +- .../wiredtiger/src/reconcile/rec_write.c | 22 +- 6 files changed, 135 insertions(+), 164 deletions(-) diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 25637cb1e31..283621c29c6 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "0a0a6354a8d7eea17c803877f9921ed6cedc31c5" + "commit": "7e595e4a3a9c30c9db0eb33f7da72c97526a2b99" } diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index fcac0745390..e3f1413c7d4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -214,23 +214,6 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, page->modify); } -/* - * __wt_ref_addr_free -- - * Free the address in a reference, if necessary. - */ -void -__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref) -{ - if (ref->addr == NULL) - return; - - if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) { - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - } - ref->addr = NULL; -} - /* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the pages it references). diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index a047fa63c3f..3cc0d4018f0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -589,60 +589,6 @@ err: return (ret); } -/* - * __split_parent_discard_ref -- - * Worker routine to discard WT_REFs for the split-parent function. - */ -static int -__split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *parent, size_t *decrp, - uint64_t split_gen, bool exclusive) -{ - WT_DECL_RET; - WT_IKEY *ikey; - size_t size; - - /* - * Row-store trees where the old version of the page is being discarded: the previous parent - * page's key for this child page may have been an on-page overflow key. In that case, if the - * key hasn't been deleted, delete it now, including its backing blocks. We are exchanging the - * WT_REF that referenced it for the split page WT_REFs and their keys, and there's no longer - * any reference to it. Done after completing the split (if we failed, we'd leak the underlying - * blocks, but the parent page would be unaffected). - */ - if (parent->type == WT_PAGE_ROW_INT) { - WT_TRET(__split_ovfl_key_cleanup(session, parent, ref)); - ikey = __wt_ref_key_instantiated(ref); - if (ikey != NULL) { - size = sizeof(WT_IKEY) + ikey->size; - WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size)); - *decrp += size; - } - } - - /* - * The page-delete and history store memory weren't added to the parent's footprint, ignore it - * here. - */ - if (ref->page_del != NULL) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } - - /* Free the backing block and address. */ - WT_TRET(__wt_ref_block_free(session, ref)); - - /* - * Set the WT_REF state. It may be possible to immediately free the WT_REF, so this is our last - * chance. - */ - WT_REF_SET_STATE(ref, WT_REF_SPLIT); - - WT_TRET(__split_safe_free(session, split_gen, exclusive, ref, sizeof(WT_REF))); - *decrp += sizeof(WT_REF); - - return (ret); -} - /* * __split_parent -- * Resolve a multi-page split, inserting new information into the parent. @@ -654,6 +600,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t WT_BTREE *btree; WT_DECL_ITEM(scr); WT_DECL_RET; + WT_IKEY *ikey; WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref; @@ -670,7 +617,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t alloc_index = pindex = NULL; parent_decr = 0; - deleted_refs = NULL; empty_parent = false; complete = WT_ERR_RETURN; @@ -687,35 +633,34 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t /* * Remove any refs to deleted pages while we are splitting, we have the internal page locked - * down and are copying the refs into a new page-index array anyway. + * down, and are copying the refs into a new array anyway. Switch them to the special split + * state, so that any reading thread will restart. * * We can't do this if there is a sync running in the tree in another session: removing the refs * frees the blocks for the deleted pages, which can corrupt the free list calculated by the * sync. */ - deleted_entries = 0; - if (!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) - for (i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); - - /* Protect against including the replaced WT_REF in the list of deleted items. */ - if (next_ref != ref && next_ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, next_ref, true) && - WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_LOCKED)) { - if (scr == NULL) - WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); - WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t))); - deleted_refs = scr->mem; - deleted_refs[deleted_entries++] = i; - } + WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); + if ((discard && next_ref == ref) || + ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) && + next_ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, next_ref, true) && + WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_SPLIT))) { + WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t))); + deleted_refs = scr->mem; + deleted_refs[deleted_entries++] = i; } + } /* - * The final entry count is the original count, where one entry will be replaced by some number - * of new entries, and some number will be deleted. + * The final entry count consists of the original count, plus any new pages, less any WT_REFs + * we're removing (deleted entries plus the entry we're replacing). */ - result_entries = (parent_entries + (new_entries - 1)) - deleted_entries; + result_entries = (parent_entries + new_entries) - deleted_entries; + if (!discard) + --result_entries; /* * If there are no remaining entries on the parent, give up, we can't leave an empty internal @@ -743,29 +688,20 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t alloc_index->entries = result_entries; for (alloc_refp = alloc_index->index, hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; - if (next_ref == ref) { + if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - continue; - } - - /* Skip refs we have marked for deletion. */ - if (deleted_entries != 0) { - for (j = 0; j < deleted_entries; ++j) - if (deleted_refs[j] == i) - break; - if (j < deleted_entries) - continue; + else if (next_ref->state != WT_REF_SPLIT) { + /* Skip refs we have marked for deletion. */ + next_ref->pindex_hint = hint++; + *alloc_refp++ = next_ref; } - - next_ref->pindex_hint = hint++; - *alloc_refp++ = next_ref; } - /* Check we filled in the expected number of entries. */ + /* Check that we filled in all the entries. */ WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); /* Start making real changes to the tree, errors are fatal. */ @@ -794,6 +730,26 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t split_gen = __wt_gen_next(session, WT_GEN_SPLIT); parent->pg_intl_split_gen = split_gen; + /* + * If discarding the page's original WT_REF field, reset it to split. Threads cursoring through + * the tree were blocked because that WT_REF state was set to locked. Changing the locked state + * to split unblocks those threads and causes them to re-calculate their position based on the + * just-updated parent page's index. + */ + if (discard) { + /* + * Set the discarded WT_REF state to split, ensuring we don't race with any discard of the + * WT_REF deleted fields. + */ + WT_REF_SET_STATE(ref, WT_REF_SPLIT); + + /* + * Push out the change: not required for correctness, but stops threads spinning on + * incorrect page references. + */ + WT_FULL_BARRIER(); + } + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif @@ -802,24 +758,72 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t complete = WT_ERR_IGNORE; /* - * The new page index is in place. Threads cursoring in the tree are blocked because the WT_REF - * being discarded (if any), and deleted WT_REFs (if any) are in a locked state. Changing the - * locked state to split unblocks those threads and causes them to re-calculate their position - * based on the just-updated parent page's index. The split state doesn't lock the WT_REF.addr - * information which is read by cursor threads in some tree-walk cases: free the WT_REF we were - * splitting and any deleted WT_REFs we found, modulo the usual safe free semantics, then reset - * the WT_REF state. + * !!! + * Swapping in the new page index released the page for eviction, we can + * no longer look inside the page. + */ + if (ref->page == NULL) + __wt_verbose(session, WT_VERB_SPLIT, + "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32 " (-%" PRIu32 ")", + (void *)ref->page, (void *)parent, parent_entries, result_entries, + parent_entries - result_entries); + else + __wt_verbose(session, WT_VERB_SPLIT, + "%p: split into parent %p, %" PRIu32 " -> %" PRIu32 " (+%" PRIu32 ")", (void *)ref->page, + (void *)parent, parent_entries, result_entries, result_entries - parent_entries); + + /* + * The new page index is in place, free the WT_REF we were splitting and any deleted WT_REFs we + * found, modulo the usual safe free semantics. */ - if (discard) { - WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED); - WT_TRET( - __split_parent_discard_ref(session, ref, parent, &parent_decr, split_gen, exclusive)); - } - for (i = 0; i < deleted_entries; ++i) { + for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { next_ref = pindex->index[deleted_refs[i]]; - WT_ASSERT(session, next_ref->state == WT_REF_LOCKED); - WT_TRET(__split_parent_discard_ref( - session, next_ref, parent, &parent_decr, split_gen, exclusive)); +#ifdef HAVE_DIAGNOSTIC + { + uint32_t ref_state; + WT_ORDERED_READ(ref_state, next_ref->state); + WT_ASSERT(session, ref_state == WT_REF_LOCKED || ref_state == WT_REF_SPLIT); + } +#endif + + /* + * We set the WT_REF to split, discard it, freeing any resources it holds. + * + * Row-store trees where the old version of the page is being discarded: the previous parent + * page's key for this child page may have been an on-page overflow key. In that case, if + * the key hasn't been deleted, delete it now, including its backing blocks. We are + * exchanging the WT_REF that referenced it for the split page WT_REFs and their keys, and + * there's no longer any reference to it. Done after completing the split (if we failed, + * we'd leak the underlying blocks, but the parent page would be unaffected). + */ + if (parent->type == WT_PAGE_ROW_INT) { + WT_TRET(__split_ovfl_key_cleanup(session, parent, next_ref)); + ikey = __wt_ref_key_instantiated(next_ref); + if (ikey != NULL) { + size = sizeof(WT_IKEY) + ikey->size; + WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size)); + parent_decr += size; + } + } + + /* Check that we are not discarding active history. */ + WT_ASSERT(session, !__wt_page_las_active(session, next_ref)); + + /* + * The page-delete and lookaside memory weren't added to the parent's footprint, ignore it + * here. + */ + if (next_ref->page_del != NULL) { + __wt_free(session, next_ref->page_del->update_list); + __wt_free(session, next_ref->page_del); + } + __wt_free(session, next_ref->page_las); + + /* Free the backing block and address. */ + WT_TRET(__wt_ref_block_free(session, next_ref)); + + WT_TRET(__split_safe_free(session, split_gen, exclusive, next_ref, sizeof(WT_REF))); + parent_decr += sizeof(WT_REF); } /* @@ -827,12 +831,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t * The original WT_REF has now been freed, we can no longer look at it. */ - /* - * Don't cache the change: not required for correctness, but stops threads spinning on incorrect - * page references. - */ - WT_FULL_BARRIER(); - /* * We can't free the previous page index, there may be threads using it. Add it to the session * discard list, to be freed when it's safe. @@ -845,14 +843,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t __wt_cache_page_inmem_incr(session, parent, parent_incr); __wt_cache_page_inmem_decr(session, parent, parent_decr); - /* - * We've discarded the WT_REFs and swapping in a new page index released the page for eviction; - * we can no longer look inside the WT_REF or the page, be careful logging the results. - */ - __wt_verbose(session, WT_VERB_SPLIT, - "%p: split into parent, %" PRIu32 "->%" PRIu32 ", %" PRIu32 " deleted", (void *)ref, - parent_entries, result_entries, deleted_entries); - err: __wt_scr_free(session, &scr); /* @@ -861,11 +851,10 @@ err: */ switch (complete) { case WT_ERR_RETURN: - /* Unlock WT_REFs locked because they were in a deleted state. */ - for (i = 0; i < deleted_entries; ++i) { - next_ref = pindex->index[deleted_refs[i]]; - WT_ASSERT(session, next_ref->state == WT_REF_LOCKED); - WT_REF_SET_STATE(next_ref, WT_REF_DELETED); + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref->state == WT_REF_SPLIT) + WT_REF_SET_STATE(next_ref, WT_REF_DELETED); } __wt_free_ref_index(session, NULL, alloc_index, false); diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 86c8159b2a7..f39a87d53c9 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -631,6 +631,23 @@ __wt_off_page(WT_PAGE *page, const void *p) p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size)); } +/* + * __wt_ref_addr_free -- + * Free the address in a reference, if necessary. + */ +static inline void +__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref) +{ + if (ref->addr == NULL) + return; + + if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + ref->addr = NULL; +} + /* * __wt_ref_key -- * Return a reference to a row-store internal page key as cheaply as possible. diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index dbaba203140..4ad93180572 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1695,7 +1695,6 @@ extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r); extern void __wt_rec_dictionary_reset(WT_RECONCILE *r); -extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_root_ref_init( WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno); @@ -2137,6 +2136,7 @@ static inline void __wt_rec_cell_build_addr( static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv); static inline void __wt_rec_incr( WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size); +static inline void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref); static inline void __wt_ref_info( WT_SESSION_IMPL *session, WT_REF *ref, const uint8_t **addrp, size_t *sizep, bool *is_leafp); static inline void __wt_ref_info_lock( diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index cfadd17e4f5..a24cd11116b 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -2171,11 +2171,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) { WT_BM *bm; WT_BTREE *btree; - WT_DECL_RET; WT_MULTI *multi; WT_PAGE_MODIFY *mod; WT_REF *ref; - uint32_t i, previous_state; + uint32_t i; btree = S2BT(session); bm = btree->bm; @@ -2199,24 +2198,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ if (__wt_ref_is_root(ref)) break; - - /* - * We're about to discard the WT_REF.addr field, and that can race with the cursor walk code - * examining tree WT_REFs for leaf pages. Lock the WT_REF unless we know we have exclusive - * access. - */ - previous_state = WT_REF_LOCKED; /* -Wuninitialized */ - if (!F_ISSET(r, WT_REC_EVICT)) - for (;; __wt_yield()) { - previous_state = ref->state; - if (previous_state != WT_REF_LOCKED && - WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) - break; - } - ret = __wt_ref_block_free(session, ref); - if (!F_ISSET(r, WT_REC_EVICT)) - WT_REF_SET_STATE(ref, previous_state); - WT_RET(ret); + WT_RET(__wt_ref_block_free(session, ref)); break; case WT_PM_REC_EMPTY: /* Page deleted */ break; -- cgit v1.2.1