diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-02-27 05:49:43 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2020-02-27 05:49:43 +0000 |
commit | a5ec32526ad4528088b78bb30fbda08393863a69 (patch) | |
tree | 6dc7577b6e6ff35be5cb15a4c2facd15acf2dc77 /src | |
parent | 1ba6e4193cfc27277e4ed65b033ee03a81b0c3e1 (diff) | |
download | mongo-a5ec32526ad4528088b78bb30fbda08393863a69.tar.gz |
Import wiredtiger: 0a0a6354a8d7eea17c803877f9921ed6cedc31c5 from branch mongodb-4.4
ref: 58baf804dd..0a0a6354a8
for: 4.3.4
WT-5518 Split-parent code can race with other threads when checking the WT_REF.state
WT-5552 Checkpoint reconciliation and page splits free the WT_REF.addr field without locking
WT-5658 Fix heap-use-after-free in parent split code
Diffstat (limited to 'src')
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_discard.c | 17 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 239 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree.i | 17 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/extern.h | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_write.c | 22 |
6 files changed, 164 insertions, 135 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 9b1647d3c2f..25637cb1e31 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "58baf804dd6e5a72c4e122cfb696e2d06a9fc888" + "commit": "0a0a6354a8d7eea17c803877f9921ed6cedc31c5" } diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index e3f1413c7d4..fcac0745390 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -215,6 +215,23 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_ref_addr_free -- + * Free the address in a reference, if necessary. + */ +void +__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref) +{ + if (ref->addr == NULL) + return; + + if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + ref->addr = NULL; +} + +/* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the pages it references). */ diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 3cc0d4018f0..a047fa63c3f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -590,6 +590,60 @@ err: } /* + * __split_parent_discard_ref -- + * Worker routine to discard WT_REFs for the split-parent function. + */ +static int +__split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *parent, size_t *decrp, + uint64_t split_gen, bool exclusive) +{ + WT_DECL_RET; + WT_IKEY *ikey; + size_t size; + + /* + * Row-store trees where the old version of the page is being discarded: the previous parent + * page's key for this child page may have been an on-page overflow key. In that case, if the + * key hasn't been deleted, delete it now, including its backing blocks. We are exchanging the + * WT_REF that referenced it for the split page WT_REFs and their keys, and there's no longer + * any reference to it. Done after completing the split (if we failed, we'd leak the underlying + * blocks, but the parent page would be unaffected). + */ + if (parent->type == WT_PAGE_ROW_INT) { + WT_TRET(__split_ovfl_key_cleanup(session, parent, ref)); + ikey = __wt_ref_key_instantiated(ref); + if (ikey != NULL) { + size = sizeof(WT_IKEY) + ikey->size; + WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size)); + *decrp += size; + } + } + + /* + * The page-delete and history store memory weren't added to the parent's footprint, ignore it + * here. + */ + if (ref->page_del != NULL) { + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + } + + /* Free the backing block and address. */ + WT_TRET(__wt_ref_block_free(session, ref)); + + /* + * Set the WT_REF state. It may be possible to immediately free the WT_REF, so this is our last + * chance. + */ + WT_REF_SET_STATE(ref, WT_REF_SPLIT); + + WT_TRET(__split_safe_free(session, split_gen, exclusive, ref, sizeof(WT_REF))); + *decrp += sizeof(WT_REF); + + return (ret); +} + +/* * __split_parent -- * Resolve a multi-page split, inserting new information into the parent. */ @@ -600,7 +654,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t WT_BTREE *btree; WT_DECL_ITEM(scr); WT_DECL_RET; - WT_IKEY *ikey; WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref; @@ -617,6 +670,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t alloc_index = pindex = NULL; parent_decr = 0; + deleted_refs = NULL; empty_parent = false; complete = WT_ERR_RETURN; @@ -633,34 +687,35 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t /* * Remove any refs to deleted pages while we are splitting, we have the internal page locked - * down, and are copying the refs into a new array anyway. Switch them to the special split - * state, so that any reading thread will restart. + * down and are copying the refs into a new page-index array anyway. * * We can't do this if there is a sync running in the tree in another session: removing the refs * frees the blocks for the deleted pages, which can corrupt the free list calculated by the * sync. */ - WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); - for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); - if ((discard && next_ref == ref) || - ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) && - next_ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, next_ref, true) && - WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_SPLIT))) { - WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t))); - deleted_refs = scr->mem; - deleted_refs[deleted_entries++] = i; + deleted_entries = 0; + if (!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); + + /* Protect against including the replaced WT_REF in the list of deleted items. */ + if (next_ref != ref && next_ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, next_ref, true) && + WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_LOCKED)) { + if (scr == NULL) + WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t))); + deleted_refs = scr->mem; + deleted_refs[deleted_entries++] = i; + } } - } /* - * The final entry count consists of the original count, plus any new pages, less any WT_REFs - * we're removing (deleted entries plus the entry we're replacing). + * The final entry count is the original count, where one entry will be replaced by some number + * of new entries, and some number will be deleted. */ - result_entries = (parent_entries + new_entries) - deleted_entries; - if (!discard) - --result_entries; + result_entries = (parent_entries + (new_entries - 1)) - deleted_entries; /* * If there are no remaining entries on the parent, give up, we can't leave an empty internal @@ -688,20 +743,29 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t alloc_index->entries = result_entries; for (alloc_refp = alloc_index->index, hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; - if (next_ref == ref) + if (next_ref == ref) { for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) { - /* Skip refs we have marked for deletion. */ - next_ref->pindex_hint = hint++; - *alloc_refp++ = next_ref; + continue; + } + + /* Skip refs we have marked for deletion. */ + if (deleted_entries != 0) { + for (j = 0; j < deleted_entries; ++j) + if (deleted_refs[j] == i) + break; + if (j < deleted_entries) + continue; } + + next_ref->pindex_hint = hint++; + *alloc_refp++ = next_ref; } - /* Check that we filled in all the entries. */ + /* Check we filled in the expected number of entries. */ WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); /* Start making real changes to the tree, errors are fatal. */ @@ -730,26 +794,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t split_gen = __wt_gen_next(session, WT_GEN_SPLIT); parent->pg_intl_split_gen = split_gen; - /* - * If discarding the page's original WT_REF field, reset it to split. Threads cursoring through - * the tree were blocked because that WT_REF state was set to locked. Changing the locked state - * to split unblocks those threads and causes them to re-calculate their position based on the - * just-updated parent page's index. - */ - if (discard) { - /* - * Set the discarded WT_REF state to split, ensuring we don't race with any discard of the - * WT_REF deleted fields. - */ - WT_REF_SET_STATE(ref, WT_REF_SPLIT); - - /* - * Push out the change: not required for correctness, but stops threads spinning on - * incorrect page references. - */ - WT_FULL_BARRIER(); - } - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif @@ -758,72 +802,24 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t complete = WT_ERR_IGNORE; /* - * !!! - * Swapping in the new page index released the page for eviction, we can - * no longer look inside the page. - */ - if (ref->page == NULL) - __wt_verbose(session, WT_VERB_SPLIT, - "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32 " (-%" PRIu32 ")", - (void *)ref->page, (void *)parent, parent_entries, result_entries, - parent_entries - result_entries); - else - __wt_verbose(session, WT_VERB_SPLIT, - "%p: split into parent %p, %" PRIu32 " -> %" PRIu32 " (+%" PRIu32 ")", (void *)ref->page, - (void *)parent, parent_entries, result_entries, result_entries - parent_entries); - - /* - * The new page index is in place, free the WT_REF we were splitting and any deleted WT_REFs we - * found, modulo the usual safe free semantics. + * The new page index is in place. Threads cursoring in the tree are blocked because the WT_REF + * being discarded (if any), and deleted WT_REFs (if any) are in a locked state. Changing the + * locked state to split unblocks those threads and causes them to re-calculate their position + * based on the just-updated parent page's index. The split state doesn't lock the WT_REF.addr + * information which is read by cursor threads in some tree-walk cases: free the WT_REF we were + * splitting and any deleted WT_REFs we found, modulo the usual safe free semantics, then reset + * the WT_REF state. */ - for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { + if (discard) { + WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED); + WT_TRET( + __split_parent_discard_ref(session, ref, parent, &parent_decr, split_gen, exclusive)); + } + for (i = 0; i < deleted_entries; ++i) { next_ref = pindex->index[deleted_refs[i]]; -#ifdef HAVE_DIAGNOSTIC - { - uint32_t ref_state; - WT_ORDERED_READ(ref_state, next_ref->state); - WT_ASSERT(session, ref_state == WT_REF_LOCKED || ref_state == WT_REF_SPLIT); - } -#endif - - /* - * We set the WT_REF to split, discard it, freeing any resources it holds. - * - * Row-store trees where the old version of the page is being discarded: the previous parent - * page's key for this child page may have been an on-page overflow key. In that case, if - * the key hasn't been deleted, delete it now, including its backing blocks. We are - * exchanging the WT_REF that referenced it for the split page WT_REFs and their keys, and - * there's no longer any reference to it. Done after completing the split (if we failed, - * we'd leak the underlying blocks, but the parent page would be unaffected). - */ - if (parent->type == WT_PAGE_ROW_INT) { - WT_TRET(__split_ovfl_key_cleanup(session, parent, next_ref)); - ikey = __wt_ref_key_instantiated(next_ref); - if (ikey != NULL) { - size = sizeof(WT_IKEY) + ikey->size; - WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size)); - parent_decr += size; - } - } - - /* Check that we are not discarding active history. */ - WT_ASSERT(session, !__wt_page_las_active(session, next_ref)); - - /* - * The page-delete and lookaside memory weren't added to the parent's footprint, ignore it - * here. - */ - if (next_ref->page_del != NULL) { - __wt_free(session, next_ref->page_del->update_list); - __wt_free(session, next_ref->page_del); - } - __wt_free(session, next_ref->page_las); - - /* Free the backing block and address. */ - WT_TRET(__wt_ref_block_free(session, next_ref)); - - WT_TRET(__split_safe_free(session, split_gen, exclusive, next_ref, sizeof(WT_REF))); - parent_decr += sizeof(WT_REF); + WT_ASSERT(session, next_ref->state == WT_REF_LOCKED); + WT_TRET(__split_parent_discard_ref( + session, next_ref, parent, &parent_decr, split_gen, exclusive)); } /* @@ -832,6 +828,12 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t */ /* + * Don't cache the change: not required for correctness, but stops threads spinning on incorrect + * page references. + */ + WT_FULL_BARRIER(); + + /* * We can't free the previous page index, there may be threads using it. Add it to the session * discard list, to be freed when it's safe. */ @@ -843,6 +845,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t __wt_cache_page_inmem_incr(session, parent, parent_incr); __wt_cache_page_inmem_decr(session, parent, parent_decr); + /* + * We've discarded the WT_REFs and swapping in a new page index released the page for eviction; + * we can no longer look inside the WT_REF or the page, be careful logging the results. + */ + __wt_verbose(session, WT_VERB_SPLIT, + "%p: split into parent, %" PRIu32 "->%" PRIu32 ", %" PRIu32 " deleted", (void *)ref, + parent_entries, result_entries, deleted_entries); + err: __wt_scr_free(session, &scr); /* @@ -851,10 +861,11 @@ err: */ switch (complete) { case WT_ERR_RETURN: - for (i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref->state == WT_REF_SPLIT) - WT_REF_SET_STATE(next_ref, WT_REF_DELETED); + /* Unlock WT_REFs locked because they were in a deleted state. */ + for (i = 0; i < deleted_entries; ++i) { + next_ref = pindex->index[deleted_refs[i]]; + WT_ASSERT(session, next_ref->state == WT_REF_LOCKED); + WT_REF_SET_STATE(next_ref, WT_REF_DELETED); } __wt_free_ref_index(session, NULL, alloc_index, false); diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index f39a87d53c9..86c8159b2a7 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -632,23 +632,6 @@ __wt_off_page(WT_PAGE *page, const void *p) } /* - * __wt_ref_addr_free -- - * Free the address in a reference, if necessary. - */ -static inline void -__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref) -{ - if (ref->addr == NULL) - return; - - if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) { - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - } - ref->addr = NULL; -} - -/* * __wt_ref_key -- * Return a reference to a row-store internal page key as cheaply as possible. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 4ad93180572..dbaba203140 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1695,6 +1695,7 @@ extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r); extern void __wt_rec_dictionary_reset(WT_RECONCILE *r); +extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_root_ref_init( WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno); @@ -2136,7 +2137,6 @@ static inline void __wt_rec_cell_build_addr( static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv); static inline void __wt_rec_incr( WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size); -static inline void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref); static inline void __wt_ref_info( WT_SESSION_IMPL *session, WT_REF *ref, const uint8_t **addrp, size_t *sizep, bool *is_leafp); static inline void __wt_ref_info_lock( diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index a24cd11116b..cfadd17e4f5 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -2171,10 +2171,11 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) { WT_BM *bm; WT_BTREE *btree; + WT_DECL_RET; WT_MULTI *multi; WT_PAGE_MODIFY *mod; WT_REF *ref; - uint32_t i; + uint32_t i, previous_state; btree = S2BT(session); bm = btree->bm; @@ -2198,7 +2199,24 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ if (__wt_ref_is_root(ref)) break; - WT_RET(__wt_ref_block_free(session, ref)); + + /* + * We're about to discard the WT_REF.addr field, and that can race with the cursor walk code + * examining tree WT_REFs for leaf pages. Lock the WT_REF unless we know we have exclusive + * access. + */ + previous_state = WT_REF_LOCKED; /* -Wuninitialized */ + if (!F_ISSET(r, WT_REC_EVICT)) + for (;; __wt_yield()) { + previous_state = ref->state; + if (previous_state != WT_REF_LOCKED && + WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) + break; + } + ret = __wt_ref_block_free(session, ref); + if (!F_ISSET(r, WT_REC_EVICT)) + WT_REF_SET_STATE(ref, previous_state); + WT_RET(ret); break; case WT_PM_REC_EMPTY: /* Page deleted */ break; |