summaryrefslogtreecommitdiff
path: root/src/third_party
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2020-02-27 05:49:43 +0000
committerevergreen <evergreen@mongodb.com>2020-02-27 05:49:43 +0000
commita5ec32526ad4528088b78bb30fbda08393863a69 (patch)
tree6dc7577b6e6ff35be5cb15a4c2facd15acf2dc77 /src/third_party
parent1ba6e4193cfc27277e4ed65b033ee03a81b0c3e1 (diff)
downloadmongo-a5ec32526ad4528088b78bb30fbda08393863a69.tar.gz
Import wiredtiger: 0a0a6354a8d7eea17c803877f9921ed6cedc31c5 from branch mongodb-4.4
ref: 58baf804dd..0a0a6354a8 for: 4.3.4 WT-5518 Split-parent code can race with other threads when checking the WT_REF.state WT-5552 Checkpoint reconciliation and page splits free the WT_REF.addr field without locking WT-5658 Fix heap-use-after-free in parent split code
Diffstat (limited to 'src/third_party')
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c17
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c239
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i17
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h2
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c22
6 files changed, 164 insertions, 135 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 9b1647d3c2f..25637cb1e31 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "58baf804dd6e5a72c4e122cfb696e2d06a9fc888"
+ "commit": "0a0a6354a8d7eea17c803877f9921ed6cedc31c5"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index e3f1413c7d4..fcac0745390 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -215,6 +215,23 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
+ * __wt_ref_addr_free --
+ * Free the address in a reference, if necessary.
+ */
+void
+__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ if (ref->addr == NULL)
+ return;
+
+ if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+ ref->addr = NULL;
+}
+
+/*
* __wt_free_ref --
* Discard the contents of a WT_REF structure (optionally including the pages it references).
*/
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 3cc0d4018f0..a047fa63c3f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -590,6 +590,60 @@ err:
}
/*
+ * __split_parent_discard_ref --
+ * Worker routine to discard WT_REFs for the split-parent function.
+ */
+static int
+__split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *parent, size_t *decrp,
+ uint64_t split_gen, bool exclusive)
+{
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ size_t size;
+
+ /*
+ * Row-store trees where the old version of the page is being discarded: the previous parent
+ * page's key for this child page may have been an on-page overflow key. In that case, if the
+ * key hasn't been deleted, delete it now, including its backing blocks. We are exchanging the
+ * WT_REF that referenced it for the split page WT_REFs and their keys, and there's no longer
+ * any reference to it. Done after completing the split (if we failed, we'd leak the underlying
+ * blocks, but the parent page would be unaffected).
+ */
+ if (parent->type == WT_PAGE_ROW_INT) {
+ WT_TRET(__split_ovfl_key_cleanup(session, parent, ref));
+ ikey = __wt_ref_key_instantiated(ref);
+ if (ikey != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size));
+ *decrp += size;
+ }
+ }
+
+ /*
+ * The page-delete and history store memory weren't added to the parent's footprint, ignore it
+ * here.
+ */
+ if (ref->page_del != NULL) {
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ }
+
+ /* Free the backing block and address. */
+ WT_TRET(__wt_ref_block_free(session, ref));
+
+ /*
+ * Set the WT_REF state. It may be possible to immediately free the WT_REF, so this is our last
+ * chance.
+ */
+ WT_REF_SET_STATE(ref, WT_REF_SPLIT);
+
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, ref, sizeof(WT_REF)));
+ *decrp += sizeof(WT_REF);
+
+ return (ret);
+}
+
+/*
* __split_parent --
* Resolve a multi-page split, inserting new information into the parent.
*/
@@ -600,7 +654,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
WT_BTREE *btree;
WT_DECL_ITEM(scr);
WT_DECL_RET;
- WT_IKEY *ikey;
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
WT_REF **alloc_refp, *next_ref;
@@ -617,6 +670,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
alloc_index = pindex = NULL;
parent_decr = 0;
+ deleted_refs = NULL;
empty_parent = false;
complete = WT_ERR_RETURN;
@@ -633,34 +687,35 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
/*
* Remove any refs to deleted pages while we are splitting, we have the internal page locked
- * down, and are copying the refs into a new array anyway. Switch them to the special split
- * state, so that any reading thread will restart.
+ * down and are copying the refs into a new page-index array anyway.
*
* We can't do this if there is a sync running in the tree in another session: removing the refs
* frees the blocks for the deleted pages, which can corrupt the free list calculated by the
* sync.
*/
- WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
- for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
- if ((discard && next_ref == ref) ||
- ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) &&
- next_ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, next_ref, true) &&
- WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_SPLIT))) {
- WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t)));
- deleted_refs = scr->mem;
- deleted_refs[deleted_entries++] = i;
+ deleted_entries = 0;
+ if (!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session))
+ for (i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
+
+ /* Protect against including the replaced WT_REF in the list of deleted items. */
+ if (next_ref != ref && next_ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, next_ref, true) &&
+ WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_LOCKED)) {
+ if (scr == NULL)
+ WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t)));
+ deleted_refs = scr->mem;
+ deleted_refs[deleted_entries++] = i;
+ }
}
- }
/*
- * The final entry count consists of the original count, plus any new pages, less any WT_REFs
- * we're removing (deleted entries plus the entry we're replacing).
+ * The final entry count is the original count, where one entry will be replaced by some number
+ * of new entries, and some number will be deleted.
*/
- result_entries = (parent_entries + new_entries) - deleted_entries;
- if (!discard)
- --result_entries;
+ result_entries = (parent_entries + (new_entries - 1)) - deleted_entries;
/*
* If there are no remaining entries on the parent, give up, we can't leave an empty internal
@@ -688,20 +743,29 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
alloc_index->entries = result_entries;
for (alloc_refp = alloc_index->index, hint = i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
- if (next_ref == ref)
+ if (next_ref == ref) {
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
ref_new[j]->pindex_hint = hint++;
*alloc_refp++ = ref_new[j];
}
- else if (next_ref->state != WT_REF_SPLIT) {
- /* Skip refs we have marked for deletion. */
- next_ref->pindex_hint = hint++;
- *alloc_refp++ = next_ref;
+ continue;
+ }
+
+ /* Skip refs we have marked for deletion. */
+ if (deleted_entries != 0) {
+ for (j = 0; j < deleted_entries; ++j)
+ if (deleted_refs[j] == i)
+ break;
+ if (j < deleted_entries)
+ continue;
}
+
+ next_ref->pindex_hint = hint++;
+ *alloc_refp++ = next_ref;
}
- /* Check that we filled in all the entries. */
+ /* Check we filled in the expected number of entries. */
WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
/* Start making real changes to the tree, errors are fatal. */
@@ -730,26 +794,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
parent->pg_intl_split_gen = split_gen;
- /*
- * If discarding the page's original WT_REF field, reset it to split. Threads cursoring through
- * the tree were blocked because that WT_REF state was set to locked. Changing the locked state
- * to split unblocks those threads and causes them to re-calculate their position based on the
- * just-updated parent page's index.
- */
- if (discard) {
- /*
- * Set the discarded WT_REF state to split, ensuring we don't race with any discard of the
- * WT_REF deleted fields.
- */
- WT_REF_SET_STATE(ref, WT_REF_SPLIT);
-
- /*
- * Push out the change: not required for correctness, but stops threads spinning on
- * incorrect page references.
- */
- WT_FULL_BARRIER();
- }
-
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent));
#endif
@@ -758,72 +802,24 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
complete = WT_ERR_IGNORE;
/*
- * !!!
- * Swapping in the new page index released the page for eviction, we can
- * no longer look inside the page.
- */
- if (ref->page == NULL)
- __wt_verbose(session, WT_VERB_SPLIT,
- "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32 " (-%" PRIu32 ")",
- (void *)ref->page, (void *)parent, parent_entries, result_entries,
- parent_entries - result_entries);
- else
- __wt_verbose(session, WT_VERB_SPLIT,
- "%p: split into parent %p, %" PRIu32 " -> %" PRIu32 " (+%" PRIu32 ")", (void *)ref->page,
- (void *)parent, parent_entries, result_entries, result_entries - parent_entries);
-
- /*
- * The new page index is in place, free the WT_REF we were splitting and any deleted WT_REFs we
- * found, modulo the usual safe free semantics.
+ * The new page index is in place. Threads cursoring in the tree are blocked because the WT_REF
+ * being discarded (if any), and deleted WT_REFs (if any) are in a locked state. Changing the
+ * locked state to split unblocks those threads and causes them to re-calculate their position
+ * based on the just-updated parent page's index. The split state doesn't lock the WT_REF.addr
+ * information which is read by cursor threads in some tree-walk cases: free the WT_REF we were
+ * splitting and any deleted WT_REFs we found, modulo the usual safe free semantics, then reset
+ * the WT_REF state.
*/
- for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
+ if (discard) {
+ WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED);
+ WT_TRET(
+ __split_parent_discard_ref(session, ref, parent, &parent_decr, split_gen, exclusive));
+ }
+ for (i = 0; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
-#ifdef HAVE_DIAGNOSTIC
- {
- uint32_t ref_state;
- WT_ORDERED_READ(ref_state, next_ref->state);
- WT_ASSERT(session, ref_state == WT_REF_LOCKED || ref_state == WT_REF_SPLIT);
- }
-#endif
-
- /*
- * We set the WT_REF to split, discard it, freeing any resources it holds.
- *
- * Row-store trees where the old version of the page is being discarded: the previous parent
- * page's key for this child page may have been an on-page overflow key. In that case, if
- * the key hasn't been deleted, delete it now, including its backing blocks. We are
- * exchanging the WT_REF that referenced it for the split page WT_REFs and their keys, and
- * there's no longer any reference to it. Done after completing the split (if we failed,
- * we'd leak the underlying blocks, but the parent page would be unaffected).
- */
- if (parent->type == WT_PAGE_ROW_INT) {
- WT_TRET(__split_ovfl_key_cleanup(session, parent, next_ref));
- ikey = __wt_ref_key_instantiated(next_ref);
- if (ikey != NULL) {
- size = sizeof(WT_IKEY) + ikey->size;
- WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size));
- parent_decr += size;
- }
- }
-
- /* Check that we are not discarding active history. */
- WT_ASSERT(session, !__wt_page_las_active(session, next_ref));
-
- /*
- * The page-delete and lookaside memory weren't added to the parent's footprint, ignore it
- * here.
- */
- if (next_ref->page_del != NULL) {
- __wt_free(session, next_ref->page_del->update_list);
- __wt_free(session, next_ref->page_del);
- }
- __wt_free(session, next_ref->page_las);
-
- /* Free the backing block and address. */
- WT_TRET(__wt_ref_block_free(session, next_ref));
-
- WT_TRET(__split_safe_free(session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
- parent_decr += sizeof(WT_REF);
+ WT_ASSERT(session, next_ref->state == WT_REF_LOCKED);
+ WT_TRET(__split_parent_discard_ref(
+ session, next_ref, parent, &parent_decr, split_gen, exclusive));
}
/*
@@ -832,6 +828,12 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
*/
/*
+ * Don't cache the change: not required for correctness, but stops threads spinning on incorrect
+ * page references.
+ */
+ WT_FULL_BARRIER();
+
+ /*
* We can't free the previous page index, there may be threads using it. Add it to the session
* discard list, to be freed when it's safe.
*/
@@ -843,6 +845,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
__wt_cache_page_inmem_incr(session, parent, parent_incr);
__wt_cache_page_inmem_decr(session, parent, parent_decr);
+ /*
+ * We've discarded the WT_REFs and swapping in a new page index released the page for eviction;
+ * we can no longer look inside the WT_REF or the page, be careful logging the results.
+ */
+ __wt_verbose(session, WT_VERB_SPLIT,
+ "%p: split into parent, %" PRIu32 "->%" PRIu32 ", %" PRIu32 " deleted", (void *)ref,
+ parent_entries, result_entries, deleted_entries);
+
err:
__wt_scr_free(session, &scr);
/*
@@ -851,10 +861,11 @@ err:
*/
switch (complete) {
case WT_ERR_RETURN:
- for (i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref->state == WT_REF_SPLIT)
- WT_REF_SET_STATE(next_ref, WT_REF_DELETED);
+ /* Unlock WT_REFs locked because they were in a deleted state. */
+ for (i = 0; i < deleted_entries; ++i) {
+ next_ref = pindex->index[deleted_refs[i]];
+ WT_ASSERT(session, next_ref->state == WT_REF_LOCKED);
+ WT_REF_SET_STATE(next_ref, WT_REF_DELETED);
}
__wt_free_ref_index(session, NULL, alloc_index, false);
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index f39a87d53c9..86c8159b2a7 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -632,23 +632,6 @@ __wt_off_page(WT_PAGE *page, const void *p)
}
/*
- * __wt_ref_addr_free --
- * Free the address in a reference, if necessary.
- */
-static inline void
-__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- if (ref->addr == NULL)
- return;
-
- if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- }
- ref->addr = NULL;
-}
-
-/*
* __wt_ref_key --
* Return a reference to a row-store internal page key as cheaply as possible.
*/
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 4ad93180572..dbaba203140 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1695,6 +1695,7 @@ extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r);
extern void __wt_rec_dictionary_reset(WT_RECONCILE *r);
+extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_root_ref_init(
WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno);
@@ -2136,7 +2137,6 @@ static inline void __wt_rec_cell_build_addr(
static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv);
static inline void __wt_rec_incr(
WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size);
-static inline void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref);
static inline void __wt_ref_info(
WT_SESSION_IMPL *session, WT_REF *ref, const uint8_t **addrp, size_t *sizep, bool *is_leafp);
static inline void __wt_ref_info_lock(
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index a24cd11116b..cfadd17e4f5 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -2171,10 +2171,11 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
{
WT_BM *bm;
WT_BTREE *btree;
+ WT_DECL_RET;
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
WT_REF *ref;
- uint32_t i;
+ uint32_t i, previous_state;
btree = S2BT(session);
bm = btree->bm;
@@ -2198,7 +2199,24 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
if (__wt_ref_is_root(ref))
break;
- WT_RET(__wt_ref_block_free(session, ref));
+
+ /*
+ * We're about to discard the WT_REF.addr field, and that can race with the cursor walk code
+ * examining tree WT_REFs for leaf pages. Lock the WT_REF unless we know we have exclusive
+ * access.
+ */
+ previous_state = WT_REF_LOCKED; /* -Wuninitialized */
+ if (!F_ISSET(r, WT_REC_EVICT))
+ for (;; __wt_yield()) {
+ previous_state = ref->state;
+ if (previous_state != WT_REF_LOCKED &&
+ WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED))
+ break;
+ }
+ ret = __wt_ref_block_free(session, ref);
+ if (!F_ISSET(r, WT_REC_EVICT))
+ WT_REF_SET_STATE(ref, previous_state);
+ WT_RET(ret);
break;
case WT_PM_REC_EMPTY: /* Page deleted */
break;