summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c17
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c239
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i17
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h2
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c22
6 files changed, 135 insertions, 164 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 25637cb1e31..283621c29c6 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "0a0a6354a8d7eea17c803877f9921ed6cedc31c5"
+ "commit": "7e595e4a3a9c30c9db0eb33f7da72c97526a2b99"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index fcac0745390..e3f1413c7d4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -215,23 +215,6 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * __wt_ref_addr_free --
- * Free the address in a reference, if necessary.
- */
-void
-__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- if (ref->addr == NULL)
- return;
-
- if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- }
- ref->addr = NULL;
-}
-
-/*
* __wt_free_ref --
* Discard the contents of a WT_REF structure (optionally including the pages it references).
*/
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index a047fa63c3f..3cc0d4018f0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -590,60 +590,6 @@ err:
}
/*
- * __split_parent_discard_ref --
- * Worker routine to discard WT_REFs for the split-parent function.
- */
-static int
-__split_parent_discard_ref(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *parent, size_t *decrp,
- uint64_t split_gen, bool exclusive)
-{
- WT_DECL_RET;
- WT_IKEY *ikey;
- size_t size;
-
- /*
- * Row-store trees where the old version of the page is being discarded: the previous parent
- * page's key for this child page may have been an on-page overflow key. In that case, if the
- * key hasn't been deleted, delete it now, including its backing blocks. We are exchanging the
- * WT_REF that referenced it for the split page WT_REFs and their keys, and there's no longer
- * any reference to it. Done after completing the split (if we failed, we'd leak the underlying
- * blocks, but the parent page would be unaffected).
- */
- if (parent->type == WT_PAGE_ROW_INT) {
- WT_TRET(__split_ovfl_key_cleanup(session, parent, ref));
- ikey = __wt_ref_key_instantiated(ref);
- if (ikey != NULL) {
- size = sizeof(WT_IKEY) + ikey->size;
- WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size));
- *decrp += size;
- }
- }
-
- /*
- * The page-delete and history store memory weren't added to the parent's footprint, ignore it
- * here.
- */
- if (ref->page_del != NULL) {
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
-
- /* Free the backing block and address. */
- WT_TRET(__wt_ref_block_free(session, ref));
-
- /*
- * Set the WT_REF state. It may be possible to immediately free the WT_REF, so this is our last
- * chance.
- */
- WT_REF_SET_STATE(ref, WT_REF_SPLIT);
-
- WT_TRET(__split_safe_free(session, split_gen, exclusive, ref, sizeof(WT_REF)));
- *decrp += sizeof(WT_REF);
-
- return (ret);
-}
-
-/*
* __split_parent --
* Resolve a multi-page split, inserting new information into the parent.
*/
@@ -654,6 +600,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
WT_BTREE *btree;
WT_DECL_ITEM(scr);
WT_DECL_RET;
+ WT_IKEY *ikey;
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
WT_REF **alloc_refp, *next_ref;
@@ -670,7 +617,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
alloc_index = pindex = NULL;
parent_decr = 0;
- deleted_refs = NULL;
empty_parent = false;
complete = WT_ERR_RETURN;
@@ -687,35 +633,34 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
/*
* Remove any refs to deleted pages while we are splitting, we have the internal page locked
- * down and are copying the refs into a new page-index array anyway.
+ * down, and are copying the refs into a new array anyway. Switch them to the special split
+ * state, so that any reading thread will restart.
*
* We can't do this if there is a sync running in the tree in another session: removing the refs
* frees the blocks for the deleted pages, which can corrupt the free list calculated by the
* sync.
*/
- deleted_entries = 0;
- if (!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session))
- for (i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
-
- /* Protect against including the replaced WT_REF in the list of deleted items. */
- if (next_ref != ref && next_ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, next_ref, true) &&
- WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_LOCKED)) {
- if (scr == NULL)
- WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
- WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t)));
- deleted_refs = scr->mem;
- deleted_refs[deleted_entries++] = i;
- }
+ WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
+ if ((discard && next_ref == ref) ||
+ ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) &&
+ next_ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, next_ref, true) &&
+ WT_REF_CAS_STATE(session, next_ref, WT_REF_DELETED, WT_REF_SPLIT))) {
+ WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t)));
+ deleted_refs = scr->mem;
+ deleted_refs[deleted_entries++] = i;
}
+ }
/*
- * The final entry count is the original count, where one entry will be replaced by some number
- * of new entries, and some number will be deleted.
+ * The final entry count consists of the original count, plus any new pages, less any WT_REFs
+ * we're removing (deleted entries plus the entry we're replacing).
*/
- result_entries = (parent_entries + (new_entries - 1)) - deleted_entries;
+ result_entries = (parent_entries + new_entries) - deleted_entries;
+ if (!discard)
+ --result_entries;
/*
* If there are no remaining entries on the parent, give up, we can't leave an empty internal
@@ -743,29 +688,20 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
alloc_index->entries = result_entries;
for (alloc_refp = alloc_index->index, hint = i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
- if (next_ref == ref) {
+ if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
ref_new[j]->pindex_hint = hint++;
*alloc_refp++ = ref_new[j];
}
- continue;
- }
-
- /* Skip refs we have marked for deletion. */
- if (deleted_entries != 0) {
- for (j = 0; j < deleted_entries; ++j)
- if (deleted_refs[j] == i)
- break;
- if (j < deleted_entries)
- continue;
+ else if (next_ref->state != WT_REF_SPLIT) {
+ /* Skip refs we have marked for deletion. */
+ next_ref->pindex_hint = hint++;
+ *alloc_refp++ = next_ref;
}
-
- next_ref->pindex_hint = hint++;
- *alloc_refp++ = next_ref;
}
- /* Check we filled in the expected number of entries. */
+ /* Check that we filled in all the entries. */
WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
/* Start making real changes to the tree, errors are fatal. */
@@ -794,6 +730,26 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
parent->pg_intl_split_gen = split_gen;
+ /*
+ * If discarding the page's original WT_REF field, reset it to split. Threads cursoring through
+ * the tree were blocked because that WT_REF state was set to locked. Changing the locked state
+ * to split unblocks those threads and causes them to re-calculate their position based on the
+ * just-updated parent page's index.
+ */
+ if (discard) {
+ /*
+ * Set the discarded WT_REF state to split, ensuring we don't race with any discard of the
+ * WT_REF deleted fields.
+ */
+ WT_REF_SET_STATE(ref, WT_REF_SPLIT);
+
+ /*
+ * Push out the change: not required for correctness, but stops threads spinning on
+ * incorrect page references.
+ */
+ WT_FULL_BARRIER();
+ }
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent));
#endif
@@ -802,24 +758,72 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
complete = WT_ERR_IGNORE;
/*
- * The new page index is in place. Threads cursoring in the tree are blocked because the WT_REF
- * being discarded (if any), and deleted WT_REFs (if any) are in a locked state. Changing the
- * locked state to split unblocks those threads and causes them to re-calculate their position
- * based on the just-updated parent page's index. The split state doesn't lock the WT_REF.addr
- * information which is read by cursor threads in some tree-walk cases: free the WT_REF we were
- * splitting and any deleted WT_REFs we found, modulo the usual safe free semantics, then reset
- * the WT_REF state.
+ * !!!
+ * Swapping in the new page index released the page for eviction, we can
+ * no longer look inside the page.
+ */
+ if (ref->page == NULL)
+ __wt_verbose(session, WT_VERB_SPLIT,
+ "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32 " (-%" PRIu32 ")",
+ (void *)ref->page, (void *)parent, parent_entries, result_entries,
+ parent_entries - result_entries);
+ else
+ __wt_verbose(session, WT_VERB_SPLIT,
+ "%p: split into parent %p, %" PRIu32 " -> %" PRIu32 " (+%" PRIu32 ")", (void *)ref->page,
+ (void *)parent, parent_entries, result_entries, result_entries - parent_entries);
+
+ /*
+ * The new page index is in place, free the WT_REF we were splitting and any deleted WT_REFs we
+ * found, modulo the usual safe free semantics.
*/
- if (discard) {
- WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED);
- WT_TRET(
- __split_parent_discard_ref(session, ref, parent, &parent_decr, split_gen, exclusive));
- }
- for (i = 0; i < deleted_entries; ++i) {
+ for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
- WT_ASSERT(session, next_ref->state == WT_REF_LOCKED);
- WT_TRET(__split_parent_discard_ref(
- session, next_ref, parent, &parent_decr, split_gen, exclusive));
+#ifdef HAVE_DIAGNOSTIC
+ {
+ uint32_t ref_state;
+ WT_ORDERED_READ(ref_state, next_ref->state);
+ WT_ASSERT(session, ref_state == WT_REF_LOCKED || ref_state == WT_REF_SPLIT);
+ }
+#endif
+
+ /*
+ * We set the WT_REF to split, discard it, freeing any resources it holds.
+ *
+ * Row-store trees where the old version of the page is being discarded: the previous parent
+ * page's key for this child page may have been an on-page overflow key. In that case, if
+ * the key hasn't been deleted, delete it now, including its backing blocks. We are
+ * exchanging the WT_REF that referenced it for the split page WT_REFs and their keys, and
+ * there's no longer any reference to it. Done after completing the split (if we failed,
+ * we'd leak the underlying blocks, but the parent page would be unaffected).
+ */
+ if (parent->type == WT_PAGE_ROW_INT) {
+ WT_TRET(__split_ovfl_key_cleanup(session, parent, next_ref));
+ ikey = __wt_ref_key_instantiated(next_ref);
+ if (ikey != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, ikey, size));
+ parent_decr += size;
+ }
+ }
+
+ /* Check that we are not discarding active history. */
+ WT_ASSERT(session, !__wt_page_las_active(session, next_ref));
+
+ /*
+ * The page-delete and lookaside memory weren't added to the parent's footprint, ignore it
+ * here.
+ */
+ if (next_ref->page_del != NULL) {
+ __wt_free(session, next_ref->page_del->update_list);
+ __wt_free(session, next_ref->page_del);
+ }
+ __wt_free(session, next_ref->page_las);
+
+ /* Free the backing block and address. */
+ WT_TRET(__wt_ref_block_free(session, next_ref));
+
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
+ parent_decr += sizeof(WT_REF);
}
/*
@@ -828,12 +832,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
*/
/*
- * Don't cache the change: not required for correctness, but stops threads spinning on incorrect
- * page references.
- */
- WT_FULL_BARRIER();
-
- /*
* We can't free the previous page index, there may be threads using it. Add it to the session
* discard list, to be freed when it's safe.
*/
@@ -845,14 +843,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
__wt_cache_page_inmem_incr(session, parent, parent_incr);
__wt_cache_page_inmem_decr(session, parent, parent_decr);
- /*
- * We've discarded the WT_REFs and swapping in a new page index released the page for eviction;
- * we can no longer look inside the WT_REF or the page, be careful logging the results.
- */
- __wt_verbose(session, WT_VERB_SPLIT,
- "%p: split into parent, %" PRIu32 "->%" PRIu32 ", %" PRIu32 " deleted", (void *)ref,
- parent_entries, result_entries, deleted_entries);
-
err:
__wt_scr_free(session, &scr);
/*
@@ -861,11 +851,10 @@ err:
*/
switch (complete) {
case WT_ERR_RETURN:
- /* Unlock WT_REFs locked because they were in a deleted state. */
- for (i = 0; i < deleted_entries; ++i) {
- next_ref = pindex->index[deleted_refs[i]];
- WT_ASSERT(session, next_ref->state == WT_REF_LOCKED);
- WT_REF_SET_STATE(next_ref, WT_REF_DELETED);
+ for (i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref->state == WT_REF_SPLIT)
+ WT_REF_SET_STATE(next_ref, WT_REF_DELETED);
}
__wt_free_ref_index(session, NULL, alloc_index, false);
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 86c8159b2a7..f39a87d53c9 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -632,6 +632,23 @@ __wt_off_page(WT_PAGE *page, const void *p)
}
/*
+ * __wt_ref_addr_free --
+ * Free the address in a reference, if necessary.
+ */
+static inline void
+__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ if (ref->addr == NULL)
+ return;
+
+ if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+ ref->addr = NULL;
+}
+
+/*
* __wt_ref_key --
* Return a reference to a row-store internal page key as cheaply as possible.
*/
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index dbaba203140..4ad93180572 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1695,7 +1695,6 @@ extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r);
extern void __wt_rec_dictionary_reset(WT_RECONCILE *r);
-extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_root_ref_init(
WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno);
@@ -2137,6 +2136,7 @@ static inline void __wt_rec_cell_build_addr(
static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv);
static inline void __wt_rec_incr(
WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size);
+static inline void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref);
static inline void __wt_ref_info(
WT_SESSION_IMPL *session, WT_REF *ref, const uint8_t **addrp, size_t *sizep, bool *is_leafp);
static inline void __wt_ref_info_lock(
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index cfadd17e4f5..a24cd11116b 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -2171,11 +2171,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
{
WT_BM *bm;
WT_BTREE *btree;
- WT_DECL_RET;
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
WT_REF *ref;
- uint32_t i, previous_state;
+ uint32_t i;
btree = S2BT(session);
bm = btree->bm;
@@ -2199,24 +2198,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
if (__wt_ref_is_root(ref))
break;
-
- /*
- * We're about to discard the WT_REF.addr field, and that can race with the cursor walk code
- * examining tree WT_REFs for leaf pages. Lock the WT_REF unless we know we have exclusive
- * access.
- */
- previous_state = WT_REF_LOCKED; /* -Wuninitialized */
- if (!F_ISSET(r, WT_REC_EVICT))
- for (;; __wt_yield()) {
- previous_state = ref->state;
- if (previous_state != WT_REF_LOCKED &&
- WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED))
- break;
- }
- ret = __wt_ref_block_free(session, ref);
- if (!F_ISSET(r, WT_REC_EVICT))
- WT_REF_SET_STATE(ref, previous_state);
- WT_RET(ret);
+ WT_RET(__wt_ref_block_free(session, ref));
break;
case WT_PM_REC_EMPTY: /* Page deleted */
break;