summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRamon Fernandez <ramon@mongodb.com>2015-11-30 18:36:43 -0500
committerRamon Fernandez <ramon@mongodb.com>2015-11-30 18:37:32 -0500
commit9add8acc69a119949a156b815003ecc15db75e0d (patch)
tree4c2a614dc0bcd5642aa5556e6301caa3b8c2bc93
parent005ef273da33bf832e1e12886d8225c9498d349d (diff)
downloadmongo-9add8acc69a119949a156b815003ecc15db75e0d.tar.gz
Import wiredtiger-wiredtiger-mongodb-3.0.7-9-gdeb2d81.tar.gz from wiredtiger branch mongodb-3.0
ref: cb64236..deb2d81 deb2d81 SERVER-21027 Reverse split if there are many deleted pages (3.0) 66a111e WT-2195 Fix a hang after giving up on a reverse split. 7b1398a SERVER-21027 Don't leave empty internal pages in the tree c819d2f SERVER-21027 Fix reverse splits to keep the original child ref locked 00dfebc SERVER-21027 Reverse split if there are many deleted pages
-rw-r--r--src/third_party/wiredtiger/dist/flags.py1
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c52
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c14
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c6
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c69
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h1
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i12
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h5
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h15
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i7
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c1
13 files changed, 156 insertions, 41 deletions
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
index 394154fcb75..bdd4bf45b6f 100644
--- a/src/third_party/wiredtiger/dist/flags.py
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -36,6 +36,7 @@ flags = {
'page_read' : [
'READ_CACHE',
'READ_COMPACT',
+ 'READ_NO_EMPTY',
'READ_NO_EVICT',
'READ_NO_GEN',
'READ_NO_WAIT',
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 1da2923489c..7313e31267f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -214,10 +214,11 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* __wt_delete_page_skip --
- * If iterating a cursor, skip deleted pages that are visible to us.
+ * If iterating a cursor, skip deleted pages that are either visible to
+ * us or globally visible.
*/
bool
-__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
{
bool skip;
@@ -245,7 +246,8 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
return (false);
- skip = (ref->page_del == NULL ||
+ skip = ref->page_del == NULL || (visible_all ?
+ __wt_txn_visible_all(session, ref->page_del->txnid) :
__wt_txn_visible(session, ref->page_del->txnid));
WT_PUBLISH(ref->state, WT_REF_DELETED);
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 41cc8f9398c..ad8f0293108 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -76,8 +76,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
for (force_attempts = 0, oldgen = false, wait_cnt = 0;;) {
switch (ref->state) {
- case WT_REF_DISK:
case WT_REF_DELETED:
+ if (LF_ISSET(WT_READ_NO_EMPTY) &&
+ __wt_delete_page_skip(session, ref, false))
+ return (WT_NOTFOUND);
+ /* FALLTHROUGH */
+ case WT_REF_DISK:
if (LF_ISSET(WT_READ_CACHE))
return (WT_NOTFOUND);
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 739db727fb5..6f31ff89aa7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -943,11 +943,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* reading thread will restart. Include the ref we are splitting in
* the count to be deleted.
*/
- for (i = 0, deleted_entries = 1; i < parent_entries; ++i) {
+ for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
if (next_ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, next_ref) &&
+ __wt_delete_page_skip(session, next_ref, true) &&
__wt_atomic_casv32(
&next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
deleted_entries++;
@@ -960,6 +960,16 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
result_entries = (parent_entries + new_entries) - deleted_entries;
/*
+ * If the entire (sub)tree is empty, give up: we can't leave an empty
+ * internal page. Mark it to be evicted soon and clean up any
+ * references that have changed state.
+ */
+ if (result_entries == 0) {
+ __wt_page_evict_soon(parent);
+ goto err;
+ }
+
+ /*
* Allocate and initialize a new page index array for the parent, then
* copy references from the original index array, plus references from
* the newly created split array, into place.
@@ -1003,6 +1013,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
*alloc_refp++ = next_ref;
}
+ /* Check that we filled in all the entries. */
+ WT_ASSERT(session, alloc_refp - alloc_index->index == result_entries);
+
/*
* Update the parent page's index: this update makes the split visible
* to threads descending the tree.
@@ -1038,9 +1051,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%s split into parent %" PRIu32 " -> %" PRIu32
- " (%" PRIu32 ")",
- __wt_page_type_string(ref->page->type), parent_entries,
- result_entries, result_entries - parent_entries));
+ " (%" PRIu32 ")", ref->page == NULL ?
+ "reverse" : __wt_page_type_string(ref->page->type),
+ parent_entries, result_entries, result_entries - parent_entries));
/*
* The new page index is in place, free the WT_REF we were splitting
@@ -1132,14 +1145,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
__split_should_deepen(session, parent_ref))
ret = __split_deepen(session, parent);
-err: if (!complete)
+err: if (!complete) {
for (i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref->state == WT_REF_SPLIT)
next_ref->state = WT_REF_DELETED;
}
- __wt_free_ref_index(session, NULL, alloc_index, false);
+ /* If we gave up on a reverse split, unlock the child. */
+ if (ref_new == NULL) {
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+ ref->state = WT_REF_DELETED;
+ }
+
+ __wt_free_ref_index(session, NULL, alloc_index, false);
+ }
/*
* A note on error handling: if we completed the split, return success,
@@ -1440,6 +1460,24 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
+ * __wt_split_reverse --
+ * We have a locked ref that is empty and we want to rewrite the index in
+ * its parent.
+ */
+int
+__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ bool hazard;
+
+ WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
+ ret = __split_parent(session, ref, NULL, 0, 0, 0);
+ WT_TRET(__split_parent_unlock(session, parent, hazard));
+ return (ret);
+}
+
+/*
* __wt_split_rewrite --
* Rewrite an in-memory page with a new version.
*/
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 6e1d182ed0b..8e0f4036b79 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -94,6 +94,9 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
*/
WT_ENTER_PAGE_INDEX(session);
+ /* Walk should never instantiate deleted pages. */
+ LF_SET(WT_READ_NO_EMPTY);
+
/*
* !!!
* Fast-truncate currently only works on row-store trees.
@@ -174,9 +177,10 @@ ascend: /*
/*
* If we got all the way through an internal page and
- * all of the child pages were deleted, evict it.
+ * all of the child pages were deleted, mark it for
+ * eviction.
*/
- if (empty_internal) {
+ if (empty_internal && pindex->entries > 1) {
__wt_page_evict_soon(ref->page);
empty_internal = false;
}
@@ -257,7 +261,7 @@ ascend: /*
* to delete it again.
*/
if (ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, ref))
+ __wt_delete_page_skip(session, ref, false))
break;
/*
* If deleting a range, try to delete the page
@@ -294,7 +298,7 @@ ascend: /*
* Try to skip deleted pages visible to us.
*/
if (ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, ref))
+ __wt_delete_page_skip(session, ref, false))
break;
}
@@ -302,7 +306,7 @@ ascend: /*
/*
* Not-found is an expected return when only walking
- * in-cache pages.
+ * in-cache pages, or if we see a deleted page.
*/
if (ret == WT_NOTFOUND) {
ret = 0;
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index c5e04806062..ed0ffb5b262 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -81,7 +81,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
case WT_SYNC_DISCARD:
WT_ASSERT(session,
__wt_page_can_evict(session, page, 0, NULL));
- __wt_evict_page_clean_update(session, ref);
+ WT_ERR(
+ __wt_evict_page_clean_update(session, ref, true));
break;
case WT_SYNC_DISCARD_FORCE:
/*
@@ -97,8 +98,9 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
}
F_SET(session, WT_SESSION_DISCARD_FORCE);
- __wt_evict_page_clean_update(session, ref);
+ ret = __wt_evict_page_clean_update(session, ref, true);
F_CLR(session, WT_SESSION_DISCARD_FORCE);
+ WT_ERR(ret);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index f4693511e11..9de66922931 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -110,7 +110,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else
- __wt_evict_page_clean_update(session, ref);
+ WT_ERR(__wt_evict_page_clean_update(
+ session, ref, closing));
WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean);
@@ -141,22 +142,75 @@ done: if ((inmem_split || (forced_eviction && ret == EBUSY)) &&
return (ret);
}
+/*
+ * __evict_delete_ref --
+ * Mark a page reference deleted and check if the parent can reverse
+ * split.
+ */
+static int
+__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_PAGE_INDEX *pindex;
+ uint32_t ndeleted;
+
+ if (__wt_ref_is_root(ref))
+ return (0);
+
+ /*
+ * Avoid doing reverse splits when closing the file, it is
+ * wasted work and some structure may already have been freed.
+ */
+ if (!closing) {
+ parent = ref->home;
+ WT_INTL_INDEX_GET(session, parent, pindex);
+ ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);
+
+ /*
+ * If more than 10% of the parent references are deleted, try a
+ * reverse split. Don't bother if there is a single deleted
+ * reference: the internal page is empty and we have to wait
+ * for eviction to notice.
+ *
+ * This will consume the deleted ref (and eventually free it).
+ * If the reverse split can't get the access it needs because
+ * something is busy, be sure that the page still ends up
+ * marked deleted.
+ */
+ if (ndeleted > pindex->entries / 10 && pindex->entries > 1 &&
+ (ret = __wt_split_reverse(session, ref)) != EBUSY)
+ return (ret);
+ }
+
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ return (0);
+}
/*
* __wt_evict_page_clean_update --
* Update a clean page's reference on eviction.
*/
-void
-__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref)
+int
+__wt_evict_page_clean_update(
+ WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
+ WT_DECL_RET;
+
/*
* Discard the page and update the reference structure; if the page has
* an address, it's a disk page; if it has no address, it's a deleted
* page re-instantiated (for example, by searching) and never written.
*/
__wt_ref_out(session, ref);
- WT_PUBLISH(ref->state,
- ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);
+ if (ref->addr == NULL) {
+ WT_WITH_PAGE_INDEX(session,
+ ret = __evict_delete_ref(session, ref, closing));
+ WT_RET_BUSY_OK(ret);
+ } else
+ WT_PUBLISH(ref->state, WT_REF_DISK);
+
+ return (0);
}
/*
@@ -167,6 +221,7 @@ static int
__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
WT_ADDR *addr;
+ WT_DECL_RET;
WT_PAGE *parent;
WT_PAGE_MODIFY *mod;
@@ -194,7 +249,9 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
*/
__wt_ref_out(session, ref);
ref->addr = NULL;
- WT_PUBLISH(ref->state, WT_REF_DELETED);
+ WT_WITH_PAGE_INDEX(session,
+ ret = __evict_delete_ref(session, ref, closing));
+ WT_RET_BUSY_OK(ret);
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
/*
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index c5d29bc8106..fb497f64963 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -408,6 +408,7 @@ struct __wt_page {
struct __wt_page_index {
uint32_t entries;
+ uint32_t deleted_entries;
WT_REF **index;
} * volatile __index; /* Collated children */
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index edddcdd6fe4..1c416c99e13 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1003,7 +1003,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
if (count > WT_MIN_SPLIT_COUNT &&
size > (size_t)btree->maxleafpage)
return (true);
- }
+ }
return (false);
}
@@ -1208,13 +1208,9 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
#endif
);
- /* An expected failure: WT_NOTFOUND when doing a cache-only read. */
- if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND)
- return (WT_NOTFOUND);
-
- /* An expected failure: WT_RESTART */
- if (ret == WT_RESTART)
- return (WT_RESTART);
+ /* Expected failures: page not found or restart. */
+ if (ret == WT_NOTFOUND || ret == WT_RESTART)
+ return (ret);
/* Discard the original held page. */
acquired = ret == 0;
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 4b341a6adaa..845102ca428 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -114,7 +114,7 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *
extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
-extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all);
extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
@@ -153,6 +153,7 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
@@ -315,7 +316,7 @@ extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, bool is_server);
extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full);
extern void __wt_cache_dump(WT_SESSION_IMPL *session);
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
-extern void __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn);
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index 71fc54f9eac..aad44c22184 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -32,13 +32,14 @@
#define WT_LOG_FSYNC 0x00000004
#define WT_READ_CACHE 0x00000001
#define WT_READ_COMPACT 0x00000002
-#define WT_READ_NO_EVICT 0x00000004
-#define WT_READ_NO_GEN 0x00000008
-#define WT_READ_NO_WAIT 0x00000010
-#define WT_READ_PREV 0x00000020
-#define WT_READ_SKIP_INTL 0x00000040
-#define WT_READ_TRUNCATE 0x00000080
-#define WT_READ_WONT_NEED 0x00000100
+#define WT_READ_NO_EMPTY 0x00000004
+#define WT_READ_NO_EVICT 0x00000008
+#define WT_READ_NO_GEN 0x00000010
+#define WT_READ_NO_WAIT 0x00000020
+#define WT_READ_PREV 0x00000040
+#define WT_READ_SKIP_INTL 0x00000080
+#define WT_READ_TRUNCATE 0x00000100
+#define WT_READ_WONT_NEED 0x00000200
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
#define WT_SESSION_DISCARD_FORCE 0x00000004
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 3152ff6bdd5..73d7f1f0518 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -187,6 +187,13 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
session->dhandle == session->meta_dhandle)
return (true);
+ /*
+ * If we don't have a transactional snapshot, only make stable updates
+ * visible.
+ */
+ if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
+ return (__wt_txn_visible_all(session, id));
+
/* Transactions see their own changes. */
if (id == txn->id)
return (true);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 988b7e0a84f..73b7f4968e9 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -653,6 +653,7 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->flags = flags;
/* Track if the page can be marked clean. */
+ r->max_txn = WT_TXN_NONE;
r->leave_dirty = false;
/* Raw compression. */