summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2015-10-29 16:47:19 +1100
committerAlex Gorrod <alexg@wiredtiger.com>2015-11-24 05:05:46 +0000
commitc819d2f9d34d8d701e986da4ea628c08239f8626 (patch)
treedd1eb79778690a42be6d05483794f4030d1a5345
parent00dfebc9b099a80c0ce8bbe69ef97168eda23bfd (diff)
downloadmongo-c819d2f9d34d8d701e986da4ea628c08239f8626.tar.gz
Merge pull request #2271 from wiredtiger/reverse-split-fix
SERVER-21027 Fix reverse splits to keep the original child ref locked (cherry picked from commit f4d20a3)
-rw-r--r--src/btree/bt_split.c25
-rw-r--r--src/evict/evict_file.c15
-rw-r--r--src/evict/evict_page.c52
-rw-r--r--src/include/extern.h2
4 files changed, 61 insertions, 33 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 07c266a07f9..82a4dac226f 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -943,8 +943,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* reading thread will restart. Include the ref we are splitting in
* the count to be deleted.
*/
- deleted_entries = ref_new != NULL ? 1 : 0;
- for (i = 0; i < parent_entries; ++i) {
+ for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
if (next_ref->state == WT_REF_DELETED &&
@@ -966,7 +965,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
*/
if (result_entries == 0) {
next_ref = pindex->index[0];
- WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
+ WT_ASSERT(session, next_ref->state == WT_REF_SPLIT ||
+ (next_ref == ref && ref->state == WT_REF_LOCKED));
next_ref->state = WT_REF_DELETED;
--deleted_entries;
result_entries = 1;
@@ -1051,9 +1051,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%s split into parent %" PRIu32 " -> %" PRIu32
- " (%" PRIu32 ")",
- __wt_page_type_string(ref->page->type), parent_entries,
- result_entries, result_entries - parent_entries));
+ " (%" PRIu32 ")", ref->page == NULL ?
+ "reverse" : __wt_page_type_string(ref->page->type),
+ parent_entries, result_entries, result_entries - parent_entries));
/*
* The new page index is in place, free the WT_REF we were splitting
@@ -1454,23 +1454,18 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* __wt_split_reverse --
- * Lock, then reverse split an internal page (remove deleted refs).
+ * We have a locked ref that is empty and we want to rewrite the index in
+ * its parent.
*/
int
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
WT_PAGE *parent;
- WT_REF dummy_child;
bool hazard;
- WT_CLEAR(dummy_child);
- dummy_child.home = dummy_child.page = ref->page;
- dummy_child.state = WT_REF_MEM;
-
- WT_RET(__split_parent_lock(session, &dummy_child, &parent, &hazard));
- WT_ASSERT(session, parent == ref->page);
- ret = __split_parent(session, &dummy_child, NULL, 0, 0, 0);
+ WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
+ ret = __split_parent(session, ref, NULL, 0, 0, 0);
WT_TRET(__split_parent_unlock(session, parent, hazard));
return (ret);
}
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index bcc5b86ecc2..4cf3840ba8e 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -18,8 +18,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
WT_DECL_RET;
WT_PAGE *page;
WT_REF *next_ref, *ref;
+ WT_TXN *txn;
bool evict_reset;
+ txn = &session->txn;
+
/*
* We need exclusive access to the file -- disable ordinary eviction
* and drain any blocks already queued.
@@ -29,6 +32,9 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
/* Make sure the oldest transaction ID is up-to-date. */
__wt_txn_update_oldest(session, true);
+ if (txn->isolation == WT_ISO_READ_COMMITTED)
+ __wt_txn_get_snapshot(session);
+
/* Walk the tree, discarding pages. */
next_ref = NULL;
WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
@@ -59,6 +65,10 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING));
+ /* Update our snapshot for each new page. */
+ if (txn->isolation == WT_ISO_READ_COMMITTED)
+ __wt_txn_get_snapshot(session);
+
/*
* We can't evict the page just returned to us (it marks our
* place in the tree), so move the walk to one page ahead of
@@ -81,7 +91,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
case WT_SYNC_DISCARD:
WT_ASSERT(session,
__wt_page_can_evict(session, page, 0, NULL));
- WT_ERR(__wt_evict_page_clean_update(session, ref));
+ WT_ERR(
+ __wt_evict_page_clean_update(session, ref, true));
break;
case WT_SYNC_DISCARD_FORCE:
/*
@@ -97,7 +108,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
}
F_SET(session, WT_SESSION_DISCARD_FORCE);
- ret = __wt_evict_page_clean_update(session, ref);
+ ret = __wt_evict_page_clean_update(session, ref, true);
F_CLR(session, WT_SESSION_DISCARD_FORCE);
WT_ERR(ret);
break;
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 641122b119f..9de66922931 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -110,7 +110,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else
- WT_ERR(__wt_evict_page_clean_update(session, ref));
+ WT_ERR(__wt_evict_page_clean_update(
+ session, ref, closing));
WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean);
@@ -142,25 +143,47 @@ done: if ((inmem_split || (forced_eviction && ret == EBUSY)) &&
return (ret);
}
/*
- * __evict_reverse_split_check --
- * Check if an internal page needs a reverse split.
+ * __evict_delete_ref --
+ * Mark a page reference deleted and check if the parent can reverse
+ * split.
*/
static int
-__evict_reverse_split_check(WT_SESSION_IMPL *session, WT_REF *ref)
+__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
+ WT_DECL_RET;
WT_PAGE *parent;
WT_PAGE_INDEX *pindex;
- uint32_t deleted_entries;
+ uint32_t ndeleted;
if (__wt_ref_is_root(ref))
return (0);
- parent = ref->home;
- WT_INTL_INDEX_GET(session, parent, pindex);
- deleted_entries = __wt_atomic_addv32(&pindex->deleted_entries, 1);
- if (deleted_entries > pindex->entries / 10)
- WT_RET(__wt_split_reverse(session, parent->pg_intl_parent_ref));
+ /*
+ * Avoid doing reverse splits when closing the file, it is
+ * wasted work and some structure may already have been freed.
+ */
+ if (!closing) {
+ parent = ref->home;
+ WT_INTL_INDEX_GET(session, parent, pindex);
+ ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);
+
+ /*
+ * If more than 10% of the parent references are deleted, try a
+ * reverse split. Don't bother if there is a single deleted
+ * reference: the internal page is empty and we have to wait
+ * for eviction to notice.
+ *
+ * This will consume the deleted ref (and eventually free it).
+ * If the reverse split can't get the access it needs because
+ * something is busy, be sure that the page still ends up
+ * marked deleted.
+ */
+ if (ndeleted > pindex->entries / 10 && pindex->entries > 1 &&
+ (ret = __wt_split_reverse(session, ref)) != EBUSY)
+ return (ret);
+ }
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
return (0);
}
@@ -169,7 +192,8 @@ __evict_reverse_split_check(WT_SESSION_IMPL *session, WT_REF *ref)
* Update a clean page's reference on eviction.
*/
int
-__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_evict_page_clean_update(
+ WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
WT_DECL_RET;
@@ -180,9 +204,8 @@ __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref)
*/
__wt_ref_out(session, ref);
if (ref->addr == NULL) {
- WT_PUBLISH(ref->state, WT_REF_DELETED);
WT_WITH_PAGE_INDEX(session,
- ret = __evict_reverse_split_check(session, ref));
+ ret = __evict_delete_ref(session, ref, closing));
WT_RET_BUSY_OK(ret);
} else
WT_PUBLISH(ref->state, WT_REF_DISK);
@@ -226,9 +249,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
*/
__wt_ref_out(session, ref);
ref->addr = NULL;
- WT_PUBLISH(ref->state, WT_REF_DELETED);
WT_WITH_PAGE_INDEX(session,
- ret = __evict_reverse_split_check(session, ref));
+ ret = __evict_delete_ref(session, ref, closing));
WT_RET_BUSY_OK(ret);
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
diff --git a/src/include/extern.h b/src/include/extern.h
index 44e5658bf28..845102ca428 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -316,7 +316,7 @@ extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, bool is_server);
extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full);
extern void __wt_cache_dump(WT_SESSION_IMPL *session);
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
-extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn);
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp);