summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2012-03-15 17:53:08 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2012-03-15 17:53:08 +1100
commit96a08a34b3ded2fbb58f2670ba17212631bd6e8b (patch)
tree3e5600b4c6b8f178a4f510ebabd0c89f87538031
parent5a16cf8bc9aaa50a67a564138f359ac9356d7f57 (diff)
downloadmongo-96a08a34b3ded2fbb58f2670ba17212631bd6e8b.tar.gz
Fixes for bugs found by running test/format multithreaded.
There were two basic problems: (1) When iterating backwards through a skiplist, we could race with an insert. In that case, we need to start again from the beginning to update the previous stack properly. (2) If eviction fails for a page, we have to assume that eviction has unlocked the reference: otherwise another thread could have already scheduled it for eviction again and put it into state WT_REF_EVICTING.
-rw-r--r--src/btree/bt_curprev.c18
-rw-r--r--src/btree/bt_evict.c29
-rw-r--r--src/btree/bt_read.c2
3 files changed, 24 insertions, 25 deletions
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index 0481f54e1bc..39d0aa8aed3 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -16,7 +16,7 @@
* search item's next array).
*
* Helper macros to go from a stack pointer at level i, pointing into a next
- * array, to insert node containing that next array.
+ * array, back to the insert node containing that next array.
*/
#undef PREV_ITEM
#define PREV_ITEM(ins_head, insp, i) \
@@ -41,11 +41,12 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cbt->iface.session;
+restart:
/*
* If the search stack does not point at the current item, fill it in
* with a search.
*/
- if ((current = cbt->ins) != PREV_INS(cbt, 0)) {
+ while ((current = cbt->ins) != PREV_INS(cbt, 0)) {
if (cbt->btree->type == BTREE_ROW) {
key.data = WT_INSERT_KEY(current);
key.size = WT_INSERT_KEY_SIZE(current);
@@ -54,10 +55,6 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt)
} else
cbt->ins = __col_insert_search(cbt->ins_head,
cbt->ins_stack, WT_INSERT_RECNO(current));
-
- /* Check that we found the expected item. */
- WT_ASSERT(session, cbt->ins == current);
- WT_ASSERT(session, PREV_INS(cbt, 0) == current);
}
/*
@@ -93,7 +90,14 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt)
/* Walk any remaining levels until just before the current node. */
while (i >= 0) {
- WT_ASSERT(session, ins != NULL);
+ /*
+ * If we get to the end of a list without finding the current
+ * item, we must have raced with an insert. Restart the search.
+ */
+ if (ins == NULL) {
+ cbt->ins_stack[0] = NULL;
+ goto restart;
+ }
if (ins->next[i] != current) /* Stay at this level */
ins = ins->next[i];
else { /* Drop down a level */
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index 1879bbb1a98..fe2bd27da2f 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -198,6 +198,7 @@ __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page)
* thread will see this later.
*/
WT_VERBOSE(session, evictserver, "eviction server request table full");
+ page->ref->state = WT_REF_MEM;
return (WT_RESTART);
}
@@ -369,16 +370,13 @@ __evict_request_walk(WT_SESSION_IMPL *session)
__wt_yield();
/*
- * If eviction fails, free up the page and hope it
+ * If eviction fails, it will free up the page: hope it
* works next time. Application threads may be holding
* a reference while trying to get another (e.g., if
* they have two cursors open), so blocking
* indefinitely leads to deadlock.
*/
- if ((ret = __wt_rec_evict(session, er->page, 0)) != 0) {
- WT_ASSERT(session, ref->page == er->page);
- ref->state = WT_REF_MEM;
- }
+ ret = __wt_rec_evict(session, er->page, 0);
} else {
/*
* If we're about to do a walk of the file tree (and
@@ -736,6 +734,8 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session)
if (page == NULL)
return (WT_NOTFOUND);
+ WT_ASSERT(session, page->ref->state == WT_REF_EVICTING);
+
/* Reference the correct WT_BTREE handle. */
saved_btree = session->btree;
WT_SET_BTREE_IN_SESSION(session, btree);
@@ -744,19 +744,14 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session)
* We don't care why eviction failed (maybe the page was dirty and we're
* out of disk space, or the page had an in-memory subtree already being
* evicted). Regardless, don't pick the same page every time.
+ *
+ * We used to bump the page's read_gen only if eviction failed, but
+ * that isn't safe: at that point, eviction has already unlocked the
+ * page and some other thread may have evicted it by the time we look
+ * at it.
*/
- if (__wt_rec_evict(session, page, 0) != 0) {
- page->read_gen = __wt_cache_read_gen(session);
-
- /*
- * If the evicting state of the page was not cleared, clear it
- * now to make the page available again.
- */
- if (page->ref->state == WT_REF_EVICTING) {
- WT_ASSERT(session, page->ref->page == page);
- page->ref->state = WT_REF_MEM;
- }
- }
+ page->read_gen = __wt_cache_read_gen(session);
+ (void)__wt_rec_evict(session, page, 0);
WT_ATOMIC_ADD(btree->lru_count, -1);
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 4d9017ca0cd..a0057e733d4 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -47,7 +47,7 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref)
WT_ASSERT(session, page != NULL);
ref->page = page;
- ref->state = WT_REF_MEM;
+ WT_PUBLISH(ref->state, WT_REF_MEM);
return (0);
err: ref->state = WT_REF_DISK;