diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-03-15 17:53:08 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-03-15 17:53:08 +1100 |
commit | 96a08a34b3ded2fbb58f2670ba17212631bd6e8b (patch) | |
tree | 3e5600b4c6b8f178a4f510ebabd0c89f87538031 | |
parent | 5a16cf8bc9aaa50a67a564138f359ac9356d7f57 (diff) | |
download | mongo-96a08a34b3ded2fbb58f2670ba17212631bd6e8b.tar.gz |
Fixes for bugs found by running test/format multithreaded.
There were two basic problems:
(1) When iterating backwards through a skiplist, we could race with an insert.
In that case, we need to start again from the beginning to update the previous
stack properly.
(2) If eviction fails for a page, we have to assume that eviction has unlocked
the reference: otherwise another thread could have already scheduled it for
eviction again and put it into state WT_REF_EVICTING.
-rw-r--r-- | src/btree/bt_curprev.c | 18 | ||||
-rw-r--r-- | src/btree/bt_evict.c | 29 | ||||
-rw-r--r-- | src/btree/bt_read.c | 2 |
3 files changed, 24 insertions, 25 deletions
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 0481f54e1bc..39d0aa8aed3 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -16,7 +16,7 @@ * search item's next array). * * Helper macros to go from a stack pointer at level i, pointing into a next - * array, to insert node containing that next array. + * array, back to the insert node containing that next array. */ #undef PREV_ITEM #define PREV_ITEM(ins_head, insp, i) \ @@ -41,11 +41,12 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cbt->iface.session; +restart: /* * If the search stack does not point at the current item, fill it in * with a search. */ - if ((current = cbt->ins) != PREV_INS(cbt, 0)) { + while ((current = cbt->ins) != PREV_INS(cbt, 0)) { if (cbt->btree->type == BTREE_ROW) { key.data = WT_INSERT_KEY(current); key.size = WT_INSERT_KEY_SIZE(current); @@ -54,10 +55,6 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt) } else cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, WT_INSERT_RECNO(current)); - - /* Check that we found the expected item. */ - WT_ASSERT(session, cbt->ins == current); - WT_ASSERT(session, PREV_INS(cbt, 0) == current); } /* @@ -93,7 +90,14 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt) /* Walk any remaining levels until just before the current node. */ while (i >= 0) { - WT_ASSERT(session, ins != NULL); + /* + * If we get to the end of a list without finding the current + * item, we must have raced with an insert. Restart the search. + */ + if (ins == NULL) { + cbt->ins_stack[0] = NULL; + goto restart; + } if (ins->next[i] != current) /* Stay at this level */ ins = ins->next[i]; else { /* Drop down a level */ diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index 1879bbb1a98..fe2bd27da2f 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -198,6 +198,7 @@ __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page) * thread will see this later. */ WT_VERBOSE(session, evictserver, "eviction server request table full"); + page->ref->state = WT_REF_MEM; return (WT_RESTART); } @@ -369,16 +370,13 @@ __evict_request_walk(WT_SESSION_IMPL *session) __wt_yield(); /* - * If eviction fails, free up the page and hope it + * If eviction fails, it will free up the page: hope it * works next time. Application threads may be holding * a reference while trying to get another (e.g., if * they have two cursors open), so blocking * indefinitely leads to deadlock. */ - if ((ret = __wt_rec_evict(session, er->page, 0)) != 0) { - WT_ASSERT(session, ref->page == er->page); - ref->state = WT_REF_MEM; - } + ret = __wt_rec_evict(session, er->page, 0); } else { /* * If we're about to do a walk of the file tree (and @@ -736,6 +734,8 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session) if (page == NULL) return (WT_NOTFOUND); + WT_ASSERT(session, page->ref->state == WT_REF_EVICTING); + /* Reference the correct WT_BTREE handle. */ saved_btree = session->btree; WT_SET_BTREE_IN_SESSION(session, btree); @@ -744,19 +744,14 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session) * We don't care why eviction failed (maybe the page was dirty and we're * out of disk space, or the page had an in-memory subtree already being * evicted). Regardless, don't pick the same page every time. + * + * We used to bump the page's read_gen only if eviction failed, but + * that isn't safe: at that point, eviction has already unlocked the + * page and some other thread may have evicted it by the time we look + * at it. */ - if (__wt_rec_evict(session, page, 0) != 0) { - page->read_gen = __wt_cache_read_gen(session); - - /* - * If the evicting state of the page was not cleared, clear it - * now to make the page available again. - */ - if (page->ref->state == WT_REF_EVICTING) { - WT_ASSERT(session, page->ref->page == page); - page->ref->state = WT_REF_MEM; - } - } + page->read_gen = __wt_cache_read_gen(session); + (void)__wt_rec_evict(session, page, 0); WT_ATOMIC_ADD(btree->lru_count, -1); diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 4d9017ca0cd..a0057e733d4 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -47,7 +47,7 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref) WT_ASSERT(session, page != NULL); ref->page = page; - ref->state = WT_REF_MEM; + WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: ref->state = WT_REF_DISK; |