diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-10-10 18:06:36 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-10-10 18:06:36 +1100 |
commit | 633f04e2ff30450f1023663fe22c6306b928cf71 (patch) | |
tree | 1905e0db4fae7f5c399d93f0eeaf587222dfa17e | |
parent | 3b8639828b0be37ae590afee9025d0d6533e06b0 (diff) | |
parent | 324b94333905f222df56a2412dcae9ca0820d731 (diff) | |
download | mongo-633f04e2ff30450f1023663fe22c6306b928cf71.tar.gz |
Merge branch 'develop' into bloom-bulk-fastpath
-rw-r--r-- | src/btree/bt_evict.c | 194 | ||||
-rw-r--r-- | src/btree/bt_page.c | 41 | ||||
-rw-r--r-- | src/btree/row_modify.c | 18 | ||||
-rw-r--r-- | src/include/cache.h | 1 | ||||
-rw-r--r-- | src/include/cache.i | 56 | ||||
-rw-r--r-- | src/include/cursor.i | 11 | ||||
-rw-r--r-- | src/include/extern.h | 1 | ||||
-rw-r--r-- | src/lsm/lsm_cursor.c | 4 | ||||
-rw-r--r-- | src/lsm/lsm_merge.c | 4 | ||||
-rw-r--r-- | src/support/hazard.c | 17 |
10 files changed, 57 insertions, 290 deletions
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index 064e1b096f6..2362c5687b3 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -12,7 +12,6 @@ static int __evict_file_request(WT_SESSION_IMPL *, int); static int __evict_file_request_walk(WT_SESSION_IMPL *); static int __evict_lru(WT_SESSION_IMPL *); static int __evict_lru_cmp(const void *, const void *); -static int __evict_page_request_walk(WT_SESSION_IMPL *); static int __evict_walk(WT_SESSION_IMPL *); static int __evict_walk_file(WT_SESSION_IMPL *, u_int *); static int __evict_worker(WT_SESSION_IMPL *); @@ -104,36 +103,6 @@ __wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * __evict_req_set -- - * Set an entry in the forced page eviction request list. - */ -static inline void -__evict_req_set(WT_EVICT_ENTRY *r, WT_BTREE *btree, WT_PAGE *page) -{ - r->btree = btree; - /* - * Publish: there must be a barrier to ensure the structure fields are - * set before the eviction thread can see the request. - */ - WT_PUBLISH(r->page, page); -} - -/* - * __evict_req_clr -- - * Clear an entry in the forced page eviction request list. - */ -static inline void -__evict_req_clr(WT_EVICT_ENTRY *r) -{ - r->btree = NULL; - r->page = NULL; - /* - * No publication necessary, all we care about is the page value and - * whenever it's cleared is fine. - */ -} - -/* * __wt_evict_server_wake -- * Wake the eviction server thread. */ @@ -186,74 +155,6 @@ __wt_sync_file_serial_func(WT_SESSION_IMPL *session, void *args) } /* - * __wt_evict_page_request -- - * Schedule a page for forced eviction due to a high volume of inserts or - * updates. - */ -void -__wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_CACHE *cache; - WT_EVICT_ENTRY *er, *er_end; - int set; - - cache = S2C(session)->cache; - - /* Do a cheap test before acquiring the lock. */ - if (page->ref->state != WT_REF_MEM) - return; - - __wt_spin_lock(session, &cache->evict_lock); - - /* - * Application threads request forced eviction of pages when they - * become too big. The application thread must hold a hazard reference - * when this function is called, which protects it from being freed. - * - * However, it is possible (but unlikely) that the page is already part - * way through the process of being evicted: a thread may have selected - * it from the LRU list but not yet checked its hazard references. - * - * To avoid that race, we try to atomically switch the page state to - * WT_REF_EVICT_FORCE. Since only one thread can do that successfully, - * this prevents a page from being evicted twice. Threads looking for - * a page to evict on the ordinary LRU eviction queue will ignore this - * page and it will be evicted by the main eviction thread. - * - * If the state is not WT_REF_MEM, some other thread is already - * evicting this page, which is fine, and in that case we don't want to - * put it on the request queue because the memory may be freed by the - * time the eviction thread sees it. - */ - if (!WT_ATOMIC_CAS(page->ref->state, WT_REF_MEM, WT_REF_EVICT_FORCE)) { - __wt_spin_unlock(session, &cache->evict_lock); - return; - } - - set = 0; - - /* Find an empty slot and enter the eviction request. */ - WT_EVICT_REQ_FOREACH(er, er_end, cache) - if (er->page == NULL) { - __evict_req_set(er, session->btree, page); - set = 1; - break; - } - - if (!set) { - /* - * The request table is full, that's okay for page requests: - * another thread will see this later. - */ - WT_VERBOSE_VOID(session, evictserver, - "page eviction request table is full"); - page->ref->state = WT_REF_MEM; - } - - __wt_spin_unlock(session, &cache->evict_lock); -} - -/* * __wt_cache_evict_server -- * Thread to evict pages from the cache. */ @@ -334,13 +235,6 @@ __evict_worker(WT_SESSION_IMPL *session) */ __wt_spin_lock(session, &cache->evict_lock); - /* - * Walk the eviction-request queue. It is important to do this - * before closing files, in case a page schedule for eviction - * is freed by closing a file. - */ - WT_RET(__evict_page_request_walk(session)); - /* If there is a file sync request, satisfy it. */ while (cache->sync_complete != cache->sync_request) WT_RET(__evict_file_request_walk(session)); @@ -441,7 +335,8 @@ __evict_page(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ASSERT(session, txn->snapshot == NULL || txn->snapshot != saved_txn.snapshot); __wt_txn_destroy(session); - } + } else + __wt_txn_release_snapshot(session); *txn = saved_txn; return (ret); @@ -606,79 +501,6 @@ err: if (next_page != NULL) } /* - * __evict_page_request_walk -- - * Walk the forced page eviction request queue. - */ -static int -__evict_page_request_walk(WT_SESSION_IMPL *session) -{ - WT_CACHE *cache; - WT_EVICT_ENTRY *er, *er_end; - WT_PAGE *page; - WT_REF *ref; - - cache = S2C(session)->cache; - - /* - * Walk the forced page eviction request queue: if we find a request, - * perform it and clear the request slot. - */ - WT_EVICT_REQ_FOREACH(er, er_end, cache) { - if ((page = er->page) == NULL) - continue; - - /* Reference the correct WT_BTREE handle. */ - WT_SET_BTREE_IN_SESSION(session, er->btree); - - WT_VERBOSE_RET(session, evictserver, - "forcing eviction of page %p", page); - - /* - * The eviction candidate list might reference pages we are - * about to discard; clear it. - */ - __evict_list_clr_all(session, 0); - - /* - * The eviction candidate might be part of the current tree's - * walk; clear it. - */ - __evict_clear_tree_walk(session, NULL); - - /* - * Wait for LRU eviction activity to drain. It is much easier - * to reason about sync or forced eviction if we know there are - * no other threads evicting in the tree. - */ - while (session->btree->lru_count > 0) { - __wt_spin_unlock(session, &cache->evict_lock); - __wt_yield(); - __wt_spin_lock(session, &cache->evict_lock); - } - - ref = page->ref; - WT_ASSERT(session, ref->page == page); - WT_ASSERT(session, ref->state == WT_REF_EVICT_FORCE); - ref->state = WT_REF_LOCKED; - - /* - * If eviction fails, it will free up the page: hope it works - * next time. Application threads may be holding a reference - * while trying to get another (e.g., if they have two cursors - * open), so blocking indefinitely leads to deadlock. - */ - (void)__evict_page(session, page); - - /* Clear the reference to the btree handle. */ - WT_CLEAR_BTREE_IN_SESSION(session); - - /* Clear the request slot. */ - __evict_req_clr(er); - } - return (0); -} - -/* * __evict_lru -- * Evict pages from the cache based on their read generation. */ @@ -719,7 +541,7 @@ __evict_walk(WT_SESSION_IMPL *session) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - u_int elem, i; + u_int elem, file_count, i; conn = S2C(session); cache = S2C(session)->cache; @@ -729,8 +551,7 @@ __evict_walk(WT_SESSION_IMPL *session) * get some pages from each underlying file. In practice, a realloc * is rarely needed, so it is worth avoiding the LRU lock. */ - elem = WT_EVICT_WALK_BASE + - (conn->open_btree_count * WT_EVICT_WALK_PER_TABLE); + elem = WT_EVICT_WALK_BASE + 2 * WT_EVICT_GROUP; if (elem > cache->evict_entries) { __wt_spin_lock(session, &cache->evict_lock); /* Save the offset of the eviction point. */ @@ -749,7 +570,11 @@ __evict_walk(WT_SESSION_IMPL *session) * servicing eviction requests. */ i = WT_EVICT_WALK_BASE; + file_count = 0; TAILQ_FOREACH(btree, &conn->btqh, q) { + if (file_count++ < cache->evict_file_next) + continue; + /* * Skip files that aren't open or don't have a root page. * @@ -770,9 +595,10 @@ __evict_walk(WT_SESSION_IMPL *session) ret = __evict_walk_file(session, &i); WT_CLEAR_BTREE_IN_SESSION(session); - if (ret != 0) + if (ret != 0 || i == cache->evict_entries) break; } + cache->evict_file_next = (btree == NULL) ? 0 : file_count; if (0) { err: __wt_spin_unlock(session, &cache->evict_lock); diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index ad0fc237191..ecdf9d47a16 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -28,36 +28,18 @@ __wt_page_in_func( { WT_DECL_RET; WT_PAGE *page; - int busy, read_lockout, wake; - - /* - * Only wake the eviction server the first time through here (if the - * cache is too full), or after we fail to evict a page. Otherwise, we - * are just wasting effort and making a busy mutex busier. - */ - wake = 1; + int busy; for (;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: - /* The page isn't in memory, attempt to read it. */ - - /* Check if there is space in the cache. */ - __wt_eviction_check(session, &read_lockout, wake); - wake = 0; - /* - * If the cache is full, give up, but only if we are - * not holding the schema lock. The schema lock can - * block checkpoints, and thus eviction, so it is not - * safe to wait for eviction if we are holding it. + * The page isn't in memory, attempt to read it. + * + * First make sure there is space in the cache. */ - if (read_lockout && - !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && - !F_ISSET(session->btree, WT_BTREE_NO_CACHE)) - break; - + WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, parent, ref)); continue; case WT_REF_EVICT_FORCE: @@ -92,12 +74,12 @@ __wt_page_in_func( * Ensure the page doesn't have ancient updates on it. * If it did, reading the page could ignore committed * updates. This should be extremely unlikely in real - * applications, force eviction of the page to avoid + * applications, wait for eviction of the page to avoid * the issue. */ if (page->modify != NULL && __wt_txn_ancient(session, page->modify->first_id)) { - __wt_evict_page_request(session, page); + page->read_gen = 0; __wt_hazard_clear(session, page); __wt_evict_server_wake(session); break; @@ -114,13 +96,8 @@ __wt_page_in_func( WT_ILLEGAL_VALUE(session); } - /* Find a page to evict -- if the page is busy, keep trying. */ - if ((ret = __wt_evict_lru_page(session, 1)) == EBUSY) - __wt_yield(); - else if (ret == WT_NOTFOUND) - wake = 1; - else - WT_RET(ret); + /* We failed to get the page -- yield before retrying. */ + __wt_yield(); } } diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index 40ba30c6c5d..7d42cf4727e 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -268,9 +268,7 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session, void *args) int __wt_update_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *next) { - WT_DECL_RET; WT_TXN *txn; - int lockout, wake = 1; /* Discard obsolete WT_UPDATE structures. */ if (next != NULL) @@ -280,22 +278,6 @@ __wt_update_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *next) WT_RET(__wt_txn_update_check(session, next)); /* - * Pause if the cache is full. - * This matches the logic in __wt_page_in_func. - */ - for (;;) { - __wt_eviction_check(session, &lockout, wake); - wake = 0; - if (!lockout || - F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) - break; - if ((ret = __wt_evict_lru_page(session, 1)) == EBUSY) - __wt_yield(); - else - WT_RET_NOTFOUND_OK(ret); - } - - /* * Record the transaction ID for the first update to a page. * We don't care if this races: there is a buffer built into the * check for ancient updates. diff --git a/src/include/cache.h b/src/include/cache.h index 2fc7e0fedd3..73dd5ab1c31 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -52,6 +52,7 @@ struct __wt_cache { WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */ size_t evict_allocated; /* LRU list bytes allocated */ uint32_t evict_entries; /* LRU list eviction slots */ + u_int evict_file_next; /* LRU: next file to search */ /* * Forced-page eviction request information. diff --git a/src/include/cache.i b/src/include/cache.i index 75050eadde7..c06ea61925c 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -36,44 +36,32 @@ __wt_eviction_check(WT_SESSION_IMPL *session, int *read_lockoutp, int wake) } /* - * __wt_eviction_page_check -- - * Return if a page should be forcibly evicted. + * __wt_cache_full_check -- + * Wait for there to be space in the cache before a read or update. */ static inline int -__wt_eviction_page_check(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_cache_full_check(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_PAGE_MODIFY *mod; - - conn = S2C(session); - mod = page->modify; - - /* - * Root pages and clean pages are never forcibly evicted. - * Nor are pages from files that are purely cache resident. - */ - if (WT_PAGE_IS_ROOT(page) || - !__wt_page_is_modified(page) || - F_ISSET(session->btree, WT_BTREE_NO_EVICTION)) - return (0); + WT_BTREE *btree; + WT_DECL_RET; + int lockout, wake; /* - * Check the page's memory footprint - evict pages that take up more - * than their fair share of the cache. We define a fair share as - * approximately half the cache size per open writable btree handle. + * Only wake the eviction server the first time through here (if the + * cache is too full). Otherwise, we are just wasting effort and + * making a busy condition variable busier. */ - if ((int64_t)page->memory_footprint > - conn->cache_size / (2 * (conn->open_btree_count + 1))) - return (1); - - /* - * If the page's write-generation has wrapped and caught up with the - * page's disk generation (wildly unlikely as it requires 4B updates - * between page reconciliations, but is technically possible), forcibly - * evict the page. - */ - if (mod != NULL && mod->write_gen + 1 == mod->disk_gen) - return (1); - - return (0); + for (wake = 1;; wake = 0) { + __wt_eviction_check(session, &lockout, wake); + if (!lockout || + F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) + return (0); + if ((btree = session->btree) != NULL && F_ISSET(btree, + WT_BTREE_BULK | WT_BTREE_NO_CACHE | WT_BTREE_NO_EVICTION)) + return (0); + if ((ret = __wt_evict_lru_page(session, 1)) == EBUSY) + __wt_yield(); + else + WT_RET_NOTFOUND_OK(ret); + } } diff --git a/src/include/cursor.i b/src/include/cursor.i index f81b955fe7f..81480d88e7a 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -72,8 +72,17 @@ __cursor_leave(WT_CURSOR_BTREE *cbt) if (F_ISSET(cbt, WT_CBT_ACTIVE)) { WT_ASSERT(session, session->ncursors > 0); - if (--session->ncursors == 0) + if (--session->ncursors == 0) { __wt_txn_read_last(session); + + /* + * We no longer have any active cursors, check if our + * operation overflowed the cache. We don't care if we + * fail to evict pages: our operation is done + * regardless. + */ + (void)__wt_cache_full_check(session); + } F_CLR(cbt, WT_CBT_ACTIVE); } } diff --git a/src/include/extern.h b/src/include/extern.h index 04d4defef22..a8ef1594153 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -292,7 +292,6 @@ extern void __wt_page_out(WT_SESSION_IMPL *session, extern void __wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page); extern void __wt_evict_server_wake(WT_SESSION_IMPL *session); extern int __wt_sync_file_serial_func(WT_SESSION_IMPL *session, void *args); -extern void __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page); extern void *__wt_cache_evict_server(void *arg); extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app); extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename); diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index a885632c501..a05e3f36bbf 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -114,9 +114,9 @@ __clsm_open_cursors(WT_CURSOR_LSM *clsm, int start_chunk) WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; const char *ckpt_cfg[] = API_CONF_DEFAULTS(session, open_cursor, - "checkpoint=WiredTigerCheckpoint"); + "checkpoint=WiredTigerCheckpoint,raw"); const char *merge_cfg[] = API_CONF_DEFAULTS(session, open_cursor, - "checkpoint=WiredTigerCheckpoint,no_cache"); + "checkpoint=WiredTigerCheckpoint,no_cache,raw"); int i, nchunks; session = (WT_SESSION_IMPL *)clsm->iface.session; diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index fe5da13206d..b5ae6d4a39c 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -150,7 +150,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_ASSERT(session, nchunks <= max_chunks); if (nchunks <= 1) - return (0); + return (WT_NOTFOUND); /* Allocate an ID for the merge. */ dest_id = WT_ATOMIC_ADD(lsm_tree->last, 1); @@ -159,7 +159,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) "Merging chunks %d-%d into %d (%" PRIu64 " records)\n", start_chunk, end_chunk, dest_id, record_count); - if (lsm_tree->bloom != 0) { + if (lsm_tree->bloom != 0 && start_chunk > 0 && record_count > 0) { WT_RET(__wt_scr_alloc(session, 0, &bbuf)); WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, dest_id, bbuf)); diff --git a/src/support/hazard.c b/src/support/hazard.c index 670175ed124..df362278d25 100644 --- a/src/support/hazard.c +++ b/src/support/hazard.c @@ -139,16 +139,6 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) hp < session->hazard + session->hazard_size; ++hp) if (hp->page == page) { /* - * Check to see if the page has grown too big and force - * eviction. We have to request eviction while holding - * a hazard reference (else the page might disappear out - * from under us), but we can't wake the eviction server - * until we've released our hazard reference because our - * hazard reference blocks the page eviction. A little - * dance: check the page, schedule the forced eviction, - * clear/publish the hazard reference, wake the eviction - * server. - * * We don't publish the hazard reference clear in the * general case. It's not required for correctness; * it gives the page server thread faster access to the @@ -156,12 +146,7 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) * generation number was just set, so it's unlikely the * page will be selected for eviction. */ - if (__wt_eviction_page_check(session, page)) { - __wt_evict_page_request(session, page); - WT_PUBLISH(hp->page, NULL); - __wt_evict_server_wake(session); - } else - hp->page = NULL; + hp->page = NULL; /* * If this was the last hazard reference in the session, |