diff options
author | Michael Cahill <mjc@wiredtiger.com> | 2012-03-20 04:04:33 -0700 |
---|---|---|
committer | Michael Cahill <mjc@wiredtiger.com> | 2012-03-20 04:04:33 -0700 |
commit | a1c07620e239673dbf6959ec8f68e9ef7cbb526b (patch) | |
tree | cc89ff982615c696cdc7c6083840809f00b0705e | |
parent | 0d09743171e69f95d75ba4829085dc737b477d98 (diff) | |
parent | 6ab5469761fc48024a91ce4390bccc15a0bba448 (diff) | |
download | mongo-a1c07620e239673dbf6959ec8f68e9ef7cbb526b.tar.gz |
Merge pull request #185 from wiredtiger/eviction-tuning
Eviction tuning changes, particularly for read-only, out-of-cache workloads.
-rw-r--r-- | src/btree/bt_debug.c | 3 | ||||
-rw-r--r-- | src/btree/bt_evict.c | 124 | ||||
-rw-r--r-- | src/btree/bt_page.c | 30 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 28 | ||||
-rw-r--r-- | src/btree/rec_evict.c | 13 | ||||
-rw-r--r-- | src/include/btmem.h | 15 | ||||
-rw-r--r-- | src/include/extern.h | 4 | ||||
-rw-r--r-- | src/support/hazard.c | 5 |
8 files changed, 131 insertions, 91 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 69089a0e209..430c6b6b504 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -794,6 +794,9 @@ __debug_ref(WT_DBG *ds, WT_REF *ref, WT_PAGE *page) case WT_REF_EVICTING: __dmsg(ds, "evicting %p", ref->page); break; + case WT_REF_EVICT_WALK: + __dmsg(ds, "evict-walk %p", ref->page); + break; case WT_REF_LOCKED: __dmsg(ds, "locked %p", ref->page); break; diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index 36aea04e9aa..2fa84475d5b 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -22,8 +22,8 @@ static int __evict_worker(WT_SESSION_IMPL *); * Tuning constants: I hesitate to call this tuning, but we want to review some * number of pages from each file's in-memory tree for each page we evict. */ -#define WT_EVICT_GROUP 20 /* Evict N pages at a time */ -#define WT_EVICT_WALK_PER_TABLE 25 /* Pages to visit per file */ +#define WT_EVICT_GROUP 30 /* Consider N pages as LRU candidates */ +#define WT_EVICT_WALK_PER_TABLE 35 /* Pages to visit per file */ #define WT_EVICT_WALK_BASE 50 /* Pages tracked across file visits */ /* @@ -212,7 +212,7 @@ __wt_cache_evict_server(void *arg) WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *session; WT_CACHE *cache; - int ret; + int read_lockout, ret; conn = arg; cache = conn->cache; @@ -227,14 +227,16 @@ __wt_cache_evict_server(void *arg) while (F_ISSET(conn, WT_SERVER_RUN)) { /* - * Use the same logic as application threads to decide - * whether there is work to do. If so, evict_cond will - * be signalled and the wait below won't block. + * Use the same logic as application threads to decide whether + * there is work to do. */ - __wt_eviction_check(session, NULL, 1); + __wt_eviction_check(session, &read_lockout, 0); + + if (!read_lockout) { + WT_VERBOSE(session, evictserver, "sleeping"); + __wt_cond_wait(session, cache->evict_cond); + } - WT_VERBOSE(session, evictserver, "sleeping"); - __wt_cond_wait(session, cache->evict_cond); if (!F_ISSET(conn, WT_SERVER_RUN)) break; WT_VERBOSE(session, evictserver, "waking"); @@ -384,7 +386,6 @@ __evict_request_walk(WT_SESSION_IMPL *session) * won't be useful; Discard any page we're holding and * we can restart our walk as needed. */ - session->btree->evict_page = NULL; ret = __evict_file(session, er); } @@ -422,6 +423,13 @@ __evict_file(WT_SESSION_IMPL *session, WT_EVICT_REQ *er) "file request: %s", (F_ISSET(er, WT_EVICT_REQ_CLOSE) ? "close" : "sync")); + /* Clear the current eviction point. */ + if ((page = session->btree->evict_page) != NULL && + !WT_PAGE_IS_ROOT(page)) + (void)WT_ATOMIC_CAS(page->ref->state, + WT_REF_EVICT_WALK, WT_REF_MEM); + session->btree->evict_page = NULL; + /* If this is a close, wait for LRU eviction activity to drain. */ while (F_ISSET(er, WT_EVICT_REQ_CLOSE) && er->btree->lru_count > 0) __wt_yield(); @@ -476,25 +484,23 @@ static int __evict_lru(WT_SESSION_IMPL *session) { WT_CACHE *cache; - int ret; cache = S2C(session)->cache; - __wt_spin_lock(session, &cache->lru_lock); - /* Get some more pages to consider for eviction. */ - WT_ERR(__evict_walk(session)); + WT_RET(__evict_walk(session)); - /* Remove duplicates from the list. */ + /* Sort and remove duplicates from the list, restart. */ + __wt_spin_lock(session, &cache->lru_lock); __evict_dup_remove(session); -err: __wt_spin_unlock(session, &cache->lru_lock); + cache->evict_current = cache->evict; + __wt_spin_unlock(session, &cache->lru_lock); /* Reconcile and discard some pages. */ - if (ret == 0) - __evict_pages(session); + __evict_pages(session); - return (ret); + return (0); } /* @@ -512,24 +518,33 @@ __evict_walk(WT_SESSION_IMPL *session) conn = S2C(session); cache = S2C(session)->cache; + ret = 0; /* - * Resize the array in which we're tracking pages, as necessary, then - * get some pages from each underlying file. We hold a spinlock for - * the entire time -- it's slow, but (1) how often do new files get - * added or removed to/from the system, and (2) it's all in-memory - * stuff, so it's not that slow. + * We hold a spinlock for the entire walk -- it's slow, but (1) how + * often do new files get added or removed to/from the system, and (2) + * it's all in-memory stuff, so it's not that slow. */ - ret = 0; __wt_spin_lock(session, &conn->spinlock); + /* + * Resize the array in which we're tracking pages, as necessary, then + * get some pages from each underlying file. In practice, a realloc + * is rarely needed, so it is worth avoiding the LRU lock. + */ elem = WT_EVICT_WALK_BASE + (conn->btqcnt * WT_EVICT_WALK_PER_TABLE); if (elem > cache->evict_entries) { + __wt_spin_lock(session, &cache->lru_lock); + /* Save the offset of the eviction point. */ + if (cache->evict_current != NULL) + i = (u_int)(cache->evict_current - cache->evict); WT_ERR(__wt_realloc(session, &cache->evict_allocated, elem * sizeof(WT_EVICT_LIST), &cache->evict)); cache->evict_entries = elem; + if (cache->evict_current != NULL) + cache->evict_current = cache->evict + i; + __wt_spin_unlock(session, &cache->lru_lock); } - cache->evict_current = cache->evict; i = WT_EVICT_WALK_BASE; TAILQ_FOREACH(btree, &conn->btqh, q) { @@ -564,13 +579,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) cache = S2C(session)->cache; /* - * Wait for application threads doing eviction in this file to drain: - * we're examining pages without holding hazard references. - */ - while (btree->lru_count > 0) - __wt_yield(); - - /* * Get the next WT_EVICT_WALK_PER_TABLE entries. * * We can't evict the page just returned to us, it marks our place in @@ -585,18 +593,14 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) } /* - * Root and pinned pages can't be evicted, nor can locked - * pages: we would skip them later, and they just fill up the - * eviction list for no benefit. - * - * Skip pages that must be merged into their parents. Don't - * skip pages marked WT_PAGE_REC_EMPTY or SPLIT: updates after - * their last reconciliation may have changed their state and - * only the eviction code can check whether they should really - * be skipped. + * Root pages can't be evicted, nor can skip pages that must be + * merged into their parents. Don't skip pages marked + * WT_PAGE_REC_EMPTY or SPLIT: updates after their last + * reconciliation may have changed their state and only the + * eviction code can check whether they should really be + * skipped. */ if (WT_PAGE_IS_ROOT(page) || - page->ref->state != WT_REF_MEM || F_ISSET(page, WT_PAGE_REC_SPLIT_MERGE)) continue; @@ -667,22 +671,38 @@ __evict_dup_remove(WT_SESSION_IMPL *session) */ static void __evict_get_page( - WT_SESSION_IMPL *session, WT_BTREE **btreep, WT_PAGE **pagep) + WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_PAGE **pagep) { WT_CACHE *cache; WT_EVICT_LIST *evict; WT_REF *ref; + int candidates, read_lockout; cache = S2C(session)->cache; *btreep = NULL; *pagep = NULL; - if (__wt_spin_trylock(session, &cache->lru_lock) != 0) - return; + candidates = (is_app ? WT_EVICT_GROUP : WT_EVICT_GROUP / 2); + + /* + * Avoid the LRU lock if no pages are available. If there are pages + * available, spin until we get the lock. If this function returns + * without getting a page to evict, application threads assume there + * are no more pages available and will attempt to wake the eviction + * server. + */ + for (;;) { + if (cache->evict_current == NULL || + cache->evict_current >= cache->evict + candidates) + return; + if (__wt_spin_trylock(session, &cache->lru_lock) == 0) + break; + __wt_yield(); + } /* Get the next page queued for eviction. */ while ((evict = cache->evict_current) != NULL && - evict >= cache->evict && evict < cache->evict + WT_EVICT_GROUP && + evict >= cache->evict && evict < cache->evict + candidates && evict->page != NULL) { WT_ASSERT(session, evict->btree != NULL); @@ -717,6 +737,8 @@ __evict_get_page( break; } + if (is_app && *pagep == NULL) + cache->evict_current = NULL; __wt_spin_unlock(session, &cache->lru_lock); } @@ -725,12 +747,12 @@ __evict_get_page( * Called by both eviction and application threads to evict a page. */ int -__wt_evict_lru_page(WT_SESSION_IMPL *session) +__wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) { WT_BTREE *btree, *saved_btree; WT_PAGE *page; - __evict_get_page(session, &btree, &page); + __evict_get_page(session, is_app, &btree, &page); if (page == NULL) return (WT_NOTFOUND); @@ -770,8 +792,8 @@ __evict_pages(WT_SESSION_IMPL *session) { u_int i; - for (i = 0; i < WT_EVICT_GROUP; i++) - if (__wt_evict_lru_page(session) != 0) + for (i = 0; i < WT_EVICT_GROUP / 2; i++) + if (__wt_evict_lru_page(session, 0) != 0) break; } diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 583b43e0137..35b97f2df32 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -26,13 +26,14 @@ __wt_page_in_func( #endif ) { - int first, read_lockout; + int wake, read_lockout; /* - * Only wake the eviction server once: after that, we're just wasting - * effort and making a busy mutex busier. + * Only wake the eviction server the first time through here (if the + * cache is too full), or after we fail to evict a page. Otherwise, we + * are just wasting effort and making a busy mutex busier. */ - first = 1; + wake = 1; for (;;) { switch (ref->state) { @@ -41,10 +42,11 @@ __wt_page_in_func( * The page isn't in memory, attempt to set the * state to WT_REF_READING. If successful, read it. */ - __wt_eviction_check(session, &read_lockout, first); - first = 0; - if (read_lockout || !WT_ATOMIC_CAS( - ref->state, WT_REF_DISK, WT_REF_READING)) + __wt_eviction_check(session, &read_lockout, wake); + wake = 0; + + if (read_lockout || !WT_ATOMIC_CAS(ref->state, + WT_REF_DISK, WT_REF_READING)) break; WT_RET(__wt_cache_read(session, parent, ref)); @@ -57,6 +59,7 @@ __wt_page_in_func( * wait for that to be resolved. */ break; + case WT_REF_EVICT_WALK: case WT_REF_MEM: /* * The page is in memory: get a hazard reference, update @@ -79,13 +82,12 @@ __wt_page_in_func( } /* - * Find a page to evict -- if that succeeds, - * try again immediately. If it fails, we - * don't care why, but give up our slice before - * retrying. + * Find a page to evict -- if that fails, we don't care why, + * but we may need to wake the eviction server again if the + * cache is still full. */ - if (__wt_evict_lru_page(session) != 0) - __wt_yield(); + if (__wt_evict_lru_page(session, 1) != 0) + wake = 1; } } diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index bc00473ed09..38aa8eb3eb8 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -12,7 +12,7 @@ * Move to the next/previous page in the tree. */ int -__wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int cacheonly, int next) +__wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int eviction, int next) { WT_BTREE *btree; WT_PAGE *page, *t; @@ -21,6 +21,7 @@ __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int cacheonly, int next) int ret; btree = session->btree; + ret = 0; /* * Take a copy of any returned page; we have a hazard reference on the @@ -60,9 +61,17 @@ __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int cacheonly, int next) * to evict our parent, that fails because the parent has a child page * that can't be discarded. */ - ret = (WT_PAGE_IS_ROOT(t) || cacheonly) ? - 0 : __wt_page_in(session, t, t->ref); - if (!cacheonly) { + if (eviction) { + if (!WT_PAGE_IS_ROOT(t)) { + while (!WT_ATOMIC_CAS(t->ref->state, + WT_REF_MEM, WT_REF_EVICT_WALK)) + __wt_yield(); + } + WT_ASSERT(session, page->ref->state == WT_REF_EVICT_WALK); + page->ref->state = WT_REF_MEM; + } else { + if (!WT_PAGE_IS_ROOT(t)) + ret = __wt_page_in(session, t, t->ref); __wt_page_release(session, page); WT_RET(ret); } @@ -94,9 +103,15 @@ descend: for (;;) { } /* We may only care about in-memory pages. */ - if (cacheonly) { - if (ref->state != WT_REF_MEM) + if (eviction) { + if (!WT_ATOMIC_CAS(ref->state, + WT_REF_MEM, WT_REF_EVICT_WALK)) break; + if (!WT_PAGE_IS_ROOT(page)) { + WT_ASSERT(session, page->ref->state == + WT_REF_EVICT_WALK); + page->ref->state = WT_REF_MEM; + } } else { /* * Swap hazard references at each level (but @@ -109,7 +124,6 @@ descend: for (;;) { } page = ref->page; - WT_ASSERT(session, ref->state == WT_REF_MEM); WT_ASSERT(session, page != NULL); slot = next ? 0 : page->entries - 1; } diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index 9da60eda485..fa86b0a943f 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -334,16 +334,8 @@ __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_page_out(session, mod->u.split, 0); } - /* - * If we are evicting the file's current eviction point, clear it so - * the walk will be restarted. - * - * !!! - * This check would arguably be cleaner in bt_evict.c, but that level - * isn't aware of all of the pages within a subtree that are evicted. - */ - if (session->btree->evict_page == page) - session->btree->evict_page = NULL; + /* We should never evict the file's current eviction point. */ + WT_ASSERT(session, session->btree->evict_page != page); /* Discard the page itself. */ __wt_page_out(session, page, 0); @@ -389,6 +381,7 @@ __rec_review(WT_SESSION_IMPL *session, WT_RET(__rec_review( session, ref, ref->page, flags, 0)); break; + case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_EVICTING: /* Being evaluated */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ diff --git a/src/include/btmem.h b/src/include/btmem.h index 8e3c1986992..e8ec2c7e9f7 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -343,20 +343,24 @@ struct __wt_ref { * * WT_REF_EVICTING: * Set by eviction when a page is about to be locked; prevents a - * page from being evicted multiple times concurrently. + * page from being evicted multiple times concurrently. + * + * WT_REF_EVICT_WALK: + * The next page to be walked for LRU eviction. This page is + * available for reads but not eviction. * * WT_REF_LOCKED: * Set by eviction; an eviction thread has selected this page or - * a parent for eviction; once hazard references are checked, the page - * will be evicted. + * a parent for eviction; once hazard references are checked, the + * page will be evicted. * * WT_REF_MEM: * Set by a reading thread once the page has been read from disk; - * the page is in the cache and the page reference is OK. + * the page is in the cache and the page reference is OK. * * WT_REF_READING: * Set by a reading thread before reading a page from disk; other - * readers of the page wait until the read completes. + * readers of the page wait until the read completes. * * The life cycle of a typical page goes like this: pages are read into * memory from disk and their state set to WT_REF_MEM. When the page is @@ -381,6 +385,7 @@ struct __wt_ref { volatile enum { WT_REF_DISK=0, /* Page is on disk */ WT_REF_EVICTING, /* Page being evaluated for eviction */ + WT_REF_EVICT_WALK, /* Next page for LRU eviction */ WT_REF_LOCKED, /* Page being evicted */ WT_REF_MEM, /* Page is in cache and valid */ WT_REF_READING /* Page being read */ diff --git a/src/include/extern.h b/src/include/extern.h index e262dbb5a66..979f578a0a4 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -196,7 +196,7 @@ extern void __wt_evict_server_wake(WT_SESSION_IMPL *session); extern void __wt_evict_file_serial_func(WT_SESSION_IMPL *session); extern int __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page); extern void *__wt_cache_evict_server(void *arg); -extern int __wt_evict_lru_page(WT_SESSION_IMPL *session); +extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app); extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename); extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename); extern int __wt_btree_open(WT_SESSION_IMPL *session, @@ -252,7 +252,7 @@ extern int __wt_verify_dsk(WT_SESSION_IMPL *session, uint32_t size); extern int __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, - int cacheonly, + int eviction, int next); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, diff --git a/src/support/hazard.c b/src/support/hazard.c index 9219c17e0a7..e22d25be3c2 100644 --- a/src/support/hazard.c +++ b/src/support/hazard.c @@ -56,9 +56,10 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref /* * Check to see if the page state is still valid (where valid - * means a state of WT_REF_MEM). + * means a state of WT_REF_MEM or WT_REF_EVICT_WALK). */ - if (ref->state == WT_REF_MEM) { + if (ref->state == WT_REF_MEM || + ref->state == WT_REF_EVICT_WALK) { WT_VERBOSE(session, hazard, "session %p hazard %p: set", session, ref->page); return (0); |