summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <mjc@wiredtiger.com>2012-03-20 04:04:33 -0700
committerMichael Cahill <mjc@wiredtiger.com>2012-03-20 04:04:33 -0700
commita1c07620e239673dbf6959ec8f68e9ef7cbb526b (patch)
treecc89ff982615c696cdc7c6083840809f00b0705e
parent0d09743171e69f95d75ba4829085dc737b477d98 (diff)
parent6ab5469761fc48024a91ce4390bccc15a0bba448 (diff)
downloadmongo-a1c07620e239673dbf6959ec8f68e9ef7cbb526b.tar.gz
Merge pull request #185 from wiredtiger/eviction-tuning
Eviction tuning changes, particularly for read-only, out-of-cache workloads.
-rw-r--r--src/btree/bt_debug.c3
-rw-r--r--src/btree/bt_evict.c124
-rw-r--r--src/btree/bt_page.c30
-rw-r--r--src/btree/bt_walk.c28
-rw-r--r--src/btree/rec_evict.c13
-rw-r--r--src/include/btmem.h15
-rw-r--r--src/include/extern.h4
-rw-r--r--src/support/hazard.c5
8 files changed, 131 insertions, 91 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 69089a0e209..430c6b6b504 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -794,6 +794,9 @@ __debug_ref(WT_DBG *ds, WT_REF *ref, WT_PAGE *page)
case WT_REF_EVICTING:
__dmsg(ds, "evicting %p", ref->page);
break;
+ case WT_REF_EVICT_WALK:
+ __dmsg(ds, "evict-walk %p", ref->page);
+ break;
case WT_REF_LOCKED:
__dmsg(ds, "locked %p", ref->page);
break;
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index 36aea04e9aa..2fa84475d5b 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -22,8 +22,8 @@ static int __evict_worker(WT_SESSION_IMPL *);
* Tuning constants: I hesitate to call this tuning, but we want to review some
* number of pages from each file's in-memory tree for each page we evict.
*/
-#define WT_EVICT_GROUP 20 /* Evict N pages at a time */
-#define WT_EVICT_WALK_PER_TABLE 25 /* Pages to visit per file */
+#define WT_EVICT_GROUP 30 /* Consider N pages as LRU candidates */
+#define WT_EVICT_WALK_PER_TABLE 35 /* Pages to visit per file */
#define WT_EVICT_WALK_BASE 50 /* Pages tracked across file visits */
/*
@@ -212,7 +212,7 @@ __wt_cache_evict_server(void *arg)
WT_CONNECTION_IMPL *conn;
WT_SESSION_IMPL *session;
WT_CACHE *cache;
- int ret;
+ int read_lockout, ret;
conn = arg;
cache = conn->cache;
@@ -227,14 +227,16 @@ __wt_cache_evict_server(void *arg)
while (F_ISSET(conn, WT_SERVER_RUN)) {
/*
- * Use the same logic as application threads to decide
- * whether there is work to do. If so, evict_cond will
- * be signalled and the wait below won't block.
+ * Use the same logic as application threads to decide whether
+ * there is work to do.
*/
- __wt_eviction_check(session, NULL, 1);
+ __wt_eviction_check(session, &read_lockout, 0);
+
+ if (!read_lockout) {
+ WT_VERBOSE(session, evictserver, "sleeping");
+ __wt_cond_wait(session, cache->evict_cond);
+ }
- WT_VERBOSE(session, evictserver, "sleeping");
- __wt_cond_wait(session, cache->evict_cond);
if (!F_ISSET(conn, WT_SERVER_RUN))
break;
WT_VERBOSE(session, evictserver, "waking");
@@ -384,7 +386,6 @@ __evict_request_walk(WT_SESSION_IMPL *session)
* won't be useful; Discard any page we're holding and
* we can restart our walk as needed.
*/
- session->btree->evict_page = NULL;
ret = __evict_file(session, er);
}
@@ -422,6 +423,13 @@ __evict_file(WT_SESSION_IMPL *session, WT_EVICT_REQ *er)
"file request: %s",
(F_ISSET(er, WT_EVICT_REQ_CLOSE) ? "close" : "sync"));
+ /* Clear the current eviction point. */
+ if ((page = session->btree->evict_page) != NULL &&
+ !WT_PAGE_IS_ROOT(page))
+ (void)WT_ATOMIC_CAS(page->ref->state,
+ WT_REF_EVICT_WALK, WT_REF_MEM);
+ session->btree->evict_page = NULL;
+
/* If this is a close, wait for LRU eviction activity to drain. */
while (F_ISSET(er, WT_EVICT_REQ_CLOSE) && er->btree->lru_count > 0)
__wt_yield();
@@ -476,25 +484,23 @@ static int
__evict_lru(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
- int ret;
cache = S2C(session)->cache;
- __wt_spin_lock(session, &cache->lru_lock);
-
/* Get some more pages to consider for eviction. */
- WT_ERR(__evict_walk(session));
+ WT_RET(__evict_walk(session));
- /* Remove duplicates from the list. */
+ /* Sort and remove duplicates from the list, restart. */
+ __wt_spin_lock(session, &cache->lru_lock);
__evict_dup_remove(session);
-err: __wt_spin_unlock(session, &cache->lru_lock);
+ cache->evict_current = cache->evict;
+ __wt_spin_unlock(session, &cache->lru_lock);
/* Reconcile and discard some pages. */
- if (ret == 0)
- __evict_pages(session);
+ __evict_pages(session);
- return (ret);
+ return (0);
}
/*
@@ -512,24 +518,33 @@ __evict_walk(WT_SESSION_IMPL *session)
conn = S2C(session);
cache = S2C(session)->cache;
+ ret = 0;
/*
- * Resize the array in which we're tracking pages, as necessary, then
- * get some pages from each underlying file. We hold a spinlock for
- * the entire time -- it's slow, but (1) how often do new files get
- * added or removed to/from the system, and (2) it's all in-memory
- * stuff, so it's not that slow.
+ * We hold a spinlock for the entire walk -- it's slow, but (1) how
+ * often do new files get added or removed to/from the system, and (2)
+ * it's all in-memory stuff, so it's not that slow.
*/
- ret = 0;
__wt_spin_lock(session, &conn->spinlock);
+ /*
+ * Resize the array in which we're tracking pages, as necessary, then
+ * get some pages from each underlying file. In practice, a realloc
+ * is rarely needed, so it is worth avoiding the LRU lock.
+ */
elem = WT_EVICT_WALK_BASE + (conn->btqcnt * WT_EVICT_WALK_PER_TABLE);
if (elem > cache->evict_entries) {
+ __wt_spin_lock(session, &cache->lru_lock);
+ /* Save the offset of the eviction point. */
+ if (cache->evict_current != NULL)
+ i = (u_int)(cache->evict_current - cache->evict);
WT_ERR(__wt_realloc(session, &cache->evict_allocated,
elem * sizeof(WT_EVICT_LIST), &cache->evict));
cache->evict_entries = elem;
+ if (cache->evict_current != NULL)
+ cache->evict_current = cache->evict + i;
+ __wt_spin_unlock(session, &cache->lru_lock);
}
- cache->evict_current = cache->evict;
i = WT_EVICT_WALK_BASE;
TAILQ_FOREACH(btree, &conn->btqh, q) {
@@ -564,13 +579,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
cache = S2C(session)->cache;
/*
- * Wait for application threads doing eviction in this file to drain:
- * we're examining pages without holding hazard references.
- */
- while (btree->lru_count > 0)
- __wt_yield();
-
- /*
* Get the next WT_EVICT_WALK_PER_TABLE entries.
*
* We can't evict the page just returned to us, it marks our place in
@@ -585,18 +593,14 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
}
/*
- * Root and pinned pages can't be evicted, nor can locked
- * pages: we would skip them later, and they just fill up the
- * eviction list for no benefit.
- *
- * Skip pages that must be merged into their parents. Don't
- * skip pages marked WT_PAGE_REC_EMPTY or SPLIT: updates after
- * their last reconciliation may have changed their state and
- * only the eviction code can check whether they should really
- * be skipped.
+ * Root pages can't be evicted, nor can skip pages that must be
+ * merged into their parents. Don't skip pages marked
+ * WT_PAGE_REC_EMPTY or SPLIT: updates after their last
+ * reconciliation may have changed their state and only the
+ * eviction code can check whether they should really be
+ * skipped.
*/
if (WT_PAGE_IS_ROOT(page) ||
- page->ref->state != WT_REF_MEM ||
F_ISSET(page, WT_PAGE_REC_SPLIT_MERGE))
continue;
@@ -667,22 +671,38 @@ __evict_dup_remove(WT_SESSION_IMPL *session)
*/
static void
__evict_get_page(
- WT_SESSION_IMPL *session, WT_BTREE **btreep, WT_PAGE **pagep)
+ WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_PAGE **pagep)
{
WT_CACHE *cache;
WT_EVICT_LIST *evict;
WT_REF *ref;
+ int candidates, read_lockout;
cache = S2C(session)->cache;
*btreep = NULL;
*pagep = NULL;
- if (__wt_spin_trylock(session, &cache->lru_lock) != 0)
- return;
+ candidates = (is_app ? WT_EVICT_GROUP : WT_EVICT_GROUP / 2);
+
+ /*
+ * Avoid the LRU lock if no pages are available. If there are pages
+ * available, spin until we get the lock. If this function returns
+ * without getting a page to evict, application threads assume there
+ * are no more pages available and will attempt to wake the eviction
+ * server.
+ */
+ for (;;) {
+ if (cache->evict_current == NULL ||
+ cache->evict_current >= cache->evict + candidates)
+ return;
+ if (__wt_spin_trylock(session, &cache->lru_lock) == 0)
+ break;
+ __wt_yield();
+ }
/* Get the next page queued for eviction. */
while ((evict = cache->evict_current) != NULL &&
- evict >= cache->evict && evict < cache->evict + WT_EVICT_GROUP &&
+ evict >= cache->evict && evict < cache->evict + candidates &&
evict->page != NULL) {
WT_ASSERT(session, evict->btree != NULL);
@@ -717,6 +737,8 @@ __evict_get_page(
break;
}
+ if (is_app && *pagep == NULL)
+ cache->evict_current = NULL;
__wt_spin_unlock(session, &cache->lru_lock);
}
@@ -725,12 +747,12 @@ __evict_get_page(
* Called by both eviction and application threads to evict a page.
*/
int
-__wt_evict_lru_page(WT_SESSION_IMPL *session)
+__wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
{
WT_BTREE *btree, *saved_btree;
WT_PAGE *page;
- __evict_get_page(session, &btree, &page);
+ __evict_get_page(session, is_app, &btree, &page);
if (page == NULL)
return (WT_NOTFOUND);
@@ -770,8 +792,8 @@ __evict_pages(WT_SESSION_IMPL *session)
{
u_int i;
- for (i = 0; i < WT_EVICT_GROUP; i++)
- if (__wt_evict_lru_page(session) != 0)
+ for (i = 0; i < WT_EVICT_GROUP / 2; i++)
+ if (__wt_evict_lru_page(session, 0) != 0)
break;
}
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 583b43e0137..35b97f2df32 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -26,13 +26,14 @@ __wt_page_in_func(
#endif
)
{
- int first, read_lockout;
+ int wake, read_lockout;
/*
- * Only wake the eviction server once: after that, we're just wasting
- * effort and making a busy mutex busier.
+ * Only wake the eviction server the first time through here (if the
+ * cache is too full), or after we fail to evict a page. Otherwise, we
+ * are just wasting effort and making a busy mutex busier.
*/
- first = 1;
+ wake = 1;
for (;;) {
switch (ref->state) {
@@ -41,10 +42,11 @@ __wt_page_in_func(
* The page isn't in memory, attempt to set the
* state to WT_REF_READING. If successful, read it.
*/
- __wt_eviction_check(session, &read_lockout, first);
- first = 0;
- if (read_lockout || !WT_ATOMIC_CAS(
- ref->state, WT_REF_DISK, WT_REF_READING))
+ __wt_eviction_check(session, &read_lockout, wake);
+ wake = 0;
+
+ if (read_lockout || !WT_ATOMIC_CAS(ref->state,
+ WT_REF_DISK, WT_REF_READING))
break;
WT_RET(__wt_cache_read(session, parent, ref));
@@ -57,6 +59,7 @@ __wt_page_in_func(
* wait for that to be resolved.
*/
break;
+ case WT_REF_EVICT_WALK:
case WT_REF_MEM:
/*
* The page is in memory: get a hazard reference, update
@@ -79,13 +82,12 @@ __wt_page_in_func(
}
/*
- * Find a page to evict -- if that succeeds,
- * try again immediately. If it fails, we
- * don't care why, but give up our slice before
- * retrying.
+ * Find a page to evict -- if that fails, we don't care why,
+ * but we may need to wake the eviction server again if the
+ * cache is still full.
*/
- if (__wt_evict_lru_page(session) != 0)
- __wt_yield();
+ if (__wt_evict_lru_page(session, 1) != 0)
+ wake = 1;
}
}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index bc00473ed09..38aa8eb3eb8 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -12,7 +12,7 @@
* Move to the next/previous page in the tree.
*/
int
-__wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int cacheonly, int next)
+__wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int eviction, int next)
{
WT_BTREE *btree;
WT_PAGE *page, *t;
@@ -21,6 +21,7 @@ __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int cacheonly, int next)
int ret;
btree = session->btree;
+ ret = 0;
/*
* Take a copy of any returned page; we have a hazard reference on the
@@ -60,9 +61,17 @@ __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int cacheonly, int next)
* to evict our parent, that fails because the parent has a child page
* that can't be discarded.
*/
- ret = (WT_PAGE_IS_ROOT(t) || cacheonly) ?
- 0 : __wt_page_in(session, t, t->ref);
- if (!cacheonly) {
+ if (eviction) {
+ if (!WT_PAGE_IS_ROOT(t)) {
+ while (!WT_ATOMIC_CAS(t->ref->state,
+ WT_REF_MEM, WT_REF_EVICT_WALK))
+ __wt_yield();
+ }
+ WT_ASSERT(session, page->ref->state == WT_REF_EVICT_WALK);
+ page->ref->state = WT_REF_MEM;
+ } else {
+ if (!WT_PAGE_IS_ROOT(t))
+ ret = __wt_page_in(session, t, t->ref);
__wt_page_release(session, page);
WT_RET(ret);
}
@@ -94,9 +103,15 @@ descend: for (;;) {
}
/* We may only care about in-memory pages. */
- if (cacheonly) {
- if (ref->state != WT_REF_MEM)
+ if (eviction) {
+ if (!WT_ATOMIC_CAS(ref->state,
+ WT_REF_MEM, WT_REF_EVICT_WALK))
break;
+ if (!WT_PAGE_IS_ROOT(page)) {
+ WT_ASSERT(session, page->ref->state ==
+ WT_REF_EVICT_WALK);
+ page->ref->state = WT_REF_MEM;
+ }
} else {
/*
* Swap hazard references at each level (but
@@ -109,7 +124,6 @@ descend: for (;;) {
}
page = ref->page;
- WT_ASSERT(session, ref->state == WT_REF_MEM);
WT_ASSERT(session, page != NULL);
slot = next ? 0 : page->entries - 1;
}
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index 9da60eda485..fa86b0a943f 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -334,16 +334,8 @@ __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_page_out(session, mod->u.split, 0);
}
- /*
- * If we are evicting the file's current eviction point, clear it so
- * the walk will be restarted.
- *
- * !!!
- * This check would arguably be cleaner in bt_evict.c, but that level
- * isn't aware of all of the pages within a subtree that are evicted.
- */
- if (session->btree->evict_page == page)
- session->btree->evict_page = NULL;
+ /* We should never evict the file's current eviction point. */
+ WT_ASSERT(session, session->btree->evict_page != page);
/* Discard the page itself. */
__wt_page_out(session, page, 0);
@@ -389,6 +381,7 @@ __rec_review(WT_SESSION_IMPL *session,
WT_RET(__rec_review(
session, ref, ref->page, flags, 0));
break;
+ case WT_REF_EVICT_WALK: /* Walk point */
case WT_REF_EVICTING: /* Being evaluated */
case WT_REF_LOCKED: /* Being evicted */
case WT_REF_READING: /* Being read */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 8e3c1986992..e8ec2c7e9f7 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -343,20 +343,24 @@ struct __wt_ref {
*
* WT_REF_EVICTING:
* Set by eviction when a page is about to be locked; prevents a
- * page from being evicted multiple times concurrently.
+ * page from being evicted multiple times concurrently.
+ *
+ * WT_REF_EVICT_WALK:
+ * The next page to be walked for LRU eviction. This page is
+ * available for reads but not eviction.
*
* WT_REF_LOCKED:
* Set by eviction; an eviction thread has selected this page or
- * a parent for eviction; once hazard references are checked, the page
- * will be evicted.
+ * a parent for eviction; once hazard references are checked, the
+ * page will be evicted.
*
* WT_REF_MEM:
* Set by a reading thread once the page has been read from disk;
- * the page is in the cache and the page reference is OK.
+ * the page is in the cache and the page reference is OK.
*
* WT_REF_READING:
* Set by a reading thread before reading a page from disk; other
- * readers of the page wait until the read completes.
+ * readers of the page wait until the read completes.
*
* The life cycle of a typical page goes like this: pages are read into
* memory from disk and their state set to WT_REF_MEM. When the page is
@@ -381,6 +385,7 @@ struct __wt_ref {
volatile enum {
WT_REF_DISK=0, /* Page is on disk */
WT_REF_EVICTING, /* Page being evaluated for eviction */
+ WT_REF_EVICT_WALK, /* Next page for LRU eviction */
WT_REF_LOCKED, /* Page being evicted */
WT_REF_MEM, /* Page is in cache and valid */
WT_REF_READING /* Page being read */
diff --git a/src/include/extern.h b/src/include/extern.h
index e262dbb5a66..979f578a0a4 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -196,7 +196,7 @@ extern void __wt_evict_server_wake(WT_SESSION_IMPL *session);
extern void __wt_evict_file_serial_func(WT_SESSION_IMPL *session);
extern int __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page);
extern void *__wt_cache_evict_server(void *arg);
-extern int __wt_evict_lru_page(WT_SESSION_IMPL *session);
+extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app);
extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename);
extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename);
extern int __wt_btree_open(WT_SESSION_IMPL *session,
@@ -252,7 +252,7 @@ extern int __wt_verify_dsk(WT_SESSION_IMPL *session,
uint32_t size);
extern int __wt_tree_np(WT_SESSION_IMPL *session,
WT_PAGE **pagep,
- int cacheonly,
+ int eviction,
int next);
extern int __wt_col_modify(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
diff --git a/src/support/hazard.c b/src/support/hazard.c
index 9219c17e0a7..e22d25be3c2 100644
--- a/src/support/hazard.c
+++ b/src/support/hazard.c
@@ -56,9 +56,10 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref
/*
* Check to see if the page state is still valid (where valid
- * means a state of WT_REF_MEM).
+ * means a state of WT_REF_MEM or WT_REF_EVICT_WALK).
*/
- if (ref->state == WT_REF_MEM) {
+ if (ref->state == WT_REF_MEM ||
+ ref->state == WT_REF_EVICT_WALK) {
WT_VERBOSE(session, hazard,
"session %p hazard %p: set", session, ref->page);
return (0);