diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2016-12-15 00:01:12 -0500 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-12-15 16:01:12 +1100 |
commit | 18fb0f8da42f5e6045e296b81d86370bf60376f5 (patch) | |
tree | ecc32e074229a3dbde2a8b32f79c1a86a60d02d8 | |
parent | a026912444c9714011acfec0ca40722e3b55e2a1 (diff) | |
download | mongo-18fb0f8da42f5e6045e296b81d86370bf60376f5.tar.gz |
SERVER-26545 Remove fixed-size limitation on WiredTiger hazard pointers (#3187)
* Default to an initial 250 hazard slots and grow from there.
* Make hazard_max undocumented, add an internal limit of 1000 eviction walks.
* If we grow the hazard pointer array, schedule the original to be freed when the database is closed.
* Update test_bug011 back to stress eviction with the hard-coded limit of 1000 active trees. Only run during "long" tests.
-rw-r--r-- | dist/api_data.py | 2 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 2 | ||||
-rw-r--r-- | src/config/config_api.c | 31 | ||||
-rw-r--r-- | src/conn/conn_api.c | 3 | ||||
-rw-r--r-- | src/docs/upgrading.dox | 10 | ||||
-rw-r--r-- | src/evict/evict_lru.c | 14 | ||||
-rw-r--r-- | src/evict/evict_page.c | 2 | ||||
-rw-r--r-- | src/include/btree.i | 47 | ||||
-rw-r--r-- | src/include/cache.h | 2 | ||||
-rw-r--r-- | src/include/connection.h | 6 | ||||
-rw-r--r-- | src/include/extern.h | 2 | ||||
-rw-r--r-- | src/include/session.h | 10 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 2 | ||||
-rw-r--r-- | src/session/session_api.c | 24 | ||||
-rw-r--r-- | src/support/hazard.c | 281 | ||||
-rw-r--r-- | test/suite/test_bug011.py | 15 | ||||
-rw-r--r-- | test/suite/test_config03.py | 10 | ||||
-rw-r--r-- | test/suite/test_config04.py | 4 | ||||
-rw-r--r-- | test/suite/test_hazard.py | 58 |
19 files changed, 330 insertions, 195 deletions
diff --git a/dist/api_data.py b/dist/api_data.py index 2b7ef4a94e1..acbbf0f2a68 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -727,7 +727,7 @@ wiredtiger_open_common =\ Config('hazard_max', '1000', r''' maximum number of simultaneous hazard pointers per session handle''', - min='15'), + min=15, undoc=True), Config('mmap', 'true', r''' Use memory mapping to access files when possible''', type='boolean'), diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 7858d2cb14e..c2733d6567b 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -44,7 +44,7 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) * Wait for up to a second for hazard pointers to be cleared. */ for (hp = NULL, i = 0; i < 100; i++) { - if ((hp = __wt_page_hazard_check(session, ref)) == NULL) + if ((hp = __wt_hazard_check(session, ref)) == NULL) break; __wt_sleep(0, 10000); } diff --git a/src/config/config_api.c b/src/config/config_api.c index 76a51903588..05c5c1287a7 100644 --- a/src/config/config_api.c +++ b/src/config/config_api.c @@ -158,24 +158,25 @@ wiredtiger_config_validate(WT_SESSION *wt_session, } /* - * __conn_foc_add -- + * __wt_conn_foc_add -- * Add a new entry into the connection's free-on-close list. */ -static int -__conn_foc_add(WT_SESSION_IMPL *session, const void *p) +void +__wt_conn_foc_add(WT_SESSION_IMPL *session, const void *p) { WT_CONNECTION_IMPL *conn; conn = S2C(session); /* - * Our caller is expected to be holding any locks we need. + * Callers of this function are expected to be holding the connection's + * api_lock. + * + * All callers of this function currently ignore errors. */ - WT_RET(__wt_realloc_def( - session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc)); - - conn->foc[conn->foc_cnt++] = (void *)p; - return (0); + if (__wt_realloc_def( + session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc) == 0) + conn->foc[conn->foc_cnt++] = (void *)p; } /* @@ -328,12 +329,12 @@ __wt_configure_method(WT_SESSION_IMPL *session, * order to avoid freeing chunks of memory twice. Again, this isn't a * commonly used API and it shouldn't ever happen, just leak it. */ - (void)__conn_foc_add(session, entry->base); - (void)__conn_foc_add(session, entry); - (void)__conn_foc_add(session, checks); - (void)__conn_foc_add(session, newcheck->type); - (void)__conn_foc_add(session, newcheck->checks); - (void)__conn_foc_add(session, newcheck_name); + __wt_conn_foc_add(session, entry->base); + __wt_conn_foc_add(session, entry); + __wt_conn_foc_add(session, checks); + __wt_conn_foc_add(session, newcheck->type); + __wt_conn_foc_add(session, newcheck->checks); + __wt_conn_foc_add(session, newcheck_name); /* * Instead of using locks to protect configuration information, assume diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 1bc4a501ce2..474b8bbad8a 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -2309,9 +2309,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, } WT_ERR(__wt_verbose_config(session, cfg)); - WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval)); - conn->hazard_max = (uint32_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); conn->session_size = (uint32_t)cval.val + WT_EXTRA_INTERNAL_SESSIONS; diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 78d09a56ea9..0b0826f2646 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,5 +1,15 @@ /*! @page upgrading Upgrading WiredTiger applications +@section version_291 Upgrading to Version 2.9.1 + +<dl> +<dt>Changes to hazard pointer configuration</dt> +<dd> +The \c hazard_max parameter to ::wiredtiger_open is now ignored. Memory is +allocated for hazard pointers as required by each session. +</dd> +</dl> + @section version_290 Upgrading to Version 2.9.0 <dl> <dt>Changes to cursor behavior after WT_CURSOR::insert</dt> diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 5b857566299..6fa728916de 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1148,9 +1148,17 @@ retry: while (slot < max_entries) { !__wt_cache_aggressive(session)) continue; - /* Skip files if we have used all available hazard pointers. */ - if (btree->evict_ref == NULL && session->nhazard >= - conn->hazard_max - WT_MIN(conn->hazard_max / 2, 10)) + /* + * Skip files if we have too many active walks. + * + * This used to be limited by the configured maximum number of + * hazard pointers per session. Even though that ceiling has + * been removed, we need to test eviction with huge numbers of + * active trees before allowing larger numbers of hazard + * pointers in the walk session. + */ + if (btree->evict_ref == NULL && + session->nhazard > WT_EVICT_MAX_TREES) continue; /* diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 56a0fcfc790..5b17a78a4dd 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -37,7 +37,7 @@ __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref) * Check for a hazard pointer indicating another thread is using the * page, meaning the page cannot be evicted. */ - if (__wt_page_hazard_check(session, ref) == NULL) + if (__wt_hazard_check(session, ref) == NULL) return (0); WT_STAT_DATA_INCR(session, cache_eviction_hazard); diff --git a/src/include/btree.i b/src/include/btree.i index e591209f39a..4f69c258621 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1437,53 +1437,6 @@ __wt_page_swap_func( } /* - * __wt_page_hazard_check -- - * Return if there's a hazard pointer to the page in the system. - */ -static inline WT_HAZARD * -__wt_page_hazard_check(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_CONNECTION_IMPL *conn; - WT_HAZARD *hp; - WT_SESSION_IMPL *s; - uint32_t i, j, hazard_size, max, session_cnt; - - conn = S2C(session); - - /* - * No lock is required because the session array is fixed size, but it - * may contain inactive entries. We must review any active session - * that might contain a hazard pointer, so insert a barrier before - * reading the active session count. That way, no matter what sessions - * come or go, we'll check the slots for all of the sessions that could - * have been active when we started our check. - */ - WT_STAT_CONN_INCR(session, cache_hazard_checks); - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (s = conn->sessions, i = 0, j = 0, max = 0; - i < session_cnt; ++s, ++i) { - if (!s->active) - continue; - WT_ORDERED_READ(hazard_size, s->hazard_size); - if (s->hazard_size > max) { - max = s->hazard_size; - WT_STAT_CONN_SET(session, - cache_hazard_max, max); - } - for (hp = s->hazard; hp < s->hazard + hazard_size; ++hp) { - ++j; - if (hp->ref == ref) { - WT_STAT_CONN_INCRV(session, - cache_hazard_walks, j); - return (hp); - } - } - } - WT_STAT_CONN_INCRV(session, cache_hazard_walks, j); - return (NULL); -} - -/* * __wt_skip_choose_depth -- * Randomly choose a depth for a skiplist insert. */ diff --git a/src/include/cache.h b/src/include/cache.h index 6ea13ff63ac..70f6169200d 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -16,6 +16,8 @@ #define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */ #define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ +#define WT_EVICT_MAX_TREES 1000 /* Maximum walk points */ + /* * WT_EVICT_ENTRY -- * Encapsulation of an eviction candidate. diff --git a/src/include/connection.h b/src/include/connection.h index 7d3d07a6abc..60ce5f55234 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -255,12 +255,6 @@ struct __wt_connection_impl { size_t session_scratch_max; /* Max scratch memory per session */ - /* - * WiredTiger allocates space for a fixed number of hazard pointers - * in each thread of control. - */ - uint32_t hazard_max; /* Hazard array size */ - WT_CACHE *cache; /* Page cache */ volatile uint64_t cache_size; /* Cache size (either statically configured or the current size diff --git a/src/include/extern.h b/src/include/extern.h index 7dce310dabf..c7506e55976 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -218,6 +218,7 @@ extern int __wt_config_getones_none(WT_SESSION_IMPL *session, const char *config extern int __wt_config_gets_def(WT_SESSION_IMPL *session, const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_config_subgetraw(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_config_subgets(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_conn_foc_add(WT_SESSION_IMPL *session, const void *p) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_conn_foc_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char *uri, const char *config, const char *type, const char *check) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_config_check(WT_SESSION_IMPL *session, const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -654,6 +655,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp ) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_hazard_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern WT_HAZARD *__wt_hazard_check(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern u_int __wt_hazard_count(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/session.h b/src/include/session.h index 76cb463602c..7e855a3db25 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -200,9 +200,13 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { #define WT_SESSION_FIRST_USE(s) \ ((s)->hazard == NULL) - /* The number of hazard pointers grows dynamically. */ -#define WT_HAZARD_INCR 1 - uint32_t hazard_size; /* Allocated slots in hazard array. */ + /* + * The hazard pointer array grows as necessary, initialize with 250 + * slots. + */ +#define WT_SESSION_INITIAL_HAZARD_SLOTS 250 + uint32_t hazard_size; /* Hazard pointer array slots */ + uint32_t hazard_inuse; /* Hazard pointer array slots in-use */ uint32_t nhazard; /* Count of active hazard pointers */ WT_HAZARD *hazard; /* Hazard pointer array */ }; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 2365135e08d..4c72df0f073 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -2355,8 +2355,6 @@ struct __wt_connection { * in seconds at which to check for files that are inactive and close them., an * integer between 1 and 100000; default \c 10.} * @config{ ),,} - * @config{hazard_max, maximum number of simultaneous hazard pointers per - * session handle., an integer greater than or equal to 15; default \c 1000.} * @config{in_memory, keep data in-memory only. See @ref in_memory for more * information., a boolean flag; default \c false.} * @config{log = (, enable logging. Enabling logging uses three sessions from diff --git a/src/session/session_api.c b/src/session/session_api.c index ed6339aa66e..88a3c32ee11 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -129,9 +129,11 @@ __session_clear(WT_SESSION_IMPL *session) * For these reasons, be careful when clearing the session structure. */ memset(session, 0, WT_SESSION_CLEAR_SIZE(session)); - session->hazard_size = 0; - session->nhazard = 0; + WT_INIT_LSN(&session->bg_sync_lsn); + + session->hazard_inuse = 0; + session->nhazard = 0; } /* @@ -1819,17 +1821,13 @@ __open_session(WT_CONNECTION_IMPL *conn, * session close because access to it isn't serialized. Allocate the * first time we open this session. */ - if (WT_SESSION_FIRST_USE(session_ret)) - WT_ERR(__wt_calloc_def( - session, conn->hazard_max, &session_ret->hazard)); - - /* - * Set an initial size for the hazard array. It will be grown as - * required up to hazard_max. The hazard_size is reset on close, since - * __wt_hazard_close ensures the array is cleared - so it is safe to - * reset the starting size on each open. - */ - session_ret->hazard_size = 0; + if (WT_SESSION_FIRST_USE(session_ret)) { + WT_ERR(__wt_calloc_def(session, + WT_SESSION_INITIAL_HAZARD_SLOTS, &session_ret->hazard)); + session_ret->hazard_size = WT_SESSION_INITIAL_HAZARD_SLOTS; + session_ret->hazard_inuse = 0; + session_ret->nhazard = 0; + } /* Cache the offset of this session's statistics bucket. */ session_ret->stat_bucket = WT_STATS_SLOT_ID(session); diff --git a/src/support/hazard.c b/src/support/hazard.c index b0fa7e129bb..7e88ad183fe 100644 --- a/src/support/hazard.c +++ b/src/support/hazard.c @@ -13,6 +13,48 @@ static void __hazard_dump(WT_SESSION_IMPL *); #endif /* + * hazard_grow -- + * Grow a hazard pointer array. + */ +static int +hazard_grow(WT_SESSION_IMPL *session) +{ + WT_HAZARD *nhazard; + size_t size; + void *ohazard; + + /* + * Allocate a new, larger hazard pointer array and copy the contents of + * the original into place. + */ + size = session->hazard_size; + WT_RET(__wt_calloc_def(session, size * 2, &nhazard)); + memcpy(nhazard, session->hazard, size * sizeof(WT_HAZARD)); + + /* + * Swap the new hazard pointer array into place after initialization + * is complete (initialization must complete before eviction can see + * the new hazard pointer array), then schedule the original to be + * freed. + */ + ohazard = session->hazard; + WT_PUBLISH(session->hazard, nhazard); + + __wt_spin_lock(session, &S2C(session)->api_lock); + __wt_conn_foc_add(session, ohazard); + __wt_spin_unlock(session, &S2C(session)->api_lock); + + /* + * Increase the size of the session's pointer array after swapping it + * into place (the session's reference must be updated before eviction + * can see the new size). + */ + WT_PUBLISH(session->hazard_size, (uint32_t)(size * 2)); + + return (0); +} + +/* * __wt_hazard_set -- * Set a hazard pointer. */ @@ -23,17 +65,12 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp #endif ) { - WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_HAZARD *hp; - int restarts = 0; - btree = S2BT(session); - conn = S2C(session); *busyp = false; /* If a file can never be evicted, hazard pointers aren't required. */ - if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) + if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY)) return (0); /* @@ -46,6 +83,45 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp return (0); } + /* If we have filled the current hazard pointer array, grow it. */ + if (session->nhazard >= session->hazard_size) { + WT_ASSERT(session, + session->nhazard == session->hazard_size && + session->hazard_inuse == session->hazard_size); + WT_RET(hazard_grow(session)); + } + + /* + * If there are no available hazard pointer slots, make another one + * visible. + */ + if (session->nhazard >= session->hazard_inuse) { + WT_ASSERT(session, + session->nhazard == session->hazard_inuse && + session->hazard_inuse < session->hazard_size); + hp = &session->hazard[session->hazard_inuse++]; + } else { + WT_ASSERT(session, + session->nhazard < session->hazard_inuse && + session->hazard_inuse <= session->hazard_size); + + /* + * There must be an empty slot in the array, find it. Skip most + * of the active slots by starting after the active count slot; + * there may be a free slot before there, but checking is + * expensive. If we reach the end of the array, continue the + * search from the beginning of the array. + */ + for (hp = session->hazard + session->nhazard;; ++hp) { + if (hp >= session->hazard + session->hazard_inuse) + hp = session->hazard; + if (hp->ref == NULL) + break; + } + } + + WT_ASSERT(session, hp->ref == NULL); + /* * Do the dance: * @@ -59,82 +135,43 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp * pointer before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * pointers). - * - * For sessions with many active hazard pointers, skip most of the - * active slots: there may be a free slot in there, but checking is - * expensive. Most hazard pointers are released quickly: optimize - * for that case. */ - for (hp = session->hazard + session->nhazard;; ++hp) { - /* - * If we get to the end of the array, either: - * 1. If we know there are free slots somewhere, and this is - * the first time through, continue the search from the - * start. Don't actually continue the loop because that - * will skip the first slot. - * 2. If we have searched all the way through and we have - * allocated the maximum number of slots, give up. - * 3. Allocate another increment of slots, up to the maximum. - * The slot we are on should now be available. - */ - if (hp >= session->hazard + session->hazard_size) { - if (session->nhazard < session->hazard_size && - restarts++ == 0) - hp = session->hazard; - else if (session->hazard_size >= conn->hazard_max) - break; - else - WT_PUBLISH(session->hazard_size, WT_MIN( - session->hazard_size + WT_HAZARD_INCR, - conn->hazard_max)); - } - - if (hp->ref != NULL) - continue; - - hp->ref = ref; + hp->ref = ref; #ifdef HAVE_DIAGNOSTIC - hp->file = file; - hp->line = line; + hp->file = file; + hp->line = line; #endif - /* Publish the hazard pointer before reading page's state. */ - WT_FULL_BARRIER(); + /* Publish the hazard pointer before reading page's state. */ + WT_FULL_BARRIER(); - /* - * Check if the page state is still valid, where valid means a - * state of WT_REF_MEM. - */ - if (ref->state == WT_REF_MEM) { - ++session->nhazard; - - /* - * Callers require a barrier here so operations holding - * the hazard pointer see consistent data. - */ - WT_READ_BARRIER(); - return (0); - } + /* + * Check if the page state is still valid, where valid means a + * state of WT_REF_MEM. + */ + if (ref->state == WT_REF_MEM) { + ++session->nhazard; /* - * The page isn't available, it's being considered for eviction - * (or being evicted, for all we know). If the eviction server - * sees our hazard pointer before evicting the page, it will - * return the page to use, no harm done, if it doesn't, it will - * go ahead and complete the eviction. - * - * We don't bother publishing this update: the worst case is we - * prevent some random page from being evicted. + * Callers require a barrier here so operations holding + * the hazard pointer see consistent data. */ - hp->ref = NULL; - *busyp = true; + WT_READ_BARRIER(); return (0); } -#ifdef HAVE_DIAGNOSTIC - __hazard_dump(session); -#endif - WT_RET_MSG(session, ENOMEM, - "session %p: hazard pointer table full", (void *)session); + /* + * The page isn't available, it's being considered for eviction + * (or being evicted, for all we know). If the eviction server + * sees our hazard pointer before evicting the page, it will + * return the page to use, no harm done, if it doesn't, it will + * go ahead and complete the eviction. + * + * We don't bother publishing this update: the worst case is we + * prevent some random page from being evicted. + */ + hp->ref = NULL; + *busyp = true; + return (0); } /* @@ -144,20 +181,17 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_REF *ref) { - WT_BTREE *btree; WT_HAZARD *hp; - btree = S2BT(session); - /* If a file can never be evicted, hazard pointers aren't required. */ - if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) + if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY)) return (0); /* * Clear the caller's hazard pointer. * The common pattern is LIFO, so do a reverse search. */ - for (hp = session->hazard + session->hazard_size - 1; + for (hp = session->hazard + session->hazard_inuse - 1; hp >= session->hazard; --hp) if (hp->ref == ref) { @@ -174,9 +208,13 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_REF *ref) /* * If this was the last hazard pointer in the session, * reset the size so that checks can skip this session. + * + * A write-barrier() is necessary before the change to + * the in-use value, the number of active references + * can never be less than the number of in-use slots. */ if (--session->nhazard == 0) - WT_PUBLISH(session->hazard_size, 0); + WT_PUBLISH(session->hazard_inuse, 0); return (0); } @@ -205,7 +243,7 @@ __wt_hazard_close(WT_SESSION_IMPL *session) * diagnostic. */ for (found = false, hp = session->hazard; - hp < session->hazard + session->hazard_size; ++hp) + hp < session->hazard + session->hazard_inuse; ++hp) if (hp->ref != NULL) { found = true; break; @@ -233,7 +271,7 @@ __wt_hazard_close(WT_SESSION_IMPL *session) * can't think of a reason it would be). */ for (hp = session->hazard; - hp < session->hazard + session->hazard_size; ++hp) + hp < session->hazard + session->hazard_inuse; ++hp) if (hp->ref != NULL) { hp->ref = NULL; --session->nhazard; @@ -247,6 +285,80 @@ __wt_hazard_close(WT_SESSION_IMPL *session) } /* + * hazard_get_reference -- + * Return a consistent reference to a hazard pointer array. + */ +static inline void +hazard_get_reference( + WT_SESSION_IMPL *session, WT_HAZARD **hazardp, uint32_t *hazard_inusep) +{ + /* + * Hazard pointer arrays can be swapped out from under us if they grow. + * First, read the current in-use value. The read must precede the read + * of the hazard pointer itself (so the in-use value is pessimistic + * should the hazard array grow), and additionally ensure we only read + * the in-use value once. Then, read the hazard pointer, also ensuring + * we only read it once. + * + * Use a barrier instead of marking the fields volatile because we don't + * want to slow down the rest of the hazard pointer functions that don't + * need special treatment. + */ + WT_ORDERED_READ(*hazard_inusep, session->hazard_inuse); + WT_ORDERED_READ(*hazardp, session->hazard); +} + +/* + * __wt_hazard_check -- + * Return if there's a hazard pointer to the page in the system. + */ +WT_HAZARD * +__wt_hazard_check(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_CONNECTION_IMPL *conn; + WT_HAZARD *hp; + WT_SESSION_IMPL *s; + uint32_t i, j, hazard_inuse, max, session_cnt, walk_cnt; + + conn = S2C(session); + + WT_STAT_CONN_INCR(session, cache_hazard_checks); + + /* + * No lock is required because the session array is fixed size, but it + * may contain inactive entries. We must review any active session + * that might contain a hazard pointer, so insert a read barrier after + * reading the active session count. That way, no matter what sessions + * come or go, we'll check the slots for all of the sessions that could + * have been active when we started our check. + */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (s = conn->sessions, + i = j = max = walk_cnt = 0; i < session_cnt; ++s, ++i) { + if (!s->active) + continue; + + hazard_get_reference(s, &hp, &hazard_inuse); + + if (hazard_inuse > max) { + max = hazard_inuse; + WT_STAT_CONN_SET(session, cache_hazard_max, max); + } + + for (j = 0; j < hazard_inuse; ++hp, ++j) { + ++walk_cnt; + if (hp->ref == ref) { + WT_STAT_CONN_INCRV(session, + cache_hazard_walks, walk_cnt); + return (hp); + } + } + } + WT_STAT_CONN_INCRV(session, cache_hazard_walks, walk_cnt); + return (NULL); +} + +/* * __wt_hazard_count -- * Count how many hazard pointers this session has on the given page. */ @@ -254,11 +366,12 @@ u_int __wt_hazard_count(WT_SESSION_IMPL *session, WT_REF *ref) { WT_HAZARD *hp; + uint32_t i, hazard_inuse; u_int count; - for (count = 0, hp = session->hazard + session->hazard_size - 1; - hp >= session->hazard; - --hp) + hazard_get_reference(session, &hp, &hazard_inuse); + + for (count = 0, i = 0; i < hazard_inuse; ++hp, ++i) if (hp->ref == ref) ++count; @@ -276,7 +389,7 @@ __hazard_dump(WT_SESSION_IMPL *session) WT_HAZARD *hp; for (hp = session->hazard; - hp < session->hazard + session->hazard_size; ++hp) + hp < session->hazard + session->hazard_inuse; ++hp) if (hp->ref != NULL) __wt_errx(session, "session %p: hazard pointer %p: %s, line %d", diff --git a/test/suite/test_bug011.py b/test/suite/test_bug011.py index 29bb08ec2e5..969aaeb5b39 100644 --- a/test/suite/test_bug011.py +++ b/test/suite/test_bug011.py @@ -30,25 +30,28 @@ import random, wiredtiger, wttest from wtdataset import SimpleDataSet # test_bug011.py -# Eviction working on more files than there are hazard pointers. +# Eviction working on more trees than the eviction server can walk +# simultaneously. There is a builtin limit of 1000 trees, we open double +# that, which makes this a long-running test. class test_bug011(wttest.WiredTigerTestCase): """ Test having eviction working on more files than the number of allocated hazard pointers. """ table_name = 'test_bug011' - ntables = 50 + ntables = 2000 nrows = 10000 nops = 10000 # Add connection configuration for this test. def conn_config(self, dir): - return 'cache_size=10MB,eviction_dirty_target=99,eviction_dirty_trigger=99,hazard_max=' + str(self.ntables / 2) + return 'cache_size=1GB' + @wttest.longtest("Eviction copes with lots of files") def test_eviction(self): cursors = [] datasets = [] for i in range(0, self.ntables): - this_uri = 'table:%s-%03d' % (self.table_name, i) + this_uri = 'table:%s-%05d' % (self.table_name, i) ds = SimpleDataSet(self, this_uri, self.nrows, config='allocation_size=1KB,leaf_page_max=1KB') ds.populate() @@ -57,9 +60,9 @@ class test_bug011(wttest.WiredTigerTestCase): # Switch over to on-disk trees with multiple leaf pages self.reopen_conn() - # Make sure we have a cursor for the table so it stays in cache. + # Make sure we have a cursor for every table so it stays in cache. for i in range(0, self.ntables): - this_uri = 'table:%s-%03d' % (self.table_name, i) + this_uri = 'table:%s-%05d' % (self.table_name, i) cursors.append(self.session.open_cursor(this_uri, None)) # Make use of the cache. diff --git a/test/suite/test_config03.py b/test/suite/test_config03.py index 88ca6ae3f39..6699f7d2650 100644 --- a/test/suite/test_config03.py +++ b/test/suite/test_config03.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!usr/bin/env python # # Public Domain 2014-2016 MongoDB, Inc. # Public Domain 2008-2014 WiredTiger, Inc. @@ -48,8 +48,6 @@ class test_config03(test_base03.test_base03): eviction_trigger_scenarios = wtscenario.quick_scenarios( 's_eviction_trigger', [50, 90, 95, 99], None) - hazard_max_scenarios = wtscenario.quick_scenarios('s_hazard_max', - [15, 50, 500], [0.4, 0.8, 0.8]) multiprocess_scenarios = wtscenario.quick_scenarios('s_multiprocess', [True,False], [1.0,1.0]) session_max_scenarios = wtscenario.quick_scenarios('s_session_max', @@ -66,13 +64,13 @@ class test_config03(test_base03.test_base03): verbose_scenarios = wtscenario.quick_scenarios('s_verbose', [None], None) config_vars = [ 'cache_size', 'create', 'error_prefix', 'eviction_target', - 'eviction_trigger', 'hazard_max', 'multiprocess', - 'session_max', 'verbose' ] + 'eviction_trigger', 'multiprocess', 'session_max', + 'verbose' ] scenarios = wtscenario.make_scenarios( cache_size_scenarios, create_scenarios, error_prefix_scenarios, eviction_target_scenarios, eviction_trigger_scenarios, - hazard_max_scenarios, multiprocess_scenarios, session_max_scenarios, + multiprocess_scenarios, session_max_scenarios, transactional_scenarios, verbose_scenarios, prune=1000) #wttest.WiredTigerTestCase.printVerbose(2, 'test_config03: running ' + \ diff --git a/test/suite/test_config04.py b/test/suite/test_config04.py index 204aa7e27d5..db8a5f4a16a 100644 --- a/test/suite/test_config04.py +++ b/test/suite/test_config04.py @@ -154,10 +154,6 @@ class test_config04(wttest.WiredTigerTestCase): 'eviction_trigger=86'), "/eviction target must be lower than the eviction trigger/") - def test_hazard_max(self): - # Note: There isn't any direct way to know that this was set. - self.common_test('hazard_max=50') - def test_invalid_config(self): msg = '/Unbalanced brackets/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, diff --git a/test/suite/test_hazard.py b/test/suite/test_hazard.py new file mode 100644 index 00000000000..f2891fce526 --- /dev/null +++ b/test/suite/test_hazard.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_hazard.py +# Hazard pointer tests. + +import wiredtiger, wttest +from wtdataset import SimpleDataSet + +# Regression tests. +class test_hazard(wttest.WiredTigerTestCase): + + # Allocate a large number of hazard pointers in a session, forcing the + # hazard pointer array to repeatedly grow. + def test_hazard(self): + uri = "table:hazard" + ds = SimpleDataSet(self, uri, 1000) + ds.populate() + + # Open 10,000 cursors and pin a page to set a hazard pointer. + cursors = [] + for i in range(0, 10000): + c = self.session.open_cursor(uri, None) + c.set_key(ds.key(10)) + c.search() + cursors.append(c) + + # Close the cursors, clearing the hazard pointer. + for c in cursors: + c.close() + +if __name__ == '__main__': + wttest.run() |