From 2d781c8cfeb2a1db8bd93e03ba35b302436e4ff3 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 1 Sep 2017 01:13:49 -0400 Subject: WT-3533 eviction handle walk can race with handle re-open (#3620) * Change btree open to no longer clear WT_BTREE eviction fields, allowing WT_BTREE.evict_disabled to persist across an open. * Change the connection-level open code to disable eviction when opening (or re-opening) an underlying WT_BTREE handle. * Eviction is turned off by btree handle open in some cases, and then turned on when an key/value is inserted. Now that we're not clearing the WT_BTREE eviction fields when re-opening a file, we have to clean up that reference if we close without ever inserting an item. * When flushing pages from the cache as part of discarding a WT_BTREE handle, release our exclusive eviction access to the handle once done. This requires eviction check for a WT_BTREE root page that's not set, because the tree is gone after we release exclusive eviction access. This reverts parts of WT-3193 (3747a26), but we can no longer increment the eviction disabled counter knowing it will be zero'd out when the handle is re-opened. The alternative is to keep another counter here, and I think checking the root page is less fragile/ugly than that would be. Add a new flag WT_BTREE.evict_disabled_open that's set if we disable eviction during open, we have to clean up the eviction disabled counters for special handles during close (for example verify). Don't use WT_BTREE.original as a flag to clean up eviction disabled counters during close, it gets set too early in the process, long before we disable eviction. Use the new WT_BTREE.evict_disabled_open flag in all cases. (cherry picked from commit fe0851d5c2a99c5084314e7b33df4c5d9735c674) --- dist/s_string.ok | 1 + src/btree/bt_cursor.c | 4 +++- src/btree/bt_handle.c | 35 ++++++++++++++++++++++++----------- src/conn/conn_dhandle.c | 7 ++++++- src/evict/evict_lru.c | 30 ++++++++++++++++++++---------- src/include/btree.h | 25 ++++++++++++++++++------- src/session/session_dhandle.c | 4 +++- 7 files changed, 75 insertions(+), 31 deletions(-) diff --git a/dist/s_string.ok b/dist/s_string.ok index f3852d00ac8..99abc3e9ad1 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -1060,6 +1060,7 @@ rebalancing recno recnos reconfig +reconfigures reconfiguring recsize rectype diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index f0aa632551b..95d817850ef 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -156,8 +156,10 @@ __cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree) * into a tree. Eviction is disabled when an empty tree is opened, and * it must only be enabled once. */ - if (__wt_atomic_cas8(&btree->original, 1, 0)) + if (__wt_atomic_cas8(&btree->original, 1, 0)) { + btree->evict_disabled_open = false; __wt_evict_file_exclusive_off(session); + } } /* diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index a0da7df0998..1e6405272f7 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -66,7 +66,6 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_DATA_HANDLE *dhandle; WT_DECL_RET; size_t root_addr_size; - uint32_t mask; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; const char *filename; bool creation, forced_salvage, readonly; @@ -75,15 +74,14 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) dhandle = session->dhandle; /* - * This may be a re-open of an underlying object and we have to clean - * up. We can't clear the operation flags, however, they're set by the - * connection handle software that called us. + * This may be a re-open, clean up the btree structure. + * Clear the fields that don't persist across a re-open. + * Clear all flags other than the operation flags (which are set by the + * connection handle software that called us). */ WT_RET(__btree_clear(session)); - - mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); - memset(btree, 0, sizeof(*btree)); - btree->flags = mask; + memset(btree, 0, WT_BTREE_CLEAR_SIZE); + F_CLR(btree, ~WT_BTREE_SPECIAL_FLAGS); /* Set the data handle first, our called functions reasonably use it. */ btree->dhandle = dhandle; @@ -185,13 +183,19 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) * * Files that can still be bulk-loaded cannot be evicted. * Permanently cache-resident files can never be evicted. - * Special operations don't enable eviction. (The underlying commands - * may turn on eviction, but it's their decision.) + * Special operations don't enable eviction. The underlying commands may + * turn on eviction (for example, verify turns on eviction while working + * a file to keep from consuming the cache), but it's their decision. If + * an underlying command reconfigures eviction, it must either clear the + * evict-disabled-open flag or restore the eviction configuration when + * finished so that handle close behaves correctly. */ if (btree->original || F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE | - WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { WT_ERR(__wt_evict_file_exclusive_on(session)); + btree->evict_disabled_open = true; + } if (0) { err: WT_TRET(__wt_btree_close(session)); @@ -228,6 +232,15 @@ __wt_btree_close(WT_SESSION_IMPL *session) return (0); F_SET(btree, WT_BTREE_CLOSED); + /* + * If we turned eviction off and never turned it back on, do that now, + * otherwise the counter will be off. + */ + if (btree->evict_disabled_open) { + btree->evict_disabled_open = false; + __wt_evict_file_exclusive_off(session); + } + /* Discard any underlying block manager resources. */ if ((bm = btree->bm) != NULL) { btree->bm = NULL; diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 243f4ac3047..2560ca47268 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -317,6 +317,9 @@ __wt_conn_btree_open( WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS)); + /* Turn off eviction. */ + WT_RET(__wt_evict_file_exclusive_on(session)); + /* * If the handle is already open, it has to be closed so it can be * reopened with a new configuration. @@ -330,7 +333,7 @@ __wt_conn_btree_open( * in the tree that can block the close. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) - WT_RET(__wt_conn_btree_sync_and_close(session, false, false)); + WT_ERR(__wt_conn_btree_sync_and_close(session, false, false)); /* Discard any previous configuration, set up the new configuration. */ __conn_btree_config_clear(session); @@ -374,6 +377,8 @@ __wt_conn_btree_open( err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); } + __wt_evict_file_exclusive_off(session); + return (ret); } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index fdf68841b85..b9b1a7783fe 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1452,26 +1452,23 @@ retry: while (slot < max_entries) { /* * Re-check the "no eviction" flag, used to enforce exclusive - * access when a handle is being closed. If not set, remember - * the file to visit first, next loop. + * access when a handle is being closed. * * Only try to acquire the lock and simply continue if we fail; * the lock is held while the thread turning off eviction clears * the tree's current eviction point, and part of the process is * waiting on this thread to acknowledge that action. + * + * If a handle is being discarded, it will still be marked open, + * but won't have a root page. */ if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { - if (btree->evict_disabled == 0) { + if (btree->evict_disabled == 0 && + btree->root.page != NULL) { /* - * Assert the handle has a root page: eviction - * should have been locked out if the tree is - * being discarded or the root page is changing. - * As this has not always been the case, assert - * to debug that change. + * Remember the file to visit first, next loop. */ - WT_ASSERT(session, btree->root.page != NULL); - cache->evict_file_next = dhandle; WT_WITH_DHANDLE(session, dhandle, ret = __evict_walk_file( @@ -1860,6 +1857,10 @@ fast: /* If the page can't be evicted, give up. */ WT_STAT_CONN_INCRV( session, cache_eviction_pages_queued, (u_int)(evict - start)); + __wt_verbose(session, WT_VERB_EVICTSERVER, + "%s walk: seen %" PRIu64 ", queued %" PRIu64, + session->dhandle->name, pages_seen, pages_queued); + /* * If we couldn't find the number of pages we were looking for, skip * the tree next time. @@ -2442,14 +2443,23 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; + u_int pct; uint64_t total_bytes, total_dirty_bytes; conn = S2C(session); total_bytes = total_dirty_bytes = 0; + pct = 0; /* [-Werror=uninitialized] */ WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); WT_RET(__wt_msg(session, "cache dump")); + WT_RET(__wt_msg(session, + "cache full: %s", __wt_cache_full(session) ? "yes" : "no")); + WT_RET(__wt_msg(session, "cache clean check: %s (%u%%)", + __wt_eviction_clean_needed(session, &pct) ? "yes" : "no", pct)); + WT_RET(__wt_msg(session, "cache dirty check: %s (%u%%)", + __wt_eviction_dirty_needed(session, &pct) ? "yes" : "no", pct)); + for (dhandle = NULL;;) { WT_WITH_HANDLE_LIST_READ_LOCK(session, WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); diff --git a/src/include/btree.h b/src/include/btree.h index 28fe1b94b23..8712a404b13 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -142,12 +142,30 @@ struct __wt_btree { uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ + /* + * We flush pages from the tree (in order to make checkpoint faster), + * without a high-level lock. To avoid multiple threads flushing at + * the same time, lock the tree. + */ + WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ + + /* + * All of the following fields live at the end of the structure so it's + * easier to clear everything but the fields that persist. + */ +#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref)) + + /* + * Eviction information is maintained in the btree handle, but owned by + * eviction, not the btree code. + */ WT_REF *evict_ref; /* Eviction thread's location */ uint64_t evict_priority; /* Relative priority of cached pages */ u_int evict_walk_period; /* Skip this many LRU walks */ u_int evict_walk_saved; /* Saved walk skips for checkpoints */ u_int evict_walk_skips; /* Number of walks skipped */ int evict_disabled; /* Eviction disabled count */ + bool evict_disabled_open;/* Eviction disabled on open */ volatile uint32_t evict_busy; /* Count of threads in eviction */ int evict_start_type; /* Start position for eviction walk (see WT_EVICT_WALK_START). */ @@ -155,13 +173,6 @@ struct __wt_btree { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING } checkpointing; /* Checkpoint in progress */ - /* - * We flush pages from the tree (in order to make checkpoint faster), - * without a high-level lock. To avoid multiple threads flushing at - * the same time, lock the tree. - */ - WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ - /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ #define WT_BTREE_ALLOW_SPLITS 0x000100 /* Allow splits, even with no evict */ #define WT_BTREE_BULK 0x000200 /* Bulk-load handle */ diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 866dab32a78..707e07ac11f 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -598,7 +598,9 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) * the underlying file are visible to the in-memory pages. */ WT_ERR(__wt_evict_file_exclusive_on(session)); - WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD)); + ret = __wt_cache_op(session, WT_SYNC_DISCARD); + __wt_evict_file_exclusive_off(session); + WT_ERR(ret); /* * We lock checkpoint handles that we are overwriting, so the handle -- cgit v1.2.1