summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2017-03-16 07:49:08 -0400
committerGitHub <noreply@github.com>2017-03-16 07:49:08 -0400
commit6203106c56504f194bab7093b28c45ae7beb9cac (patch)
treefd07d0d4860dda86625d9a5df699d31a23cd1a84
parent51d22616094e0a0d34997d26aec925adf949fbdf (diff)
downloadmongo-6203106c56504f194bab7093b28c45ae7beb9cac.tar.gz
WT-3204 eviction changes cost LSM performance (#3325)
* WT-3204 eviction changes cost LSM performance Modify LSM's primary chunk switching to match the new btree eviction semantics on object creation. We now create objects with eviction turned off, LSM should no longer have to turn eviction off when configuring the primary chunk. LSM previously set WT_BTREE.bulk_load_ok to false to ensure an insert into the tree wouldn't turn eviction on. That problem remains, but there's a race in the implementation if multiple threads are inserting at the same time (where a thread modifies WT_BTREE.bulk_load_ok and goes to sleep before configuring eviction, and another thread does an insert and turns off eviction), and there's a further race between threads doing F_ISSET/F_SET tests. Change the WT_BTREE_LSM_PRIMARY flag into a WT_BTREE.lsm_primary variable so there's no F_ISSET/F_SET race. Remove the test/set of bulk-load_ok, instead, test the lsm_primary value in the btree code before turning eviction off. When checkpointing an LSM chunk, move the code that turns off the chunk's primary flag in the chunk inside the single-threaded part of the function to ensure we don't race with other threads doing checkpoints. That makes the code to fix up the accounting single-threaded and safe. Simplify the LSM checkpoint code to call __wt_checkpoint directly, and use the same handle for turning off the chunk's primary flag as we use for the checkpoint. * Force a primary switch in LSM after an exclusive-handle operation has come through. Otherwise it's possible to attempt to use a file as the primary chunk without disabling eviction. * spelling * WT_BTREE.bulk_load_ok isn't a boolean, don't use true/false comparisons. * Only check for an empty tree the first time an LSM chunk is opened. The goal here is to make sure that LSM primary chunks start empty. Otherwise, we can't load into a skiplist in memory as required by LSM. If an operation such as verify closes a btree in order to check the on-disk state, the next time it is opened we have to check whether it is empty. It is safe to do this check without locking: what matters is that we always do the `lsm_primary` check before any update operation that would turn off `btree->bulk_load_ok`. * Rename WT_BTREE.bulk_load_ok to be WT_BTREE.original, it's used by LSM. * Fix a comment.
-rw-r--r--src/btree/bt_cursor.c24
-rw-r--r--src/btree/bt_handle.c4
-rw-r--r--src/include/btree.h9
-rw-r--r--src/include/btree.i60
-rw-r--r--src/lsm/lsm_cursor.c31
-rw-r--r--src/lsm/lsm_work_unit.c109
-rw-r--r--src/reconcile/rec_write.c3
-rw-r--r--src/txn/txn_ckpt.c2
8 files changed, 130 insertions, 112 deletions
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 3ae6e022906..d6dc0991d3f 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -126,15 +126,23 @@ static inline void
__cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree)
{
/*
- * Once a tree is no longer empty, eviction should pay attention to it,
- * and it's no longer possible to bulk-load into it.
- *
- * We use a compare-and-swap here to avoid races among the first
- * inserts into a tree. Eviction is disabled when an empty tree is
- * opened, it must only be enabled once.
+ * Once a tree (other than the LSM primary) is no longer empty, eviction
+ * should pay attention to it, and it's no longer possible to bulk-load
+ * into it.
+ */
+ if (!btree->original)
+ return;
+ if (btree->lsm_primary) {
+ btree->original = 0; /* Make the next test faster. */
+ return;
+ }
+
+ /*
+ * We use a compare-and-swap here to avoid races among the first inserts
+ * into a tree. Eviction is disabled when an empty tree is opened, and
+ * it must only be enabled once.
*/
- if (btree->bulk_load_ok &&
- __wt_atomic_cas8(&btree->bulk_load_ok, 1, 0))
+ if (__wt_atomic_cas8(&btree->original, 1, 0))
__wt_evict_file_exclusive_off(session);
}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index ff199eb1e0e..f2bffee06da 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -188,7 +188,7 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
* Special operations don't enable eviction. (The underlying commands
* may turn on eviction, but it's their decision.)
*/
- if (btree->bulk_load_ok ||
+ if (btree->original ||
F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE |
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
WT_ERR(__wt_evict_file_exclusive_on(session));
@@ -562,7 +562,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
* tree.
*/
if (creation)
- btree->bulk_load_ok = 1;
+ btree->original = 1;
/*
* A note about empty trees: the initial tree is a single root page.
diff --git a/src/include/btree.h b/src/include/btree.h
index 857dc6694c5..15a68474fdf 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -118,11 +118,13 @@ struct __wt_btree {
uint64_t last_recno; /* Column-store last record number */
- WT_REF root; /* Root page reference */
- bool modified; /* If the tree ever modified */
- uint8_t bulk_load_ok; /* Bulk-load is a possibility
+ WT_REF root; /* Root page reference */
+ bool modified; /* If the tree ever modified */
+ uint8_t original; /* Newly created: bulk-load possible
(want a bool but needs atomic cas) */
+ bool lsm_primary; /* Handle is/was the LSM primary */
+
WT_BM *bm; /* Block manager reference */
u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */
@@ -160,7 +162,6 @@ struct __wt_btree {
#define WT_BTREE_IGNORE_CACHE 0x000400 /* Cache-resident object */
#define WT_BTREE_IN_MEMORY 0x000800 /* Cache-resident object */
#define WT_BTREE_LOOKASIDE 0x001000 /* Look-aside table */
-#define WT_BTREE_LSM_PRIMARY 0x002000 /* Handle is current LSM primary */
#define WT_BTREE_NO_CHECKPOINT 0x004000 /* Disable checkpoints */
#define WT_BTREE_NO_LOGGING 0x008000 /* Disable logging */
#define WT_BTREE_NO_RECONCILE 0x010000 /* Allow splits, even with no evict */
diff --git a/src/include/btree.i b/src/include/btree.i
index cec6f67e9bd..c0c5c7c5a8d 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -149,7 +149,7 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
if (WT_PAGE_IS_INTERNAL(page)) {
(void)__wt_atomic_add64(&btree->bytes_dirty_intl, size);
(void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
- } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) {
+ } else if (!btree->lsm_primary) {
(void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
(void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
}
@@ -285,7 +285,7 @@ __wt_cache_page_byte_dirty_decr(
decr, "WT_BTREE.bytes_dirty_intl");
__wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl,
decr, "WT_CACHE.bytes_dirty_intl");
- } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) {
+ } else if (!btree->lsm_primary) {
__wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf,
decr, "WT_BTREE.bytes_dirty_leaf");
__wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf,
@@ -345,7 +345,7 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
(void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
(void)__wt_atomic_add64(&cache->pages_dirty_intl, 1);
} else {
- if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) {
+ if (!btree->lsm_primary) {
(void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
(void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
}
@@ -444,7 +444,7 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_cache_decr_zero_uint64(session,
&cache->bytes_dirty_intl,
modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl");
- } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) {
+ } else if (!btree->lsm_primary) {
__wt_cache_decr_zero_uint64(session,
&btree->bytes_dirty_leaf,
modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf");
@@ -1546,58 +1546,6 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
}
/*
- * __wt_btree_lsm_switch_primary --
- * Switch a btree handle to/from the current primary chunk of an LSM tree.
- */
-static inline int
-__wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on)
-{
- WT_BTREE *btree;
- WT_CACHE *cache;
- WT_PAGE *child, *root;
- WT_PAGE_INDEX *pindex;
- WT_REF *first;
- size_t size;
-
- btree = S2BT(session);
- cache = S2C(session)->cache;
- root = btree->root.page;
-
- if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) {
- F_SET(btree, WT_BTREE_LSM_PRIMARY);
- WT_RET(__wt_evict_file_exclusive_on(session));
- }
- if (!on && F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) {
- pindex = WT_INTL_INDEX_GET_SAFE(root);
- if (btree->evict_disabled == 0 || pindex->entries != 1)
- return (0);
- first = pindex->index[0];
-
- /*
- * We're reaching down into the page without a hazard pointer,
- * but that's OK because we know that no-eviction is set so the
- * page can't disappear.
- *
- * While this tree was the primary, its dirty bytes were not
- * included in the cache accounting. Fix that now before we
- * open it up for eviction.
- */
- child = first->page;
- if (first->state == WT_REF_MEM &&
- child->type == WT_PAGE_ROW_LEAF &&
- __wt_page_is_modified(child)) {
- size = child->modify->bytes_dirty;
- (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
- (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
- }
-
- F_CLR(btree, WT_BTREE_LSM_PRIMARY);
- __wt_evict_file_exclusive_off(session);
- }
- return (0);
-}
-
-/*
* __wt_split_descent_race --
* Return if we raced with an internal page split when descending the tree.
*/
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 77fa96ebdfd..bd1daaa6915 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -688,20 +688,29 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) {
if (chunk != NULL &&
!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
chunk->switch_txn == WT_TXN_NONE) {
- clsm->primary_chunk = chunk;
primary = clsm->chunks[clsm->nchunks - 1]->cursor;
+ btree = ((WT_CURSOR_BTREE *)primary)->btree;
+
/*
- * Disable eviction for the in-memory chunk. Also clear the
- * bulk load flag here, otherwise eviction will be enabled by
- * the first update.
+ * If the primary is not yet set as the primary, do that now.
+ * Note that eviction was configured off when the underlying
+ * object was created, which is what we want, leave it alone.
+ *
+ * We don't have to worry about races here: every thread that
+ * modifies the tree will have to come through here, at worse
+ * we set the flag repeatedly. We don't use a WT_BTREE handle
+ * flag, however, we could race doing the read-modify-write of
+ * the flags field.
+ *
+ * If something caused the chunk to be closed and reopened
+ * since it was created, we can no longer use it as a primary
+ * chunk and we need to force a switch. We detect the tree was
+ * created when it was opened by checking the "original" flag.
*/
- btree = ((WT_CURSOR_BTREE *)(primary))->btree;
- if (btree->bulk_load_ok) {
- btree->bulk_load_ok = false;
- WT_WITH_BTREE(session, btree,
- ret = __wt_btree_lsm_switch_primary(session, true));
- WT_ERR(ret);
- }
+ if (!btree->lsm_primary && btree->original)
+ btree->lsm_primary = true;
+ if (btree->lsm_primary)
+ clsm->primary_chunk = chunk;
}
clsm->dsk_gen = lsm_tree->dsk_gen;
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index c9c350c5ac9..0b0801a8cca 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -256,6 +256,51 @@ err:
}
/*
+ * __lsm_switch_primary_off --
+ * Switch when a btree handle is no longer the current primary chunk of
+ * an LSM tree.
+ */
+static void
+__lsm_switch_primary_off(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_PAGE *child, *root;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *first;
+ size_t size;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+ root = btree->root.page;
+ pindex = WT_INTL_INDEX_GET_SAFE(root);
+
+ /* Diagnostic: assert we've never split. */
+ WT_ASSERT(session, pindex->entries == 1);
+
+ /*
+ * We're reaching down into the page without a hazard pointer,
+ * but that's OK because we know that no-eviction is set so the
+ * page can't disappear.
+ *
+ * While this tree was the primary, its dirty bytes were not
+ * included in the cache accounting. Fix that now before we
+ * open it up for eviction.
+ */
+ first = pindex->index[0];
+ child = first->page;
+ if (first->state == WT_REF_MEM &&
+ child->type == WT_PAGE_ROW_LEAF && __wt_page_is_modified(child)) {
+ size = child->modify->bytes_dirty;
+ (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
+ (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
+ }
+
+ /* Configure eviction. */
+ __wt_evict_file_exclusive_off(session);
+}
+
+/*
* __wt_lsm_checkpoint_chunk --
* Flush a single LSM chunk to disk.
*/
@@ -263,11 +308,12 @@ int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
+ WT_BTREE *btree;
WT_DECL_RET;
WT_TXN_ISOLATION saved_isolation;
- bool flush_set;
+ bool flush_set, release_btree;
- flush_set = false;
+ flush_set = release_btree = false;
/*
* If the chunk is already checkpointed, make sure it is also evicted.
@@ -318,20 +364,18 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
* We can wait here for checkpoints and fsyncs to complete, which can
* take a long time.
*/
- if ((ret = __wt_session_get_btree(
- session, chunk->uri, NULL, NULL, 0)) == 0) {
- /*
- * Set read-uncommitted: we have already checked that all of the
- * updates in this chunk are globally visible, use the cheapest
- * possible check in reconciliation.
- */
- saved_isolation = session->txn.isolation;
- session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
- ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES);
- session->txn.isolation = saved_isolation;
- WT_TRET(__wt_session_release_btree(session));
- }
- WT_ERR(ret);
+ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
+ release_btree = true;
+
+ /*
+ * Set read-uncommitted: we have already checked that all of the updates
+ * in this chunk are globally visible, use the cheapest possible check
+ * in reconciliation.
+ */
+ saved_isolation = session->txn.isolation;
+ session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
+ WT_ERR(__wt_cache_op(session, WT_SYNC_WRITE_LEAVES));
+ session->txn.isolation = saved_isolation;
__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
chunk->uri);
@@ -348,12 +392,28 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
WT_ERR(__wt_meta_track_on(session));
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_worker(
- session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)));
+ ret = __wt_checkpoint(session, NULL)));
WT_TRET(__wt_meta_track_off(session, false, ret != 0));
if (ret != 0)
WT_ERR_MSG(session, ret, "LSM checkpoint");
+ /*
+ * If the chunk is the lsm primary, clear the no-eviction flag so it can
+ * be evicted and eventually closed. Only do once, and only do after the
+ * checkpoint has succeeded: otherwise, accessing the leaf page during
+ * the checkpoint can trigger forced eviction.
+ *
+ * We don't have to worry about races here, we're single-threaded.
+ */
+ btree = S2BT(session);
+ if (btree->lsm_primary) {
+ __lsm_switch_primary_off(session);
+ btree->lsm_primary = false;
+ }
+
+ release_btree = false;
+ WT_ERR(__wt_session_release_btree(session));
+
/* Now the file is written, get the chunk size. */
WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
@@ -376,17 +436,6 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
WT_PUBLISH(chunk->flushing, 0);
flush_set = false;
- /*
- * Clear the no-eviction flag so the primary can be evicted and
- * eventually closed. Only do this once the checkpoint has succeeded:
- * otherwise, accessing the leaf page during the checkpoint can trigger
- * forced eviction.
- */
- WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
- WT_TRET(__wt_btree_lsm_switch_primary(session, false));
- WT_TRET(__wt_session_release_btree(session));
- WT_ERR(ret);
-
/* Make sure we aren't pinning a transaction ID. */
__wt_txn_release_snapshot(session);
@@ -403,6 +452,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
err: if (flush_set)
WT_PUBLISH(chunk->flushing, 0);
+ if (release_btree)
+ WT_TRET(__wt_session_release_btree(session));
return (ret);
}
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index a667a288187..88d4397fcb5 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -3583,11 +3583,12 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
uint64_t recno;
btree = S2BT(session);
+
/*
* Bulk-load is only permitted on newly created files, not any empty
* file -- see the checkpoint code for a discussion.
*/
- if (!btree->bulk_load_ok)
+ if (!btree->original)
WT_RET_MSG(session, EINVAL,
"bulk-load is only possible for newly created trees");
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index ec150f39fc5..80cdf1cd39b 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -1420,7 +1420,7 @@ __checkpoint_tree(
* delete a physical checkpoint, and that will end in tears.
*/
if (is_checkpoint)
- if (btree->bulk_load_ok) {
+ if (btree->original) {
fake_ckpt = true;
goto fake;
}