summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2017-04-13 00:59:39 -0400
committerMichael Cahill <michael.cahill@mongodb.com>2017-04-13 14:59:39 +1000
commit787c625446989be6745e069d7b427f2370d2ddda (patch)
tree28a494fba54fda654b10eff0598ad44a3618f9f9
parent45759b71aacc73b71cd8741fc5c46f34a5332f7e (diff)
downloadmongo-787c625446989be6745e069d7b427f2370d2ddda.tar.gz
WT-3261 add a checkpoint epoch to avoid draining the eviction queue (#3370)
-rw-r--r--src/btree/bt_read.c7
-rw-r--r--src/btree/bt_sync.c22
-rw-r--r--src/evict/evict_page.c12
-rw-r--r--src/include/btree.h27
-rw-r--r--src/include/btree.i20
-rw-r--r--src/include/extern.h2
-rw-r--r--src/include/session.h9
-rw-r--r--src/support/generation.c20
8 files changed, 54 insertions, 65 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index fb69afb166c..72a69e8591c 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -585,15 +585,10 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
* if the page qualifies for forced eviction and update
* the page's generation number. If eviction isn't being
* done on this file, we're done.
- * In-memory split of large pages is allowed while
- * no_eviction is set on btree, whereas reconciliation
- * is not allowed.
*/
if (LF_ISSET(WT_READ_NO_EVICT) ||
F_ISSET(session, WT_SESSION_NO_EVICTION) ||
- btree->lsm_primary ||
- (btree->evict_disabled > 0 &&
- !F_ISSET(btree, WT_BTREE_ALLOW_SPLITS)))
+ btree->evict_disabled > 0 || btree->lsm_primary)
goto skip_evict;
/*
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 81e9d1757bb..112f0725f94 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -180,21 +180,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* any problematic eviction or page splits to complete.
*/
WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);
-
- /*
- * Sync for checkpoint allows splits to happen while the queue
- * is being drained, but not reconciliation. We need to do this,
- * since draining the queue can take long enough for hot pages
- * to grow significantly larger than the configured maximum
- * size.
- */
- F_SET(btree, WT_BTREE_ALLOW_SPLITS);
- ret = __wt_evict_file_exclusive_on(session);
- F_CLR(btree, WT_BTREE_ALLOW_SPLITS);
- WT_ERR(ret);
- __wt_evict_file_exclusive_off(session);
-
- WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);
+ (void)__wt_gen_next_drain(session, WT_GEN_EVICT);
+ btree->checkpointing = WT_CKPT_RUNNING;
/* Write all dirty in-cache pages. */
flags |= WT_READ_NO_EVICT;
@@ -268,9 +255,8 @@ err: /* On error, clear any left-over tree walk. */
saved_pinned_id == WT_TXN_NONE)
__wt_txn_release_snapshot(session);
- /* Clear the checkpoint flag and push the change. */
- if (btree->checkpointing != WT_CKPT_OFF)
- WT_PUBLISH(btree->checkpointing, WT_CKPT_OFF);
+ /* Clear the checkpoint flag. */
+ btree->checkpointing = WT_CKPT_OFF;
__wt_spin_unlock(session, &btree->flush_lock);
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 9498e2fb313..edcd108e7e4 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -113,6 +113,9 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
/* Checkpoints should never do eviction. */
WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session));
+ /* Enter the eviction generation. */
+ __wt_session_gen_enter(session, WT_GEN_EVICT);
+
page = ref->page;
tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD);
@@ -133,7 +136,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* we want: there is nothing more to do.
*/
if (LF_ISSET(WT_EVICT_INMEM_SPLIT))
- return (0);
+ goto done;
/* Count evictions of internal pages during normal operation. */
if (!closing && WT_PAGE_IS_INTERNAL(page)) {
@@ -182,6 +185,9 @@ err: if (!closing)
WT_STAT_DATA_INCR(session, cache_eviction_fail);
}
+done: /* Leave the eviction generation. */
+ __wt_session_gen_leave(session, WT_GEN_EVICT);
+
return (ret);
}
@@ -479,10 +485,6 @@ __evict_review(
*/
if (LF_ISSET(WT_EVICT_INMEM_SPLIT))
return (__wt_split_insert(session, ref));
-
- /* If splits are the only permitted operation, we're done. */
- if (F_ISSET(S2BT(session), WT_BTREE_ALLOW_SPLITS))
- return (EBUSY);
}
/* If the page is clean, we're done and we can evict. */
diff --git a/src/include/btree.h b/src/include/btree.h
index 19db27d84a2..8ce77b5ecd3 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -151,7 +151,7 @@ struct __wt_btree {
volatile uint32_t evict_busy; /* Count of threads in eviction */
int evict_start_type; /* Start position for eviction walk
(see WT_EVICT_WALK_START). */
- enum {
+ volatile enum {
WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING
} checkpointing; /* Checkpoint in progress */
@@ -163,19 +163,18 @@ struct __wt_btree {
WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
-#define WT_BTREE_ALLOW_SPLITS 0x000100 /* Allow splits, even with no evict */
-#define WT_BTREE_BULK 0x000200 /* Bulk-load handle */
-#define WT_BTREE_CLOSED 0x000400 /* Handle closed */
-#define WT_BTREE_IGNORE_CACHE 0x000800 /* Cache-resident object */
-#define WT_BTREE_IN_MEMORY 0x001000 /* Cache-resident object */
-#define WT_BTREE_LOOKASIDE 0x002000 /* Look-aside table */
-#define WT_BTREE_NO_CHECKPOINT 0x004000 /* Disable checkpoints */
-#define WT_BTREE_NO_LOGGING 0x008000 /* Disable logging */
-#define WT_BTREE_REBALANCE 0x020000 /* Handle is for rebalance */
-#define WT_BTREE_SALVAGE 0x040000 /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x080000 /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x100000 /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x200000 /* Handle is for verify */
+#define WT_BTREE_BULK 0x000100 /* Bulk-load handle */
+#define WT_BTREE_CLOSED 0x000200 /* Handle closed */
+#define WT_BTREE_IGNORE_CACHE 0x000400 /* Cache-resident object */
+#define WT_BTREE_IN_MEMORY 0x000800 /* Cache-resident object */
+#define WT_BTREE_LOOKASIDE 0x001000 /* Look-aside table */
+#define WT_BTREE_NO_CHECKPOINT 0x002000 /* Disable checkpoints */
+#define WT_BTREE_NO_LOGGING 0x004000 /* Disable logging */
+#define WT_BTREE_REBALANCE 0x008000 /* Handle is for rebalance */
+#define WT_BTREE_SALVAGE 0x010000 /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x020000 /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x040000 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x080000 /* Handle is for verify */
uint32_t flags;
};
diff --git a/src/include/btree.i b/src/include/btree.i
index 474b40bf805..d4db65b2033 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -1286,6 +1286,16 @@ __wt_page_can_evict(
return (true);
/*
+ * We can't split or evict multiblock row-store pages where the parent's
+ * key for the page is an overflow item, because the split into the
+ * parent frees the backing blocks for any no-longer-used overflow keys,
+ * which will corrupt the checkpoint's block management.
+ */
+ if (btree->checkpointing != WT_CKPT_OFF &&
+ F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS))
+ return (false);
+
+ /*
* Check for in-memory splits before other eviction tests. If the page
* should split in-memory, return success immediately and skip more
* detailed eviction tests. We don't need further tests since the page
@@ -1312,16 +1322,6 @@ __wt_page_can_evict(
}
/*
- * We can't evict clean, multiblock row-store pages where the parent's
- * key for the page is an overflow item, because the split into the
- * parent frees the backing blocks for any no-longer-used overflow keys,
- * which will corrupt the checkpoint's block management.
- */
- if (btree->checkpointing != WT_CKPT_OFF &&
- F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS))
- return (false);
-
- /*
* If a split created new internal pages, those newly created internal
* pages cannot be evicted until all threads are known to have exited
* the original parent page's index, because evicting an internal page
diff --git a/src/include/extern.h b/src/include/extern.h
index ff835bacc56..4f38b7ac433 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -649,6 +649,8 @@ extern int __wt_unexpected_object_type( WT_SESSION_IMPL *session, const char *ur
extern void __wt_gen_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern uint64_t __wt_gen(WT_SESSION_IMPL *session, int which) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern uint64_t __wt_gen_next(WT_SESSION_IMPL *session, int which) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern uint64_t __wt_gen_next_drain(WT_SESSION_IMPL *session, int which) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_gen_drain(WT_SESSION_IMPL *session, int which, uint64_t generation) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern uint64_t __wt_gen_oldest(WT_SESSION_IMPL *session, int which) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern uint64_t __wt_session_gen(WT_SESSION_IMPL *session, int which) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_session_gen_enter(WT_SESSION_IMPL *session, int which) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/include/session.h b/src/include/session.h
index adef5e39068..de2c1463684 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -169,10 +169,11 @@ struct __wt_session_impl {
/* Generations manager */
#define WT_GEN_CHECKPOINT 0 /* Checkpoint generation */
-#define WT_GEN_HAZARD 1 /* Hazard pointer */
-#define WT_GEN_SCHEMA 2 /* Schema version */
-#define WT_GEN_SPLIT 3 /* Page splits */
-#define WT_GENERATIONS 4 /* Total generation manager entries */
+#define WT_GEN_EVICT 1 /* Eviction generation */
+#define WT_GEN_HAZARD 2 /* Hazard pointer */
+#define WT_GEN_SCHEMA 3 /* Schema version */
+#define WT_GEN_SPLIT 4 /* Page splits */
+#define WT_GENERATIONS 5 /* Total generation manager entries */
volatile uint64_t generations[WT_GENERATIONS];
/*
diff --git a/src/support/generation.c b/src/support/generation.c
index ed615d4c7cd..6e16d7e57fe 100644
--- a/src/support/generation.c
+++ b/src/support/generation.c
@@ -57,14 +57,12 @@ __wt_gen_next(WT_SESSION_IMPL *session, int which)
return (__wt_atomic_addv64(&S2C(session)->generations[which], 1));
}
-#if 0
/*
* __wt_gen_next_drain --
* Switch the resource to its next generation, then wait for it to drain.
*/
uint64_t
- TABBED IN to avoid dist/ functions:
- __wt_gen_next_drain(WT_SESSION_IMPL *session, int which)
+__wt_gen_next_drain(WT_SESSION_IMPL *session, int which)
{
uint64_t v;
@@ -80,8 +78,7 @@ uint64_t
* Wait for the resource to drain.
*/
void
- TABBED IN to avoid dist/ functions:
- __wt_gen_drain(WT_SESSION_IMPL *session, int which, uint64_t generation)
+__wt_gen_drain(WT_SESSION_IMPL *session, int which, uint64_t generation)
{
WT_CONNECTION_IMPL *conn;
WT_SESSION_IMPL *s;
@@ -109,7 +106,14 @@ void
/* Ensure we only read the value once. */
WT_ORDERED_READ(v, s->generations[which]);
- if (v == 0 || generation <= v)
+ /*
+ * The generation argument is newer than the limit. Wait
+ * for threads in generations older than the argument
+ * generation, threads in argument generations are OK.
+ *
+ * The thread's generation may be 0 (that is, not set).
+ */
+ if (v == 0 || v >= generation)
break;
/*
@@ -124,7 +128,6 @@ void
}
}
}
-#endif
/*
* __wt_gen_oldest --
@@ -156,6 +159,7 @@ __wt_gen_oldest(WT_SESSION_IMPL *session, int which)
/* Ensure we only read the value once. */
WT_ORDERED_READ(v, s->generations[which]);
+
if (v != 0 && v < oldest)
oldest = v;
}
@@ -328,7 +332,7 @@ __wt_stash_discard_all(WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session)
/*
* This function is called during WT_CONNECTION.close to discard any
- * memory that remains. For that reason, we take two WT_SESSION_IMPL
+ * memory that remains. For that reason, we take two WT_SESSION_IMPL
* arguments: session_safe is still linked to the WT_CONNECTION and
* can be safely used for calls to other WiredTiger functions, while
* session is the WT_SESSION_IMPL we're cleaning up.