summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2015-01-14 04:54:08 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2015-01-14 04:54:08 +1100
commit16b057c89ce15ce6d37224ab04a826c0eac14896 (patch)
treef7d2274b31b46f80b55037f6beda73589b3f24cf
parent53cfcbcc48c857cfbfb08a043c1476a02bccb459 (diff)
parent3090b47b76f6876e18fabe41b52926099f5243d8 (diff)
downloadmongo-16b057c89ce15ce6d37224ab04a826c0eac14896.tar.gz
Merge branch 'develop' into slow-deepen-split
Conflicts: src/btree/bt_split.c
-rw-r--r--dist/stat_data.py2
-rw-r--r--src/btree/bt_delete.c9
-rw-r--r--src/btree/bt_page.c15
-rw-r--r--src/btree/bt_split.c32
-rw-r--r--src/btree/bt_walk.c9
-rw-r--r--src/evict/evict_lru.c13
-rw-r--r--src/include/btree.i97
-rw-r--r--src/include/cache.i59
-rw-r--r--src/include/cursor.i12
-rw-r--r--src/include/stat.h1
-rw-r--r--src/include/txn.i10
-rw-r--r--src/include/wiredtiger.in202
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/lsm/lsm_cursor.c2
-rw-r--r--src/session/session_api.c7
-rw-r--r--src/support/stat.c3
-rw-r--r--src/txn/txn.c9
-rw-r--r--src/txn/txn_log.c10
18 files changed, 295 insertions, 199 deletions
diff --git a/dist/stat_data.py b/dist/stat_data.py
index ae442bcc463..69e8d2ed21e 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -162,6 +162,8 @@ connection_stats = [
'pages selected for eviction unable to be evicted'),
CacheStat('cache_eviction_force',
'pages evicted because they exceeded the in-memory maximum'),
+ CacheStat('cache_eviction_force_delete',
+ 'pages evicted because they had chains of deleted items'),
CacheStat('cache_eviction_force_fail',
'failed eviction of pages that exceeded the in-memory maximum'),
CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'),
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index c97ea176c97..570b7f80742 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -207,6 +207,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
int skip;
+ if (ref->state != WT_REF_DELETED)
+ return (0);
+
/*
* Deleted pages come from two sources: either it's a fast-delete as
* described above, or the page has been emptied by other operations
@@ -225,11 +228,13 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
* the page could switch to an in-memory state at any time. Lock down
* the structure, just to be safe.
*/
+ if (ref->page_del == NULL)
+ return (1);
+
if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
return (0);
- skip = ref->page_del == NULL ||
- __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;
+ skip = __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;
WT_PUBLISH(ref->state, WT_REF_DELETED);
return (skip);
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 181ffdb3736..561e1c19218 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -37,8 +37,11 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
page->type != WT_PAGE_ROW_LEAF)
return (0);
- /* Eviction may be turned off, although that's rare. */
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ /*
+ * Eviction may be turned off (although that's rare), or we may be in
+ * the middle of a checkpoint.
+ */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION) || btree->checkpointing)
return (0);
/*
@@ -128,7 +131,13 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
force_attempts < 10 &&
__evict_force_check(session, page)) {
++force_attempts;
- WT_RET(__wt_page_release(session, ref, flags));
+ if ((ret = __wt_page_release_busy(
+ session, ref, flags)) == EBUSY) {
+ /* If forced eviction fails, stall. */
+ ret = 0;
+ wait_cnt += 1000;
+ } else
+ WT_RET(ret);
WT_STAT_FAST_CONN_INCR(
session, page_forcible_evict_blocked);
break;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 2bae34b620b..9a3186a0015 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -812,9 +812,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_DECL_RET;
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
- WT_REF **alloc_refp, *parent_ref;
+ WT_REF **alloc_refp, *next_ref, *parent_ref;
size_t size;
- uint32_t children, i, j, parent_entries, result_entries;
+ uint32_t children, i, j;
+ uint32_t deleted_entries, parent_entries, result_entries;
int complete, hazard, locked;
parent = NULL; /* -Wconditional-uninitialized */
@@ -861,7 +862,22 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
pindex = WT_INTL_INDEX_COPY(parent);
parent_entries = pindex->entries;
- result_entries = (parent_entries - 1) + new_entries;
+
+ /*
+ * Remove any refs to deleted pages while we are splitting, we have
+ * the internal page locked down, and are copying the refs into a new
+ * array anyway.
+ */
+ for (i = 0, deleted_entries = 0; i < parent_entries; ++i)
+ if (pindex->index[i]->state == WT_REF_DELETED)
+ deleted_entries++;
+
+ /*
+ * The final entry count consists of: The original count, plus any
+ * new pages, less any refs we are removing because they only
+ * contained deleted items, less 1 for the page being replaced.
+ */
+ result_entries = (parent_entries + new_entries) - (deleted_entries + 1);
/*
* Allocate and initialize a new page index array for the parent, then
@@ -873,8 +889,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_MEMSIZE_ADD(parent_incr, size);
alloc_index->index = (WT_REF **)(alloc_index + 1);
alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i)
- if (pindex->index[i] == ref)
+ for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
*alloc_refp++ = ref_new[j];
@@ -886,8 +903,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
ref_new[j] = NULL;
}
- else
- *alloc_refp++ = pindex->index[i];
+ else if (next_ref->state != WT_REF_DELETED)
+ *alloc_refp++ = next_ref;
+ }
/*
* Update the parent page's index: this update makes the split visible
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index c74a7177401..a2b2a6bb7c8 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -208,6 +208,12 @@ restart: /*
break;
} else if (LF_ISSET(WT_READ_TRUNCATE)) {
/*
+ * Avoid pulling a deleted page back in to try
+ * to delete it again.
+ */
+ if (__wt_delete_page_skip(session, ref))
+ break;
+ /*
* If deleting a range, try to delete the page
* without instantiating it.
*/
@@ -242,8 +248,7 @@ restart: /*
* If iterating a cursor, try to skip deleted
* pages that are visible to us.
*/
- if (ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, ref))
+ if (__wt_delete_page_skip(session, ref))
break;
}
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 60a5f82f233..a4ae0aaf55b 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -437,7 +437,7 @@ __evict_pass(WT_SESSION_IMPL *session)
WT_EVICT_WORKER *worker;
int loop;
uint32_t flags;
- uint64_t bytes_inuse, pages_evicted;
+ uint64_t bytes_inuse, dirty_target_size, pages_evicted, target_size;
conn = S2C(session);
cache = conn->cache;
@@ -465,9 +465,16 @@ __evict_pass(WT_SESSION_IMPL *session)
if (loop > 10)
LF_SET(WT_EVICT_PASS_AGGRESSIVE);
- /* Start a worker if we have capacity and the cache is full. */
+ /*
+ * Start a worker if we have capacity and we haven't reached
+ * the eviction targets.
+ */
bytes_inuse = __wt_cache_bytes_inuse(cache);
- if (bytes_inuse > conn->cache_size &&
+ target_size = (conn->cache_size * cache->eviction_target) / 100;
+ dirty_target_size =
+ (conn->cache_size * cache->eviction_dirty_target) / 100;
+ if ((bytes_inuse > target_size ||
+ cache->bytes_dirty > dirty_target_size) &&
conn->evict_workers < conn->evict_workers_max) {
WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
"Starting evict worker: %"PRIu32"\n",
diff --git a/src/include/btree.i b/src/include/btree.i
index a333e4af565..6955b672926 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -165,65 +165,6 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * __wt_cache_read_gen --
- * Get the current read generation number.
- */
-static inline uint64_t
-__wt_cache_read_gen(WT_SESSION_IMPL *session)
-{
- return (S2C(session)->cache->read_gen);
-}
-
-/*
- * __wt_cache_read_gen_incr --
- * Increment the current read generation number.
- */
-static inline void
-__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
-{
- ++S2C(session)->cache->read_gen;
-}
-
-/*
- * __wt_cache_read_gen_set --
- * Get the read generation to store in a page.
- */
-static inline uint64_t
-__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
-{
- /*
- * We return read-generations from the future (where "the future" is
- * measured by increments of the global read generation). The reason
- * is because when acquiring a new hazard pointer for a page, we can
- * check its read generation, and if the read generation isn't less
- * than the current global generation, we don't bother updating the
- * page. In other words, the goal is to avoid some number of updates
- * immediately after each update we have to make.
- */
- return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
-}
-
-/*
- * __wt_cache_pages_inuse --
- * Return the number of pages in use.
- */
-static inline uint64_t
-__wt_cache_pages_inuse(WT_CACHE *cache)
-{
- return (cache->pages_inmem - cache->pages_evict);
-}
-
-/*
- * __wt_cache_bytes_inuse --
- * Return the number of bytes in use.
- */
-static inline uint64_t
-__wt_cache_bytes_inuse(WT_CACHE *cache)
-{
- return (cache->bytes_inmem - cache->bytes_evict);
-}
-
-/*
* __wt_page_evict_soon --
* Set a page to be evicted as soon as possible.
*/
@@ -917,16 +858,16 @@ __wt_ref_info(WT_SESSION_IMPL *session,
}
/*
- * __wt_page_release --
- * Release a reference to a page.
+ * __wt_page_release_busy --
+ * Release a reference to a page, fail if busy during forced eviction.
*/
static inline int
-__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+__wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
- int locked;
+ int locked, too_big;
btree = S2BT(session);
@@ -938,6 +879,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
return (0);
page = ref->page;
+ too_big = (page->memory_footprint < btree->maxmempage) ? 0 : 1;
+
/*
* Attempt to evict pages with the special "oldest" read generation.
*
@@ -970,12 +913,19 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
return (ret);
(void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
- if ((ret = __wt_evict_page(session, ref)) == 0)
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
- else {
+ if ((ret = __wt_evict_page(session, ref)) == 0) {
+ if (too_big)
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
+ else
+ /*
+ * If the page isn't too big, we are evicting it
+ * because it had a chain of deleted entries that make
+ * traversal expensive.
+ */
+ WT_STAT_FAST_CONN_INCR(
+ session, cache_eviction_force_delete);
+ } else {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
- if (ret == EBUSY)
- ret = 0;
}
(void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
@@ -983,6 +933,17 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
}
/*
+ * __wt_page_release --
+ * Release a reference to a page.
+ */
+static inline int
+__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+ WT_RET_BUSY_OK(__wt_page_release_busy(session, ref, flags));
+ return (0);
+}
+
+/*
* __wt_page_swap_func --
* Swap one page's hazard pointer for another one when hazard pointer
* coupling up/down the tree.
diff --git a/src/include/cache.i b/src/include/cache.i
index b997781272a..ee969255241 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -7,6 +7,65 @@
*/
/*
+ * __wt_cache_read_gen --
+ * Get the current read generation number.
+ */
+static inline uint64_t
+__wt_cache_read_gen(WT_SESSION_IMPL *session)
+{
+ return (S2C(session)->cache->read_gen);
+}
+
+/*
+ * __wt_cache_read_gen_incr --
+ * Increment the current read generation number.
+ */
+static inline void
+__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
+{
+ ++S2C(session)->cache->read_gen;
+}
+
+/*
+ * __wt_cache_read_gen_set --
+ * Get the read generation to store in a page.
+ */
+static inline uint64_t
+__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+{
+ /*
+ * We return read-generations from the future (where "the future" is
+ * measured by increments of the global read generation). The reason
+ * is because when acquiring a new hazard pointer for a page, we can
+ * check its read generation, and if the read generation isn't less
+ * than the current global generation, we don't bother updating the
+ * page. In other words, the goal is to avoid some number of updates
+ * immediately after each update we have to make.
+ */
+ return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ * Return the number of pages in use.
+ */
+static inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+ return (cache->pages_inmem - cache->pages_evict);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ * Return the number of bytes in use.
+ */
+static inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+ return (cache->bytes_inmem - cache->bytes_evict);
+}
+
+/*
* __wt_eviction_check --
* Wake the eviction server if necessary.
*/
diff --git a/src/include/cursor.i b/src/include/cursor.i
index ae6aafdd638..8fa9790e096 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -176,11 +176,23 @@ static inline int
__cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
{
WT_SESSION_IMPL *session;
+ WT_TXN *txn;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ txn = &session->txn;
if (reenter)
WT_RET(__curfile_leave(cbt));
+
+ /*
+ * If there is no transaction active in this thread and we haven't
+ * checked if the cache is full, do it now. If we have to block for
+ * eviction, this is the best time to do it.
+ */
+ if (F_ISSET(txn, TXN_RUNNING) &&
+ !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+ WT_RET(__wt_cache_full_check(session));
+
if (!F_ISSET(cbt, WT_CBT_ACTIVE))
WT_RET(__curfile_enter(cbt));
__wt_txn_cursor_op(session);
diff --git a/src/include/stat.h b/src/include/stat.h
index cbd22c7b9d0..6efb9970065 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -164,6 +164,7 @@ struct __wt_connection_stats {
WT_STATS cache_eviction_dirty;
WT_STATS cache_eviction_fail;
WT_STATS cache_eviction_force;
+ WT_STATS cache_eviction_force_delete;
WT_STATS cache_eviction_force_fail;
WT_STATS cache_eviction_hazard;
WT_STATS cache_eviction_internal;
diff --git a/src/include/txn.i b/src/include/txn.i
index 745a8f75a99..656181790ed 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -227,6 +227,16 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
txn = &session->txn;
WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
+
+ /*
+ * If there is no transaction active in this thread and we haven't
+ * checked if the cache is full, do it now. If we have to block for
+ * eviction, this is the best time to do it.
+ */
+ if (F_ISSET(txn, TXN_RUNNING) &&
+ !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+ WT_RET(__wt_cache_full_check(session));
+
if (!F_ISSET(txn, TXN_HAS_ID)) {
conn = S2C(session);
txn_global = &conn->txn_global;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index c731c107651..91eb41af4f3 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -3165,206 +3165,208 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1030
/*! cache: pages evicted because they exceeded the in-memory maximum */
#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1031
+/*! cache: pages evicted because they had chains of deleted items */
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1032
/*! cache: failed eviction of pages that exceeded the in-memory maximum */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1032
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1033
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1033
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1034
/*! cache: internal pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1034
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1035
/*! cache: maximum page size at eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1035
+#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1036
/*! cache: eviction server candidate queue empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1036
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1037
/*! cache: eviction server candidate queue not empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1037
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1038
/*! cache: eviction server evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1038
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1039
/*! cache: eviction server populating queue, but not evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1039
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1040
/*! cache: eviction server unable to reach eviction goal */
-#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1040
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1041
/*! cache: pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1041
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1042
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1042
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1043
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1043
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1044
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1044
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1045
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1045
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1046
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1046
+#define WT_STAT_CONN_CACHE_READ 1047
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1047
+#define WT_STAT_CONN_CACHE_WRITE 1048
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1048
+#define WT_STAT_CONN_COND_WAIT 1049
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1049
+#define WT_STAT_CONN_CURSOR_CREATE 1050
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1050
+#define WT_STAT_CONN_CURSOR_INSERT 1051
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1051
+#define WT_STAT_CONN_CURSOR_NEXT 1052
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1052
+#define WT_STAT_CONN_CURSOR_PREV 1053
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1053
+#define WT_STAT_CONN_CURSOR_REMOVE 1054
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1054
+#define WT_STAT_CONN_CURSOR_RESET 1055
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1055
+#define WT_STAT_CONN_CURSOR_SEARCH 1056
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1056
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1057
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1057
+#define WT_STAT_CONN_CURSOR_UPDATE 1058
/*! data-handle: connection dhandles swept */
-#define WT_STAT_CONN_DH_CONN_HANDLES 1058
+#define WT_STAT_CONN_DH_CONN_HANDLES 1059
/*! data-handle: connection candidate referenced */
-#define WT_STAT_CONN_DH_CONN_REF 1059
+#define WT_STAT_CONN_DH_CONN_REF 1060
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_CONN_SWEEPS 1060
+#define WT_STAT_CONN_DH_CONN_SWEEPS 1061
/*! data-handle: connection time-of-death sets */
-#define WT_STAT_CONN_DH_CONN_TOD 1061
+#define WT_STAT_CONN_DH_CONN_TOD 1062
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1062
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1063
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1063
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1064
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1064
+#define WT_STAT_CONN_FILE_OPEN 1065
/*! log: log buffer size increases */
-#define WT_STAT_CONN_LOG_BUFFER_GROW 1065
+#define WT_STAT_CONN_LOG_BUFFER_GROW 1066
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1066
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1067
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1067
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1068
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1068
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1069
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1069
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1070
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1070
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1071
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1071
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1072
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1072
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1073
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1073
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1074
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1074
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1075
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1075
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1076
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1076
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1077
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1077
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1078
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1078
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1079
/*! log: log read operations */
-#define WT_STAT_CONN_LOG_READS 1079
+#define WT_STAT_CONN_LOG_READS 1080
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1080
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1081
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1081
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1082
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1082
+#define WT_STAT_CONN_LOG_SCANS 1083
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1083
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1084
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1084
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1085
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1085
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1086
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1086
+#define WT_STAT_CONN_LOG_SLOT_RACES 1087
/*! log: slots selected for switching that were unavailable */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1087
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1088
/*! log: record size exceeded maximum */
-#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1088
+#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1089
/*! log: failed to find a slot large enough for record */
-#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1089
+#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1090
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1090
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1091
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1091
+#define WT_STAT_CONN_LOG_SYNC 1092
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1092
+#define WT_STAT_CONN_LOG_WRITES 1093
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1093
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1094
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1094
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1095
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1095
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1096
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1096
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1097
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1097
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1098
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1098
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1099
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1099
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1100
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1100
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1101
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1101
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1102
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1102
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1103
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1103
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1104
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1104
+#define WT_STAT_CONN_MEMORY_FREE 1105
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1105
+#define WT_STAT_CONN_MEMORY_GROW 1106
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1106
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1107
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1107
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1108
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1108
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1109
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1109
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1110
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1110
+#define WT_STAT_CONN_PAGE_SLEEP 1111
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1111
+#define WT_STAT_CONN_READ_IO 1112
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1112
+#define WT_STAT_CONN_REC_PAGES 1113
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1113
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1114
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1114
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1115
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1115
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1116
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1116
+#define WT_STAT_CONN_RWLOCK_READ 1117
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1117
+#define WT_STAT_CONN_RWLOCK_WRITE 1118
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1118
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1119
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1119
+#define WT_STAT_CONN_SESSION_OPEN 1120
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1120
+#define WT_STAT_CONN_TXN_BEGIN 1121
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1121
+#define WT_STAT_CONN_TXN_CHECKPOINT 1122
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1122
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1123
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1123
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1124
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1124
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1125
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1125
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1126
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1126
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1127
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1127
+#define WT_STAT_CONN_TXN_COMMIT 1128
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1128
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1129
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1129
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1130
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1130
+#define WT_STAT_CONN_TXN_ROLLBACK 1131
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1131
+#define WT_STAT_CONN_WRITE_IO 1132
/*!
* @}
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 138b64a6e27..1b3a9b62626 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -322,13 +322,13 @@ struct __wt_update;
#include "misc.i"
#include "intpack.i" /* required by cell.i, packing.i */
#include "packing.i"
+#include "cache.i" /* required by txn.i */
#include "cell.i" /* required by btree.i */
#include "mutex.i" /* required by btree.i */
#include "txn.i" /* required by btree.i */
#include "btree.i" /* required by cursor.i */
-#include "cache.i" /* required by cursor.i */
#include "cursor.i"
#include "bitstring.i"
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 52bd3e9373d..0d44b16d85c 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -171,8 +171,6 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
lsm_tree->nchunks != 0)
goto open;
- WT_RET(__wt_cache_full_check(session));
-
if (clsm->dsk_gen != lsm_tree->dsk_gen &&
lsm_tree->nchunks != 0)
goto open;
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 3ab5e0acab1..8ee143133ae 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -736,13 +736,6 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config)
if (F_ISSET(&session->txn, TXN_RUNNING))
WT_ERR_MSG(session, EINVAL, "Transaction already running");
- /*
- * There is no transaction active in this thread; check if the cache is
- * full, if we have to block for eviction, this is the best time to do
- * it.
- */
- WT_ERR(__wt_cache_full_check(session));
-
ret = __wt_txn_begin(session, cfg);
err: API_END_RET(session, ret);
diff --git a/src/support/stat.c b/src/support/stat.c
index f4ae082add3..223d62d0559 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -376,6 +376,8 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
"cache: pages currently held in the cache";
stats->cache_eviction_force.desc =
"cache: pages evicted because they exceeded the in-memory maximum";
+ stats->cache_eviction_force_delete.desc =
+ "cache: pages evicted because they had chains of deleted items";
stats->cache_eviction_app.desc =
"cache: pages evicted by application threads";
stats->cache_read.desc = "cache: pages read into cache";
@@ -554,6 +556,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
stats->cache_eviction_dirty.v = 0;
stats->cache_eviction_deepen.v = 0;
stats->cache_eviction_force.v = 0;
+ stats->cache_eviction_force_delete.v = 0;
stats->cache_eviction_app.v = 0;
stats->cache_read.v = 0;
stats->cache_eviction_fail.v = 0;
diff --git a/src/txn/txn.c b/src/txn/txn.c
index fd80efd5ebd..5b8f11a88a5 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -361,8 +361,15 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
/* If we are logging, write a commit log record. */
if (ret == 0 && txn->mod_count > 0 &&
FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) &&
- !F_ISSET(session, WT_SESSION_NO_LOGGING))
+ !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
+ /*
+ * We are about to block on I/O writing the log.
+ * Release our snapshot in case it is keeping data pinned.
+ * This is particularly important for checkpoints.
+ */
+ __wt_txn_release_snapshot(session);
ret = __wt_txn_log_commit(session, cfg);
+ }
/*
* If anything went wrong, roll back.
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index f66bd7e09c8..f706efa8a70 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -270,6 +270,7 @@ __wt_txn_checkpoint_log(
{
WT_DECL_ITEM(logrec);
WT_DECL_RET;
+ WT_ITEM *ckpt_snapshot, empty;
WT_LSN *ckpt_lsn;
WT_TXN *txn;
uint8_t *end, *p;
@@ -319,19 +320,22 @@ __wt_txn_checkpoint_log(
*/
if (!txn->full_ckpt) {
txn->ckpt_nsnapshot = 0;
+ WT_CLEAR(empty);
+ ckpt_snapshot = &empty;
*ckpt_lsn = S2C(session)->log->alloc_lsn;
- }
+ } else
+ ckpt_snapshot = txn->ckpt_snapshot;
/* Write the checkpoint log record. */
WT_ERR(__wt_struct_size(session, &recsize, fmt,
rectype, ckpt_lsn->file, ckpt_lsn->offset,
- txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ txn->ckpt_nsnapshot, ckpt_snapshot));
WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
WT_ERR(__wt_struct_pack(session,
(uint8_t *)logrec->data + logrec->size, recsize, fmt,
rectype, ckpt_lsn->file, ckpt_lsn->offset,
- txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ txn->ckpt_nsnapshot, ckpt_snapshot));
logrec->size += (uint32_t)recsize;
WT_ERR(__wt_log_write(session, logrec, lsnp, 0));