summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2015-01-16 15:23:00 -0500
committerKeith Bostic <keith@wiredtiger.com>2015-01-16 15:23:00 -0500
commit51a92facb691706bee4b6c573e8bda070a62351d (patch)
tree42bb9c6f5e16d4661cdf1c143e88339ecd94dff4 /src
parent24ca383872e0512a3ae54efd9f4f2de29eac0d23 (diff)
parent38b6b25fb7e825b234a17ad1fb9269c5f48cb129 (diff)
downloadmongo-51a92facb691706bee4b6c573e8bda070a62351d.tar.gz
Merge branch 'develop' into cursor-reconfigure
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_debug.c21
-rw-r--r--src/btree/bt_delete.c10
-rw-r--r--src/btree/bt_page.c15
-rw-r--r--src/btree/bt_split.c305
-rw-r--r--src/btree/bt_walk.c9
-rw-r--r--src/conn/conn_log.c105
-rw-r--r--src/conn/conn_open.c3
-rw-r--r--src/docs/images/wtstats.pngbin0 -> 128334 bytes
-rw-r--r--src/docs/performance.dox3
-rw-r--r--src/docs/spell.ok2
-rw-r--r--src/docs/statistics.dox4
-rw-r--r--src/docs/wtstats.dox47
-rw-r--r--src/evict/evict_lru.c13
-rw-r--r--src/include/btmem.h5
-rw-r--r--src/include/btree.i97
-rw-r--r--src/include/cache.i59
-rw-r--r--src/include/connection.h12
-rw-r--r--src/include/cursor.i12
-rw-r--r--src/include/flags.h19
-rw-r--r--src/include/stat.h1
-rw-r--r--src/include/txn.i10
-rw-r--r--src/include/wiredtiger.in202
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/log/log.c37
-rw-r--r--src/lsm/lsm_cursor.c2
-rw-r--r--src/session/session_api.c7
-rw-r--r--src/support/stat.c3
-rw-r--r--src/txn/txn.c9
-rw-r--r--src/txn/txn_log.c14
29 files changed, 673 insertions, 355 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 4de94277364..af9f6a669f2 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -408,11 +408,13 @@ __debug_tree_shape_info(WT_PAGE *page)
v = page->memory_footprint;
if (v >= WT_GIGABYTE)
- snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE);
+ snprintf(buf, sizeof(buf),
+ "(%p %" PRIu64 "G)", page, v / WT_GIGABYTE);
else if (v >= WT_MEGABYTE)
- snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE);
+ snprintf(buf, sizeof(buf),
+ "(%p %" PRIu64 "M)", page, v / WT_MEGABYTE);
else
- snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v);
+ snprintf(buf, sizeof(buf), "(%p %" PRIu64 ")", page, v);
return (buf);
}
@@ -429,16 +431,16 @@ __debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level)
session = ds->session;
if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) {
- __dmsg(ds, "%*s" "I" "%s\n",
- level, " ", __debug_tree_shape_info(page));
+ __dmsg(ds, "%*s" "I" "%d %s\n",
+ level * 3, " ", level, __debug_tree_shape_info(page));
WT_INTL_FOREACH_BEGIN(session, page, ref) {
if (ref->state == WT_REF_MEM)
__debug_tree_shape_worker(
- ds, ref->page, level + 3);
+ ds, ref->page, level + 1);
} WT_INTL_FOREACH_END;
} else
- __dmsg(ds, "%*s" "L" "%s\n",
- level, " ", __debug_tree_shape_info(page));
+ __dmsg(ds, "%*s" "L" " %s\n",
+ level * 3, " ", __debug_tree_shape_info(page));
}
/*
@@ -458,8 +460,7 @@ __wt_debug_tree_shape(
if (page == NULL)
page = S2BT(session)->root.page;
- WT_WITH_PAGE_INDEX(session,
- __debug_tree_shape_worker(ds, page, 0));
+ WT_WITH_PAGE_INDEX(session, __debug_tree_shape_worker(ds, page, 1));
__dmsg_wrapup(ds);
return (0);
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index c97ea176c97..622dfb1b294 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -207,6 +207,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
int skip;
+ if (ref->state != WT_REF_DELETED)
+ return (0);
+
/*
* Deleted pages come from two sources: either it's a fast-delete as
* described above, or the page has been emptied by other operations
@@ -225,11 +228,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
* the page could switch to an in-memory state at any time. Lock down
* the structure, just to be safe.
*/
+ if (ref->page_del == NULL)
+ return (1);
+
if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
return (0);
- skip = ref->page_del == NULL ||
- __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;
+ skip = (ref->page_del == NULL ||
+ __wt_txn_visible(session, ref->page_del->txnid));
WT_PUBLISH(ref->state, WT_REF_DELETED);
return (skip);
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 181ffdb3736..561e1c19218 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -37,8 +37,11 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
page->type != WT_PAGE_ROW_LEAF)
return (0);
- /* Eviction may be turned off, although that's rare. */
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ /*
+ * Eviction may be turned off (although that's rare), or we may be in
+ * the middle of a checkpoint.
+ */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION) || btree->checkpointing)
return (0);
/*
@@ -128,7 +131,13 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
force_attempts < 10 &&
__evict_force_check(session, page)) {
++force_attempts;
- WT_RET(__wt_page_release(session, ref, flags));
+ if ((ret = __wt_page_release_busy(
+ session, ref, flags)) == EBUSY) {
+ /* If forced eviction fails, stall. */
+ ret = 0;
+ wait_cnt += 1000;
+ } else
+ WT_RET(ret);
WT_STAT_FAST_CONN_INCR(
session, page_forcible_evict_blocked);
break;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 911a38e4be6..69dbfb42354 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -9,15 +9,6 @@
#include "wt_internal.h"
/*
- * Tuning; global variables to allow the binary to be patched, we don't yet have
- * any real understanding of what might be useful to surface to applications.
- */
-static u_int __split_deepen_max_internal_image = 100;
-static u_int __split_deepen_min_child = 10;
-static u_int __split_deepen_per_child = 100;
-static u_int __split_deepen_split_child = 100;
-
-/*
* Track allocation increments, matching the cache calculations, which add an
* estimate of allocation overhead to every object.
*/
@@ -177,45 +168,57 @@ __split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s)
}
/*
+ * Tuning; global variables to allow the binary to be patched, we don't yet have
+ * any real understanding of what might be useful to surface to applications.
+ */
+static u_int __split_deepen_min_child = 10000;
+static u_int __split_deepen_per_child = 100;
+
+/*
* __split_should_deepen --
* Return if we should deepen the tree.
*/
static int
-__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page)
+__split_should_deepen(
+ WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp)
{
WT_PAGE_INDEX *pindex;
+ WT_PAGE *page;
- /*
- * Splits are based on either the number of child pages that will be
- * created by the split (splitting an internal page that will be slow
- * to search), or by the memory footprint of the parent page (avoiding
- * an internal page that will eat up all of the cache and put eviction
- * pressure on the system).
- */
+ *childrenp = 0;
+
+ page = ref->page;
pindex = WT_INTL_INDEX_COPY(page);
/*
* Deepen the tree if the page's memory footprint is larger than the
- * maximum size for a page in memory. We need an absolute minimum
- * number of entries in order to split the page: if there is a single
- * huge key, splitting won't help.
+ * maximum size for a page in memory (presumably putting eviction
+ * pressure on the cache).
*/
- if (page->memory_footprint > S2BT(session)->maxmempage &&
- pindex->entries >= __split_deepen_min_child)
- return (1);
+ if (page->memory_footprint < S2BT(session)->maxmempage)
+ return (0);
/*
- * Deepen the tree if the page's memory footprint is at least N
- * times the maximum internal page size chunk in the backing file and
- * the split will result in at least N children in the newly created
- * intermediate layer.
+ * Ensure the page has enough entries to make it worth splitting and
+ * we get a significant payback (in the case of a set of large keys,
+ * splitting won't help).
*/
- if (page->memory_footprint >
- __split_deepen_max_internal_image * S2BT(session)->maxintlpage &&
- pindex->entries >=
- (__split_deepen_per_child * __split_deepen_split_child))
+ if (pindex->entries > __split_deepen_min_child) {
+ *childrenp = pindex->entries / __split_deepen_per_child;
return (1);
+ }
+ /*
+ * The root is a special-case: if it's putting cache pressure on the
+ * system, split it even if there are only a few entries, we can't
+ * push it out of memory. Sanity check: if the root page is too big
+ * with less than 100 keys, there are huge keys and/or a too-small
+ * cache, there's not much to do.
+ */
+ if (__wt_ref_is_root(ref) && pindex->entries > 100) {
+ *childrenp = pindex->entries / 10;
+ return (1);
+ }
return (0);
}
@@ -254,12 +257,13 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
}
/*
- * __split_ref_instantiate --
- * Instantiate key/address pairs in memory in service of a split.
+ * __split_ref_deepen_move --
+ * Move a WT_REF from a parent to a child in service of a split to deepen
+ * the tree, including updating the accounting information.
*/
static int
-__split_ref_instantiate(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+__split_ref_deepen_move(WT_SESSION_IMPL *session,
+ WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
{
WT_ADDR *addr;
WT_CELL_UNPACK unpack;
@@ -276,8 +280,6 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
* of child pages, and so we can no longer reference the block image
* that remains with the page being split.
*
- * Track how much memory the parent is losing and the child gaining.
- *
* No locking is required to update the WT_REF structure because we're
* the only thread splitting the parent page, and there's no way for
* readers to race with our updates of single pointers. The changes
@@ -286,13 +288,13 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
*
* Row-store keys, first.
*/
- if (page->type == WT_PAGE_ROW_INT) {
+ if (parent->type == WT_PAGE_ROW_INT) {
if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
- __wt_ref_key(page, ref, &key, &size);
+ __wt_ref_key(parent, ref, &key, &size);
WT_RET(__wt_row_ikey(session, 0, key, size, &ikey));
ref->key.ikey = ikey;
} else {
- WT_RET(__split_ovfl_key_cleanup(session, page, ref));
+ WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
WT_MEMSIZE_ADD(*parent_decrp,
sizeof(WT_IKEY) + ikey->size);
}
@@ -304,12 +306,8 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
* address has been instantiated, there's no work to do. Otherwise,
* get the address from the on-page cell.
*/
- if ((addr = ref->addr) == NULL)
- return (0);
- if (__wt_off_page(page, addr))
- WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp,
- sizeof(WT_ADDR) + addr->size);
- else {
+ addr = ref->addr;
+ if (addr != NULL && !__wt_off_page(parent, addr)) {
__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
WT_RET(__wt_calloc_one(session, &addr));
if ((ret = __wt_strndup(
@@ -321,8 +319,11 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
addr->type =
unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
ref->addr = addr;
- WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_ADDR) + addr->size);
}
+
+ /* And finally, the WT_REF itself. */
+ WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF));
+
return (0);
}
@@ -383,7 +384,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
* Split an internal page in-memory, deepening the tree.
*/
static int
-__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
{
WT_DECL_RET;
WT_PAGE *child;
@@ -391,7 +392,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
WT_REF **alloc_refp;
WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
size_t child_incr, parent_decr, parent_incr, size;
- uint32_t children, chunk, i, j, remain, slots;
+ uint32_t chunk, i, j, remain, slots;
int panic;
void *p;
@@ -401,13 +402,6 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
pindex = WT_INTL_INDEX_COPY(parent);
- /*
- * Create N children, unless we are dealing with a large page without
- * many entries, in which case split into the minimum number of pages.
- */
- children = WT_MAX(pindex->entries / __split_deepen_per_child,
- __split_deepen_min_child);
-
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
@@ -506,12 +500,9 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
child_incr = 0;
child_pindex = WT_INTL_INDEX_COPY(child);
for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
- WT_ERR(__split_ref_instantiate(session,
+ WT_ERR(__split_ref_deepen_move(session,
parent, *parent_refp, &parent_decr, &child_incr));
*child_refp++ = *parent_refp++;
-
- WT_MEMSIZE_TRANSFER(
- parent_decr, child_incr, sizeof(WT_REF));
}
__wt_cache_page_inmem_incr(session, child, child_incr);
}
@@ -604,9 +595,10 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
* be using the new index.
*/
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_MEMSIZE_ADD(parent_decr, size);
WT_ERR(__split_safe_free(session, 0, pindex, size));
+ WT_MEMSIZE_ADD(parent_decr, size);
+#if 0
/*
* Adjust the parent's memory footprint. This may look odd, but we
* have already taken the allocation overhead into account, and an
@@ -615,6 +607,19 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
*/
__wt_cache_page_inmem_incr(session, parent, parent_incr);
__wt_cache_page_inmem_decr(session, parent, parent_decr);
+#else
+ /*
+ * XXX
+ * The code to track page sizes is fundamentally flawed in the face of
+ * splits: for example, we don't add in an overhead allocation constant
+ * when allocating WT_REF structures as pages are created, but the
+ * calculations during split assume that correction. For now, ignore
+ * our carefully calculated values and force the internal page size to
+ * 5% of its current value.
+ */
+ size = parent->memory_footprint - (parent->memory_footprint / 20);
+ __wt_cache_page_inmem_decr(session, parent, size);
+#endif
if (0) {
err: __wt_free_ref_index(session, parent, alloc_index, 1);
@@ -770,13 +775,11 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
* the confusion.
*/
WT_RET(__wt_calloc_one(session, &addr));
- WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR));
ref->addr = addr;
addr->size = multi->addr.size;
addr->type = multi->addr.type;
WT_RET(__wt_strndup(session,
multi->addr.addr, addr->size, &addr->addr));
- WT_MEMSIZE_ADD(incr, addr->size);
} else
WT_RET(__split_multi_inmem(session, page, ref, multi));
@@ -814,17 +817,20 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
int exclusive, int ref_discard)
{
WT_DECL_RET;
+ WT_IKEY *ikey;
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
- WT_REF **alloc_refp, *parent_ref;
+ WT_REF **alloc_refp, *next_ref, *parent_ref;
size_t size;
- uint32_t i, j, parent_entries, result_entries;
+ uint32_t children, i, j;
+ uint32_t deleted_entries, parent_entries, result_entries;
int complete, hazard, locked;
parent = NULL; /* -Wconditional-uninitialized */
- alloc_index = NULL;
+ alloc_index = pindex = NULL;
parent_ref = NULL;
complete = hazard = locked = 0;
+ parent_entries = 0;
/*
* Get a page-level lock on the parent to single-thread splits into the
@@ -865,7 +871,29 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
pindex = WT_INTL_INDEX_COPY(parent);
parent_entries = pindex->entries;
- result_entries = (parent_entries - 1) + new_entries;
+
+ /*
+ * Remove any refs to deleted pages while we are splitting, we have
+ * the internal page locked down, and are copying the refs into a new
+ * array anyway. Switch them to the special split state, so that any
+ * reading thread will restart.
+ */
+ for (i = 0, deleted_entries = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
+ if (next_ref->state == WT_REF_DELETED &&
+ next_ref->page_del == NULL &&
+ WT_ATOMIC_CAS4(next_ref->state,
+ WT_REF_DELETED, WT_REF_SPLIT))
+ deleted_entries++;
+ }
+
+ /*
+ * The final entry count consists of: The original count, plus any
+ * new pages, less any refs we are removing because they only
+ * contained deleted items, less 1 for the page being replaced.
+ */
+ result_entries = (parent_entries + new_entries) - (deleted_entries + 1);
/*
* Allocate and initialize a new page index array for the parent, then
@@ -877,8 +905,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_MEMSIZE_ADD(parent_incr, size);
alloc_index->index = (WT_REF **)(alloc_index + 1);
alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i)
- if (pindex->index[i] == ref)
+ for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
*alloc_refp++ = ref_new[j];
@@ -890,8 +919,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
ref_new[j] = NULL;
}
- else
- *alloc_refp++ = pindex->index[i];
+ else if (next_ref->state != WT_REF_SPLIT)
+ /* Skip refs we have marked for deletion. */
+ *alloc_refp++ = next_ref;
+ }
/*
* Update the parent page's index: this update makes the split visible
@@ -926,6 +957,36 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
complete = 1;
/*
+ * Now that the new page is in place it's OK to free any deleted
+ * refs we encountered modulo the regular safe free semantics.
+ */
+ for (i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ /* If we set the ref to split to mark it for delete */
+ if (next_ref != ref && next_ref->state == WT_REF_SPLIT) {
+ /*
+ * We're discarding a deleted reference.
+ * Free any resources it holds.
+ */
+ if (parent->type == WT_PAGE_ROW_INT) {
+ WT_TRET(__split_ovfl_key_cleanup(
+ session, parent, next_ref));
+ ikey = __wt_ref_key_instantiated(next_ref);
+ if (ikey != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(
+ session, 0, ikey, size));
+ WT_MEMSIZE_ADD(parent_decr, size);
+ }
+ }
+
+ WT_TRET(__split_safe_free(
+ session, 0, next_ref, sizeof(WT_REF)));
+ WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+ }
+ }
+
+ /*
* We can't free the previous page index, there may be threads using it.
* Add it to the session discard list, to be freed when it's safe.
*/
@@ -978,11 +1039,30 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Do the check here because we've just grown the parent page and
* are holding it locked.
*/
- if (ret == 0 && !exclusive && __split_should_deepen(session, parent))
+ if (ret == 0 && !exclusive &&
+ !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) &&
+ __split_should_deepen(session, parent_ref, &children)) {
+ /*
+ * XXX
+ * Temporary hack to avoid a bug where the root page is split
+ * even when it's no longer doing any good.
+ */
+ uint64_t __a, __b;
+ __a = parent->memory_footprint;
WT_WITH_PAGE_INDEX(session,
- ret = __split_deepen(session, parent));
+ ret = __split_deepen(session, parent, children));
+ __b = parent->memory_footprint;
+ if (__b * 2 >= __a)
+ F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN);
+ }
-err: if (locked)
+err: if (!complete)
+ for (i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref->state == WT_REF_SPLIT)
+ next_ref->state = WT_REF_DELETED;
+ }
+ if (locked)
F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
if (hazard)
@@ -1018,15 +1098,16 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
WT_PAGE *page, *right;
WT_REF *child, *split_ref[2] = { NULL, NULL };
WT_UPDATE *upd;
- size_t page_decr, parent_incr, right_incr, size;
+ size_t page_decr, parent_decr, parent_incr, right_incr;
int i;
*splitp = 0;
btree = S2BT(session);
page = ref->page;
+ ikey = NULL;
right = NULL;
- page_decr = parent_incr = right_incr = 0;
+ page_decr = parent_decr = parent_incr = right_incr = 0;
/*
* Check for pages with append-only workloads. A common application
@@ -1127,9 +1208,19 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
WT_ERR(__wt_row_ikey(session, 0,
WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
&child->key.ikey));
+
+ /*
+ * We're swapping WT_REFs in the parent, adjust the accounting, and
+ * row store pages may have instantiated keys.
+ */
WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF));
- WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY));
- WT_MEMSIZE_ADD(parent_incr, WT_INSERT_KEY_SIZE(moved_ins));
+ WT_MEMSIZE_ADD(
+ parent_incr, sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins));
+ WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+ if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT)
+ if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
+ WT_MEMSIZE_ADD(
+ parent_decr, sizeof(WT_IKEY) + ikey->size);
/* The new page is dirty by definition. */
WT_ERR(__wt_page_modify_init(session, right));
@@ -1151,14 +1242,11 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
*/
for (i = 0; i < WT_SKIP_MAXDEPTH && ins_head->tail[i] == moved_ins; ++i)
;
- size = ((size_t)i - 1) * sizeof(WT_INSERT *);
- size += sizeof(WT_INSERT) + WT_INSERT_KEY_SIZE(moved_ins);
+ WT_MEMSIZE_TRANSFER(page_decr, right_incr, sizeof(WT_INSERT) +
+ (size_t)i * sizeof(WT_INSERT *) + WT_INSERT_KEY_SIZE(moved_ins));
for (upd = moved_ins->upd; upd != NULL; upd = upd->next)
- size += sizeof(WT_UPDATE) + upd->size;
- WT_MEMSIZE_ADD(right_incr, size);
- WT_MEMSIZE_ADD(page_decr, size);
- __wt_cache_page_inmem_decr(session, page, page_decr);
- __wt_cache_page_inmem_incr(session, right, right_incr);
+ WT_MEMSIZE_TRANSFER(
+ page_decr, right_incr, sizeof(WT_UPDATE) + upd->size);
/*
* Allocation operations completed, move the last insert list item from
@@ -1245,10 +1333,23 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
#endif
/*
- * Split into the parent.
+ * Save the transaction ID when the split happened. Application
+ * threads will not try to forcibly evict the page again until
+ * all concurrent transactions commit.
+ */
+ page->modify->inmem_split_txn = __wt_txn_new_id(session);
+
+ /* Update the page accounting. */
+ __wt_cache_page_inmem_decr(session, page, page_decr);
+ __wt_cache_page_inmem_incr(session, right, right_incr);
+
+ /*
+ * Split into the parent. After this, the original page is no
+ * longer locked, so we cannot safely look at it.
*/
+ page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, 0, parent_incr, 0, 0)) != 0) {
+ session, ref, split_ref, 2, parent_decr, parent_incr, 0, 0)) != 0) {
/*
* Move the insert list element back to the original page list.
* For simplicity, the previous skip list pointers originally
@@ -1271,13 +1372,6 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
WT_ERR(ret);
}
- /*
- * Save the transaction ID when the split happened. Application
- * threads will not try to forcibly evict the page again until
- * all concurrent transactions commit.
- */
- page->modify->inmem_split_txn = __wt_txn_new_id(session);
-
/* Let our caller know that we split. */
*splitp = 1;
@@ -1289,13 +1383,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
* structure and instantiated key, there may be threads using them.
* Add them to the session discard list, to be freed once we know it's
* safe.
- *
- * After the split, we're going to discard the WT_REF, account for the
- * change in memory footprint. Row store pages have keys that may be
- * instantiated, check for that.
*/
- if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) &&
- (ikey = __wt_ref_key_instantiated(ref)) != NULL)
+ if (ikey != NULL)
WT_TRET(__split_safe_free(
session, 0, ikey, sizeof(WT_IKEY) + ikey->size));
WT_TRET(__split_safe_free(session, 0, ref, sizeof(WT_REF)));
@@ -1380,7 +1469,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_REF **ref_new;
- size_t ikey_size, parent_decr, parent_incr;
+ size_t parent_decr, parent_incr;
uint32_t i, new_entries;
page = ref->page;
@@ -1388,7 +1477,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
new_entries = mod->mod_multi_entries;
ikey = NULL;
- ikey_size = parent_decr = parent_incr = 0;
+ parent_decr = parent_incr = 0;
/*
* Convert the split page's multiblock reconciliation information into
@@ -1404,12 +1493,11 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
* change in memory footprint. Row store pages have keys that may be
* instantiated, check for that.
*/
- if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) &&
- (ikey = __wt_ref_key_instantiated(ref)) != NULL) {
- ikey_size = sizeof(WT_IKEY) + ikey->size;
- WT_MEMSIZE_ADD(parent_decr, ikey_size);
- }
WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+ if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT)
+ if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
+ WT_MEMSIZE_ADD(
+ parent_decr, sizeof(WT_IKEY) + ikey->size);
/* Split into the parent. */
WT_ERR(__split_parent(session,
@@ -1436,7 +1524,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
* safe.
*/
if (ikey != NULL)
- WT_TRET(__split_safe_free(session, exclusive, ikey, ikey_size));
+ WT_TRET(__split_safe_free(
+ session, exclusive, ikey, sizeof(WT_IKEY) + ikey->size));
WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF)));
/*
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index c74a7177401..a2b2a6bb7c8 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -208,6 +208,12 @@ restart: /*
break;
} else if (LF_ISSET(WT_READ_TRUNCATE)) {
/*
+ * Avoid pulling a deleted page back in to try
+ * to delete it again.
+ */
+ if (__wt_delete_page_skip(session, ref))
+ break;
+ /*
* If deleting a range, try to delete the page
* without instantiating it.
*/
@@ -242,8 +248,7 @@ restart: /*
* If iterating a cursor, try to skip deleted
* pages that are visible to us.
*/
- if (ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, ref))
+ if (__wt_delete_page_skip(session, ref))
break;
}
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 2799a58f327..796b7d5147b 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -126,11 +126,13 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
/*
* If we're coming from a backup cursor we want the smaller of
* the last full log file copied in backup or the checkpoint LSN.
+ * Otherwise we want the minimum of the last log file written to
+ * disk and the checkpoint LSN.
*/
if (backup_file != 0)
min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file);
else
- min_lognum = log->ckpt_lsn.file;
+ min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file);
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"log_archive: archive to log number %" PRIu32, min_lognum));
@@ -276,6 +278,70 @@ err:
}
/*
+ * __log_close_server --
+ * The log close server thread.
+ */
+static void *
+__log_close_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *close_fh;
+ WT_LOG *log;
+ WT_LSN close_end_lsn, close_lsn;
+ WT_SESSION_IMPL *session;
+ int locked;
+
+ session = arg;
+ conn = S2C(session);
+ log = conn->log;
+ locked = 0;
+ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
+ /*
+ * If there is a log file to close, fsync and close it.
+ */
+ if ((close_fh = log->log_close_fh) != NULL) {
+ /*
+ * We've copied the file handle, clear out the one in
+ * log structure to allow it to be set again.
+ */
+ log->log_close_fh = NULL;
+ /*
+ * Set the close_end_lsn to the LSN immediately after
+ * ours. That is, the beginning of the next log file.
+ * We need to know the LSN file number of our own close
+ * in case earlier calls are still in progress and the
+ * next one to move the sync_lsn into the next file for
+ * later syncs.
+ */
+ WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
+ &close_lsn.file));
+ close_lsn.offset = 0;
+ close_end_lsn = close_lsn;
+ close_end_lsn.file++;
+ WT_ERR(__wt_fsync(session, close_fh));
+ __wt_spin_lock(session, &log->log_sync_lock);
+ locked = 1;
+ WT_ERR(__wt_close(session, close_fh));
+ log->sync_lsn = close_end_lsn;
+ WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
+ locked = 0;
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ } else
+ /* Wait until the next event. */
+ WT_ERR(__wt_cond_wait(session,
+ conn->log_close_cond, 10000));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "log close server error");
+ }
+ if (locked)
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ return (NULL);
+}
+
+/*
* __log_server --
* The log server thread.
*/
@@ -292,7 +358,7 @@ __log_server(void *arg)
conn = S2C(session);
log = conn->log;
locked = 0;
- while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
/*
* Perform log pre-allocation.
*/
@@ -320,7 +386,7 @@ __log_server(void *arg)
}
if (0) {
-err: __wt_err(session, ret, "log archive server error");
+err: __wt_err(session, ret, "log server error");
}
if (locked)
(void)__wt_writeunlock(session, log->log_archive_lock);
@@ -384,7 +450,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
/*
* __wt_logmgr_open --
- * Start the log subsystem and archive server thread.
+ * Start the log service threads.
*/
int
__wt_logmgr_open(WT_SESSION_IMPL *session)
@@ -394,14 +460,33 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
conn = S2C(session);
/* If no log thread services are configured, we're done. */
- if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
- !FLD_ISSET(conn->log_flags,
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ return (0);
+
+ /*
+ * Start the log close thread. It is not configurable.
+ * If logging is enabled, this thread runs.
+ */
+ WT_RET(__wt_open_internal_session(
+ conn, "log-close-server", 0, 0, &conn->log_close_session));
+ WT_RET(__wt_cond_alloc(conn->log_close_session,
+ "log close server", 0, &conn->log_close_cond));
+
+ /*
+ * Start the thread.
+ */
+ WT_RET(__wt_thread_create(conn->log_close_session,
+ &conn->log_close_tid, __log_close_server, conn->log_close_session));
+ conn->log_close_tid_set = 1;
+
+ /* If no log thread services are configured, we're done. */
+ if (!FLD_ISSET(conn->log_flags,
(WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC)))
return (0);
/*
* If a log server thread exists, the user may have reconfigured
- * archiving ore pre-allocation. Signal the thread. Otherwise the
+ * archiving or pre-allocation. Signal the thread. Otherwise the
* user wants archiving and/or allocation and we need to start up
* the thread.
*/
@@ -455,6 +540,12 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn->log_tid_set = 0;
}
WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
+ if (conn->log_close_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->log_close_cond));
+ WT_TRET(__wt_thread_join(session, conn->log_close_tid));
+ conn->log_close_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->log_close_cond));
WT_TRET(__wt_log_close(session));
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index b425376d6ae..ab873cc36a9 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
* Tell internal server threads to run: this must be set before opening
* any sessions.
*/
- F_SET(conn, WT_CONN_SERVER_RUN);
+ F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN);
/* WT_SESSION_IMPL array. */
WT_RET(__wt_calloc(session,
@@ -130,6 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
WT_TRET(__wt_txn_checkpoint_log(
session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
+ F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
WT_TRET(__wt_logmgr_destroy(session));
/* Free memory for collators, compressors, data sources. */
diff --git a/src/docs/images/wtstats.png b/src/docs/images/wtstats.png
new file mode 100644
index 00000000000..f65a2871b6f
--- /dev/null
+++ b/src/docs/images/wtstats.png
Binary files differ
diff --git a/src/docs/performance.dox b/src/docs/performance.dox
index 5b9d6c40e7b..2284e1e1d4f 100644
--- a/src/docs/performance.dox
+++ b/src/docs/performance.dox
@@ -24,4 +24,7 @@ investigate performance and tune their WiredTiger applications.
<h2>Simulating workloads</h2>
- @subpage wtperf
+<h2>Visualizing performance</h2>
+- @subpage wtstats
+
*/
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index 2fd7e5f0ad2..56d1aa1170f 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -76,6 +76,7 @@ WiredTigerException
WiredTigerLog
WiredTigerPanicException
WiredTigerRollbackException
+WiredTigerStat
WiredTigerTestCase
Za
aR
@@ -442,6 +443,7 @@ writelock
writelocks
wrlock
wtperf
+wtstats
xa
yieldcpu
zlib
diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox
index 067cf342111..7fdc4125254 100644
--- a/src/docs/statistics.dox
+++ b/src/docs/statistics.dox
@@ -153,4 +153,8 @@ A Python script that parses the default logging output and uses the
Portable Network Graphics (PNG) format graphs is included in the
WiredTiger distribution in the file \c tools/statlog.py.
+@m_if{c}
+To interactively examine statistics results, see @ref wtstats.
+@m_endif
+
*/
diff --git a/src/docs/wtstats.dox b/src/docs/wtstats.dox
new file mode 100644
index 00000000000..1a792849124
--- /dev/null
+++ b/src/docs/wtstats.dox
@@ -0,0 +1,47 @@
+/*! @page wtstats Visualizing performance with wtstats
+
+The WiredTiger distribution includes the \b wtstats tool that can be used to
+examine information generated using statistics logging (see @ref
+statistics_log).
+
+After running an application with statistics logging configured, the
+statistics output files will be in the database home directory. By default,
+these are named \c WiredTigerStat.* . In the database home directory, run
+this command, replacing \c \<wiredtiger\> with the path to the
+WiredTiger installation directory:
+\code{.sh}
+python <wiredtiger>/tools/wtstats.py WiredTigerStat.*
+\endcode
+
+Another way to process all the stats files in a directory is:
+
+\code{.sh}
+python <wiredtiger>/tools/wtstats.py <directory>
+\endcode
+In either case, a \c wtstats.html file will be generated in the \e current
+directory that you can open in your browser to examine statistics.
+
+Additional options are available, use <tt>wtstats.py --help</tt>
+to display them.
+
+Here is a sample of what is displayed using \c wtstats.html:
+
+\image html wtstats.png "wtstats.html"
+
+Some things to note about the interface:
+
+- The left sidebar has statistics groups that can each be expanded
+to show individual statistics. Clicking on a circle toggles whether an
+individual statistic or statistics group is displayed or not.
+
+- The search box at the upper left can be used to search for statistics
+matching a string.
+
+- Hovering over values in the graph will show what the value is, and what
+statistic is being shown.
+
+- The graph can be panned using two fingered scroll or mouse wheel.
+
+- Scaling of the entire graph can be changed using the buttons at the right top.
+
+*/
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 60a5f82f233..a4ae0aaf55b 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -437,7 +437,7 @@ __evict_pass(WT_SESSION_IMPL *session)
WT_EVICT_WORKER *worker;
int loop;
uint32_t flags;
- uint64_t bytes_inuse, pages_evicted;
+ uint64_t bytes_inuse, dirty_target_size, pages_evicted, target_size;
conn = S2C(session);
cache = conn->cache;
@@ -465,9 +465,16 @@ __evict_pass(WT_SESSION_IMPL *session)
if (loop > 10)
LF_SET(WT_EVICT_PASS_AGGRESSIVE);
- /* Start a worker if we have capacity and the cache is full. */
+ /*
+ * Start a worker if we have capacity and we haven't reached
+ * the eviction targets.
+ */
bytes_inuse = __wt_cache_bytes_inuse(cache);
- if (bytes_inuse > conn->cache_size &&
+ target_size = (conn->cache_size * cache->eviction_target) / 100;
+ dirty_target_size =
+ (conn->cache_size * cache->eviction_dirty_target) / 100;
+ if ((bytes_inuse > target_size ||
+ cache->bytes_dirty > dirty_target_size) &&
conn->evict_workers < conn->evict_workers_max) {
WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
"Starting evict worker: %"PRIu32"\n",
diff --git a/src/include/btmem.h b/src/include/btmem.h
index e1fc72677c5..dd10e522412 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -550,9 +550,10 @@ struct __wt_page {
#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
-#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing */
+#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */
+#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */
#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
+#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
};
diff --git a/src/include/btree.i b/src/include/btree.i
index a333e4af565..d30ee46486a 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -165,65 +165,6 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * __wt_cache_read_gen --
- * Get the current read generation number.
- */
-static inline uint64_t
-__wt_cache_read_gen(WT_SESSION_IMPL *session)
-{
- return (S2C(session)->cache->read_gen);
-}
-
-/*
- * __wt_cache_read_gen_incr --
- * Increment the current read generation number.
- */
-static inline void
-__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
-{
- ++S2C(session)->cache->read_gen;
-}
-
-/*
- * __wt_cache_read_gen_set --
- * Get the read generation to store in a page.
- */
-static inline uint64_t
-__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
-{
- /*
- * We return read-generations from the future (where "the future" is
- * measured by increments of the global read generation). The reason
- * is because when acquiring a new hazard pointer for a page, we can
- * check its read generation, and if the read generation isn't less
- * than the current global generation, we don't bother updating the
- * page. In other words, the goal is to avoid some number of updates
- * immediately after each update we have to make.
- */
- return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
-}
-
-/*
- * __wt_cache_pages_inuse --
- * Return the number of pages in use.
- */
-static inline uint64_t
-__wt_cache_pages_inuse(WT_CACHE *cache)
-{
- return (cache->pages_inmem - cache->pages_evict);
-}
-
-/*
- * __wt_cache_bytes_inuse --
- * Return the number of bytes in use.
- */
-static inline uint64_t
-__wt_cache_bytes_inuse(WT_CACHE *cache)
-{
- return (cache->bytes_inmem - cache->bytes_evict);
-}
-
-/*
* __wt_page_evict_soon --
* Set a page to be evicted as soon as possible.
*/
@@ -917,16 +858,16 @@ __wt_ref_info(WT_SESSION_IMPL *session,
}
/*
- * __wt_page_release --
- * Release a reference to a page.
+ * __wt_page_release_busy --
+ * Release a reference to a page, fail if busy during forced eviction.
*/
static inline int
-__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+__wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
- int locked;
+ int locked, too_big;
btree = S2BT(session);
@@ -938,6 +879,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
return (0);
page = ref->page;
+ too_big = (page->memory_footprint < btree->maxmempage) ? 0 : 1;
+
/*
* Attempt to evict pages with the special "oldest" read generation.
*
@@ -970,12 +913,19 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
return (ret);
(void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
- if ((ret = __wt_evict_page(session, ref)) == 0)
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
- else {
+ if ((ret = __wt_evict_page(session, ref)) == 0) {
+ if (too_big)
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
+ else
+ /*
+ * If the page isn't too big, we are evicting it because
+ * it had a chain of deleted entries that make traversal
+ * expensive.
+ */
+ WT_STAT_FAST_CONN_INCR(
+ session, cache_eviction_force_delete);
+ } else {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
- if (ret == EBUSY)
- ret = 0;
}
(void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
@@ -983,6 +933,17 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
}
/*
+ * __wt_page_release --
+ * Release a reference to a page.
+ */
+static inline int
+__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+ WT_RET_BUSY_OK(__wt_page_release_busy(session, ref, flags));
+ return (0);
+}
+
+/*
* __wt_page_swap_func --
* Swap one page's hazard pointer for another one when hazard pointer
* coupling up/down the tree.
diff --git a/src/include/cache.i b/src/include/cache.i
index b997781272a..ee969255241 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -7,6 +7,65 @@
*/
/*
+ * __wt_cache_read_gen --
+ * Get the current read generation number.
+ */
+static inline uint64_t
+__wt_cache_read_gen(WT_SESSION_IMPL *session)
+{
+ return (S2C(session)->cache->read_gen);
+}
+
+/*
+ * __wt_cache_read_gen_incr --
+ * Increment the current read generation number.
+ */
+static inline void
+__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
+{
+ ++S2C(session)->cache->read_gen;
+}
+
+/*
+ * __wt_cache_read_gen_set --
+ * Get the read generation to store in a page.
+ */
+static inline uint64_t
+__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+{
+ /*
+ * We return read-generations from the future (where "the future" is
+ * measured by increments of the global read generation). The reason
+ * is because when acquiring a new hazard pointer for a page, we can
+ * check its read generation, and if the read generation isn't less
+ * than the current global generation, we don't bother updating the
+ * page. In other words, the goal is to avoid some number of updates
+ * immediately after each update we have to make.
+ */
+ return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ * Return the number of pages in use.
+ */
+static inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+ return (cache->pages_inmem - cache->pages_evict);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ * Return the number of bytes in use.
+ */
+static inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+ return (cache->bytes_inmem - cache->bytes_evict);
+}
+
+/*
* __wt_eviction_check --
* Wake the eviction server if necessary.
*/
diff --git a/src/include/connection.h b/src/include/connection.h
index c8a3ae6e291..c5723882489 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -279,10 +279,14 @@ struct __wt_connection_impl {
#define WT_CONN_LOG_EXISTED 0x04 /* Log files found */
#define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */
uint32_t log_flags; /* Global logging configuration */
- WT_CONDVAR *log_cond; /* Log archive wait mutex */
- WT_SESSION_IMPL *log_session; /* Log archive session */
- wt_thread_t log_tid; /* Log archive thread */
- int log_tid_set; /* Log archive thread set */
+ WT_CONDVAR *log_cond; /* Log server wait mutex */
+ WT_SESSION_IMPL *log_session; /* Log server session */
+ wt_thread_t log_tid; /* Log server thread */
+ int log_tid_set; /* Log server thread set */
+ WT_CONDVAR *log_close_cond;/* Log close thread wait mutex */
+ WT_SESSION_IMPL *log_close_session;/* Log close thread session */
+ wt_thread_t log_close_tid; /* Log close thread thread */
+ int log_close_tid_set;/* Log close thread set */
WT_LOG *log; /* Logging structure */
WT_COMPRESSOR *log_compressor;/* Logging compressor */
wt_off_t log_file_max; /* Log file max size */
diff --git a/src/include/cursor.i b/src/include/cursor.i
index ae6aafdd638..8fa9790e096 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -176,11 +176,23 @@ static inline int
__cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
{
WT_SESSION_IMPL *session;
+ WT_TXN *txn;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ txn = &session->txn;
if (reenter)
WT_RET(__curfile_leave(cbt));
+
+ /*
+ * If there is no transaction active in this thread and we haven't
+ * checked if the cache is full, do it now. If we have to block for
+ * eviction, this is the best time to do it.
+ */
+ if (F_ISSET(txn, TXN_RUNNING) &&
+ !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+ WT_RET(__wt_cache_full_check(session));
+
if (!F_ISSET(cbt, WT_CBT_ACTIVE))
WT_RET(__curfile_enter(cbt));
__wt_txn_cursor_op(session);
diff --git a/src/include/flags.h b/src/include/flags.h
index c7e74885a35..9664fce3f9f 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -6,15 +6,16 @@
#define WT_CONN_CKPT_SYNC 0x00000002
#define WT_CONN_EVICTION_RUN 0x00000004
#define WT_CONN_LEAK_MEMORY 0x00000008
-#define WT_CONN_LSM_MERGE 0x00000010
-#define WT_CONN_PANIC 0x00000020
-#define WT_CONN_SERVER_ASYNC 0x00000040
-#define WT_CONN_SERVER_CHECKPOINT 0x00000080
-#define WT_CONN_SERVER_LSM 0x00000100
-#define WT_CONN_SERVER_RUN 0x00000200
-#define WT_CONN_SERVER_STATISTICS 0x00000400
-#define WT_CONN_SERVER_SWEEP 0x00000800
-#define WT_CONN_WAS_BACKUP 0x00001000
+#define WT_CONN_LOG_SERVER_RUN 0x00000010
+#define WT_CONN_LSM_MERGE 0x00000020
+#define WT_CONN_PANIC 0x00000040
+#define WT_CONN_SERVER_ASYNC 0x00000080
+#define WT_CONN_SERVER_CHECKPOINT 0x00000100
+#define WT_CONN_SERVER_LSM 0x00000200
+#define WT_CONN_SERVER_RUN 0x00000400
+#define WT_CONN_SERVER_STATISTICS 0x00000800
+#define WT_CONN_SERVER_SWEEP 0x00001000
+#define WT_CONN_WAS_BACKUP 0x00002000
#define WT_EVICTING 0x00000001
#define WT_FILE_TYPE_CHECKPOINT 0x00000001
#define WT_FILE_TYPE_DATA 0x00000002
diff --git a/src/include/stat.h b/src/include/stat.h
index cbd22c7b9d0..6efb9970065 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -164,6 +164,7 @@ struct __wt_connection_stats {
WT_STATS cache_eviction_dirty;
WT_STATS cache_eviction_fail;
WT_STATS cache_eviction_force;
+ WT_STATS cache_eviction_force_delete;
WT_STATS cache_eviction_force_fail;
WT_STATS cache_eviction_hazard;
WT_STATS cache_eviction_internal;
diff --git a/src/include/txn.i b/src/include/txn.i
index 745a8f75a99..656181790ed 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -227,6 +227,16 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
txn = &session->txn;
WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
+
+ /*
+ * If there is no transaction active in this thread and we haven't
+ * checked if the cache is full, do it now. If we have to block for
+ * eviction, this is the best time to do it.
+ */
+ if (F_ISSET(txn, TXN_RUNNING) &&
+ !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+ WT_RET(__wt_cache_full_check(session));
+
if (!F_ISSET(txn, TXN_HAS_ID)) {
conn = S2C(session);
txn_global = &conn->txn_global;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 80b917e37cb..5f6818ebba5 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -3185,206 +3185,208 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1030
/*! cache: pages evicted because they exceeded the in-memory maximum */
#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1031
+/*! cache: pages evicted because they had chains of deleted items */
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1032
/*! cache: failed eviction of pages that exceeded the in-memory maximum */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1032
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1033
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1033
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1034
/*! cache: internal pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1034
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1035
/*! cache: maximum page size at eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1035
+#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1036
/*! cache: eviction server candidate queue empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1036
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1037
/*! cache: eviction server candidate queue not empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1037
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1038
/*! cache: eviction server evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1038
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1039
/*! cache: eviction server populating queue, but not evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1039
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1040
/*! cache: eviction server unable to reach eviction goal */
-#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1040
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1041
/*! cache: pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1041
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1042
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1042
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1043
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1043
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1044
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1044
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1045
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1045
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1046
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1046
+#define WT_STAT_CONN_CACHE_READ 1047
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1047
+#define WT_STAT_CONN_CACHE_WRITE 1048
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1048
+#define WT_STAT_CONN_COND_WAIT 1049
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1049
+#define WT_STAT_CONN_CURSOR_CREATE 1050
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1050
+#define WT_STAT_CONN_CURSOR_INSERT 1051
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1051
+#define WT_STAT_CONN_CURSOR_NEXT 1052
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1052
+#define WT_STAT_CONN_CURSOR_PREV 1053
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1053
+#define WT_STAT_CONN_CURSOR_REMOVE 1054
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1054
+#define WT_STAT_CONN_CURSOR_RESET 1055
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1055
+#define WT_STAT_CONN_CURSOR_SEARCH 1056
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1056
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1057
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1057
+#define WT_STAT_CONN_CURSOR_UPDATE 1058
/*! data-handle: connection dhandles swept */
-#define WT_STAT_CONN_DH_CONN_HANDLES 1058
+#define WT_STAT_CONN_DH_CONN_HANDLES 1059
/*! data-handle: connection candidate referenced */
-#define WT_STAT_CONN_DH_CONN_REF 1059
+#define WT_STAT_CONN_DH_CONN_REF 1060
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_CONN_SWEEPS 1060
+#define WT_STAT_CONN_DH_CONN_SWEEPS 1061
/*! data-handle: connection time-of-death sets */
-#define WT_STAT_CONN_DH_CONN_TOD 1061
+#define WT_STAT_CONN_DH_CONN_TOD 1062
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1062
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1063
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1063
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1064
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1064
+#define WT_STAT_CONN_FILE_OPEN 1065
/*! log: log buffer size increases */
-#define WT_STAT_CONN_LOG_BUFFER_GROW 1065
+#define WT_STAT_CONN_LOG_BUFFER_GROW 1066
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1066
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1067
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1067
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1068
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1068
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1069
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1069
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1070
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1070
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1071
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1071
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1072
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1072
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1073
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1073
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1074
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1074
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1075
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1075
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1076
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1076
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1077
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1077
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1078
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1078
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1079
/*! log: log read operations */
-#define WT_STAT_CONN_LOG_READS 1079
+#define WT_STAT_CONN_LOG_READS 1080
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1080
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1081
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1081
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1082
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1082
+#define WT_STAT_CONN_LOG_SCANS 1083
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1083
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1084
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1084
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1085
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1085
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1086
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1086
+#define WT_STAT_CONN_LOG_SLOT_RACES 1087
/*! log: slots selected for switching that were unavailable */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1087
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1088
/*! log: record size exceeded maximum */
-#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1088
+#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1089
/*! log: failed to find a slot large enough for record */
-#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1089
+#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1090
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1090
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1091
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1091
+#define WT_STAT_CONN_LOG_SYNC 1092
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1092
+#define WT_STAT_CONN_LOG_WRITES 1093
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1093
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1094
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1094
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1095
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1095
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1096
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1096
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1097
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1097
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1098
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1098
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1099
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1099
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1100
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1100
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1101
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1101
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1102
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1102
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1103
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1103
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1104
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1104
+#define WT_STAT_CONN_MEMORY_FREE 1105
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1105
+#define WT_STAT_CONN_MEMORY_GROW 1106
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1106
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1107
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1107
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1108
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1108
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1109
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1109
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1110
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1110
+#define WT_STAT_CONN_PAGE_SLEEP 1111
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1111
+#define WT_STAT_CONN_READ_IO 1112
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1112
+#define WT_STAT_CONN_REC_PAGES 1113
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1113
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1114
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1114
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1115
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1115
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1116
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1116
+#define WT_STAT_CONN_RWLOCK_READ 1117
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1117
+#define WT_STAT_CONN_RWLOCK_WRITE 1118
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1118
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1119
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1119
+#define WT_STAT_CONN_SESSION_OPEN 1120
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1120
+#define WT_STAT_CONN_TXN_BEGIN 1121
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1121
+#define WT_STAT_CONN_TXN_CHECKPOINT 1122
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1122
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1123
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1123
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1124
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1124
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1125
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1125
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1126
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1126
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1127
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1127
+#define WT_STAT_CONN_TXN_COMMIT 1128
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1128
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1129
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1129
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1130
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1130
+#define WT_STAT_CONN_TXN_ROLLBACK 1131
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1131
+#define WT_STAT_CONN_WRITE_IO 1132
/*!
* @}
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 138b64a6e27..1b3a9b62626 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -322,13 +322,13 @@ struct __wt_update;
#include "misc.i"
#include "intpack.i" /* required by cell.i, packing.i */
#include "packing.i"
+#include "cache.i" /* required by txn.i */
#include "cell.i" /* required by btree.i */
#include "mutex.i" /* required by btree.i */
#include "txn.i" /* required by btree.i */
#include "btree.i" /* required by cursor.i */
-#include "cache.i" /* required by cursor.i */
#include "cursor.i"
#include "bitstring.i"
diff --git a/src/log/log.c b/src/log/log.c
index 944e748a6a8..c48fc7536b2 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -240,6 +240,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
if (log->log_close_fh != NULL)
F_SET(slot, SLOT_CLOSEFH);
}
+
/*
* Checkpoints can be configured based on amount of log written.
* Add in this log record to the sum and if needed, signal the
@@ -857,9 +858,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_FH *close_fh;
WT_LOG *log;
- WT_LSN sync_lsn;
+ WT_LSN close_end_lsn, close_lsn, sync_lsn;
size_t write_size;
int locked;
WT_DECL_SPINLOCK_ID(id); /* Must appear last */
@@ -872,12 +872,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
* If we're going to have to close our log file, make a local copy
* of the file handle structure.
*/
- close_fh = NULL;
- if (F_ISSET(slot, SLOT_CLOSEFH)) {
- close_fh = log->log_close_fh;
- log->log_close_fh = NULL;
- F_CLR(slot, SLOT_CLOSEFH);
- }
+ WT_INIT_LSN(&close_lsn);
+ WT_INIT_LSN(&close_end_lsn);
/* Write the buffered records */
if (F_ISSET(slot, SLOT_BUFFERED)) {
@@ -895,13 +891,22 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
__wt_yield();
log->write_lsn = slot->slot_end_lsn;
+ if (F_ISSET(slot, SLOT_CLOSEFH))
+ WT_ERR(__wt_cond_signal(session, conn->log_close_cond));
+
/*
* Try to consolidate calls to fsync to wait less. Acquire a spin lock
* so that threads finishing writing to the log will wait while the
* current fsync completes and advance log->sync_lsn.
*/
while (F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) {
- if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
+ /*
+ * We have to wait until earlier log files have finished their
+ * sync operations. The most recent one will set the LSN to the
+ * beginning of our file.
+ */
+ if (log->sync_lsn.file < slot->slot_end_lsn.file ||
+ __wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
WT_ERR(__wt_cond_wait(
session, log->log_sync_cond, 10000));
continue;
@@ -909,10 +914,10 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
locked = 1;
/*
- * Record the current end of log after we grabbed the lock.
+ * Record the current end of our update after the lock.
* That is how far our calls can guarantee.
*/
- sync_lsn = log->write_lsn;
+ sync_lsn = slot->slot_end_lsn;
/*
* Check if we have to sync the parent directory. Some
* combinations of sync flags may result in the log file
@@ -956,16 +961,6 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
WT_ERR(__wt_buf_grow(session,
&slot->slot_buf, slot->slot_buf.memsize * 2));
}
- /*
- * If we have a file to close, close it now. First fsync so
- * that a later sync will be assured all earlier transactions
- * in earlier log files are also on disk.
- */
- if (close_fh) {
- WT_ERR(__wt_fsync(session, close_fh));
- WT_ERR(__wt_close(session, close_fh));
- }
-
err: if (locked)
__wt_spin_unlock(session, &log->log_sync_lock);
if (ret != 0 && slot->slot_error == 0)
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 031a4e88467..3f14e035a9b 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -171,8 +171,6 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
lsm_tree->nchunks != 0)
goto open;
- WT_RET(__wt_cache_full_check(session));
-
if (clsm->dsk_gen != lsm_tree->dsk_gen &&
lsm_tree->nchunks != 0)
goto open;
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 3ab5e0acab1..8ee143133ae 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -736,13 +736,6 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config)
if (F_ISSET(&session->txn, TXN_RUNNING))
WT_ERR_MSG(session, EINVAL, "Transaction already running");
- /*
- * There is no transaction active in this thread; check if the cache is
- * full, if we have to block for eviction, this is the best time to do
- * it.
- */
- WT_ERR(__wt_cache_full_check(session));
-
ret = __wt_txn_begin(session, cfg);
err: API_END_RET(session, ret);
diff --git a/src/support/stat.c b/src/support/stat.c
index f4ae082add3..223d62d0559 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -376,6 +376,8 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
"cache: pages currently held in the cache";
stats->cache_eviction_force.desc =
"cache: pages evicted because they exceeded the in-memory maximum";
+ stats->cache_eviction_force_delete.desc =
+ "cache: pages evicted because they had chains of deleted items";
stats->cache_eviction_app.desc =
"cache: pages evicted by application threads";
stats->cache_read.desc = "cache: pages read into cache";
@@ -554,6 +556,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
stats->cache_eviction_dirty.v = 0;
stats->cache_eviction_deepen.v = 0;
stats->cache_eviction_force.v = 0;
+ stats->cache_eviction_force_delete.v = 0;
stats->cache_eviction_app.v = 0;
stats->cache_read.v = 0;
stats->cache_eviction_fail.v = 0;
diff --git a/src/txn/txn.c b/src/txn/txn.c
index fd80efd5ebd..5b8f11a88a5 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -361,8 +361,15 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
/* If we are logging, write a commit log record. */
if (ret == 0 && txn->mod_count > 0 &&
FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) &&
- !F_ISSET(session, WT_SESSION_NO_LOGGING))
+ !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
+ /*
+ * We are about to block on I/O writing the log.
+ * Release our snapshot in case it is keeping data pinned.
+ * This is particularly important for checkpoints.
+ */
+ __wt_txn_release_snapshot(session);
ret = __wt_txn_log_commit(session, cfg);
+ }
/*
* If anything went wrong, roll back.
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index f66bd7e09c8..789be2ceef4 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -270,6 +270,7 @@ __wt_txn_checkpoint_log(
{
WT_DECL_ITEM(logrec);
WT_DECL_RET;
+ WT_ITEM *ckpt_snapshot, empty;
WT_LSN *ckpt_lsn;
WT_TXN *txn;
uint8_t *end, *p;
@@ -319,21 +320,26 @@ __wt_txn_checkpoint_log(
*/
if (!txn->full_ckpt) {
txn->ckpt_nsnapshot = 0;
+ WT_CLEAR(empty);
+ ckpt_snapshot = &empty;
*ckpt_lsn = S2C(session)->log->alloc_lsn;
- }
+ } else
+ ckpt_snapshot = txn->ckpt_snapshot;
/* Write the checkpoint log record. */
WT_ERR(__wt_struct_size(session, &recsize, fmt,
rectype, ckpt_lsn->file, ckpt_lsn->offset,
- txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ txn->ckpt_nsnapshot, ckpt_snapshot));
WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
WT_ERR(__wt_struct_pack(session,
(uint8_t *)logrec->data + logrec->size, recsize, fmt,
rectype, ckpt_lsn->file, ckpt_lsn->offset,
- txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ txn->ckpt_nsnapshot, ckpt_snapshot));
logrec->size += (uint32_t)recsize;
- WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+ WT_ERR(__wt_log_write(session, logrec, lsnp,
+ F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ?
+ WT_LOG_FSYNC : 0));
/*
* If this full checkpoint completed successfully and there is