diff options
author | Keith Bostic <keith@wiredtiger.com> | 2015-01-16 15:23:00 -0500 |
---|---|---|
committer | Keith Bostic <keith@wiredtiger.com> | 2015-01-16 15:23:00 -0500 |
commit | 51a92facb691706bee4b6c573e8bda070a62351d (patch) | |
tree | 42bb9c6f5e16d4661cdf1c143e88339ecd94dff4 /src | |
parent | 24ca383872e0512a3ae54efd9f4f2de29eac0d23 (diff) | |
parent | 38b6b25fb7e825b234a17ad1fb9269c5f48cb129 (diff) | |
download | mongo-51a92facb691706bee4b6c573e8bda070a62351d.tar.gz |
Merge branch 'develop' into cursor-reconfigure
Diffstat (limited to 'src')
-rw-r--r-- | src/btree/bt_debug.c | 21 | ||||
-rw-r--r-- | src/btree/bt_delete.c | 10 | ||||
-rw-r--r-- | src/btree/bt_page.c | 15 | ||||
-rw-r--r-- | src/btree/bt_split.c | 305 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 9 | ||||
-rw-r--r-- | src/conn/conn_log.c | 105 | ||||
-rw-r--r-- | src/conn/conn_open.c | 3 | ||||
-rw-r--r-- | src/docs/images/wtstats.png | bin | 0 -> 128334 bytes | |||
-rw-r--r-- | src/docs/performance.dox | 3 | ||||
-rw-r--r-- | src/docs/spell.ok | 2 | ||||
-rw-r--r-- | src/docs/statistics.dox | 4 | ||||
-rw-r--r-- | src/docs/wtstats.dox | 47 | ||||
-rw-r--r-- | src/evict/evict_lru.c | 13 | ||||
-rw-r--r-- | src/include/btmem.h | 5 | ||||
-rw-r--r-- | src/include/btree.i | 97 | ||||
-rw-r--r-- | src/include/cache.i | 59 | ||||
-rw-r--r-- | src/include/connection.h | 12 | ||||
-rw-r--r-- | src/include/cursor.i | 12 | ||||
-rw-r--r-- | src/include/flags.h | 19 | ||||
-rw-r--r-- | src/include/stat.h | 1 | ||||
-rw-r--r-- | src/include/txn.i | 10 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 202 | ||||
-rw-r--r-- | src/include/wt_internal.h | 2 | ||||
-rw-r--r-- | src/log/log.c | 37 | ||||
-rw-r--r-- | src/lsm/lsm_cursor.c | 2 | ||||
-rw-r--r-- | src/session/session_api.c | 7 | ||||
-rw-r--r-- | src/support/stat.c | 3 | ||||
-rw-r--r-- | src/txn/txn.c | 9 | ||||
-rw-r--r-- | src/txn/txn_log.c | 14 |
29 files changed, 673 insertions, 355 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 4de94277364..af9f6a669f2 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -408,11 +408,13 @@ __debug_tree_shape_info(WT_PAGE *page) v = page->memory_footprint; if (v >= WT_GIGABYTE) - snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE); + snprintf(buf, sizeof(buf), + "(%p %" PRIu64 "G)", page, v / WT_GIGABYTE); else if (v >= WT_MEGABYTE) - snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE); + snprintf(buf, sizeof(buf), + "(%p %" PRIu64 "M)", page, v / WT_MEGABYTE); else - snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v); + snprintf(buf, sizeof(buf), "(%p %" PRIu64 ")", page, v); return (buf); } @@ -429,16 +431,16 @@ __debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level) session = ds->session; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { - __dmsg(ds, "%*s" "I" "%s\n", - level, " ", __debug_tree_shape_info(page)); + __dmsg(ds, "%*s" "I" "%d %s\n", + level * 3, " ", level, __debug_tree_shape_info(page)); WT_INTL_FOREACH_BEGIN(session, page, ref) { if (ref->state == WT_REF_MEM) __debug_tree_shape_worker( - ds, ref->page, level + 3); + ds, ref->page, level + 1); } WT_INTL_FOREACH_END; } else - __dmsg(ds, "%*s" "L" "%s\n", - level, " ", __debug_tree_shape_info(page)); + __dmsg(ds, "%*s" "L" " %s\n", + level * 3, " ", __debug_tree_shape_info(page)); } /* @@ -458,8 +460,7 @@ __wt_debug_tree_shape( if (page == NULL) page = S2BT(session)->root.page; - WT_WITH_PAGE_INDEX(session, - __debug_tree_shape_worker(ds, page, 0)); + WT_WITH_PAGE_INDEX(session, __debug_tree_shape_worker(ds, page, 1)); __dmsg_wrapup(ds); return (0); diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index c97ea176c97..622dfb1b294 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -207,6 +207,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) { int skip; + if (ref->state != WT_REF_DELETED) + return (0); + /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations @@ -225,11 +228,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ + if (ref->page_del == NULL) + return (1); + if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (0); - skip = ref->page_del == NULL || - __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0; + skip = (ref->page_del == NULL || + __wt_txn_visible(session, ref->page_del->txnid)); WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 181ffdb3736..561e1c19218 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -37,8 +37,11 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) page->type != WT_PAGE_ROW_LEAF) return (0); - /* Eviction may be turned off, although that's rare. */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + /* + * Eviction may be turned off (although that's rare), or we may be in + * the middle of a checkpoint. + */ + if (F_ISSET(btree, WT_BTREE_NO_EVICTION) || btree->checkpointing) return (0); /* @@ -128,7 +131,13 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; - WT_RET(__wt_page_release(session, ref, flags)); + if ((ret = __wt_page_release_busy( + session, ref, flags)) == EBUSY) { + /* If forced eviction fails, stall. */ + ret = 0; + wait_cnt += 1000; + } else + WT_RET(ret); WT_STAT_FAST_CONN_INCR( session, page_forcible_evict_blocked); break; diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 911a38e4be6..69dbfb42354 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -9,15 +9,6 @@ #include "wt_internal.h" /* - * Tuning; global variables to allow the binary to be patched, we don't yet have - * any real understanding of what might be useful to surface to applications. - */ -static u_int __split_deepen_max_internal_image = 100; -static u_int __split_deepen_min_child = 10; -static u_int __split_deepen_per_child = 100; -static u_int __split_deepen_split_child = 100; - -/* * Track allocation increments, matching the cache calculations, which add an * estimate of allocation overhead to every object. */ @@ -177,45 +168,57 @@ __split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s) } /* + * Tuning; global variables to allow the binary to be patched, we don't yet have + * any real understanding of what might be useful to surface to applications. + */ +static u_int __split_deepen_min_child = 10000; +static u_int __split_deepen_per_child = 100; + +/* * __split_should_deepen -- * Return if we should deepen the tree. */ static int -__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page) +__split_should_deepen( + WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp) { WT_PAGE_INDEX *pindex; + WT_PAGE *page; - /* - * Splits are based on either the number of child pages that will be - * created by the split (splitting an internal page that will be slow - * to search), or by the memory footprint of the parent page (avoiding - * an internal page that will eat up all of the cache and put eviction - * pressure on the system). - */ + *childrenp = 0; + + page = ref->page; pindex = WT_INTL_INDEX_COPY(page); /* * Deepen the tree if the page's memory footprint is larger than the - * maximum size for a page in memory. We need an absolute minimum - * number of entries in order to split the page: if there is a single - * huge key, splitting won't help. + * maximum size for a page in memory (presumably putting eviction + * pressure on the cache). */ - if (page->memory_footprint > S2BT(session)->maxmempage && - pindex->entries >= __split_deepen_min_child) - return (1); + if (page->memory_footprint < S2BT(session)->maxmempage) + return (0); /* - * Deepen the tree if the page's memory footprint is at least N - * times the maximum internal page size chunk in the backing file and - * the split will result in at least N children in the newly created - * intermediate layer. + * Ensure the page has enough entries to make it worth splitting and + * we get a significant payback (in the case of a set of large keys, + * splitting won't help). */ - if (page->memory_footprint > - __split_deepen_max_internal_image * S2BT(session)->maxintlpage && - pindex->entries >= - (__split_deepen_per_child * __split_deepen_split_child)) + if (pindex->entries > __split_deepen_min_child) { + *childrenp = pindex->entries / __split_deepen_per_child; return (1); + } + /* + * The root is a special-case: if it's putting cache pressure on the + * system, split it even if there are only a few entries, we can't + * push it out of memory. Sanity check: if the root page is too big + * with less than 100 keys, there are huge keys and/or a too-small + * cache, there's not much to do. + */ + if (__wt_ref_is_root(ref) && pindex->entries > 100) { + *childrenp = pindex->entries / 10; + return (1); + } return (0); } @@ -254,12 +257,13 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) } /* - * __split_ref_instantiate -- - * Instantiate key/address pairs in memory in service of a split. + * __split_ref_deepen_move -- + * Move a WT_REF from a parent to a child in service of a split to deepen + * the tree, including updating the accounting information. */ static int -__split_ref_instantiate(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) +__split_ref_deepen_move(WT_SESSION_IMPL *session, + WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) { WT_ADDR *addr; WT_CELL_UNPACK unpack; @@ -276,8 +280,6 @@ __split_ref_instantiate(WT_SESSION_IMPL *session, * of child pages, and so we can no longer reference the block image * that remains with the page being split. * - * Track how much memory the parent is losing and the child gaining. - * * No locking is required to update the WT_REF structure because we're * the only thread splitting the parent page, and there's no way for * readers to race with our updates of single pointers. The changes @@ -286,13 +288,13 @@ __split_ref_instantiate(WT_SESSION_IMPL *session, * * Row-store keys, first. */ - if (page->type == WT_PAGE_ROW_INT) { + if (parent->type == WT_PAGE_ROW_INT) { if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { - __wt_ref_key(page, ref, &key, &size); + __wt_ref_key(parent, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, &ikey)); ref->key.ikey = ikey; } else { - WT_RET(__split_ovfl_key_cleanup(session, page, ref)); + WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); WT_MEMSIZE_ADD(*parent_decrp, sizeof(WT_IKEY) + ikey->size); } @@ -304,12 +306,8 @@ __split_ref_instantiate(WT_SESSION_IMPL *session, * address has been instantiated, there's no work to do. Otherwise, * get the address from the on-page cell. */ - if ((addr = ref->addr) == NULL) - return (0); - if (__wt_off_page(page, addr)) - WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp, - sizeof(WT_ADDR) + addr->size); - else { + addr = ref->addr; + if (addr != NULL && !__wt_off_page(parent, addr)) { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( @@ -321,8 +319,11 @@ __split_ref_instantiate(WT_SESSION_IMPL *session, addr->type = unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; ref->addr = addr; - WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_ADDR) + addr->size); } + + /* And finally, the WT_REF itself. */ + WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); + return (0); } @@ -383,7 +384,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) * Split an internal page in-memory, deepening the tree. */ static int -__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) +__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) { WT_DECL_RET; WT_PAGE *child; @@ -391,7 +392,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) WT_REF **alloc_refp; WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; size_t child_incr, parent_decr, parent_incr, size; - uint32_t children, chunk, i, j, remain, slots; + uint32_t chunk, i, j, remain, slots; int panic; void *p; @@ -401,13 +402,6 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) pindex = WT_INTL_INDEX_COPY(parent); - /* - * Create N children, unless we are dealing with a large page without - * many entries, in which case split into the minimum number of pages. - */ - children = WT_MAX(pindex->entries / __split_deepen_per_child, - __split_deepen_min_child); - WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", @@ -506,12 +500,9 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) child_incr = 0; child_pindex = WT_INTL_INDEX_COPY(child); for (child_refp = child_pindex->index, j = 0; j < slots; ++j) { - WT_ERR(__split_ref_instantiate(session, + WT_ERR(__split_ref_deepen_move(session, parent, *parent_refp, &parent_decr, &child_incr)); *child_refp++ = *parent_refp++; - - WT_MEMSIZE_TRANSFER( - parent_decr, child_incr, sizeof(WT_REF)); } __wt_cache_page_inmem_incr(session, child, child_incr); } @@ -604,9 +595,10 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) * be using the new index. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_MEMSIZE_ADD(parent_decr, size); WT_ERR(__split_safe_free(session, 0, pindex, size)); + WT_MEMSIZE_ADD(parent_decr, size); +#if 0 /* * Adjust the parent's memory footprint. This may look odd, but we * have already taken the allocation overhead into account, and an @@ -615,6 +607,19 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) */ __wt_cache_page_inmem_incr(session, parent, parent_incr); __wt_cache_page_inmem_decr(session, parent, parent_decr); +#else + /* + * XXX + * The code to track page sizes is fundamentally flawed in the face of + * splits: for example, we don't add in an overhead allocation constant + * when allocating WT_REF structures as pages are created, but the + * calculations during split assume that correction. For now, ignore + * our carefully calculated values and force the internal page size to + * 5% of its current value. + */ + size = parent->memory_footprint - (parent->memory_footprint / 20); + __wt_cache_page_inmem_decr(session, parent, size); +#endif if (0) { err: __wt_free_ref_index(session, parent, alloc_index, 1); @@ -770,13 +775,11 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, * the confusion. */ WT_RET(__wt_calloc_one(session, &addr)); - WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR)); ref->addr = addr; addr->size = multi->addr.size; addr->type = multi->addr.type; WT_RET(__wt_strndup(session, multi->addr.addr, addr->size, &addr->addr)); - WT_MEMSIZE_ADD(incr, addr->size); } else WT_RET(__split_multi_inmem(session, page, ref, multi)); @@ -814,17 +817,20 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, int exclusive, int ref_discard) { WT_DECL_RET; + WT_IKEY *ikey; WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; - WT_REF **alloc_refp, *parent_ref; + WT_REF **alloc_refp, *next_ref, *parent_ref; size_t size; - uint32_t i, j, parent_entries, result_entries; + uint32_t children, i, j; + uint32_t deleted_entries, parent_entries, result_entries; int complete, hazard, locked; parent = NULL; /* -Wconditional-uninitialized */ - alloc_index = NULL; + alloc_index = pindex = NULL; parent_ref = NULL; complete = hazard = locked = 0; + parent_entries = 0; /* * Get a page-level lock on the parent to single-thread splits into the @@ -865,7 +871,29 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, pindex = WT_INTL_INDEX_COPY(parent); parent_entries = pindex->entries; - result_entries = (parent_entries - 1) + new_entries; + + /* + * Remove any refs to deleted pages while we are splitting, we have + * the internal page locked down, and are copying the refs into a new + * array anyway. Switch them to the special split state, so that any + * reading thread will restart. + */ + for (i = 0, deleted_entries = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); + if (next_ref->state == WT_REF_DELETED && + next_ref->page_del == NULL && + WT_ATOMIC_CAS4(next_ref->state, + WT_REF_DELETED, WT_REF_SPLIT)) + deleted_entries++; + } + + /* + * The final entry count consists of: The original count, plus any + * new pages, less any refs we are removing because they only + * contained deleted items, less 1 for the page being replaced. + */ + result_entries = (parent_entries + new_entries) - (deleted_entries + 1); /* * Allocate and initialize a new page index array for the parent, then @@ -877,8 +905,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_MEMSIZE_ADD(parent_incr, size); alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) - if (pindex->index[i] == ref) + for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; *alloc_refp++ = ref_new[j]; @@ -890,8 +919,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ ref_new[j] = NULL; } - else - *alloc_refp++ = pindex->index[i]; + else if (next_ref->state != WT_REF_SPLIT) + /* Skip refs we have marked for deletion. */ + *alloc_refp++ = next_ref; + } /* * Update the parent page's index: this update makes the split visible @@ -926,6 +957,36 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, complete = 1; /* + * Now that the new page is in place it's OK to free any deleted + * refs we encountered modulo the regular safe free semantics. + */ + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + /* If we set the ref to split to mark it for delete */ + if (next_ref != ref && next_ref->state == WT_REF_SPLIT) { + /* + * We're discarding a deleted reference. + * Free any resources it holds. + */ + if (parent->type == WT_PAGE_ROW_INT) { + WT_TRET(__split_ovfl_key_cleanup( + session, parent, next_ref)); + ikey = __wt_ref_key_instantiated(next_ref); + if (ikey != NULL) { + size = sizeof(WT_IKEY) + ikey->size; + WT_TRET(__split_safe_free( + session, 0, ikey, size)); + WT_MEMSIZE_ADD(parent_decr, size); + } + } + + WT_TRET(__split_safe_free( + session, 0, next_ref, sizeof(WT_REF))); + WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + } + } + + /* * We can't free the previous page index, there may be threads using it. * Add it to the session discard list, to be freed when it's safe. */ @@ -978,11 +1039,30 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Do the check here because we've just grown the parent page and * are holding it locked. */ - if (ret == 0 && !exclusive && __split_should_deepen(session, parent)) + if (ret == 0 && !exclusive && + !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) && + __split_should_deepen(session, parent_ref, &children)) { + /* + * XXX + * Temporary hack to avoid a bug where the root page is split + * even when it's no longer doing any good. + */ + uint64_t __a, __b; + __a = parent->memory_footprint; WT_WITH_PAGE_INDEX(session, - ret = __split_deepen(session, parent)); + ret = __split_deepen(session, parent, children)); + __b = parent->memory_footprint; + if (__b * 2 >= __a) + F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN); + } -err: if (locked) +err: if (!complete) + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref->state == WT_REF_SPLIT) + next_ref->state = WT_REF_DELETED; + } + if (locked) F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); if (hazard) @@ -1018,15 +1098,16 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) WT_PAGE *page, *right; WT_REF *child, *split_ref[2] = { NULL, NULL }; WT_UPDATE *upd; - size_t page_decr, parent_incr, right_incr, size; + size_t page_decr, parent_decr, parent_incr, right_incr; int i; *splitp = 0; btree = S2BT(session); page = ref->page; + ikey = NULL; right = NULL; - page_decr = parent_incr = right_incr = 0; + page_decr = parent_decr = parent_incr = right_incr = 0; /* * Check for pages with append-only workloads. A common application @@ -1127,9 +1208,19 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) WT_ERR(__wt_row_ikey(session, 0, WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), &child->key.ikey)); + + /* + * We're swapping WT_REFs in the parent, adjust the accounting, and + * row store pages may have instantiated keys. + */ WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF)); - WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY)); - WT_MEMSIZE_ADD(parent_incr, WT_INSERT_KEY_SIZE(moved_ins)); + WT_MEMSIZE_ADD( + parent_incr, sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins)); + WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) + if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) + WT_MEMSIZE_ADD( + parent_decr, sizeof(WT_IKEY) + ikey->size); /* The new page is dirty by definition. */ WT_ERR(__wt_page_modify_init(session, right)); @@ -1151,14 +1242,11 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) */ for (i = 0; i < WT_SKIP_MAXDEPTH && ins_head->tail[i] == moved_ins; ++i) ; - size = ((size_t)i - 1) * sizeof(WT_INSERT *); - size += sizeof(WT_INSERT) + WT_INSERT_KEY_SIZE(moved_ins); + WT_MEMSIZE_TRANSFER(page_decr, right_incr, sizeof(WT_INSERT) + + (size_t)i * sizeof(WT_INSERT *) + WT_INSERT_KEY_SIZE(moved_ins)); for (upd = moved_ins->upd; upd != NULL; upd = upd->next) - size += sizeof(WT_UPDATE) + upd->size; - WT_MEMSIZE_ADD(right_incr, size); - WT_MEMSIZE_ADD(page_decr, size); - __wt_cache_page_inmem_decr(session, page, page_decr); - __wt_cache_page_inmem_incr(session, right, right_incr); + WT_MEMSIZE_TRANSFER( + page_decr, right_incr, sizeof(WT_UPDATE) + upd->size); /* * Allocation operations completed, move the last insert list item from @@ -1245,10 +1333,23 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) #endif /* - * Split into the parent. + * Save the transaction ID when the split happened. Application + * threads will not try to forcibly evict the page again until + * all concurrent transactions commit. + */ + page->modify->inmem_split_txn = __wt_txn_new_id(session); + + /* Update the page accounting. */ + __wt_cache_page_inmem_decr(session, page, page_decr); + __wt_cache_page_inmem_incr(session, right, right_incr); + + /* + * Split into the parent. After this, the original page is no + * longer locked, so we cannot safely look at it. */ + page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, 0, parent_incr, 0, 0)) != 0) { + session, ref, split_ref, 2, parent_decr, parent_incr, 0, 0)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1271,13 +1372,6 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) WT_ERR(ret); } - /* - * Save the transaction ID when the split happened. Application - * threads will not try to forcibly evict the page again until - * all concurrent transactions commit. - */ - page->modify->inmem_split_txn = __wt_txn_new_id(session); - /* Let our caller know that we split. */ *splitp = 1; @@ -1289,13 +1383,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) * structure and instantiated key, there may be threads using them. * Add them to the session discard list, to be freed once we know it's * safe. - * - * After the split, we're going to discard the WT_REF, account for the - * change in memory footprint. Row store pages have keys that may be - * instantiated, check for that. */ - if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) && - (ikey = __wt_ref_key_instantiated(ref)) != NULL) + if (ikey != NULL) WT_TRET(__split_safe_free( session, 0, ikey, sizeof(WT_IKEY) + ikey->size)); WT_TRET(__split_safe_free(session, 0, ref, sizeof(WT_REF))); @@ -1380,7 +1469,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF **ref_new; - size_t ikey_size, parent_decr, parent_incr; + size_t parent_decr, parent_incr; uint32_t i, new_entries; page = ref->page; @@ -1388,7 +1477,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) new_entries = mod->mod_multi_entries; ikey = NULL; - ikey_size = parent_decr = parent_incr = 0; + parent_decr = parent_incr = 0; /* * Convert the split page's multiblock reconciliation information into @@ -1404,12 +1493,11 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * change in memory footprint. Row store pages have keys that may be * instantiated, check for that. */ - if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) && - (ikey = __wt_ref_key_instantiated(ref)) != NULL) { - ikey_size = sizeof(WT_IKEY) + ikey->size; - WT_MEMSIZE_ADD(parent_decr, ikey_size); - } WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) + if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) + WT_MEMSIZE_ADD( + parent_decr, sizeof(WT_IKEY) + ikey->size); /* Split into the parent. */ WT_ERR(__split_parent(session, @@ -1436,7 +1524,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * safe. */ if (ikey != NULL) - WT_TRET(__split_safe_free(session, exclusive, ikey, ikey_size)); + WT_TRET(__split_safe_free( + session, exclusive, ikey, sizeof(WT_IKEY) + ikey->size)); WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF))); /* diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index c74a7177401..a2b2a6bb7c8 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -208,6 +208,12 @@ restart: /* break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* + * Avoid pulling a deleted page back in to try + * to delete it again. + */ + if (__wt_delete_page_skip(session, ref)) + break; + /* * If deleting a range, try to delete the page * without instantiating it. */ @@ -242,8 +248,7 @@ restart: /* * If iterating a cursor, try to skip deleted * pages that are visible to us. */ - if (ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, ref)) + if (__wt_delete_page_skip(session, ref)) break; } diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 2799a58f327..796b7d5147b 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -126,11 +126,13 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) /* * If we're coming from a backup cursor we want the smaller of * the last full log file copied in backup or the checkpoint LSN. + * Otherwise we want the minimum of the last log file written to + * disk and the checkpoint LSN. */ if (backup_file != 0) min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file); else - min_lognum = log->ckpt_lsn.file; + min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_archive: archive to log number %" PRIu32, min_lognum)); @@ -276,6 +278,70 @@ err: } /* + * __log_close_server -- + * The log close server thread. + */ +static void * +__log_close_server(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *close_fh; + WT_LOG *log; + WT_LSN close_end_lsn, close_lsn; + WT_SESSION_IMPL *session; + int locked; + + session = arg; + conn = S2C(session); + log = conn->log; + locked = 0; + while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + /* + * If there is a log file to close, fsync and close it. + */ + if ((close_fh = log->log_close_fh) != NULL) { + /* + * We've copied the file handle, clear out the one in + * log structure to allow it to be set again. + */ + log->log_close_fh = NULL; + /* + * Set the close_end_lsn to the LSN immediately after + * ours. That is, the beginning of the next log file. + * We need to know the LSN file number of our own close + * in case earlier calls are still in progress and the + * next one to move the sync_lsn into the next file for + * later syncs. + */ + WT_ERR(__wt_log_extract_lognum(session, close_fh->name, + &close_lsn.file)); + close_lsn.offset = 0; + close_end_lsn = close_lsn; + close_end_lsn.file++; + WT_ERR(__wt_fsync(session, close_fh)); + __wt_spin_lock(session, &log->log_sync_lock); + locked = 1; + WT_ERR(__wt_close(session, close_fh)); + log->sync_lsn = close_end_lsn; + WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); + locked = 0; + __wt_spin_unlock(session, &log->log_sync_lock); + } else + /* Wait until the next event. */ + WT_ERR(__wt_cond_wait(session, + conn->log_close_cond, 10000)); + } + + if (0) { +err: __wt_err(session, ret, "log close server error"); + } + if (locked) + __wt_spin_unlock(session, &log->log_sync_lock); + return (NULL); +} + +/* * __log_server -- * The log server thread. */ @@ -292,7 +358,7 @@ __log_server(void *arg) conn = S2C(session); log = conn->log; locked = 0; - while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Perform log pre-allocation. */ @@ -320,7 +386,7 @@ __log_server(void *arg) } if (0) { -err: __wt_err(session, ret, "log archive server error"); +err: __wt_err(session, ret, "log server error"); } if (locked) (void)__wt_writeunlock(session, log->log_archive_lock); @@ -384,7 +450,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) /* * __wt_logmgr_open -- - * Start the log subsystem and archive server thread. + * Start the log service threads. */ int __wt_logmgr_open(WT_SESSION_IMPL *session) @@ -394,14 +460,33 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) conn = S2C(session); /* If no log thread services are configured, we're done. */ - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || - !FLD_ISSET(conn->log_flags, + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + return (0); + + /* + * Start the log close thread. It is not configurable. + * If logging is enabled, this thread runs. + */ + WT_RET(__wt_open_internal_session( + conn, "log-close-server", 0, 0, &conn->log_close_session)); + WT_RET(__wt_cond_alloc(conn->log_close_session, + "log close server", 0, &conn->log_close_cond)); + + /* + * Start the thread. + */ + WT_RET(__wt_thread_create(conn->log_close_session, + &conn->log_close_tid, __log_close_server, conn->log_close_session)); + conn->log_close_tid_set = 1; + + /* If no log thread services are configured, we're done. */ + if (!FLD_ISSET(conn->log_flags, (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC))) return (0); /* * If a log server thread exists, the user may have reconfigured - * archiving ore pre-allocation. Signal the thread. Otherwise the + * archiving or pre-allocation. Signal the thread. Otherwise the * user wants archiving and/or allocation and we need to start up * the thread. */ @@ -455,6 +540,12 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_tid_set = 0; } WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); + if (conn->log_close_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->log_close_cond)); + WT_TRET(__wt_thread_join(session, conn->log_close_tid)); + conn->log_close_tid_set = 0; + } + WT_TRET(__wt_cond_destroy(session, &conn->log_close_cond)); WT_TRET(__wt_log_close(session)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index b425376d6ae..ab873cc36a9 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Tell internal server threads to run: this must be set before opening * any sessions. */ - F_SET(conn, WT_CONN_SERVER_RUN); + F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN); /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, @@ -130,6 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_TRET(__wt_txn_checkpoint_log( session, 1, WT_TXN_LOG_CKPT_STOP, NULL)); + F_CLR(conn, WT_CONN_LOG_SERVER_RUN); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ diff --git a/src/docs/images/wtstats.png b/src/docs/images/wtstats.png Binary files differnew file mode 100644 index 00000000000..f65a2871b6f --- /dev/null +++ b/src/docs/images/wtstats.png diff --git a/src/docs/performance.dox b/src/docs/performance.dox index 5b9d6c40e7b..2284e1e1d4f 100644 --- a/src/docs/performance.dox +++ b/src/docs/performance.dox @@ -24,4 +24,7 @@ investigate performance and tune their WiredTiger applications. <h2>Simulating workloads</h2> - @subpage wtperf +<h2>Visualizing performance</h2> +- @subpage wtstats + */ diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 2fd7e5f0ad2..56d1aa1170f 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -76,6 +76,7 @@ WiredTigerException WiredTigerLog WiredTigerPanicException WiredTigerRollbackException +WiredTigerStat WiredTigerTestCase Za aR @@ -442,6 +443,7 @@ writelock writelocks wrlock wtperf +wtstats xa yieldcpu zlib diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox index 067cf342111..7fdc4125254 100644 --- a/src/docs/statistics.dox +++ b/src/docs/statistics.dox @@ -153,4 +153,8 @@ A Python script that parses the default logging output and uses the Portable Network Graphics (PNG) format graphs is included in the WiredTiger distribution in the file \c tools/statlog.py. +@m_if{c} +To interactively examine statistics results, see @ref wtstats. +@m_endif + */ diff --git a/src/docs/wtstats.dox b/src/docs/wtstats.dox new file mode 100644 index 00000000000..1a792849124 --- /dev/null +++ b/src/docs/wtstats.dox @@ -0,0 +1,47 @@ +/*! @page wtstats Visualizing performance with wtstats + +The WiredTiger distribution includes the \b wtstats tool that can be used to +examine information generated using statistics logging (see @ref +statistics_log). + +After running an application with statistics logging configured, the +statistics output files will be in the database home directory. By default, +these are named \c WiredTigerStat.* . In the database home directory, run +this command, replacing \c \<wiredtiger\> with the path to the +WiredTiger installation directory: +\code{.sh} +python <wiredtiger>/tools/wtstats.py WiredTigerStat.* +\endcode + +Another way to process all the stats files in a directory is: + +\code{.sh} +python <wiredtiger>/tools/wtstats.py <directory> +\endcode +In either case, a \c wtstats.html file will be generated in the \e current +directory that you can open in your browser to examine statistics. + +Additional options are available, use <tt>wtstats.py --help</tt> +to display them. + +Here is a sample of what is displayed using \c wtstats.html: + +\image html wtstats.png "wtstats.html" + +Some things to note about the interface: + +- The left sidebar has statistics groups that can each be expanded +to show individual statistics. Clicking on a circle toggles whether an +individual statistic or statistics group is displayed or not. + +- The search box at the upper left can be used to search for statistics +matching a string. + +- Hovering over values in the graph will show what the value is, and what +statistic is being shown. + +- The graph can be panned using two fingered scroll or mouse wheel. + +- Scaling of the entire graph can be changed using the buttons at the right top. + +*/ diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 60a5f82f233..a4ae0aaf55b 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -437,7 +437,7 @@ __evict_pass(WT_SESSION_IMPL *session) WT_EVICT_WORKER *worker; int loop; uint32_t flags; - uint64_t bytes_inuse, pages_evicted; + uint64_t bytes_inuse, dirty_target_size, pages_evicted, target_size; conn = S2C(session); cache = conn->cache; @@ -465,9 +465,16 @@ __evict_pass(WT_SESSION_IMPL *session) if (loop > 10) LF_SET(WT_EVICT_PASS_AGGRESSIVE); - /* Start a worker if we have capacity and the cache is full. */ + /* + * Start a worker if we have capacity and we haven't reached + * the eviction targets. + */ bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_inuse > conn->cache_size && + target_size = (conn->cache_size * cache->eviction_target) / 100; + dirty_target_size = + (conn->cache_size * cache->eviction_dirty_target) / 100; + if ((bytes_inuse > target_size || + cache->bytes_dirty > dirty_target_size) && conn->evict_workers < conn->evict_workers_max) { WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "Starting evict worker: %"PRIu32"\n", diff --git a/src/include/btmem.h b/src/include/btmem.h index e1fc72677c5..dd10e522412 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -550,9 +550,10 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ -#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing */ +#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */ +#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */ #define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ +#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ }; diff --git a/src/include/btree.i b/src/include/btree.i index a333e4af565..d30ee46486a 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -165,65 +165,6 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * __wt_cache_read_gen -- - * Get the current read generation number. - */ -static inline uint64_t -__wt_cache_read_gen(WT_SESSION_IMPL *session) -{ - return (S2C(session)->cache->read_gen); -} - -/* - * __wt_cache_read_gen_incr -- - * Increment the current read generation number. - */ -static inline void -__wt_cache_read_gen_incr(WT_SESSION_IMPL *session) -{ - ++S2C(session)->cache->read_gen; -} - -/* - * __wt_cache_read_gen_set -- - * Get the read generation to store in a page. - */ -static inline uint64_t -__wt_cache_read_gen_set(WT_SESSION_IMPL *session) -{ - /* - * We return read-generations from the future (where "the future" is - * measured by increments of the global read generation). The reason - * is because when acquiring a new hazard pointer for a page, we can - * check its read generation, and if the read generation isn't less - * than the current global generation, we don't bother updating the - * page. In other words, the goal is to avoid some number of updates - * immediately after each update we have to make. - */ - return (__wt_cache_read_gen(session) + WT_READGEN_STEP); -} - -/* - * __wt_cache_pages_inuse -- - * Return the number of pages in use. - */ -static inline uint64_t -__wt_cache_pages_inuse(WT_CACHE *cache) -{ - return (cache->pages_inmem - cache->pages_evict); -} - -/* - * __wt_cache_bytes_inuse -- - * Return the number of bytes in use. - */ -static inline uint64_t -__wt_cache_bytes_inuse(WT_CACHE *cache) -{ - return (cache->bytes_inmem - cache->bytes_evict); -} - -/* * __wt_page_evict_soon -- * Set a page to be evicted as soon as possible. */ @@ -917,16 +858,16 @@ __wt_ref_info(WT_SESSION_IMPL *session, } /* - * __wt_page_release -- - * Release a reference to a page. + * __wt_page_release_busy -- + * Release a reference to a page, fail if busy during forced eviction. */ static inline int -__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +__wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; - int locked; + int locked, too_big; btree = S2BT(session); @@ -938,6 +879,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) return (0); page = ref->page; + too_big = (page->memory_footprint < btree->maxmempage) ? 0 : 1; + /* * Attempt to evict pages with the special "oldest" read generation. * @@ -970,12 +913,19 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) return (ret); (void)WT_ATOMIC_ADD4(btree->evict_busy, 1); - if ((ret = __wt_evict_page(session, ref)) == 0) - WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); - else { + if ((ret = __wt_evict_page(session, ref)) == 0) { + if (too_big) + WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); + else + /* + * If the page isn't too big, we are evicting it because + * it had a chain of deleted entries that make traversal + * expensive. + */ + WT_STAT_FAST_CONN_INCR( + session, cache_eviction_force_delete); + } else { WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail); - if (ret == EBUSY) - ret = 0; } (void)WT_ATOMIC_SUB4(btree->evict_busy, 1); @@ -983,6 +933,17 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) } /* + * __wt_page_release -- + * Release a reference to a page. + */ +static inline int +__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +{ + WT_RET_BUSY_OK(__wt_page_release_busy(session, ref, flags)); + return (0); +} + +/* * __wt_page_swap_func -- * Swap one page's hazard pointer for another one when hazard pointer * coupling up/down the tree. diff --git a/src/include/cache.i b/src/include/cache.i index b997781272a..ee969255241 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -7,6 +7,65 @@ */ /* + * __wt_cache_read_gen -- + * Get the current read generation number. + */ +static inline uint64_t +__wt_cache_read_gen(WT_SESSION_IMPL *session) +{ + return (S2C(session)->cache->read_gen); +} + +/* + * __wt_cache_read_gen_incr -- + * Increment the current read generation number. + */ +static inline void +__wt_cache_read_gen_incr(WT_SESSION_IMPL *session) +{ + ++S2C(session)->cache->read_gen; +} + +/* + * __wt_cache_read_gen_set -- + * Get the read generation to store in a page. + */ +static inline uint64_t +__wt_cache_read_gen_set(WT_SESSION_IMPL *session) +{ + /* + * We return read-generations from the future (where "the future" is + * measured by increments of the global read generation). The reason + * is because when acquiring a new hazard pointer for a page, we can + * check its read generation, and if the read generation isn't less + * than the current global generation, we don't bother updating the + * page. In other words, the goal is to avoid some number of updates + * immediately after each update we have to make. + */ + return (__wt_cache_read_gen(session) + WT_READGEN_STEP); +} + +/* + * __wt_cache_pages_inuse -- + * Return the number of pages in use. + */ +static inline uint64_t +__wt_cache_pages_inuse(WT_CACHE *cache) +{ + return (cache->pages_inmem - cache->pages_evict); +} + +/* + * __wt_cache_bytes_inuse -- + * Return the number of bytes in use. + */ +static inline uint64_t +__wt_cache_bytes_inuse(WT_CACHE *cache) +{ + return (cache->bytes_inmem - cache->bytes_evict); +} + +/* * __wt_eviction_check -- * Wake the eviction server if necessary. */ diff --git a/src/include/connection.h b/src/include/connection.h index c8a3ae6e291..c5723882489 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -279,10 +279,14 @@ struct __wt_connection_impl { #define WT_CONN_LOG_EXISTED 0x04 /* Log files found */ #define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */ uint32_t log_flags; /* Global logging configuration */ - WT_CONDVAR *log_cond; /* Log archive wait mutex */ - WT_SESSION_IMPL *log_session; /* Log archive session */ - wt_thread_t log_tid; /* Log archive thread */ - int log_tid_set; /* Log archive thread set */ + WT_CONDVAR *log_cond; /* Log server wait mutex */ + WT_SESSION_IMPL *log_session; /* Log server session */ + wt_thread_t log_tid; /* Log server thread */ + int log_tid_set; /* Log server thread set */ + WT_CONDVAR *log_close_cond;/* Log close thread wait mutex */ + WT_SESSION_IMPL *log_close_session;/* Log close thread session */ + wt_thread_t log_close_tid; /* Log close thread thread */ + int log_close_tid_set;/* Log close thread set */ WT_LOG *log; /* Logging structure */ WT_COMPRESSOR *log_compressor;/* Logging compressor */ wt_off_t log_file_max; /* Log file max size */ diff --git a/src/include/cursor.i b/src/include/cursor.i index ae6aafdd638..8fa9790e096 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -176,11 +176,23 @@ static inline int __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter) { WT_SESSION_IMPL *session; + WT_TXN *txn; session = (WT_SESSION_IMPL *)cbt->iface.session; + txn = &session->txn; if (reenter) WT_RET(__curfile_leave(cbt)); + + /* + * If there is no transaction active in this thread and we haven't + * checked if the cache is full, do it now. If we have to block for + * eviction, this is the best time to do it. + */ + if (F_ISSET(txn, TXN_RUNNING) && + !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT)) + WT_RET(__wt_cache_full_check(session)); + if (!F_ISSET(cbt, WT_CBT_ACTIVE)) WT_RET(__curfile_enter(cbt)); __wt_txn_cursor_op(session); diff --git a/src/include/flags.h b/src/include/flags.h index c7e74885a35..9664fce3f9f 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -6,15 +6,16 @@ #define WT_CONN_CKPT_SYNC 0x00000002 #define WT_CONN_EVICTION_RUN 0x00000004 #define WT_CONN_LEAK_MEMORY 0x00000008 -#define WT_CONN_LSM_MERGE 0x00000010 -#define WT_CONN_PANIC 0x00000020 -#define WT_CONN_SERVER_ASYNC 0x00000040 -#define WT_CONN_SERVER_CHECKPOINT 0x00000080 -#define WT_CONN_SERVER_LSM 0x00000100 -#define WT_CONN_SERVER_RUN 0x00000200 -#define WT_CONN_SERVER_STATISTICS 0x00000400 -#define WT_CONN_SERVER_SWEEP 0x00000800 -#define WT_CONN_WAS_BACKUP 0x00001000 +#define WT_CONN_LOG_SERVER_RUN 0x00000010 +#define WT_CONN_LSM_MERGE 0x00000020 +#define WT_CONN_PANIC 0x00000040 +#define WT_CONN_SERVER_ASYNC 0x00000080 +#define WT_CONN_SERVER_CHECKPOINT 0x00000100 +#define WT_CONN_SERVER_LSM 0x00000200 +#define WT_CONN_SERVER_RUN 0x00000400 +#define WT_CONN_SERVER_STATISTICS 0x00000800 +#define WT_CONN_SERVER_SWEEP 0x00001000 +#define WT_CONN_WAS_BACKUP 0x00002000 #define WT_EVICTING 0x00000001 #define WT_FILE_TYPE_CHECKPOINT 0x00000001 #define WT_FILE_TYPE_DATA 0x00000002 diff --git a/src/include/stat.h b/src/include/stat.h index cbd22c7b9d0..6efb9970065 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -164,6 +164,7 @@ struct __wt_connection_stats { WT_STATS cache_eviction_dirty; WT_STATS cache_eviction_fail; WT_STATS cache_eviction_force; + WT_STATS cache_eviction_force_delete; WT_STATS cache_eviction_force_fail; WT_STATS cache_eviction_hazard; WT_STATS cache_eviction_internal; diff --git a/src/include/txn.i b/src/include/txn.i index 745a8f75a99..656181790ed 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -227,6 +227,16 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) txn = &session->txn; WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING)); + + /* + * If there is no transaction active in this thread and we haven't + * checked if the cache is full, do it now. If we have to block for + * eviction, this is the best time to do it. + */ + if (F_ISSET(txn, TXN_RUNNING) && + !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT)) + WT_RET(__wt_cache_full_check(session)); + if (!F_ISSET(txn, TXN_HAS_ID)) { conn = S2C(session); txn_global = &conn->txn_global; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 80b917e37cb..5f6818ebba5 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -3185,206 +3185,208 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_FAIL 1030 /*! cache: pages evicted because they exceeded the in-memory maximum */ #define WT_STAT_CONN_CACHE_EVICTION_FORCE 1031 +/*! cache: pages evicted because they had chains of deleted items */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1032 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1032 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1033 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1033 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1034 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1034 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1035 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1035 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1036 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1036 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1037 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1037 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1038 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1038 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1039 /*! cache: eviction server populating queue, but not evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1039 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1040 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1040 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1041 /*! cache: pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1041 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1042 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1042 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1043 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1043 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1044 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1044 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1045 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1045 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1046 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1046 +#define WT_STAT_CONN_CACHE_READ 1047 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1047 +#define WT_STAT_CONN_CACHE_WRITE 1048 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1048 +#define WT_STAT_CONN_COND_WAIT 1049 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1049 +#define WT_STAT_CONN_CURSOR_CREATE 1050 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1050 +#define WT_STAT_CONN_CURSOR_INSERT 1051 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1051 +#define WT_STAT_CONN_CURSOR_NEXT 1052 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1052 +#define WT_STAT_CONN_CURSOR_PREV 1053 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1053 +#define WT_STAT_CONN_CURSOR_REMOVE 1054 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1054 +#define WT_STAT_CONN_CURSOR_RESET 1055 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1055 +#define WT_STAT_CONN_CURSOR_SEARCH 1056 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1056 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1057 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1057 +#define WT_STAT_CONN_CURSOR_UPDATE 1058 /*! data-handle: connection dhandles swept */ -#define WT_STAT_CONN_DH_CONN_HANDLES 1058 +#define WT_STAT_CONN_DH_CONN_HANDLES 1059 /*! data-handle: connection candidate referenced */ -#define WT_STAT_CONN_DH_CONN_REF 1059 +#define WT_STAT_CONN_DH_CONN_REF 1060 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_CONN_SWEEPS 1060 +#define WT_STAT_CONN_DH_CONN_SWEEPS 1061 /*! data-handle: connection time-of-death sets */ -#define WT_STAT_CONN_DH_CONN_TOD 1061 +#define WT_STAT_CONN_DH_CONN_TOD 1062 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1062 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1063 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1063 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1064 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1064 +#define WT_STAT_CONN_FILE_OPEN 1065 /*! log: log buffer size increases */ -#define WT_STAT_CONN_LOG_BUFFER_GROW 1065 +#define WT_STAT_CONN_LOG_BUFFER_GROW 1066 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1066 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1067 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1067 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1068 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1068 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1069 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1069 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1070 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1070 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1071 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1071 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1072 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1072 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1073 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1073 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1074 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1074 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1075 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1075 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1076 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1076 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1077 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1077 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1078 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1078 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1079 /*! log: log read operations */ -#define WT_STAT_CONN_LOG_READS 1079 +#define WT_STAT_CONN_LOG_READS 1080 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1080 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1081 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1081 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1082 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1082 +#define WT_STAT_CONN_LOG_SCANS 1083 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1083 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1084 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1084 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1085 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1085 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1086 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1086 +#define WT_STAT_CONN_LOG_SLOT_RACES 1087 /*! log: slots selected for switching that were unavailable */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1087 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1088 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1088 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1089 /*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1089 +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1090 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1090 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1091 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1091 +#define WT_STAT_CONN_LOG_SYNC 1092 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1092 +#define WT_STAT_CONN_LOG_WRITES 1093 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1093 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1094 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1094 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1095 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1095 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1096 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1096 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1097 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1097 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1098 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1098 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1099 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1099 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1100 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1100 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1101 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1101 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1102 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1102 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1103 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1103 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1104 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1104 +#define WT_STAT_CONN_MEMORY_FREE 1105 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1105 +#define WT_STAT_CONN_MEMORY_GROW 1106 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1106 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1107 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1107 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1108 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1108 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1109 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1109 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1110 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1110 +#define WT_STAT_CONN_PAGE_SLEEP 1111 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1111 +#define WT_STAT_CONN_READ_IO 1112 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1112 +#define WT_STAT_CONN_REC_PAGES 1113 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1113 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1114 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1114 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1115 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1115 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1116 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1116 +#define WT_STAT_CONN_RWLOCK_READ 1117 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1117 +#define WT_STAT_CONN_RWLOCK_WRITE 1118 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1118 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1119 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1119 +#define WT_STAT_CONN_SESSION_OPEN 1120 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1120 +#define WT_STAT_CONN_TXN_BEGIN 1121 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1121 +#define WT_STAT_CONN_TXN_CHECKPOINT 1122 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1122 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1123 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1123 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1124 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1124 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1125 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1125 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1126 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1126 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1127 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1127 +#define WT_STAT_CONN_TXN_COMMIT 1128 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1128 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1129 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1129 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1130 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1130 +#define WT_STAT_CONN_TXN_ROLLBACK 1131 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1131 +#define WT_STAT_CONN_WRITE_IO 1132 /*! * @} diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 138b64a6e27..1b3a9b62626 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -322,13 +322,13 @@ struct __wt_update; #include "misc.i" #include "intpack.i" /* required by cell.i, packing.i */ #include "packing.i" +#include "cache.i" /* required by txn.i */ #include "cell.i" /* required by btree.i */ #include "mutex.i" /* required by btree.i */ #include "txn.i" /* required by btree.i */ #include "btree.i" /* required by cursor.i */ -#include "cache.i" /* required by cursor.i */ #include "cursor.i" #include "bitstring.i" diff --git a/src/log/log.c b/src/log/log.c index 944e748a6a8..c48fc7536b2 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -240,6 +240,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) if (log->log_close_fh != NULL) F_SET(slot, SLOT_CLOSEFH); } + /* * Checkpoints can be configured based on amount of log written. * Add in this log record to the sum and if needed, signal the @@ -857,9 +858,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_FH *close_fh; WT_LOG *log; - WT_LSN sync_lsn; + WT_LSN close_end_lsn, close_lsn, sync_lsn; size_t write_size; int locked; WT_DECL_SPINLOCK_ID(id); /* Must appear last */ @@ -872,12 +872,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * If we're going to have to close our log file, make a local copy * of the file handle structure. */ - close_fh = NULL; - if (F_ISSET(slot, SLOT_CLOSEFH)) { - close_fh = log->log_close_fh; - log->log_close_fh = NULL; - F_CLR(slot, SLOT_CLOSEFH); - } + WT_INIT_LSN(&close_lsn); + WT_INIT_LSN(&close_end_lsn); /* Write the buffered records */ if (F_ISSET(slot, SLOT_BUFFERED)) { @@ -895,13 +891,22 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) __wt_yield(); log->write_lsn = slot->slot_end_lsn; + if (F_ISSET(slot, SLOT_CLOSEFH)) + WT_ERR(__wt_cond_signal(session, conn->log_close_cond)); + /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->sync_lsn. */ while (F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) { - if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { + /* + * We have to wait until earlier log files have finished their + * sync operations. The most recent one will set the LSN to the + * beginning of our file. + */ + if (log->sync_lsn.file < slot->slot_end_lsn.file || + __wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { WT_ERR(__wt_cond_wait( session, log->log_sync_cond, 10000)); continue; @@ -909,10 +914,10 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) locked = 1; /* - * Record the current end of log after we grabbed the lock. + * Record the current end of our update after the lock. * That is how far our calls can guarantee. */ - sync_lsn = log->write_lsn; + sync_lsn = slot->slot_end_lsn; /* * Check if we have to sync the parent directory. Some * combinations of sync flags may result in the log file @@ -956,16 +961,6 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } - /* - * If we have a file to close, close it now. First fsync so - * that a later sync will be assured all earlier transactions - * in earlier log files are also on disk. - */ - if (close_fh) { - WT_ERR(__wt_fsync(session, close_fh)); - WT_ERR(__wt_close(session, close_fh)); - } - err: if (locked) __wt_spin_unlock(session, &log->log_sync_lock); if (ret != 0 && slot->slot_error == 0) diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 031a4e88467..3f14e035a9b 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -171,8 +171,6 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) lsm_tree->nchunks != 0) goto open; - WT_RET(__wt_cache_full_check(session)); - if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; diff --git a/src/session/session_api.c b/src/session/session_api.c index 3ab5e0acab1..8ee143133ae 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -736,13 +736,6 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config) if (F_ISSET(&session->txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, "Transaction already running"); - /* - * There is no transaction active in this thread; check if the cache is - * full, if we have to block for eviction, this is the best time to do - * it. - */ - WT_ERR(__wt_cache_full_check(session)); - ret = __wt_txn_begin(session, cfg); err: API_END_RET(session, ret); diff --git a/src/support/stat.c b/src/support/stat.c index f4ae082add3..223d62d0559 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -376,6 +376,8 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "cache: pages currently held in the cache"; stats->cache_eviction_force.desc = "cache: pages evicted because they exceeded the in-memory maximum"; + stats->cache_eviction_force_delete.desc = + "cache: pages evicted because they had chains of deleted items"; stats->cache_eviction_app.desc = "cache: pages evicted by application threads"; stats->cache_read.desc = "cache: pages read into cache"; @@ -554,6 +556,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->cache_eviction_dirty.v = 0; stats->cache_eviction_deepen.v = 0; stats->cache_eviction_force.v = 0; + stats->cache_eviction_force_delete.v = 0; stats->cache_eviction_app.v = 0; stats->cache_read.v = 0; stats->cache_eviction_fail.v = 0; diff --git a/src/txn/txn.c b/src/txn/txn.c index fd80efd5ebd..5b8f11a88a5 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -361,8 +361,15 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) /* If we are logging, write a commit log record. */ if (ret == 0 && txn->mod_count > 0 && FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) && - !F_ISSET(session, WT_SESSION_NO_LOGGING)) + !F_ISSET(session, WT_SESSION_NO_LOGGING)) { + /* + * We are about to block on I/O writing the log. + * Release our snapshot in case it is keeping data pinned. + * This is particularly important for checkpoints. + */ + __wt_txn_release_snapshot(session); ret = __wt_txn_log_commit(session, cfg); + } /* * If anything went wrong, roll back. diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index f66bd7e09c8..789be2ceef4 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -270,6 +270,7 @@ __wt_txn_checkpoint_log( { WT_DECL_ITEM(logrec); WT_DECL_RET; + WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; uint8_t *end, *p; @@ -319,21 +320,26 @@ __wt_txn_checkpoint_log( */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; + WT_CLEAR(empty); + ckpt_snapshot = ∅ *ckpt_lsn = S2C(session)->log->alloc_lsn; - } + } else + ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, - txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, - txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; - WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); + WT_ERR(__wt_log_write(session, logrec, lsnp, + F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ? + WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is |