summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexg@wiredtiger.com>2015-12-01 22:53:21 +0000
committerAlex Gorrod <alexg@wiredtiger.com>2015-12-01 22:53:21 +0000
commitf1a93162f2a17d76d3930b43b3ae97be7b7cca7d (patch)
treead618e6c096055f2aaa02008ed070ee643ede639
parentf2fa6b9283169ac2b2737de6d929fa6dac544d8b (diff)
parente731ef8ab8b8f9d1c65380c83fc3e7b318f3fbe8 (diff)
downloadmongo-f1a93162f2a17d76d3930b43b3ae97be7b7cca7d.tar.gz
Merge branch 'develop' into wtperf_truncate_multiplier
-rw-r--r--src/btree/bt_delete.c15
-rw-r--r--src/btree/bt_discard.c9
-rw-r--r--src/btree/bt_handle.c7
-rw-r--r--src/btree/bt_read.c12
-rw-r--r--src/btree/bt_slvg.c8
-rw-r--r--src/btree/bt_split.c65
-rw-r--r--src/cursor/cur_file.c5
-rw-r--r--src/evict/evict_file.c2
-rw-r--r--src/evict/evict_lru.c58
-rw-r--r--src/evict/evict_page.c14
-rw-r--r--src/include/btmem.h22
-rw-r--r--src/include/btree.h3
-rw-r--r--src/include/btree.i73
-rw-r--r--src/include/cursor.h9
-rw-r--r--src/include/cursor.i24
-rw-r--r--src/log/log_slot.c2
-rw-r--r--src/reconcile/rec_write.c24
-rw-r--r--src/session/session_api.c3
-rw-r--r--test/format/config.c2
19 files changed, 162 insertions, 195 deletions
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index dc9352ec981..9dd72108e4b 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -99,25 +99,18 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/*
* We cannot fast-delete pages that have overflow key/value items as
* the overflow blocks have to be discarded. The way we figure that
- * out is to check the on-page cell type for the page, cells for leaf
- * pages that have no overflow items are special.
- *
- * In some cases, the reference address may not reference an on-page
- * cell (for example, some combination of page splits), in which case
- * we can't check the original cell value and we fail.
+ * out is to check the page's cell type, cells for leaf pages without
+ * overflow items are special.
*
* To look at an on-page cell, we need to look at the parent page, and
* that's dangerous, our parent page could change without warning if
* the parent page were to split, deepening the tree. It's safe: the
* page's reference will always point to some valid page, and if we find
* any problems we simply fail the fast-delete optimization.
- *
- * !!!
- * I doubt it's worth the effort, but we could copy the cell's type into
- * the reference structure, and then we wouldn't need an on-page cell.
*/
parent = ref->home;
- if (__wt_off_page(parent, ref->addr) ||
+ if (__wt_off_page(parent, ref->addr) ?
+ ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO :
__wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
goto err;
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 13bd943f803..54d9761c487 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -50,8 +50,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
page = *pagep;
*pagep = NULL;
- if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) &&
- __wt_page_is_modified(page))
+ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
__wt_page_modify_clear(session, page);
/*
@@ -270,11 +269,7 @@ __wt_free_ref(
* Free any address allocation; if there's no linked WT_REF page, it
* must be allocated.
*/
- if (ref->addr != NULL &&
- (ref->home == NULL || __wt_off_page(ref->home, ref->addr))) {
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- }
+ __wt_ref_free_addr(session, ref);
/* Free any page-deleted information. */
if (ref->page_del != NULL) {
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index dbdf94fc1b6..294cc399d65 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -697,6 +697,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
}
/*
+ * Try in-memory splits once we hit 80% of the maximum in-memory page
+ * size. This gives multi-threaded append workloads a better chance of
+ * not stalling.
+ */
+ btree->splitmempage = 8 * btree->maxmempage / 10;
+
+ /*
* Get the split percentage (reconciliation splits pages into smaller
* than the maximum page size chunks so we don't split every time a
* new entry is added). Determine how large newly split pages will be.
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 18fd87e78ff..77215474359 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -307,10 +307,6 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
btree = S2BT(session);
page = ref->page;
- /* Pages are usually small enough, check that first. */
- if (page->memory_footprint < btree->maxmempage)
- return (0);
-
/* Leaf pages only. */
if (WT_PAGE_IS_INTERNAL(page))
return (0);
@@ -322,6 +318,12 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
if (page->modify == NULL)
return (0);
+ /* Pages are usually small enough, check that first. */
+ if (page->memory_footprint < btree->splitmempage)
+ return (0);
+ else if (page->memory_footprint < btree->maxmempage)
+ return (__wt_leaf_page_can_split(session, page));
+
/* Trigger eviction on the next page release. */
__wt_page_evict_soon(page);
@@ -329,7 +331,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_txn_update_oldest(session, false);
/* If eviction cannot succeed, don't try. */
- return (__wt_page_can_evict(session, ref, true, NULL));
+ return (__wt_page_can_evict(session, ref, NULL));
}
/*
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 80e467b5707..e4a860bb421 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1290,9 +1290,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
* would have been lost.) Clear the reference addr so eviction doesn't
* free the underlying blocks.
*/
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- ref->addr = NULL;
+ __wt_ref_free_addr(session, ref);
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
@@ -2013,9 +2011,7 @@ __slvg_row_build_leaf(
* would have been lost.) Clear the reference addr so eviction doesn't
* free the underlying blocks.
*/
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- ref->addr = NULL;
+ __wt_ref_free_addr(session, ref);
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 35c3bfea711..e2031553aed 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -340,8 +340,18 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
return (ret);
}
addr->size = (uint8_t)unpack.size;
- addr->type =
- unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
+ switch (unpack.raw) {
+ case WT_CELL_ADDR_INT:
+ addr->type = WT_ADDR_INT;
+ break;
+ case WT_CELL_ADDR_LEAF:
+ addr->type = WT_ADDR_LEAF;
+ break;
+ case WT_CELL_ADDR_LEAF_NO:
+ addr->type = WT_ADDR_LEAF_NO;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
ref->addr = addr;
}
@@ -399,18 +409,9 @@ __split_ref_move_final(
WT_DECL_RET;
WT_PAGE *child;
WT_REF *ref, *child_ref;
- uint64_t txn_new_id;
uint32_t i;
/*
- * When creating new internal pages as part of a split, we set a field
- * in those pages modify structure to prevent them from being evicted
- * until all threads are known to have exited the index of the page that
- * previously "owned" the WT_REF. Set that field to a safe value.
- */
- txn_new_id = __wt_txn_id_alloc(session, false);
-
- /*
* The WT_REF structures moved to newly allocated child pages reference
* the wrong parent page and we have to fix that up. The problem is
* revealed when a thread of control searches for the child page's
@@ -461,8 +462,6 @@ __split_ref_move_final(
if (child_ref->home != child) {
child_ref->home = child;
child_ref->pindex_hint = 0;
-
- child->modify->mod_split_txn = txn_new_id;
}
} WT_INTL_FOREACH_END;
WT_LEAVE_PAGE_INDEX(session);
@@ -896,6 +895,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
WT_ASSERT(session, next_ref->page_del == NULL);
+ __wt_ref_free_addr(session, next_ref);
WT_TRET(__split_safe_free(
session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
parent_decr += sizeof(WT_REF);
@@ -1183,8 +1183,8 @@ err: /*
* Lock an internal page.
*/
static int
-__split_internal_lock(
- WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
+__split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
+ WT_PAGE **parentp, bool *hazardp)
{
WT_DECL_RET;
WT_PAGE *parent;
@@ -1202,7 +1202,7 @@ __split_internal_lock(
* loop until the exclusive lock is resolved). If we want to split
* the parent, give up to avoid that deadlock.
*/
- if (S2BT(session)->checkpointing != WT_CKPT_OFF)
+ if (!trylock && S2BT(session)->checkpointing != WT_CKPT_OFF)
return (EBUSY);
/*
@@ -1227,7 +1227,10 @@ __split_internal_lock(
if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
return (EBUSY);
- WT_RET(__wt_fair_lock(session, &parent->page_lock));
+ if (trylock)
+ WT_RET(__wt_fair_trylock(session, &parent->page_lock));
+ else
+ WT_RET(__wt_fair_lock(session, &parent->page_lock));
if (parent == ref->home)
break;
WT_RET(__wt_fair_unlock(session, &parent->page_lock));
@@ -1371,7 +1374,7 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* locks, lock-coupling up the tree.
*/
WT_ERR(__split_internal_lock(
- session, ref, &parent, &parent_hazard));
+ session, ref, true, &parent, &parent_hazard));
ret = __split_internal(session, parent, page);
WT_TRET(__split_internal_unlock(session, page, page_hazard));
@@ -1635,7 +1638,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*
* Note this page has already been through an in-memory split.
*/
- WT_ASSERT(session, __wt_page_can_split(session, page));
+ WT_ASSERT(session, __wt_leaf_page_can_split(session, page));
WT_ASSERT(session, __wt_page_is_modified(page));
F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);
@@ -1669,6 +1672,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
child->addr = ref->addr;
/*
+ * The address has moved to the replacement WT_REF. Make sure it isn't
+ * freed when the original ref is discarded.
+ */
+ ref->addr = NULL;
+
+ /*
* Copy the first key from the original page into first ref in the new
* parent. Pages created in memory always have a "smallest" insert
* list, so look there first. If we don't find one, get the first key
@@ -1818,13 +1827,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
#endif
/*
- * Save the transaction ID when the split happened. Application
- * threads will not try to forcibly evict the page again until
- * all concurrent transactions commit.
- */
- page->modify->inmem_split_txn = __wt_txn_id_alloc(session, false);
-
- /*
* Update the page accounting.
*
* XXX
@@ -1864,6 +1866,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
return (0);
err: if (split_ref[0] != NULL) {
+ /*
+ * The address was moved to the replacement WT_REF, restore it.
+ */
+ ref->addr = split_ref[0]->addr;
+
__wt_free(session, split_ref[0]->key.ikey);
__wt_free(session, split_ref[0]);
}
@@ -1891,7 +1898,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_RET(__wt_verbose(
session, WT_VERB_SPLIT, "%p: split-insert", ref->page));
- WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
if ((ret = __split_insert(session, ref)) != 0) {
WT_TRET(__split_internal_unlock(session, parent, hazard));
return (ret);
@@ -1983,7 +1990,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_RET(__wt_verbose(
session, WT_VERB_SPLIT, "%p: split-multi", ref->page));
- WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
WT_TRET(__split_internal_unlock(session, parent, hazard));
return (ret);
@@ -2012,7 +2019,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
WT_RET(__wt_verbose(
session, WT_VERB_SPLIT, "%p: reverse-split", ref->page));
- WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
ret = __split_parent(session, ref, NULL, 0, 0, false, true);
WT_TRET(__split_internal_unlock(session, parent, hazard));
return (ret);
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index 7c18b59fded..63f77248ca8 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -379,7 +379,7 @@ __curfile_close(WT_CURSOR *cursor)
* updated correctly.
*/
if (session->dhandle != NULL) {
- /* Increment the data-source's in-use counter. */
+ /* Decrement the data-source's in-use counter. */
__wt_cursor_dhandle_decr_use(session);
WT_TRET(__wt_session_release_btree(session));
}
@@ -439,6 +439,9 @@ __wt_curfile_create(WT_SESSION_IMPL *session,
cursor->value_format = btree->value_format;
cbt->btree = btree;
+ if (session->dhandle->checkpoint != NULL)
+ F_SET(cbt, WT_CBT_NO_TXN);
+
if (bulk) {
F_SET(cursor, WT_CURSTD_BULK);
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 9937390d19a..2b2117ad9fd 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -84,7 +84,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
*/
WT_ASSERT(session,
F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
- __wt_page_can_evict(session, ref, false, NULL));
+ __wt_page_can_evict(session, ref, NULL));
__wt_evict_page_clean_update(session, ref, true);
break;
WT_ILLEGAL_VALUE_ERR(session);
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index e77edf3d500..f2784890ab7 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -159,7 +159,8 @@ __evict_server(void *arg)
WT_DECL_RET;
WT_SESSION_IMPL *session;
#ifdef HAVE_DIAGNOSTIC
- struct timespec now, stuck_ts = { 0, 0 };
+ struct timespec now, stuck_ts;
+ uint64_t pages_evicted = 0;
#endif
u_int spins;
@@ -204,10 +205,11 @@ __evict_server(void *arg)
/* Next time we wake up, reverse the sweep direction. */
cache->flags ^= WT_CACHE_WALK_REVERSE;
#ifdef HAVE_DIAGNOSTIC
- stuck_ts.tv_sec = 0;
- } else if (stuck_ts.tv_sec == 0)
+ pages_evicted = 0;
+ } else if (pages_evicted != cache->pages_evict) {
WT_ERR(__wt_epoch(session, &stuck_ts));
- else {
+ pages_evicted = cache->pages_evict;
+ } else {
/* After being stuck for 5 minutes, give up. */
WT_ERR(__wt_epoch(session, &now));
if (WT_TIMEDIFF_SEC(now, stuck_ts) > 300) {
@@ -481,6 +483,13 @@ __evict_update_work(WT_SESSION_IMPL *session)
goto done;
}
+ /*
+ * If the cache has been stuck and is now under control, clear the
+ * stuck flag.
+ */
+ if (bytes_inuse < bytes_max)
+ F_CLR(cache, WT_CACHE_STUCK);
+
dirty_inuse = __wt_cache_dirty_inuse(cache);
if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) {
FLD_SET(cache->state, WT_EVICT_PASS_DIRTY);
@@ -498,6 +507,7 @@ __evict_update_work(WT_SESSION_IMPL *session)
F_CLR(cache, WT_CACHE_WOULD_BLOCK);
goto done;
}
+
return (false);
done: if (F_ISSET(cache, WT_CACHE_STUCK))
@@ -1169,7 +1179,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
uint64_t pages_walked;
uint32_t walk_flags;
int internal_pages, restarts;
- bool enough, modified;
+ bool enough, modified, would_split;
conn = S2C(session);
btree = S2BT(session);
@@ -1254,10 +1264,16 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
page->read_gen = __wt_cache_read_gen_new(session);
fast: /* If the page can't be evicted, give up. */
- if (!__wt_page_can_evict(session, ref, true, NULL))
+ if (!__wt_page_can_evict(session, ref, &would_split))
continue;
/*
+ * Note: take care with ordering: if we detected that
+ * the page is modified above, we expect mod != NULL.
+ */
+ mod = page->modify;
+
+ /*
* Additional tests if eviction is likely to succeed.
*
* If eviction is stuck or we are helping with forced eviction,
@@ -1270,31 +1286,12 @@ fast: /* If the page can't be evicted, give up. */
if (!FLD_ISSET(cache->state,
WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) {
/*
- * Note: take care with ordering: if we detected that
- * the page is modified above, we expect mod != NULL.
- */
- mod = page->modify;
-
- /*
* If the page is clean but has modifications that
* appear too new to evict, skip it.
*/
if (!modified && mod != NULL &&
!__wt_txn_visible_all(session, mod->rec_max_txn))
continue;
-
- /*
- * If the oldest transaction hasn't changed since the
- * last time this page was written, it's unlikely we
- * can make progress. Similarly, if the most recent
- * update on the page is not yet globally visible,
- * eviction will fail. These heuristics attempt to
- * avoid repeated attempts to evict the same page.
- */
- if (modified &&
- (mod->disk_snap_min == conn->txn_global.oldest_id ||
- !__wt_txn_visible_all(session, mod->update_txn)))
- continue;
}
WT_ASSERT(session, evict->ref == NULL);
@@ -1419,7 +1416,6 @@ static int
__evict_page(WT_SESSION_IMPL *session, bool is_server)
{
WT_BTREE *btree;
- WT_CACHE *cache;
WT_DECL_RET;
WT_PAGE *page;
WT_REF *ref;
@@ -1458,12 +1454,6 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
- WT_RET(ret);
-
- cache = S2C(session)->cache;
- if (F_ISSET(cache, WT_CACHE_STUCK))
- F_CLR(cache, WT_CACHE_STUCK);
-
return (ret);
}
@@ -1607,8 +1597,8 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
next_walk = NULL;
session->dhandle = dhandle;
- while (__wt_tree_walk(session,
- &next_walk, NULL, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 &&
+ while (__wt_tree_walk(session, &next_walk, NULL,
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
next_walk != NULL) {
page = next_walk->page;
size = page->memory_footprint;
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index f0d97c7f2c8..9281e7cdb2d 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -241,19 +241,14 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
WT_ADDR *addr;
WT_DECL_RET;
- WT_PAGE *parent;
WT_PAGE_MODIFY *mod;
- parent = ref->home;
mod = ref->page->modify;
switch (mod->rec_result) {
case WT_PM_REC_EMPTY: /* Page is empty */
/* Discard the parent's address. */
- if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- }
+ __wt_ref_free_addr(session, ref);
/*
* Update the parent to reference a deleted page. The fact that
@@ -308,10 +303,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
return (EBUSY);
/* Discard the parent's address. */
- if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- }
+ __wt_ref_free_addr(session, ref);
/*
* Update the parent to reference the replacement page.
@@ -434,7 +426,7 @@ __evict_review(
if (modified)
__wt_txn_update_oldest(session, true);
- if (!__wt_page_can_evict(session, ref, false, inmem_splitp))
+ if (!__wt_page_can_evict(session, ref, inmem_splitp))
return (EBUSY);
/*
diff --git a/src/include/btmem.h b/src/include/btmem.h
index ae29dc68003..6ee74c61a38 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -198,20 +198,9 @@ struct __wt_ovfl_txnc {
* When a page is modified, there's additional information to maintain.
*/
struct __wt_page_modify {
- /*
- * Track the highest transaction ID at which the page was written to
- * disk. This can be used to avoid trying to write the page multiple
- * times if a snapshot is keeping old versions pinned (e.g., in a
- * checkpoint).
- */
- uint64_t disk_snap_min;
-
/* The first unwritten transaction ID (approximate). */
uint64_t first_dirty_txn;
- /* In-memory split transaction ID. */
- uint64_t inmem_split_txn;
-
/* Avoid checking for obsolete updates during checkpoints. */
uint64_t obsolete_check_txn;
@@ -221,10 +210,8 @@ struct __wt_page_modify {
/* The largest update transaction ID (approximate). */
uint64_t update_txn;
-#ifdef HAVE_DIAGNOSTIC
/* Check that transaction time moves forward. */
uint64_t last_oldest_id;
-#endif
/* Dirty bytes added to the cache. */
size_t bytes_dirty;
@@ -313,17 +300,8 @@ struct __wt_page_modify {
* so they can be discarded when no longer needed.
*/
WT_PAGE *root_split; /* Linked list of root split pages */
-
- /*
- * When we deepen the tree, newly created internal pages cannot
- * be evicted until all threads have exited the original page
- * index structure. We set a transaction value during the split
- * that's checked during eviction.
- */
- uint64_t split_txn; /* Split eviction transaction value */
} intl;
#define mod_root_split u2.intl.root_split
-#define mod_split_txn u2.intl.split_txn
struct {
/*
* Appended items to column-stores: there is only a single one
diff --git a/src/include/btree.h b/src/include/btree.h
index ccdcccbaa0e..a1d8e395cfc 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -88,7 +88,8 @@ struct __wt_btree {
uint32_t maxleafpage; /* Leaf page max size */
uint32_t maxleafkey; /* Leaf page max key size */
uint32_t maxleafvalue; /* Leaf page max value size */
- uint64_t maxmempage; /* In memory page max size */
+ uint64_t maxmempage; /* In-memory page max size */
+ uint64_t splitmempage; /* In-memory split trigger size */
void *huffman_key; /* Key huffman encoding */
void *huffman_value; /* Value huffman encoding */
diff --git a/src/include/btree.i b/src/include/btree.i
index 25b555be64e..02633c3206a 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -349,13 +349,6 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_cache_dirty_incr(session, page);
/*
- * The page can never end up with changes older than the oldest
- * running transaction.
- */
- if (F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT))
- page->modify->disk_snap_min = session->txn.snap_min;
-
- /*
* We won the race to dirty the page, but another thread could
* have committed in the meantime, and the last_running field
* been updated past it. That is all very unlikely, but not
@@ -473,6 +466,22 @@ __wt_off_page(WT_PAGE *page, const void *p)
}
/*
+ * __wt_ref_free_addr --
+ * Free the address in a reference, if necessary.
+ */
+static inline void
+__wt_ref_free_addr(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ if (ref->addr != NULL) {
+ if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ } else
+ ref->addr = NULL;
+ }
+}
+
+/*
* __wt_ref_key --
* Return a reference to a row-store internal page key as cheaply as
* possible.
@@ -970,11 +979,11 @@ __wt_ref_info(WT_SESSION_IMPL *session,
}
/*
- * __wt_page_can_split --
+ * __wt_leaf_page_can_split --
* Check whether a page can be split in memory.
*/
static inline bool
-__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
+__wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_INSERT_HEAD *ins_head;
@@ -1005,7 +1014,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
* reconciliation will be wrong, so we can't evict immediately).
*/
if (page->type != WT_PAGE_ROW_LEAF ||
- page->memory_footprint < btree->maxmempage ||
+ page->memory_footprint < btree->splitmempage ||
!__wt_page_is_modified(page))
return (false);
@@ -1048,13 +1057,12 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
* Check whether a page can be evicted.
*/
static inline bool
-__wt_page_can_evict(WT_SESSION_IMPL *session,
- WT_REF *ref, bool check_splits, bool *inmem_splitp)
+__wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
{
WT_BTREE *btree;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
- WT_TXN_GLOBAL *txn_global;
+ bool modified;
if (inmem_splitp != NULL)
*inmem_splitp = false;
@@ -1073,20 +1081,21 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* detailed eviction tests. We don't need further tests since the page
* won't be written or discarded from the cache.
*/
- if (__wt_page_can_split(session, page)) {
+ if (__wt_leaf_page_can_split(session, page)) {
if (inmem_splitp != NULL)
*inmem_splitp = true;
return (true);
}
+ modified = __wt_page_is_modified(page);
+
/*
* If the file is being checkpointed, we can't evict dirty pages:
* if we write a page and free the previous version of the page, that
* previous version might be referenced by an internal page already
* been written in the checkpoint, leaving the checkpoint inconsistent.
*/
- if (btree->checkpointing != WT_CKPT_OFF &&
- __wt_page_is_modified(page)) {
+ if (btree->checkpointing != WT_CKPT_OFF && modified) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
return (false);
@@ -1107,28 +1116,24 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* pages cannot be evicted until all threads are known to have exited
* the original parent page's index, because evicting an internal page
* discards its WT_REF array, and a thread traversing the original
- * parent page index might see a freed WT_REF. During the split we set
- * a transaction value, we can evict the created page as soon as that
- * transaction value is globally visible.
+ * parent page index might see a freed WT_REF.
*/
- if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
- (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) ||
- !__wt_txn_visible_all(session, mod->mod_split_txn)))
+ if (WT_PAGE_IS_INTERNAL(page) &&
+ F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
return (false);
/*
- * If the page was recently split in-memory, don't evict it immediately:
- * we want to give application threads that are appending a chance to
- * move to the new leaf page created by the split.
- *
- * Note the check here is similar to __wt_txn_visible_all, but ignores
- * the checkpoint's transaction.
+ * If the oldest transaction hasn't changed since the last time
+ * this page was written, it's unlikely we can make progress.
+ * Similarly, if the most recent update on the page is not yet
+ * globally visible, eviction will fail. These heuristics
+ * attempt to avoid repeated attempts to evict the same page.
*/
- if (check_splits) {
- txn_global = &S2C(session)->txn_global;
- if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
- return (false);
- }
+ if (modified &&
+ !F_ISSET(S2C(session)->cache, WT_CACHE_STUCK) &&
+ (mod->last_oldest_id == __wt_txn_oldest_id(session) ||
+ !__wt_txn_visible_all(session, mod->update_txn)))
+ return (false);
return (true);
}
@@ -1223,7 +1228,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
LF_ISSET(WT_READ_NO_EVICT) ||
F_ISSET(session, WT_SESSION_NO_EVICTION) ||
F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
- !__wt_page_can_evict(session, ref, true, NULL))
+ !__wt_page_can_evict(session, ref, NULL))
return (__wt_hazard_clear(session, page));
WT_RET_BUSY_OK(__wt_page_release_evict(session, ref));
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 0a3842efd45..54787d2227b 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -197,7 +197,14 @@ struct __wt_cursor_btree {
#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */
#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */
#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */
-#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */
+#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor
+ (e.g. on a checkpoint) */
+#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */
+
+#define WT_CBT_POSITION_MASK /* Flags associated with position */ \
+ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
+ WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST)
+
uint8_t flags;
};
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 9dd280534b4..2e382591313 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -41,11 +41,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt)
cbt->cip_saved = NULL;
cbt->rip_saved = NULL;
- /*
- * Don't clear the active flag, it's owned by the cursor enter/leave
- * functions.
- */
- F_CLR(cbt, ~WT_CBT_ACTIVE);
+ F_CLR(cbt, WT_CBT_POSITION_MASK);
}
/*
@@ -93,7 +89,8 @@ __curfile_enter(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cbt->iface.session;
- WT_RET(__cursor_enter(session));
+ if (!F_ISSET(cbt, WT_CBT_NO_TXN))
+ WT_RET(__cursor_enter(session));
F_SET(cbt, WT_CBT_ACTIVE);
return (0);
}
@@ -112,7 +109,8 @@ __curfile_leave(WT_CURSOR_BTREE *cbt)
/* If the cursor was active, deactivate it. */
if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
- __cursor_leave(session);
+ if (!F_ISSET(cbt, WT_CBT_NO_TXN))
+ __cursor_leave(session);
F_CLR(cbt, WT_CBT_ACTIVE);
}
@@ -204,7 +202,7 @@ err: return (ret);
/*
* __wt_cursor_dhandle_incr_use --
- * Increment the in-use counter in cursor's data source.
+ * Increment the in-use counter in the cursor's data source.
*/
static inline void
__wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session)
@@ -221,7 +219,7 @@ __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session)
/*
* __wt_cursor_dhandle_decr_use --
- * Decrement the in-use counter in cursor's data source.
+ * Decrement the in-use counter in the cursor's data source.
*/
static inline void
__wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session)
@@ -262,7 +260,13 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
if (!F_ISSET(cbt, WT_CBT_ACTIVE))
WT_RET(__curfile_enter(cbt));
- __wt_txn_cursor_op(session);
+
+ /*
+ * If this is an ordinary transactional cursor, make sure we are set up
+ * to read.
+ */
+ if (!F_ISSET(cbt, WT_CBT_NO_TXN))
+ __wt_txn_cursor_op(session);
return (0);
}
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 255551f99a4..8155397d823 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -293,7 +293,7 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
&log->slot_pool[i].slot_buf, log->slot_buf_size));
F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
}
- WT_STAT_FAST_CONN_INCRV(session,
+ WT_STAT_FAST_CONN_SET(session,
log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
/*
* Set up the available slot from the pool the first time.
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index dbe5ce8a781..0e1e7498568 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -351,6 +351,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_RECONCILE *r;
+ uint64_t oldest_id;
page = ref->page;
mod = page->modify;
@@ -361,21 +362,14 @@ __wt_reconcile(WT_SESSION_IMPL *session,
/* We shouldn't get called with a clean page, that's an error. */
WT_ASSERT(session, __wt_page_is_modified(page));
-#ifdef HAVE_DIAGNOSTIC
- {
/*
* Check that transaction time always moves forward for a given page.
* If this check fails, reconciliation can free something that a future
* reconciliation will need.
*/
- uint64_t oldest_id = __wt_txn_oldest_id(session);
+ oldest_id = __wt_txn_oldest_id(session);
WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id));
mod->last_oldest_id = oldest_id;
- }
-#endif
-
- /* Record the most recent transaction ID we will *not* write. */
- mod->disk_snap_min = session->txn.snap_min;
/* Initialize the reconciliation structure for each new run. */
WT_RET(__rec_write_init(
@@ -1401,12 +1395,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
__wt_txn_visible_all(session, page_del->txnid))) {
WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
WT_RET(__rec_block_free(session, addr, addr_size));
-
- if (__wt_off_page(ref->home, ref->addr)) {
- __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- }
- ref->addr = NULL;
+ __wt_ref_free_addr(session, ref);
}
/*
@@ -5440,12 +5429,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_RET(__wt_ref_info(
session, ref, &addr, &addr_size, NULL));
WT_RET(__rec_block_free(session, addr, addr_size));
- if (__wt_off_page(ref->home, ref->addr)) {
- __wt_free(
- session, ((WT_ADDR *)ref->addr)->addr);
- __wt_free(session, ref->addr);
- }
- ref->addr = NULL;
+ __wt_ref_free_addr(session, ref);
}
break;
case WT_PM_REC_EMPTY: /* Page deleted */
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 12f7ce2ec3f..053f69ee7f8 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -206,6 +206,9 @@ __session_close(WT_SESSION *wt_session, const char *config)
__wt_spin_unlock(session, &conn->api_lock);
+ /* We no longer have a session, don't try to update it. */
+ session = NULL;
+
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
diff --git a/test/format/config.c b/test/format/config.c
index b9d0e437765..7aa4575efbd 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -394,7 +394,7 @@ config_lrt(void)
* stores.
*/
if (g.type == FIX) {
- if (config_is_perm("long_running_txn"))
+ if (g.c_long_running_txn && config_is_perm("long_running_txn"))
die(EINVAL,
"long_running_txn not supported with fixed-length "
"column store");