diff options
author | Alex Gorrod <alexg@wiredtiger.com> | 2015-12-01 22:53:21 +0000 |
---|---|---|
committer | Alex Gorrod <alexg@wiredtiger.com> | 2015-12-01 22:53:21 +0000 |
commit | f1a93162f2a17d76d3930b43b3ae97be7b7cca7d (patch) | |
tree | ad618e6c096055f2aaa02008ed070ee643ede639 | |
parent | f2fa6b9283169ac2b2737de6d929fa6dac544d8b (diff) | |
parent | e731ef8ab8b8f9d1c65380c83fc3e7b318f3fbe8 (diff) | |
download | mongo-f1a93162f2a17d76d3930b43b3ae97be7b7cca7d.tar.gz |
Merge branch 'develop' into wtperf_truncate_multiplier
-rw-r--r-- | src/btree/bt_delete.c | 15 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 9 | ||||
-rw-r--r-- | src/btree/bt_handle.c | 7 | ||||
-rw-r--r-- | src/btree/bt_read.c | 12 | ||||
-rw-r--r-- | src/btree/bt_slvg.c | 8 | ||||
-rw-r--r-- | src/btree/bt_split.c | 65 | ||||
-rw-r--r-- | src/cursor/cur_file.c | 5 | ||||
-rw-r--r-- | src/evict/evict_file.c | 2 | ||||
-rw-r--r-- | src/evict/evict_lru.c | 58 | ||||
-rw-r--r-- | src/evict/evict_page.c | 14 | ||||
-rw-r--r-- | src/include/btmem.h | 22 | ||||
-rw-r--r-- | src/include/btree.h | 3 | ||||
-rw-r--r-- | src/include/btree.i | 73 | ||||
-rw-r--r-- | src/include/cursor.h | 9 | ||||
-rw-r--r-- | src/include/cursor.i | 24 | ||||
-rw-r--r-- | src/log/log_slot.c | 2 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 24 | ||||
-rw-r--r-- | src/session/session_api.c | 3 | ||||
-rw-r--r-- | test/format/config.c | 2 |
19 files changed, 162 insertions, 195 deletions
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index dc9352ec981..9dd72108e4b 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -99,25 +99,18 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that - * out is to check the on-page cell type for the page, cells for leaf - * pages that have no overflow items are special. - * - * In some cases, the reference address may not reference an on-page - * cell (for example, some combination of page splits), in which case - * we can't check the original cell value and we fail. + * out is to check the page's cell type, cells for leaf pages without + * overflow items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. - * - * !!! - * I doubt it's worth the effort, but we could copy the cell's type into - * the reference structure, and then we wouldn't need an on-page cell. */ parent = ref->home; - if (__wt_off_page(parent, ref->addr) || + if (__wt_off_page(parent, ref->addr) ? + ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 13bd943f803..54d9761c487 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -50,8 +50,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) page = *pagep; *pagep = NULL; - if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) && - __wt_page_is_modified(page)) + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* @@ -270,11 +269,7 @@ __wt_free_ref( * Free any address allocation; if there's no linked WT_REF page, it * must be allocated. */ - if (ref->addr != NULL && - (ref->home == NULL || __wt_off_page(ref->home, ref->addr))) { - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - } + __wt_ref_free_addr(session, ref); /* Free any page-deleted information. */ if (ref->page_del != NULL) { diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index dbdf94fc1b6..294cc399d65 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -697,6 +697,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session) } /* + * Try in-memory splits once we hit 80% of the maximum in-memory page + * size. This gives multi-threaded append workloads a better chance of + * not stalling. + */ + btree->splitmempage = 8 * btree->maxmempage / 10; + + /* * Get the split percentage (reconciliation splits pages into smaller * than the maximum page size chunks so we don't split every time a * new entry is added). Determine how large newly split pages will be. diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 18fd87e78ff..77215474359 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -307,10 +307,6 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) btree = S2BT(session); page = ref->page; - /* Pages are usually small enough, check that first. */ - if (page->memory_footprint < btree->maxmempage) - return (0); - /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) return (0); @@ -322,6 +318,12 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) if (page->modify == NULL) return (0); + /* Pages are usually small enough, check that first. */ + if (page->memory_footprint < btree->splitmempage) + return (0); + else if (page->memory_footprint < btree->maxmempage) + return (__wt_leaf_page_can_split(session, page)); + /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); @@ -329,7 +331,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) __wt_txn_update_oldest(session, false); /* If eviction cannot succeed, don't try. */ - return (__wt_page_can_evict(session, ref, true, NULL)); + return (__wt_page_can_evict(session, ref, NULL)); } /* diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 80e467b5707..e4a860bb421 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1290,9 +1290,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) * would have been lost.) Clear the reference addr so eviction doesn't * free the underlying blocks. */ - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - ref->addr = NULL; + __wt_ref_free_addr(session, ref); /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); @@ -2013,9 +2011,7 @@ __slvg_row_build_leaf( * would have been lost.) Clear the reference addr so eviction doesn't * free the underlying blocks. */ - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - ref->addr = NULL; + __wt_ref_free_addr(session, ref); /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 35c3bfea711..e2031553aed 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -340,8 +340,18 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, return (ret); } addr->size = (uint8_t)unpack.size; - addr->type = - unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; + switch (unpack.raw) { + case WT_CELL_ADDR_INT: + addr->type = WT_ADDR_INT; + break; + case WT_CELL_ADDR_LEAF: + addr->type = WT_ADDR_LEAF; + break; + case WT_CELL_ADDR_LEAF_NO: + addr->type = WT_ADDR_LEAF_NO; + break; + WT_ILLEGAL_VALUE(session); + } ref->addr = addr; } @@ -399,18 +409,9 @@ __split_ref_move_final( WT_DECL_RET; WT_PAGE *child; WT_REF *ref, *child_ref; - uint64_t txn_new_id; uint32_t i; /* - * When creating new internal pages as part of a split, we set a field - * in those pages modify structure to prevent them from being evicted - * until all threads are known to have exited the index of the page that - * previously "owned" the WT_REF. Set that field to a safe value. - */ - txn_new_id = __wt_txn_id_alloc(session, false); - - /* * The WT_REF structures moved to newly allocated child pages reference * the wrong parent page and we have to fix that up. The problem is * revealed when a thread of control searches for the child page's @@ -461,8 +462,6 @@ __split_ref_move_final( if (child_ref->home != child) { child_ref->home = child; child_ref->pindex_hint = 0; - - child->modify->mod_split_txn = txn_new_id; } } WT_INTL_FOREACH_END; WT_LEAVE_PAGE_INDEX(session); @@ -896,6 +895,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ WT_ASSERT(session, next_ref->page_del == NULL); + __wt_ref_free_addr(session, next_ref); WT_TRET(__split_safe_free( session, split_gen, exclusive, next_ref, sizeof(WT_REF))); parent_decr += sizeof(WT_REF); @@ -1183,8 +1183,8 @@ err: /* * Lock an internal page. */ static int -__split_internal_lock( - WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp) +__split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, + WT_PAGE **parentp, bool *hazardp) { WT_DECL_RET; WT_PAGE *parent; @@ -1202,7 +1202,7 @@ __split_internal_lock( * loop until the exclusive lock is resolved). If we want to split * the parent, give up to avoid that deadlock. */ - if (S2BT(session)->checkpointing != WT_CKPT_OFF) + if (!trylock && S2BT(session)->checkpointing != WT_CKPT_OFF) return (EBUSY); /* @@ -1227,7 +1227,10 @@ __split_internal_lock( if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) return (EBUSY); - WT_RET(__wt_fair_lock(session, &parent->page_lock)); + if (trylock) + WT_RET(__wt_fair_trylock(session, &parent->page_lock)); + else + WT_RET(__wt_fair_lock(session, &parent->page_lock)); if (parent == ref->home) break; WT_RET(__wt_fair_unlock(session, &parent->page_lock)); @@ -1371,7 +1374,7 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) * locks, lock-coupling up the tree. */ WT_ERR(__split_internal_lock( - session, ref, &parent, &parent_hazard)); + session, ref, true, &parent, &parent_hazard)); ret = __split_internal(session, parent, page); WT_TRET(__split_internal_unlock(session, page, page_hazard)); @@ -1635,7 +1638,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * Note this page has already been through an in-memory split. */ - WT_ASSERT(session, __wt_page_can_split(session, page)); + WT_ASSERT(session, __wt_leaf_page_can_split(session, page)); WT_ASSERT(session, __wt_page_is_modified(page)); F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); @@ -1669,6 +1672,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) child->addr = ref->addr; /* + * The address has moved to the replacement WT_REF. Make sure it isn't + * freed when the original ref is discarded. + */ + ref->addr = NULL; + + /* * Copy the first key from the original page into first ref in the new * parent. Pages created in memory always have a "smallest" insert * list, so look there first. If we don't find one, get the first key @@ -1818,13 +1827,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) #endif /* - * Save the transaction ID when the split happened. Application - * threads will not try to forcibly evict the page again until - * all concurrent transactions commit. - */ - page->modify->inmem_split_txn = __wt_txn_id_alloc(session, false); - - /* * Update the page accounting. * * XXX @@ -1864,6 +1866,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) return (0); err: if (split_ref[0] != NULL) { + /* + * The address was moved to the replacement WT_REF, restore it. + */ + ref->addr = split_ref[0]->addr; + __wt_free(session, split_ref[0]->key.ikey); __wt_free(session, split_ref[0]); } @@ -1891,7 +1898,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_RET(__wt_verbose( session, WT_VERB_SPLIT, "%p: split-insert", ref->page)); - WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard)); if ((ret = __split_insert(session, ref)) != 0) { WT_TRET(__split_internal_unlock(session, parent, hazard)); return (ret); @@ -1983,7 +1990,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_RET(__wt_verbose( session, WT_VERB_SPLIT, "%p: split-multi", ref->page)); - WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { WT_TRET(__split_internal_unlock(session, parent, hazard)); return (ret); @@ -2012,7 +2019,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) WT_RET(__wt_verbose( session, WT_VERB_SPLIT, "%p: reverse-split", ref->page)); - WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); ret = __split_parent(session, ref, NULL, 0, 0, false, true); WT_TRET(__split_internal_unlock(session, parent, hazard)); return (ret); diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 7c18b59fded..63f77248ca8 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -379,7 +379,7 @@ __curfile_close(WT_CURSOR *cursor) * updated correctly. */ if (session->dhandle != NULL) { - /* Increment the data-source's in-use counter. */ + /* Decrement the data-source's in-use counter. */ __wt_cursor_dhandle_decr_use(session); WT_TRET(__wt_session_release_btree(session)); } @@ -439,6 +439,9 @@ __wt_curfile_create(WT_SESSION_IMPL *session, cursor->value_format = btree->value_format; cbt->btree = btree; + if (session->dhandle->checkpoint != NULL) + F_SET(cbt, WT_CBT_NO_TXN); + if (bulk) { F_SET(cursor, WT_CURSTD_BULK); diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 9937390d19a..2b2117ad9fd 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -84,7 +84,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) */ WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || - __wt_page_can_evict(session, ref, false, NULL)); + __wt_page_can_evict(session, ref, NULL)); __wt_evict_page_clean_update(session, ref, true); break; WT_ILLEGAL_VALUE_ERR(session); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index e77edf3d500..f2784890ab7 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -159,7 +159,8 @@ __evict_server(void *arg) WT_DECL_RET; WT_SESSION_IMPL *session; #ifdef HAVE_DIAGNOSTIC - struct timespec now, stuck_ts = { 0, 0 }; + struct timespec now, stuck_ts; + uint64_t pages_evicted = 0; #endif u_int spins; @@ -204,10 +205,11 @@ __evict_server(void *arg) /* Next time we wake up, reverse the sweep direction. */ cache->flags ^= WT_CACHE_WALK_REVERSE; #ifdef HAVE_DIAGNOSTIC - stuck_ts.tv_sec = 0; - } else if (stuck_ts.tv_sec == 0) + pages_evicted = 0; + } else if (pages_evicted != cache->pages_evict) { WT_ERR(__wt_epoch(session, &stuck_ts)); - else { + pages_evicted = cache->pages_evict; + } else { /* After being stuck for 5 minutes, give up. */ WT_ERR(__wt_epoch(session, &now)); if (WT_TIMEDIFF_SEC(now, stuck_ts) > 300) { @@ -481,6 +483,13 @@ __evict_update_work(WT_SESSION_IMPL *session) goto done; } + /* + * If the cache has been stuck and is now under control, clear the + * stuck flag. + */ + if (bytes_inuse < bytes_max) + F_CLR(cache, WT_CACHE_STUCK); + dirty_inuse = __wt_cache_dirty_inuse(cache); if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) { FLD_SET(cache->state, WT_EVICT_PASS_DIRTY); @@ -498,6 +507,7 @@ __evict_update_work(WT_SESSION_IMPL *session) F_CLR(cache, WT_CACHE_WOULD_BLOCK); goto done; } + return (false); done: if (F_ISSET(cache, WT_CACHE_STUCK)) @@ -1169,7 +1179,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) uint64_t pages_walked; uint32_t walk_flags; int internal_pages, restarts; - bool enough, modified; + bool enough, modified, would_split; conn = S2C(session); btree = S2BT(session); @@ -1254,10 +1264,16 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) page->read_gen = __wt_cache_read_gen_new(session); fast: /* If the page can't be evicted, give up. */ - if (!__wt_page_can_evict(session, ref, true, NULL)) + if (!__wt_page_can_evict(session, ref, &would_split)) continue; /* + * Note: take care with ordering: if we detected that + * the page is modified above, we expect mod != NULL. + */ + mod = page->modify; + + /* * Additional tests if eviction is likely to succeed. * * If eviction is stuck or we are helping with forced eviction, @@ -1270,31 +1286,12 @@ fast: /* If the page can't be evicted, give up. */ if (!FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) { /* - * Note: take care with ordering: if we detected that - * the page is modified above, we expect mod != NULL. - */ - mod = page->modify; - - /* * If the page is clean but has modifications that * appear too new to evict, skip it. */ if (!modified && mod != NULL && !__wt_txn_visible_all(session, mod->rec_max_txn)) continue; - - /* - * If the oldest transaction hasn't changed since the - * last time this page was written, it's unlikely we - * can make progress. Similarly, if the most recent - * update on the page is not yet globally visible, - * eviction will fail. These heuristics attempt to - * avoid repeated attempts to evict the same page. - */ - if (modified && - (mod->disk_snap_min == conn->txn_global.oldest_id || - !__wt_txn_visible_all(session, mod->update_txn))) - continue; } WT_ASSERT(session, evict->ref == NULL); @@ -1419,7 +1416,6 @@ static int __evict_page(WT_SESSION_IMPL *session, bool is_server) { WT_BTREE *btree; - WT_CACHE *cache; WT_DECL_RET; WT_PAGE *page; WT_REF *ref; @@ -1458,12 +1454,6 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) (void)__wt_atomic_subv32(&btree->evict_busy, 1); - WT_RET(ret); - - cache = S2C(session)->cache; - if (F_ISSET(cache, WT_CACHE_STUCK)) - F_CLR(cache, WT_CACHE_STUCK); - return (ret); } @@ -1607,8 +1597,8 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) next_walk = NULL; session->dhandle = dhandle; - while (__wt_tree_walk(session, - &next_walk, NULL, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 && + while (__wt_tree_walk(session, &next_walk, NULL, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && next_walk != NULL) { page = next_walk->page; size = page->memory_footprint; diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index f0d97c7f2c8..9281e7cdb2d 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -241,19 +241,14 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { WT_ADDR *addr; WT_DECL_RET; - WT_PAGE *parent; WT_PAGE_MODIFY *mod; - parent = ref->home; mod = ref->page->modify; switch (mod->rec_result) { case WT_PM_REC_EMPTY: /* Page is empty */ /* Discard the parent's address. */ - if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - } + __wt_ref_free_addr(session, ref); /* * Update the parent to reference a deleted page. The fact that @@ -308,10 +303,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) return (EBUSY); /* Discard the parent's address. */ - if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - } + __wt_ref_free_addr(session, ref); /* * Update the parent to reference the replacement page. @@ -434,7 +426,7 @@ __evict_review( if (modified) __wt_txn_update_oldest(session, true); - if (!__wt_page_can_evict(session, ref, false, inmem_splitp)) + if (!__wt_page_can_evict(session, ref, inmem_splitp)) return (EBUSY); /* diff --git a/src/include/btmem.h b/src/include/btmem.h index ae29dc68003..6ee74c61a38 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -198,20 +198,9 @@ struct __wt_ovfl_txnc { * When a page is modified, there's additional information to maintain. */ struct __wt_page_modify { - /* - * Track the highest transaction ID at which the page was written to - * disk. This can be used to avoid trying to write the page multiple - * times if a snapshot is keeping old versions pinned (e.g., in a - * checkpoint). - */ - uint64_t disk_snap_min; - /* The first unwritten transaction ID (approximate). */ uint64_t first_dirty_txn; - /* In-memory split transaction ID. */ - uint64_t inmem_split_txn; - /* Avoid checking for obsolete updates during checkpoints. */ uint64_t obsolete_check_txn; @@ -221,10 +210,8 @@ struct __wt_page_modify { /* The largest update transaction ID (approximate). */ uint64_t update_txn; -#ifdef HAVE_DIAGNOSTIC /* Check that transaction time moves forward. */ uint64_t last_oldest_id; -#endif /* Dirty bytes added to the cache. */ size_t bytes_dirty; @@ -313,17 +300,8 @@ struct __wt_page_modify { * so they can be discarded when no longer needed. */ WT_PAGE *root_split; /* Linked list of root split pages */ - - /* - * When we deepen the tree, newly created internal pages cannot - * be evicted until all threads have exited the original page - * index structure. We set a transaction value during the split - * that's checked during eviction. - */ - uint64_t split_txn; /* Split eviction transaction value */ } intl; #define mod_root_split u2.intl.root_split -#define mod_split_txn u2.intl.split_txn struct { /* * Appended items to column-stores: there is only a single one diff --git a/src/include/btree.h b/src/include/btree.h index ccdcccbaa0e..a1d8e395cfc 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -88,7 +88,8 @@ struct __wt_btree { uint32_t maxleafpage; /* Leaf page max size */ uint32_t maxleafkey; /* Leaf page max key size */ uint32_t maxleafvalue; /* Leaf page max value size */ - uint64_t maxmempage; /* In memory page max size */ + uint64_t maxmempage; /* In-memory page max size */ + uint64_t splitmempage; /* In-memory split trigger size */ void *huffman_key; /* Key huffman encoding */ void *huffman_value; /* Value huffman encoding */ diff --git a/src/include/btree.i b/src/include/btree.i index 25b555be64e..02633c3206a 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -349,13 +349,6 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_cache_dirty_incr(session, page); /* - * The page can never end up with changes older than the oldest - * running transaction. - */ - if (F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)) - page->modify->disk_snap_min = session->txn.snap_min; - - /* * We won the race to dirty the page, but another thread could * have committed in the meantime, and the last_running field * been updated past it. That is all very unlikely, but not @@ -473,6 +466,22 @@ __wt_off_page(WT_PAGE *page, const void *p) } /* + * __wt_ref_free_addr -- + * Free the address in a reference, if necessary. + */ +static inline void +__wt_ref_free_addr(WT_SESSION_IMPL *session, WT_REF *ref) +{ + if (ref->addr != NULL) { + if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } else + ref->addr = NULL; + } +} + +/* * __wt_ref_key -- * Return a reference to a row-store internal page key as cheaply as * possible. @@ -970,11 +979,11 @@ __wt_ref_info(WT_SESSION_IMPL *session, } /* - * __wt_page_can_split -- + * __wt_leaf_page_can_split -- * Check whether a page can be split in memory. */ static inline bool -__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_INSERT_HEAD *ins_head; @@ -1005,7 +1014,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * reconciliation will be wrong, so we can't evict immediately). */ if (page->type != WT_PAGE_ROW_LEAF || - page->memory_footprint < btree->maxmempage || + page->memory_footprint < btree->splitmempage || !__wt_page_is_modified(page)) return (false); @@ -1048,13 +1057,12 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * Check whether a page can be evicted. */ static inline bool -__wt_page_can_evict(WT_SESSION_IMPL *session, - WT_REF *ref, bool check_splits, bool *inmem_splitp) +__wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) { WT_BTREE *btree; WT_PAGE *page; WT_PAGE_MODIFY *mod; - WT_TXN_GLOBAL *txn_global; + bool modified; if (inmem_splitp != NULL) *inmem_splitp = false; @@ -1073,20 +1081,21 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * detailed eviction tests. We don't need further tests since the page * won't be written or discarded from the cache. */ - if (__wt_page_can_split(session, page)) { + if (__wt_leaf_page_can_split(session, page)) { if (inmem_splitp != NULL) *inmem_splitp = true; return (true); } + modified = __wt_page_is_modified(page); + /* * If the file is being checkpointed, we can't evict dirty pages: * if we write a page and free the previous version of the page, that * previous version might be referenced by an internal page already * been written in the checkpoint, leaving the checkpoint inconsistent. */ - if (btree->checkpointing != WT_CKPT_OFF && - __wt_page_is_modified(page)) { + if (btree->checkpointing != WT_CKPT_OFF && modified) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); return (false); @@ -1107,28 +1116,24 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * pages cannot be evicted until all threads are known to have exited * the original parent page's index, because evicting an internal page * discards its WT_REF array, and a thread traversing the original - * parent page index might see a freed WT_REF. During the split we set - * a transaction value, we can evict the created page as soon as that - * transaction value is globally visible. + * parent page index might see a freed WT_REF. */ - if (check_splits && WT_PAGE_IS_INTERNAL(page) && - (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) || - !__wt_txn_visible_all(session, mod->mod_split_txn))) + if (WT_PAGE_IS_INTERNAL(page) && + F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) return (false); /* - * If the page was recently split in-memory, don't evict it immediately: - * we want to give application threads that are appending a chance to - * move to the new leaf page created by the split. - * - * Note the check here is similar to __wt_txn_visible_all, but ignores - * the checkpoint's transaction. + * If the oldest transaction hasn't changed since the last time + * this page was written, it's unlikely we can make progress. + * Similarly, if the most recent update on the page is not yet + * globally visible, eviction will fail. These heuristics + * attempt to avoid repeated attempts to evict the same page. */ - if (check_splits) { - txn_global = &S2C(session)->txn_global; - if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) - return (false); - } + if (modified && + !F_ISSET(S2C(session)->cache, WT_CACHE_STUCK) && + (mod->last_oldest_id == __wt_txn_oldest_id(session) || + !__wt_txn_visible_all(session, mod->update_txn))) + return (false); return (true); } @@ -1223,7 +1228,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION) || - !__wt_page_can_evict(session, ref, true, NULL)) + !__wt_page_can_evict(session, ref, NULL)) return (__wt_hazard_clear(session, page)); WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); diff --git a/src/include/cursor.h b/src/include/cursor.h index 0a3842efd45..54787d2227b 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -197,7 +197,14 @@ struct __wt_cursor_btree { #define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */ #define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */ #define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */ -#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ +#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor + (e.g. on a checkpoint) */ +#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */ + +#define WT_CBT_POSITION_MASK /* Flags associated with position */ \ + (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ + WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST) + uint8_t flags; }; diff --git a/src/include/cursor.i b/src/include/cursor.i index 9dd280534b4..2e382591313 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -41,11 +41,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt) cbt->cip_saved = NULL; cbt->rip_saved = NULL; - /* - * Don't clear the active flag, it's owned by the cursor enter/leave - * functions. - */ - F_CLR(cbt, ~WT_CBT_ACTIVE); + F_CLR(cbt, WT_CBT_POSITION_MASK); } /* @@ -93,7 +89,8 @@ __curfile_enter(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cbt->iface.session; - WT_RET(__cursor_enter(session)); + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + WT_RET(__cursor_enter(session)); F_SET(cbt, WT_CBT_ACTIVE); return (0); } @@ -112,7 +109,8 @@ __curfile_leave(WT_CURSOR_BTREE *cbt) /* If the cursor was active, deactivate it. */ if (F_ISSET(cbt, WT_CBT_ACTIVE)) { - __cursor_leave(session); + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + __cursor_leave(session); F_CLR(cbt, WT_CBT_ACTIVE); } @@ -204,7 +202,7 @@ err: return (ret); /* * __wt_cursor_dhandle_incr_use -- - * Increment the in-use counter in cursor's data source. + * Increment the in-use counter in the cursor's data source. */ static inline void __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session) @@ -221,7 +219,7 @@ __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session) /* * __wt_cursor_dhandle_decr_use -- - * Decrement the in-use counter in cursor's data source. + * Decrement the in-use counter in the cursor's data source. */ static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session) @@ -262,7 +260,13 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) if (!F_ISSET(cbt, WT_CBT_ACTIVE)) WT_RET(__curfile_enter(cbt)); - __wt_txn_cursor_op(session); + + /* + * If this is an ordinary transactional cursor, make sure we are set up + * to read. + */ + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + __wt_txn_cursor_op(session); return (0); } diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 255551f99a4..8155397d823 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -293,7 +293,7 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) &log->slot_pool[i].slot_buf, log->slot_buf_size)); F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } - WT_STAT_FAST_CONN_INCRV(session, + WT_STAT_FAST_CONN_SET(session, log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); /* * Set up the available slot from the pool the first time. diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index dbe5ce8a781..0e1e7498568 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -351,6 +351,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_RECONCILE *r; + uint64_t oldest_id; page = ref->page; mod = page->modify; @@ -361,21 +362,14 @@ __wt_reconcile(WT_SESSION_IMPL *session, /* We shouldn't get called with a clean page, that's an error. */ WT_ASSERT(session, __wt_page_is_modified(page)); -#ifdef HAVE_DIAGNOSTIC - { /* * Check that transaction time always moves forward for a given page. * If this check fails, reconciliation can free something that a future * reconciliation will need. */ - uint64_t oldest_id = __wt_txn_oldest_id(session); + oldest_id = __wt_txn_oldest_id(session); WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id)); mod->last_oldest_id = oldest_id; - } -#endif - - /* Record the most recent transaction ID we will *not* write. */ - mod->disk_snap_min = session->txn.snap_min; /* Initialize the reconciliation structure for each new run. */ WT_RET(__rec_write_init( @@ -1401,12 +1395,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, __wt_txn_visible_all(session, page_del->txnid))) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); WT_RET(__rec_block_free(session, addr, addr_size)); - - if (__wt_off_page(ref->home, ref->addr)) { - __wt_free(session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - } - ref->addr = NULL; + __wt_ref_free_addr(session, ref); } /* @@ -5440,12 +5429,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_ref_info( session, ref, &addr, &addr_size, NULL)); WT_RET(__rec_block_free(session, addr, addr_size)); - if (__wt_off_page(ref->home, ref->addr)) { - __wt_free( - session, ((WT_ADDR *)ref->addr)->addr); - __wt_free(session, ref->addr); - } - ref->addr = NULL; + __wt_ref_free_addr(session, ref); } break; case WT_PM_REC_EMPTY: /* Page deleted */ diff --git a/src/session/session_api.c b/src/session/session_api.c index 12f7ce2ec3f..053f69ee7f8 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -206,6 +206,9 @@ __session_close(WT_SESSION *wt_session, const char *config) __wt_spin_unlock(session, &conn->api_lock); + /* We no longer have a session, don't try to update it. */ + session = NULL; + err: API_END_RET_NOTFOUND_MAP(session, ret); } diff --git a/test/format/config.c b/test/format/config.c index b9d0e437765..7aa4575efbd 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -394,7 +394,7 @@ config_lrt(void) * stores. */ if (g.type == FIX) { - if (config_is_perm("long_running_txn")) + if (g.c_long_running_txn && config_is_perm("long_running_txn")) die(EINVAL, "long_running_txn not supported with fixed-length " "column store"); |