diff options
author | Ramon Fernandez <ramon@mongodb.com> | 2017-10-12 07:15:30 -0400 |
---|---|---|
committer | Ramon Fernandez <ramon@mongodb.com> | 2017-10-12 07:15:30 -0400 |
commit | 96dfbfbb8cfc35640034b189ba58fef43751319e (patch) | |
tree | 0cee99a3dda7d150795449d3bb80f13d1db9c7fa /src | |
parent | 8205b768ca58b68b7aaf3b3b1c7a544f8d044d5a (diff) | |
download | mongo-96dfbfbb8cfc35640034b189ba58fef43751319e.tar.gz |
Import wiredtiger: 4b5ade6072d548fdebe3b376f94e0d672eea5359 from branch mongodb-3.6r3.6.0-rc0
ref: 0cd3d5bbd8..4b5ade6072
for: 3.5.14
WT-3644 Port to FreeBSD release 11.1
WT-3645 Build Failed: Lookaside file occupies 10MB of 11MB cache size
WT-3646 Only use lookaside when operations are blocked waiting for cache
WT-3649 Disable lookaside eviction during close
Diffstat (limited to 'src')
21 files changed, 266 insertions, 127 deletions
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 b/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 index f9509df26bf..92e77815a18 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 @@ -95,6 +95,10 @@ AC_DEFUN([AM_CLANG_WARNINGS], [ # w="$w -Wno-error=cast-qual" w="$w -Wno-cast-qual" + # Turn off clang thread-safety-analysis, it doesn't like some of the + # code patterns in WiredTiger. + w="$w -Wno-thread-safety-analysis" + # On Centos 7.3.1611, system header files aren't compatible with # -Wdisabled-macro-expansion. w="$w -Wno-disabled-macro-expansion" diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 6c4f2ee7138..f5e704d7625 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "0cd3d5bbd8a5c8779f1129c6754b4463403e788f", + "commit": "4b5ade6072d548fdebe3b376f94e0d672eea5359", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index ab8a8d7916b..9d4e860f8fd 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -453,7 +453,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_PAGE *page; uint64_t sleep_cnt, wait_cnt; int force_attempts; - bool busy, cache_work, did_read, evict_soon, stalled; + bool busy, cache_work, did_read, stalled, wont_need; btree = S2BT(session); @@ -466,7 +466,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_STAT_DATA_INCR(session, cache_pages_requested); } - for (did_read = evict_soon = stalled = false, + for (did_read = wont_need = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: @@ -477,9 +477,8 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags case WT_REF_DISK: case WT_REF_LOOKASIDE: if (LF_ISSET(WT_READ_CACHE)) { - if (ref->state != WT_REF_LOOKASIDE) - return (WT_NOTFOUND); - if (!LF_ISSET(WT_READ_LOOKASIDE)) + if (ref->state != WT_REF_LOOKASIDE || + !LF_ISSET(WT_READ_LOOKASIDE)) return (WT_NOTFOUND); #ifdef HAVE_TIMESTAMPS /* @@ -520,7 +519,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * here because we don't want to evict the page before * we "acquire" it. */ - evict_soon = LF_ISSET(WT_READ_WONT_NEED) || + wont_need = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: @@ -610,8 +609,8 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags /* * If we read the page and are configured to not trash * the cache, and no other thread has already used the - * page, set the oldest read generation so the page is - * forcibly evicted as soon as possible. + * page, set the read generation so the page is evicted + * soon. * * Otherwise, if we read the page, or, if configured to * update the page's read generation and the page isn't @@ -620,8 +619,8 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags */ page = ref->page; if (page->read_gen == WT_READGEN_NOTSET) { - if (evict_soon) - __wt_page_evict_soon(session, ref); + if (wont_need) + page->read_gen = WT_READGEN_WONT_NEED; else __wt_cache_read_gen_new(session, page); } else if (!LF_ISSET(WT_READ_NO_GEN)) diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 884ee9b5c8b..71007e76dfd 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -213,7 +213,8 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) * sure we're catching all paths and to avoid regressions. */ WT_ASSERT(session, - S2BT(session)->checkpointing != WT_CKPT_RUNNING); + S2BT(session)->checkpointing != WT_CKPT_RUNNING || + WT_SESSION_IS_CHECKPOINT(session)); WT_RET(__wt_ovfl_discard(session, cell)); } @@ -1179,7 +1180,7 @@ __split_internal_lock( * loop until the exclusive lock is resolved). If we want to split * the parent, give up to avoid that deadlock. */ - if (!trylock && S2BT(session)->checkpointing != WT_CKPT_OFF) + if (!trylock && !__wt_btree_can_evict_dirty(session)) return (EBUSY); /* @@ -1293,13 +1294,10 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) static int __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_BTREE *btree; WT_DECL_RET; WT_PAGE *parent; WT_REF *ref; - btree = S2BT(session); - /* * Disallow internal splits during the final pass of a checkpoint. Most * splits are already disallowed during checkpoints, but an important @@ -1310,7 +1308,7 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page) * split chunk, but we'll write it upon finding it in a different part * of the tree. */ - if (btree->checkpointing != WT_CKPT_OFF) { + if (!__wt_btree_can_evict_dirty(session)) { __split_internal_unlock(session, page); return (0); } @@ -1421,7 +1419,7 @@ __split_multi_inmem( * leave the new page with the read generation unset. Eviction will * set the read generation next time it visits this page. */ - if (orig->read_gen != WT_READGEN_OLDEST) + if (!WT_READGEN_EVICT_SOON(orig->read_gen)) page->read_gen = orig->read_gen; /* If there are no updates to apply to the page, we're done. */ @@ -1638,7 +1636,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, /* Verify any disk image we have. */ WT_ASSERT(session, multi->disk_image == NULL || __wt_verify_dsk_image(session, - "[page instantiate]", multi->disk_image, 0, false) == 0); + "[page instantiate]", multi->disk_image, 0, true) == 0); /* * If there's an address, the page was written, set it. diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 02ff0a1a4be..15d83169ea2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -309,8 +309,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) } /* - * If the page needs forced eviction, try to do that - * now. + * If the page was pulled into cache by our read, try + * to evict it now. * * For eviction to have a chance, we first need to move * the walk point to the next page checkpoint will @@ -322,7 +322,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * remember so we don't retry it. */ if (!WT_PAGE_IS_INTERNAL(page) && - page->read_gen == WT_READGEN_OLDEST && + page->read_gen == WT_READGEN_WONT_NEED && !evict_failed) { if ((ret = __sync_evict_page( session, &walk, flags)) == 0) { diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index d9a5dbc2096..ccf16674a68 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -156,6 +156,10 @@ __wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) */ btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree; + /* Track the lookaside file ID. */ + if (S2C(session)->las_fileid == 0) + S2C(session)->las_fileid = btree->id; + /* * Set special flags for the lookaside table: the lookaside flag (used, * for example, to avoid writing records during reconciliation), also diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 55251491129..4fcd1b8ede1 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1084,6 +1084,13 @@ err: /* WT_TRET(wt_session->close(wt_session, config)); } + /* + * Disable lookaside eviction: it doesn't help us shut down and can + * lead to pages being marked dirty, causing spurious assertions to + * fire. + */ + F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE); + /* Shut down transactions (wait for in-flight operations to complete. */ WT_TRET(__wt_txn_global_shutdown(session)); diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index 625350cf3e6..da5b6bfd55f 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -177,7 +177,7 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) * The lowest possible page read-generation has a special meaning, it * marks a page for forcible eviction; don't let it happen by accident. */ - cache->read_gen = WT_READGEN_START_VALUE; + cache->read_gen = cache->read_gen_oldest = WT_READGEN_START_VALUE; /* * The target size must be lower than the trigger size or we will never diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index d968d4e4b2b..a47524af2d7 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -161,11 +161,8 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) * * Checkpoint does enough I/O it may be called upon to perform slow * operations for the block manager. - * - * The checkpoint thread reads the lookaside table for outdated records, - * it gets its own cursor for that purpose. */ - session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_LOOKASIDE_CURSOR; + session_flags = WT_SESSION_CAN_WAIT; WT_RET(__wt_open_internal_session(conn, "checkpoint-server", true, session_flags, &conn->ckpt_session)); session = conn->ckpt_session; diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 8dd48738735..2bc359df4ae 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -71,7 +71,7 @@ __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref) page = ref->page; /* Any page set to the oldest generation should be discarded. */ - if (page->read_gen == WT_READGEN_OLDEST) + if (WT_READGEN_EVICT_SOON(page->read_gen)) return (WT_READGEN_OLDEST); /* Any page from a dead tree is a great choice. */ @@ -1271,10 +1271,10 @@ __evict_lru_walk(WT_SESSION_IMPL *session) * system. The queue is sorted, find the first "normal" * generation. */ - read_gen_oldest = WT_READGEN_OLDEST; + read_gen_oldest = WT_READGEN_START_VALUE; for (candidates = 0; candidates < entries; ++candidates) { read_gen_oldest = queue->evict_queue[candidates].score; - if (read_gen_oldest != WT_READGEN_OLDEST) + if (!WT_READGEN_EVICT_SOON(read_gen_oldest)) break; } @@ -1286,7 +1286,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session) * 50% of the entries were at the oldest read generation, take * all of them. */ - if (read_gen_oldest == WT_READGEN_OLDEST) + if (WT_READGEN_EVICT_SOON(read_gen_oldest)) queue->evict_candidates = entries; else if (candidates > entries / 2) queue->evict_candidates = candidates; @@ -1872,9 +1872,16 @@ __evict_walk_file(WT_SESSION_IMPL *session, continue; } - /* Pages that are empty or from dead trees are fast-tracked. */ + /* + * Pages that are empty or from dead trees are fast-tracked. + * + * Also evict lookaside table pages without further filtering: + * the cache is under pressure by definition and we want to + * free space. + */ if (__wt_page_is_empty(page) || - F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || + F_ISSET(btree, WT_BTREE_LOOKASIDE)) goto fast; /* @@ -1937,8 +1944,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, * can be evicted as soon as they are committed. */ mod = page->modify; - if (modified && !F_ISSET(btree, WT_BTREE_LOOKASIDE) && - txn_global->current != txn_global->oldest_id && + if (modified && txn_global->current != txn_global->oldest_id && (mod->last_eviction_id == __wt_txn_oldest_id(session) || !__wt_txn_visible_all(session, mod->update_txn, NULL))) continue; @@ -1995,7 +2001,7 @@ fast: /* If the page can't be evicted, give up. */ */ if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || - ref->page->read_gen == WT_READGEN_OLDEST || + WT_READGEN_EVICT_SOON(ref->page->read_gen) || ref->page->memory_footprint >= btree->splitmempage) { if (restarts == 0) WT_STAT_CONN_INCR( @@ -2003,7 +2009,7 @@ fast: /* If the page can't be evicted, give up. */ WT_RET(__wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT)); ref = NULL; - } else if (ref->page->read_gen == WT_READGEN_OLDEST) + } else if (WT_READGEN_EVICT_SOON(ref->page->read_gen)) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( session, &ref, &refs_walked, walk_flags)); btree->evict_ref = ref; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 7536e3593e8..4b7c71c19ee 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -344,6 +344,9 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) /* * Update the parent to reference the replacement page. * + * A page evicted with lookaside entries may not have an + * address, if no updates were visible to reconciliation. + * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ @@ -574,10 +577,17 @@ __evict_review( } /* - * Check if reconciliation suggests trying the - * lookaside table. + * If the cache is nearly stuck, check if + * reconciliation suggests trying the lookaside table + * unless lookaside eviction is disabled globally. + * + * We don't wait until the cache is completely stuck: + * for workloads where lookaside eviction is necessary + * to make progress, we don't want a single successful + * page eviction to make the cache "unstuck" so we have + * to wait again before evicting the next page. */ - if (__wt_cache_aggressive(session) && + if (__wt_cache_nearly_stuck(session) && !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)) lookaside_retryp = &lookaside_retry; } @@ -626,7 +636,6 @@ __evict_review( WT_ASSERT(session, __wt_page_is_modified(page) || LF_ISSET(WT_REC_LOOKASIDE) || - F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE) || __wt_txn_visible_all(session, page->modify->rec_max_txn, WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 486ab7562a1..8ba6a240ace 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -617,6 +617,9 @@ struct __wt_page { */ #define WT_READGEN_NOTSET 0 #define WT_READGEN_OLDEST 1 +#define WT_READGEN_WONT_NEED 2 +#define WT_READGEN_EVICT_SOON(readgen) \ + ((readgen) != WT_READGEN_NOTSET && (readgen) < WT_READGEN_START_VALUE) #define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 3b196dca673..35c7d5d5a1a 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1165,6 +1165,24 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) } /* + * __wt_btree_can_evict_dirty -- + * Check whether eviction of dirty pages or splits are permitted in the + * current tree. + * + * We cannot evict dirty pages or split while a checkpoint is in progress, + * unless the checkpoint thread is doing the work. + */ +static inline bool +__wt_btree_can_evict_dirty(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + btree = S2BT(session); + return (btree->checkpointing == WT_CKPT_OFF || + WT_SESSION_IS_CHECKPOINT(session)); +} + +/* * __wt_leaf_page_can_split -- * Check whether a page can be split in memory. */ @@ -1272,12 +1290,10 @@ static inline bool __wt_page_can_evict( WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *evict_flagsp) { - WT_BTREE *btree; WT_PAGE *page; WT_PAGE_MODIFY *mod; bool modified; - btree = S2BT(session); page = ref->page; mod = page->modify; @@ -1291,7 +1307,7 @@ __wt_page_can_evict( * parent frees the backing blocks for any no-longer-used overflow keys, * which will corrupt the checkpoint's block management. */ - if (btree->checkpointing != WT_CKPT_OFF && + if (!__wt_btree_can_evict_dirty(session) && F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); @@ -1315,8 +1331,7 @@ __wt_page_can_evict( * previous version might be referenced by an internal page already * written in the checkpoint, leaving the checkpoint inconsistent. */ - if (modified && btree->checkpointing != WT_CKPT_OFF && - !WT_SESSION_IS_CHECKPOINT(session)) { + if (modified && !__wt_btree_can_evict_dirty(session)) { WT_STAT_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_DATA_INCR(session, cache_eviction_checkpoint); return (false); @@ -1385,7 +1400,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * tree, then perform a general check if eviction will be possible. */ page = ref->page; - if (page->read_gen != WT_READGEN_OLDEST || + if (!WT_READGEN_EVICT_SOON(page->read_gen) || LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || btree->evict_disabled > 0 || diff --git a/src/third_party/wiredtiger/src/include/btree_cmp.i b/src/third_party/wiredtiger/src/include/btree_cmp.i index db7af8daaed..9efbf8f618f 100644 --- a/src/third_party/wiredtiger/src/include/btree_cmp.i +++ b/src/third_party/wiredtiger/src/include/btree_cmp.i @@ -7,7 +7,7 @@ */ #ifdef HAVE_X86INTRIN_H -#if !defined(_MSC_VER) +#if !defined(_MSC_VER) && !defined(_lint) #include <x86intrin.h> #endif /* 16B alignment */ diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index d8ecb9a91ad..d51e58e471b 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -79,6 +79,22 @@ __wt_cache_read_gen_new(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_cache_nearly_stuck -- + * Indicate if the cache is nearly stuck. + */ +static inline bool +__wt_cache_nearly_stuck(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + + cache = S2C(session)->cache; + return (cache->evict_aggressive_score >= + (WT_EVICT_SCORE_MAX - WT_EVICT_SCORE_BUMP) && + F_ISSET(cache, + WT_CACHE_EVICT_CLEAN_HARD | WT_CACHE_EVICT_DIRTY_HARD)); +} + +/* * __wt_cache_stuck -- * Indicate if the cache is stuck (i.e., not making progress). */ diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 1d7b6142685..0b9e82ee1ef 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -365,6 +365,7 @@ struct __wt_connection_impl { */ WT_SPINLOCK las_lock; /* Lookaside table spinlock */ WT_SESSION_IMPL *las_session; /* Lookaside table session */ + uint32_t las_fileid; /* Lookaside table file ID */ /* * The "lookaside_activity" verbose messages are throttled to once per diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index af43a56f877..a62489cb661 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1205,7 +1205,6 @@ static int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { - WT_BTREE *btree; WT_PAGE *page; WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd; wt_timestamp_t *timestampp; @@ -1214,7 +1213,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, *updp = NULL; - btree = S2BT(session); page = r->page; first_ts_upd = first_txn_upd = NULL; max_txn = WT_TXN_NONE; @@ -1262,15 +1260,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * uncommitted updates). Lookaside eviction can save any * committed update. Regular eviction checks that the maximum * transaction ID and timestamp seen are stable. - * - * Use the first committed entry we find in the lookaside - * table. */ - if (F_ISSET(btree, WT_BTREE_LOOKASIDE) && !uncommitted) { - *updp = upd; - break; - } - if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? !__wt_txn_upd_visible_all(session, upd) : !__wt_txn_upd_visible(session, upd)) { @@ -1326,9 +1316,10 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, /* * The checkpoint transaction is special. Make sure we never write - * (metadata) updates from a checkpoint in a concurrent session. + * metadata updates from a checkpoint in a concurrent session. */ - WT_ASSERT(session, *updp == NULL || (*updp)->txnid == WT_TXN_NONE || + WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || + *updp == NULL || (*updp)->txnid == WT_TXN_NONE || (*updp)->txnid != S2C(session)->txn_global.checkpoint_state.id || WT_SESSION_IS_CHECKPOINT(session)); @@ -1352,13 +1343,10 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UNUSED(first_ts_upd); timestampp = NULL; #endif - if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) - all_visible = !uncommitted; - else - all_visible = *updp == first_txn_upd && - (F_ISSET(r, WT_REC_VISIBLE_ALL) ? - __wt_txn_visible_all(session, max_txn, timestampp) : - __wt_txn_visible(session, max_txn, timestampp)); + all_visible = *updp == first_txn_upd && + (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + __wt_txn_visible_all(session, max_txn, timestampp) : + __wt_txn_visible(session, max_txn, timestampp)); if (all_visible) goto check_original_value; @@ -1391,8 +1379,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is * the WT_REC_LOOKASIDE flag. */ - if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE) && - !F_ISSET(btree, WT_BTREE_LOOKASIDE)) + if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE)) return (EBUSY); if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE)) return (EBUSY); @@ -1405,14 +1392,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, #ifdef HAVE_TIMESTAMPS /* Track the oldest saved timestamp for lookaside. */ - if (F_ISSET(r, WT_REC_LOOKASIDE)) { + if (F_ISSET(r, WT_REC_LOOKASIDE)) for (upd = first_upd; upd->next != NULL; upd = upd->next) - ; - if (__wt_timestamp_cmp( - &r->min_saved_timestamp, &upd->timestamp) > 0) - __wt_timestamp_set( - &r->min_saved_timestamp, &upd->timestamp); - } + if (upd->txnid != WT_TXN_ABORTED && + upd->txnid != WT_TXN_NONE && + __wt_timestamp_cmp( + &upd->timestamp, &r->min_saved_timestamp) < 0) + __wt_timestamp_set( + &r->min_saved_timestamp, &upd->timestamp); #endif check_original_value: @@ -1659,6 +1646,17 @@ __rec_child_modify(WT_SESSION_IMPL *session, if (F_ISSET(r, WT_REC_EVICT)) return (EBUSY); + /* + * A page evicted with lookaside entries may not have + * an address, if no updates were visible to + * reconciliation. Any child pages in that state + * should be ignored. + */ + if (ref->addr == NULL) { + *statep = WT_CHILD_IGNORE; + WT_CHILD_RELEASE(session, *hazardp, ref); + } + goto done; case WT_REF_MEM: @@ -1996,6 +1994,29 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (page_size * 2); } +#define WT_REC_MAX_SAVED_UPDATES 100 + +/* + * __rec_need_split -- + * Check whether adding some bytes to the page requires a split. + * + * This takes into account the disk image growing across a boundary, and + * also triggers a split for row store leaf pages when a threshold number + * of saved updates is reached. This allows pages to split for update / + * restore and lookaside eviction when there is no visible data that + * causes the disk image to grow. + */ +static bool +__rec_need_split(WT_RECONCILE *r, size_t len) +{ + if (r->page->type == WT_PAGE_ROW_LEAF && + r->supd_next >= WT_REC_MAX_SAVED_UPDATES) + return (true); + + return (r->raw_compression ? + len > r->space_avail : WT_CHECK_CROSSING_BND(r, len)); +} + /* * __rec_split_page_size_from_pct -- * Given a split percentage, calculate split page size in bytes. @@ -2456,7 +2477,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) btree = S2BT(session); /* Fixed length col store can call with next_len 0 */ - WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); + WT_ASSERT(session, next_len == 0 || __rec_need_split(r, next_len)); /* * We should never split during salvage, and we're about to drop core @@ -2474,7 +2495,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) * Additionally, grow the buffer to contain the current item if we * haven't already consumed a reasonable portion of a split chunk. */ - if (inuse < r->split_size / 2) + if (inuse < r->split_size / 2 && !__rec_need_split(r, 0)) goto done; /* All page boundaries reset the dictionary. */ @@ -2557,7 +2578,7 @@ __rec_split_crossing_bnd( WT_BTREE *btree; size_t min_offset; - WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len)); + WT_ASSERT(session, __rec_need_split(r, next_len)); /* * If crossing the minimum split size boundary, store the boundary @@ -2566,7 +2587,7 @@ __rec_split_crossing_bnd( * large enough, just split at this point. */ if (WT_CROSSING_MIN_BND(r, next_len) && - !WT_CROSSING_SPLIT_BND(r, next_len)) { + !WT_CROSSING_SPLIT_BND(r, next_len) && !__rec_need_split(r, 0)) { btree = S2BT(session); WT_ASSERT(session, r->cur_ptr->min_offset == 0); @@ -2640,7 +2661,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, /* * We can get here if the first key/value pair won't fit. */ - if (r->entries == 0) + if (r->entries == 0 && !__rec_need_split(r, 0)) goto split_grow; /* @@ -4110,13 +4131,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_CHILD_RELEASE_ERR(session, hazard, ref); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (val->len > r->space_avail) + if (__rec_need_split(r, val->len)) { + if (r->raw_compression) WT_ERR(__rec_split_raw(session, r, val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, val->len)) + else WT_ERR(__rec_split_crossing_bnd( session, r, val->len)); + } /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4158,13 +4179,13 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (val->len > r->space_avail) + if (__rec_need_split(r, val->len)) { + if (r->raw_compression) WT_RET(__rec_split_raw(session, r, val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, val->len)) + else WT_RET(__rec_split_crossing_bnd( session, r, val->len)); + } /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4431,12 +4452,12 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, session, r, value->data, value->size, rle)); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (val->len > r->space_avail) + if (__rec_need_split(r, val->len)) { + if (r->raw_compression) WT_RET(__rec_split_raw(session, r, val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, val->len)) + else WT_RET(__rec_split_crossing_bnd(session, r, val->len)); + } /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -5132,12 +5153,11 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) + if (__rec_need_split(r, key->len + val->len)) { + if (r->raw_compression) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { + else { /* * In one path above, we copied address blocks * from the page rather than building the actual @@ -5153,6 +5173,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } + } /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5202,14 +5223,14 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) + if (__rec_need_split(r, key->len + val->len)) { + if (r->raw_compression) WT_RET(__rec_split_raw( session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) + else WT_RET(__rec_split_crossing_bnd( session, r, key->len + val->len)); + } /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5549,12 +5570,11 @@ build: } /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) + if (__rec_need_split(r, key->len + val->len)) { + if (r->raw_compression) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { + else { /* * If we copied address blocks from the page * rather than building the actual key, we have @@ -5585,6 +5605,7 @@ build: WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } + } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5693,12 +5714,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) + if (__rec_need_split(r, key->len + val->len)) { + if (r->raw_compression) WT_RET(__rec_split_raw( session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { + else { /* * Turn off prefix compression until a full key * written to the new page, and (unless already @@ -5717,6 +5737,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_RET(__rec_split_crossing_bnd( session, r, key->len + val->len)); } + } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index c5c514c008b..3215a372d36 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -701,6 +701,16 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) break; } + /* + * Writes to the lookaside file can be evicted as soon + * as they commit. + */ + if (conn->las_fileid != 0 && + op->fileid == conn->las_fileid) { + op->u.upd->txnid = WT_TXN_NONE; + break; + } + #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && op->type != WT_TXN_OP_BASIC_TS) { diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 7d2bb62cdd1..2137d5b16ef 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -1018,7 +1018,7 @@ int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) { WT_DECL_RET; - uint32_t mask; + uint32_t orig_flags; /* * Reset open cursors. Do this explicitly, even though it will happen @@ -1040,11 +1040,23 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) * * Application checkpoints wait until the checkpoint lock is available, * compaction checkpoints don't. - */ -#define WT_TXN_SESSION_MASK \ + * + * Checkpoints should always use a separate session for lookaside + * updates, otherwise those updates are pinned until the checkpoint + * commits. Also, there are unfortunate interactions between the + * special rules for lookaside eviction and the special handling of the + * checkpoint transaction. + */ +#undef WT_CHECKPOINT_SESSION_FLAGS +#define WT_CHECKPOINT_SESSION_FLAGS \ (WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION) - mask = F_MASK(session, WT_TXN_SESSION_MASK); - F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); +#undef WT_CHECKPOINT_SESSION_FLAGS_OFF +#define WT_CHECKPOINT_SESSION_FLAGS_OFF \ + (WT_SESSION_LOOKASIDE_CURSOR) + orig_flags = F_MASK(session, + WT_CHECKPOINT_SESSION_FLAGS | WT_CHECKPOINT_SESSION_FLAGS_OFF); + F_SET(session, WT_CHECKPOINT_SESSION_FLAGS); + F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS_OFF); /* * Only one checkpoint can be active at a time, and checkpoints must run @@ -1060,8 +1072,8 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, ret = __txn_checkpoint_wrapper(session, cfg)); - F_CLR(session, WT_TXN_SESSION_MASK); - F_SET(session, mask); + F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS); + F_SET(session, orig_flags); return (ret); } diff --git a/src/third_party/wiredtiger/test/suite/test_colgap.py b/src/third_party/wiredtiger/test/suite/test_colgap.py index 73feaf0dd8e..1ea55e11d78 100644 --- a/src/third_party/wiredtiger/test/suite/test_colgap.py +++ b/src/third_party/wiredtiger/test/suite/test_colgap.py @@ -176,7 +176,7 @@ class test_colmax(wttest.WiredTigerTestCase): # Confirm searching past the end of the table works. if not self.bulk: - cursor.set_key(recno) + cursor.set_key(simple_key(cursor, recno)) self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) # Insert the big record. @@ -191,18 +191,18 @@ class test_colmax(wttest.WiredTigerTestCase): cursor = self.session.open_cursor(uri, None, None) # Search for the large record. - cursor.set_key(recno) + cursor.set_key(simple_key(cursor, recno)) self.assertEqual(cursor.search(), 0) self.assertEqual(cursor.get_value(), simple_value(cursor, recno)) # Update it. cursor[simple_key(cursor, recno)] = simple_value(cursor, 37) - cursor.set_key(recno) + cursor.set_key(simple_key(cursor, recno)) self.assertEqual(cursor.search(), 0) self.assertEqual(cursor.get_value(), simple_value(cursor, 37)) # Remove it. - cursor.set_key(recno) + cursor.set_key(simple_key(cursor, recno)) self.assertEqual(cursor.remove(), 0) cursor.set_key(simple_key(cursor, recno)) self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp07.py b/src/third_party/wiredtiger/test/suite/test_timestamp07.py index 09547dba3a7..ffbe1e314d1 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp07.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp07.py @@ -82,6 +82,34 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): if txn_config: session.commit_transaction() + # Check reads of all tables at a timestamp + def check_reads(self, session, txn_config, check_value, valcnt, valcnt2, valcnt3): + if txn_config: + session.begin_transaction(txn_config) + c = session.open_cursor(self.uri + self.tablename, None) + c2 = session.open_cursor(self.uri + self.tablename2, None) + c3 = session.open_cursor(self.uri + self.tablename3, None) + count = 0 + for k, v in c: + if check_value in str(v): + count += 1 + c.close() + count2 = 0 + for k, v in c2: + if check_value in str(v): + count2 += 1 + c2.close() + count3 = 0 + for k, v in c3: + if check_value in str(v): + count3 += 1 + c3.close() + if txn_config: + session.commit_transaction() + self.assertEqual(count, valcnt) + self.assertEqual(count2, valcnt2) + self.assertEqual(count3, valcnt3) + # # Take a backup of the database and verify that the value we want to # check exists in the tables the expected number of times. @@ -135,6 +163,15 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): self.session.checkpoint(ckptcfg) self.backup_check(check_value, valcnt, valcnt2, valcnt3) + def check_stable(self, check_value, valcnt, valcnt2, valcnt3): + self.ckpt_backup(check_value, valcnt, valcnt2, valcnt3) + + # When reading as-of a timestamp, tables 1 and 3 should match (both + # use timestamps and we're not running recovery, so logging behavior + # should be irrelevant). + self.check_reads(self.session, 'read_timestamp=' + self.stablets, + check_value, valcnt, valcnt2, valcnt) + def test_timestamp07(self): if not wiredtiger.timestamp_build(): self.skipTest('requires a timestamp build') @@ -177,9 +214,9 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): # Bump the oldest timestamp, we're not going back... self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(self.nkeys)) - self.oldts = timestamp_str(self.nkeys) + self.oldts = self.stablets = timestamp_str(self.nkeys) self.conn.set_timestamp('oldest_timestamp=' + self.oldts) - self.conn.set_timestamp('stable_timestamp=' + self.oldts) + self.conn.set_timestamp('stable_timestamp=' + self.stablets) # print "Oldest " + self.oldts # Update them and retry. @@ -204,14 +241,14 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): # Take a checkpoint using the given configuration. Then verify # whether value2 appears in a copy of that data or not. - self.ckpt_backup(self.value2, 0, self.nkeys, self.nkeys if self.using_log else 0) + self.check_stable(self.value2, 0, self.nkeys, self.nkeys if self.using_log else 0) # Update the stable timestamp to the latest, but not the oldest # timestamp and make sure we can see the data. Once the stable # timestamp is moved we should see all keys with value2. - self.conn.set_timestamp('stable_timestamp=' + \ - timestamp_str(self.nkeys*2)) - self.ckpt_backup(self.value2, self.nkeys, self.nkeys, self.nkeys) + self.stablets = timestamp_str(self.nkeys*2) + self.conn.set_timestamp('stable_timestamp=' + self.stablets) + self.check_stable(self.value2, self.nkeys, self.nkeys, self.nkeys) # If we're not using the log we're done. if not self.using_log: @@ -244,7 +281,7 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): # of that data or not. Both tables that are logged should see # all the data regardless of timestamps. The table that is not # logged should not see any of it. - self.backup_check(self.value3, 0, self.nkeys, self.nkeys) + self.check_stable(self.value3, 0, self.nkeys, self.nkeys) if __name__ == '__main__': wttest.run() |