diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2017-11-10 14:25:02 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-11-10 14:25:02 +1100 |
commit | d56f8dc481f3250b531273d0cd376f57df324914 (patch) | |
tree | cdf913c7523c32f4a2441e05062d2150097b4d8c /src | |
parent | 7a702f125909a58035625e664affa625c1f88049 (diff) | |
download | mongo-d56f8dc481f3250b531273d0cd376f57df324914.tar.gz |
WT-3715 Lookaside eviction tuning. (#3777)
Multiple changes aimed at improving performance and decreasing stalls
when applications keep more history than fits in cache.
Support multiple lookaside sessions / cursors simultaneously (initially 5).
Don't count lookaside pages as part of the dirty content in cache.
Add statistics that indicate the range of pinned timestamps.
Try to further hand-optimize WT_SESSION::transaction_timestamp, since
it is called under a mutex by MongoDB.
Dropping a tree with lookaside entries now causes the entries to be
discarded in the background by the sweep thread, rather than doing a
full pass of the lookaside table for every drop.
Diffstat (limited to 'src')
34 files changed, 783 insertions, 242 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index e6e8cce02e2..fc4afc7f9b1 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -116,7 +116,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) cursor, btree_id, ref->page_las->las_pageid); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); + &las_pageid, &las_id, &las_counter, &las_key)); /* * Confirm the search using the unique prefix; if not a match, @@ -314,6 +314,11 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages or pages with lookaside entries. + * The difference is that checkpoints can skip over clean pages that + * are being read into cache, but need to wait for deletes or lookaside + * updates to be resolved (in order for checkpoint to write the correct + * version of the page). + * * If successful, we've won the race, read the page. */ switch (previous_state = ref->state) { @@ -368,8 +373,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) */ page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED; - if (LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(session, WT_SESSION_NO_EVICTION)) + if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) FLD_SET(page_flags, WT_PAGE_READ_NO_EVICT); WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &page)); tmp.mem = NULL; @@ -518,6 +522,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags btree = S2BT(session); + if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) + LF_SET(WT_READ_IGNORE_CACHE_SIZE); + /* * Ignore reads of pages already known to be in cache, otherwise the * eviction server can dominate these statistics. @@ -554,7 +561,7 @@ read: /* * allowed to do eviction work, check for space in the * cache. */ - if (!LF_ISSET(WT_READ_NO_EVICT)) + if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref, flags)); @@ -574,7 +581,7 @@ read: /* * we "acquire" it. */ wont_need = LF_ISSET(WT_READ_WONT_NEED) || - F_ISSET(session, WT_SESSION_NO_CACHE); + F_ISSET(session, WT_SESSION_READ_WONT_NEED); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) @@ -623,17 +630,22 @@ read: /* } /* - * If eviction is configured for this file, check to see - * if the page qualifies for forced eviction and update - * the page's generation number. If eviction isn't being - * done on this file, we're done. + * Check if the page requires forced eviction. */ - if (did_read || LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(session, WT_SESSION_NO_EVICTION) || + if (did_read || LF_ISSET(WT_READ_NO_SPLIT) || btree->evict_disabled > 0 || btree->lsm_primary) goto skip_evict; /* + * If reconciliation is disabled (e.g., when inserting + * into the lookaside table), skip forced eviction if + * the page can't split. + */ + if (F_ISSET(session, WT_SESSION_NO_RECONCILE) && + !__wt_leaf_page_can_split(session, ref->page)) + goto skip_evict; + + /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && @@ -684,9 +696,19 @@ skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. + * + * The logic here is a little weird: some code paths do + * a blanket ban on checking the cache size in + * sessions, but still require a transaction (e.g., + * when updating metadata or lookaside). If + * WT_READ_IGNORE_CACHE_SIZE was passed in explicitly, + * we're done. If we set WT_READ_IGNORE_CACHE_SIZE + * because it was set in the session then make sure we + * start a transaction. */ - return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : - __wt_txn_autocommit_check(session)); + return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) && + !F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE) ? + 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } @@ -707,7 +729,7 @@ skip_evict: /* * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ - if (!LF_ISSET(WT_READ_NO_EVICT)) { + if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) @@ -728,16 +750,16 @@ __btree_verbose_lookaside_read( WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid) { #ifdef HAVE_VERBOSE - WT_CONNECTION_IMPL *conn; + WT_CACHE *cache; uint64_t ckpt_gen_current, ckpt_gen_last; if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY)) return; - conn = S2C(session); + cache = S2C(session)->cache; ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); - ckpt_gen_last = conn->las_verb_gen_read; + ckpt_gen_last = cache->las_verb_gen_read; /* * This message is throttled to one per checkpoint. To do this we @@ -751,7 +773,7 @@ __btree_verbose_lookaside_read( * for which this message was printed. If the atomic swap fails * we have raced and the winning thread will print the message. */ - if (__wt_atomic_casv64(&conn->las_verb_gen_read, + if (__wt_atomic_casv64(&cache->las_verb_gen_read, ckpt_gen_last, ckpt_gen_current)) { __wt_verbose(session, WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY, diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 91b53dcba96..021788919d0 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -141,6 +141,9 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_DECL_RET; WT_REF *ref; + uint32_t read_flags; + + read_flags = WT_READ_CACHE | WT_READ_NO_EVICT; /* The split is complete and live, verify all of the pages involved. */ __split_verify_intl_key_order(session, page); @@ -156,14 +159,14 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) * Ignore pages not in-memory (deleted, on-disk, being read), * there's no in-memory structure to check. */ - if ((ret = __wt_page_in(session, - ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) + if ((ret = + __wt_page_in(session, ref, read_flags)) == WT_NOTFOUND) continue; WT_ERR(ret); __split_verify_intl_key_order(session, ref->page); - WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT)); + WT_ERR(__wt_page_release(session, ref, read_flags)); } WT_INTL_FOREACH_END; return (0); @@ -1648,6 +1651,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_RET(__wt_calloc_one(session, &ref->page_las)); *ref->page_las = multi->page_las; + WT_ASSERT(session, ref->page_las->las_max_txn != WT_TXN_NONE); ref->state = WT_REF_LOOKASIDE; } diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index d15852af935..2338d5be8ed 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -58,6 +58,7 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) i = 0; i < mod->mod_multi_entries; ++multi, ++i) if (multi->addr.addr == NULL) return (false); + return (true); } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index f883acef4d5..deed37517bb 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -9,18 +9,44 @@ #include "wt_internal.h" /* + * When an operation is accessing the lookaside table, it should ignore the + * cache size (since the cache is already full), any pages it reads should be + * evicted before application data, and the operation can't reenter + * reconciliation. + */ +#define WT_LAS_SESSION_FLAGS \ + (WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED | \ + WT_SESSION_NO_RECONCILE) + +/* + * __wt_las_nonempty -- + * Return when there are entries in the lookaside table. + */ +bool +__wt_las_nonempty(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + + cache = S2C(session)->cache; + + return (cache->las_entry_count > 0); +} + +/* * __wt_las_stats_update -- * Update the lookaside table statistics for return to the application. */ void __wt_las_stats_update(WT_SESSION_IMPL *session) { + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS **cstats; WT_DSRC_STATS **dstats; int64_t v; conn = S2C(session); + cache = conn->cache; /* * Lookaside table statistics are copied from the underlying lookaside @@ -36,7 +62,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) */ cstats = conn->stats; dstats = ((WT_CURSOR_BTREE *) - conn->las_session->las_cursor)->btree->dhandle->stats; + cache->las_session[0]->las_cursor)->btree->dhandle->stats; v = WT_STAT_READ(dstats, cursor_insert); WT_STAT_SET(session, cstats, cache_lookaside_insert, v); @@ -62,13 +88,15 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) int __wt_las_create(WT_SESSION_IMPL *session) { + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - uint32_t session_flags; + int i; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; conn = S2C(session); + cache = conn->cache; /* Read-only and in-memory configurations don't need the LAS table. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) @@ -86,16 +114,17 @@ __wt_las_create(WT_SESSION_IMPL *session) WT_RET(ret); /* Re-create the table. */ - WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); + WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_CONFIG)); /* * Open a shared internal session and cursor used for the lookaside - * table. This session should never be tapped for eviction. + * table. This session should never perform reconciliation. */ - session_flags = WT_SESSION_NO_EVICTION; - WT_RET(__wt_open_internal_session( - conn, "lookaside table", true, session_flags, &conn->las_session)); - WT_RET(__wt_las_cursor_open(conn->las_session)); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) { + WT_RET(__wt_open_internal_session(conn, "lookaside table", + true, WT_LAS_SESSION_FLAGS, &cache->las_session[i])); + WT_RET(__wt_las_cursor_open(cache->las_session[i])); + } /* The statistics server is already running, make sure we don't race. */ WT_WRITE_BARRIER(); @@ -111,20 +140,31 @@ __wt_las_create(WT_SESSION_IMPL *session) int __wt_las_destroy(WT_SESSION_IMPL *session) { + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; + int i; conn = S2C(session); + cache = conn->cache; F_CLR(conn, WT_CONN_LOOKASIDE_OPEN); - if (conn->las_session == NULL) + if (cache == NULL) return (0); - wt_session = &conn->las_session->iface; - ret = wt_session->close(wt_session, NULL); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) { + if (cache->las_session[i] == NULL) + continue; - conn->las_session = NULL; + wt_session = &cache->las_session[i]->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + cache->las_session[i] = NULL; + } + + __wt_buf_free(session, &cache->las_sweep_key); + __wt_free(session, cache->las_dropped); + __wt_free(session, cache->las_sweep_dropmap); return (ret); } @@ -154,8 +194,8 @@ __wt_las_cursor_open(WT_SESSION_IMPL *session) btree = ((WT_CURSOR_BTREE *)cursor)->btree; /* Track the lookaside file ID. */ - if (S2C(session)->las_fileid == 0) - S2C(session)->las_fileid = btree->id; + if (S2C(session)->cache->las_fileid == 0) + S2C(session)->cache->las_fileid = btree->id; /* * Set special flags for the lookaside table: the lookaside flag (used, @@ -187,7 +227,8 @@ void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) { - WT_CONNECTION_IMPL *conn; + WT_CACHE *cache; + int i; *cursorp = NULL; @@ -200,10 +241,9 @@ __wt_las_cursor( * problems and there's no reason to believe lookaside pages will be * useful more than once. */ - *session_flags = - F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + *session_flags = F_MASK(session, WT_LAS_SESSION_FLAGS); - conn = S2C(session); + cache = S2C(session)->cache; /* * Some threads have their own lookaside table cursors, else lock the @@ -212,12 +252,30 @@ __wt_las_cursor( if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) *cursorp = session->las_cursor; else { - __wt_spin_lock(session, &conn->las_lock); - *cursorp = conn->las_session->las_cursor; + for (;;) { + __wt_spin_lock(session, &cache->las_lock); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) { + if (!cache->las_session_inuse[i]) { + *cursorp = + cache->las_session[i]->las_cursor; + cache->las_session_inuse[i] = true; + break; + } + } + __wt_spin_unlock(session, &cache->las_lock); + if (*cursorp != NULL) + break; + /* + * If all the lookaside sessions are busy, stall. + * + * XXX better as a condition variable. + */ + __wt_sleep(0, 1000); + } } - /* Turn caching and eviction off. */ - F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + /* Configure session to access the lookaside table. */ + F_SET(session, WT_LAS_SESSION_FLAGS); } /* @@ -226,13 +284,14 @@ __wt_las_cursor( */ int __wt_las_cursor_close( - WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) { - WT_CONNECTION_IMPL *conn; + WT_CACHE *cache; WT_CURSOR *cursor; WT_DECL_RET; + int i; - conn = S2C(session); + cache = S2C(session)->cache; if ((cursor = *cursorp) == NULL) return (0); @@ -245,15 +304,23 @@ __wt_las_cursor_close( * We turned off caching and eviction while the lookaside cursor was in * use, restore the session's flags. */ - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_CLR(session, WT_LAS_SESSION_FLAGS); F_SET(session, session_flags); /* * Some threads have their own lookaside table cursors, else unlock the * shared lookaside cursor. */ - if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) - __wt_spin_unlock(session, &conn->las_lock); + if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { + __wt_spin_lock(session, &cache->las_lock); + for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) + if (cursor->session == &cache->las_session[i]->iface) { + cache->las_session_inuse[i] = false; + break; + } + __wt_spin_unlock(session, &cache->las_lock); + WT_ASSERT(session, i != WT_LAS_NUM_SESSIONS); + } return (ret); } @@ -267,6 +334,7 @@ static int __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) { #ifdef HAVE_VERBOSE + WT_CACHE *cache; WT_CONNECTION_IMPL *conn; #ifdef HAVE_TIMESTAMPS char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1]; @@ -283,8 +351,9 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) return (0); conn = S2C(session); + cache = conn->cache; ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); - ckpt_gen_last = conn->las_verb_gen_write; + ckpt_gen_last = cache->las_verb_gen_write; /* * Print a message if verbose lookaside, or once per checkpoint if @@ -293,7 +362,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) */ if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) || (ckpt_gen_current > ckpt_gen_last && - __wt_atomic_casv64(&conn->las_verb_gen_write, + __wt_atomic_casv64(&cache->las_verb_gen_write, ckpt_gen_last, ckpt_gen_current))) { (void)__wt_eviction_clean_needed(session, &pct_full); (void)__wt_eviction_dirty_needed(session, &pct_dirty); @@ -323,7 +392,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) /* Never skip updating the tracked generation */ if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) - conn->las_verb_gen_write = ckpt_gen_current; + cache->las_verb_gen_write = ckpt_gen_current; #else WT_UNUSED(session); WT_UNUSED(multi); @@ -336,12 +405,14 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) * Copy one set of saved updates into the database's lookaside buffer. */ int -__wt_las_insert_block(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) +__wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, + WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) { + WT_BTREE *btree; WT_DECL_RET; WT_ITEM las_timestamp, las_value; WT_SAVE_UPD *list; + WT_SESSION_IMPL *las_session; WT_UPDATE *upd; uint64_t insert_cnt, las_counter, las_pageid; uint32_t btree_id, i, slot; @@ -351,15 +422,23 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CLEAR(las_value); insert_cnt = 0; - btree_id = S2BT(session)->id; + btree = S2BT(session); + btree_id = btree->id; las_pageid = multi->page_las.las_pageid = - __wt_atomic_add64(&S2BT(session)->las_pageid, 1); + __wt_atomic_add64(&S2C(session)->cache->las_pageid, 1); + + if (!btree->lookaside_entries) + btree->lookaside_entries = true; + + /* Wrap all the updates in a transaction. */ + las_session = (WT_SESSION_IMPL *)cursor->session; + WT_RET(__wt_txn_begin(las_session, NULL)); /* * Make sure there are no leftover entries (e.g., from a handle * reopen). */ - WT_RET(__wt_las_remove_block(session, cursor, btree_id, las_pageid)); + WT_ERR(__wt_las_remove_block(session, cursor, btree_id, las_pageid)); /* Enter each update in the boundary's list into the lookaside store. */ for (las_counter = 0, i = 0, @@ -369,20 +448,20 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = key->mem; - WT_RET( + WT_ERR( __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); key->size = WT_PTRDIFF(p, key->data); break; case WT_PAGE_ROW_LEAF: if (list->ins == NULL) - WT_RET(__wt_row_leaf_key( + WT_ERR(__wt_row_leaf_key( session, page, list->ripcip, key, false)); else { key->data = WT_INSERT_KEY(list->ins); key->size = WT_INSERT_KEY_SIZE(list->ins); } break; - WT_ILLEGAL_VALUE(session); + WT_ILLEGAL_VALUE_ERR(session); } /* @@ -430,7 +509,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, } cursor->set_key(cursor, - btree_id, las_pageid, ++las_counter, key); + las_pageid, btree_id, ++las_counter, key); #ifdef HAVE_TIMESTAMPS las_timestamp.data = &upd->timestamp; @@ -439,7 +518,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, cursor->set_value(cursor, upd->txnid, &las_timestamp, upd->type, &las_value); - WT_RET(cursor->insert(cursor)); + WT_ERR(cursor->insert(cursor)); ++insert_cnt; } while ((upd = upd->next) != NULL); } @@ -447,10 +526,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, if (insert_cnt > 0) { WT_STAT_CONN_INCRV( session, cache_lookaside_entries, insert_cnt); + __wt_atomic_add64( + &S2C(session)->cache->las_entry_count, insert_cnt); WT_ERR(__las_insert_block_verbose(session, multi)); } -err: __wt_free(session, multi->supd); +err: /* Resolve the transaction. */ + if (ret == 0) + ret = __wt_txn_commit(las_session, NULL); + else + WT_TRET(__wt_txn_rollback(las_session, NULL)); + __wt_free(session, multi->supd); multi->supd_entries = 0; return (ret); } @@ -471,6 +557,15 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) int exact; /* + * When scanning for all pages, start at the beginning of the lookaside + * table. + */ + if (pageid == 0) { + WT_RET(cursor->reset(cursor)); + return (cursor->next(cursor)); + } + + /* * Because of the special visibility rules for lookaside, a new block * can appear in between our search and the block of interest. Keep * trying until we find it. @@ -478,7 +573,7 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) for (;;) { WT_CLEAR(las_key); cursor->set_key(cursor, - btree_id, pageid, (uint64_t)0, &las_key); + pageid, btree_id, (uint64_t)0, &las_key); WT_RET(cursor->search_near(cursor, &exact)); if (exact < 0) { WT_RET(cursor->next(cursor)); @@ -494,9 +589,9 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) * WT_CONNECTION::rollback_to_stable. */ WT_RET(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); - if (las_id < btree_id || (las_id == btree_id && - pageid != 0 && las_pageid < pageid)) + &las_pageid, &las_id, &las_counter, &las_key)); + if (las_pageid < pageid || (las_pageid == pageid && + las_id < btree_id)) continue; } @@ -508,7 +603,7 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) /* * __wt_las_remove_block -- - * Remove all records matching a key prefix from the lookaside store. + * Remove all records for a given page from the lookaside store. */ int __wt_las_remove_block(WT_SESSION_IMPL *session, @@ -516,18 +611,29 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_ITEM las_key; + WT_SESSION_IMPL *las_session; uint64_t las_counter, las_pageid, remove_cnt; uint32_t las_id, session_flags; - bool local_cursor; + bool local_cursor, local_txn; remove_cnt = 0; session_flags = 0; /* [-Wconditional-uninitialized] */ - local_cursor = false; + local_cursor = local_txn = false; if (cursor == NULL) { __wt_las_cursor(session, &cursor, &session_flags); local_cursor = true; } + las_session = (WT_SESSION_IMPL *)cursor->session; + + /* + * Wrap all of the removes in a transaction, unless this remove is part + * of a larger operation. + */ + if (local_cursor) { + WT_ERR(__wt_txn_begin(las_session, NULL)); + local_txn = true; + } /* * Search for the block's unique prefix and step through all matching @@ -536,16 +642,13 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, ret = __wt_las_cursor_position(cursor, btree_id, pageid); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); + &las_pageid, &las_id, &las_counter, &las_key)); /* * Confirm the search using the unique prefix; if not a match, - * we're done searching for records for this page. Note that - * page ID zero is special: it is a wild card indicating that - * all pages in the tree should be removed. + * we're done searching for records for this page. */ - if (las_id != btree_id || - (pageid != 0 && las_pageid != pageid)) + if (las_pageid != pageid || las_id != btree_id) break; WT_ERR(cursor->remove(cursor)); @@ -553,9 +656,218 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, } WT_ERR_NOTFOUND_OK(ret); -err: if (local_cursor) - WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); +err: if (local_txn) { + if (ret == 0) + ret = __wt_txn_commit(las_session, NULL); + else + WT_TRET(__wt_txn_rollback(las_session, NULL)); + } + if (local_cursor) + WT_TRET(__wt_las_cursor_close( + session, &cursor, session_flags)); WT_STAT_CONN_DECRV(session, cache_lookaside_entries, remove_cnt); + __wt_cache_decr_check_uint64(session, + &S2C(session)->cache->las_entry_count, remove_cnt, + "lookaside entry count"); + return (ret); +} + +/* + * __wt_las_save_dropped -- + * Save a dropped btree ID to be swept from the lookaside table. + */ +int +__wt_las_save_dropped(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_DECL_RET; + + btree = S2BT(session); + cache = S2C(session)->cache; + + __wt_spin_lock(session, &cache->las_sweep_lock); + WT_ERR(__wt_realloc_def(session, &cache->las_dropped_alloc, + cache->las_dropped_next + 1, &cache->las_dropped)); + cache->las_dropped[cache->las_dropped_next++] = btree->id; +err: __wt_spin_unlock(session, &cache->las_sweep_lock); + return (ret); +} + +/* + * __las_sweep_init -- + * Prepare to start a lookaside sweep. + */ +static int +__las_sweep_init(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_DECL_RET; + u_int i; + + cache = S2C(session)->cache; + + __wt_spin_lock(session, &cache->las_sweep_lock); + /* If no files have been dropped, there's nothing to do. */ + if (cache->las_dropped_next == 0) + WT_ERR(WT_NOTFOUND); + + /* Scan the btree IDs to find min/max. */ + cache->las_sweep_dropmin = UINT32_MAX; + cache->las_sweep_dropmax = 0; + for (i = 0; i < cache->las_dropped_next; i++) { + cache->las_sweep_dropmin = WT_MIN( + cache->las_sweep_dropmin, + cache->las_dropped[i]); + cache->las_sweep_dropmax = WT_MAX( + cache->las_sweep_dropmax, + cache->las_dropped[i]); + } + + /* Initialize the bitmap. */ + __wt_free(session, cache->las_sweep_dropmap); + WT_ERR(__bit_alloc(session, + 1 + cache->las_sweep_dropmax - cache->las_sweep_dropmin, + &cache->las_sweep_dropmap)); + for (i = 0; i < cache->las_dropped_next; i++) + __bit_set(cache->las_sweep_dropmap, + cache->las_dropped[i] - cache->las_sweep_dropmin); + + /* Clear the list of btree IDs. */ + cache->las_dropped_next = 0; + +err: __wt_spin_unlock(session, &cache->las_sweep_lock); + return (ret); +} + +/* + * __wt_las_sweep -- + * Sweep the lookaside table. + */ +int +__wt_las_sweep(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_ITEM *key, las_key; + uint64_t cnt, las_counter, las_pageid, remove_cnt; + uint32_t las_id, session_flags; + int notused; + + cache = S2C(session)->cache; + cursor = NULL; + key = &cache->las_sweep_key; + remove_cnt = 0; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + __wt_las_cursor(session, &cursor, &session_flags); + + /* We should have our own session. */ + WT_ASSERT(session, cursor->session == &session->iface); + + /* + * When continuing a sweep, position the cursor using the key from the + * last call (we don't care if we're before or after the key, either + * side is fine). + * + * Otherwise, we're starting a new sweep, gather the list of trees to + * sweep. + */ + if (key->size != 0) { + __wt_cursor_set_raw_key(cursor, key); + ret = cursor->search_near(cursor, ¬used); + + /* + * Don't search for the same key twice; if we don't set a new + * key below, it's because we've reached the end of the table + * and we want the next pass to start at the beginning of the + * table. Searching for the same key could leave us stuck at + * the end of the table, repeatedly checking the same rows. + */ + key->size = 0; + } else + ret = __las_sweep_init(session); + + if (ret != 0) + goto srch_notfound; + + /* + * The sweep server wakes up every 10 seconds (by default), it's a slow + * moving thread. Try to review the entire lookaside table once every 5 + * minutes, or every 30 calls. + * + * The reason is because the lookaside table exists because we're seeing + * cache/eviction pressure (it allows us to trade performance and disk + * space for cache space), and it's likely lookaside blocks are being + * evicted, and reading them back in doesn't help things. A trickier, + * but possibly better, alternative might be to review all lookaside + * blocks in the cache in order to get rid of them, and slowly review + * lookaside blocks that have already been evicted. + */ + cnt = (uint64_t)WT_MAX(100, cache->las_entry_count / 30); + + /* Walk the file. */ + for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { + /* + * Give up if the cache is stuck: we are ignoring the cache + * size while scanning the lookaside table, so we're making + * things worse. + */ + if (__wt_cache_stuck(session)) + cnt = 1; + + /* + * If the loop terminates after completing a work unit, we will + * continue the table sweep next time. Get a local copy of the + * sweep key, we're going to reset the cursor; do so before + * calling cursor.remove, cursor.remove can discard our hazard + * pointer and the page could be evicted from underneath us. + */ + if (cnt == 1) { + WT_ERR(__wt_cursor_get_raw_key(cursor, key)); + if (!WT_DATA_IN_ITEM(key)) + WT_ERR(__wt_buf_set( + session, key, key->data, key->size)); + } + + WT_ERR(cursor->get_key(cursor, + &las_pageid, &las_id, &las_counter, &las_key)); + + /* + * If the entry belongs to a dropped tree, discard it. + * + * Cursor opened overwrite=true: won't return WT_NOTFOUND + * should another thread remove the record before we do (not + * expected for dropped trees), and the cursor remains + * positioned in that case. + * + * TODO it would also be good to remove entries in lookaside + * from live files that have aged out. If we track for each + * entry whether it was the on-page value chosen by + * reconciliation, we can safely remove entries from that point + * on (for the given key) that are visible to all readers. + */ + if (__bit_test(cache->las_sweep_dropmap, + las_id - cache->las_sweep_dropmin)) { + WT_ERR(cursor->remove(cursor)); + ++remove_cnt; + } + } + +srch_notfound: + WT_ERR_NOTFOUND_OK(ret); + + if (0) { +err: __wt_buf_free(session, key); + } + + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_cache_decr_check_uint64(session, + &S2C(session)->cache->las_entry_count, remove_cnt, + "lookaside entry count"); + return (ret); } diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 007aa8757da..76106b3592f 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -198,6 +198,10 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET_MSG(NULL, ret, "Failed to create session for eviction walks"); + WT_RET(__wt_spin_init(session, &cache->las_lock, "lookaside table")); + WT_RET(__wt_spin_init( + session, &cache->las_sweep_lock, "lookaside sweep")); + /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; for (i = 0; i < WT_EVICT_QUEUE_MAX; ++i) { @@ -334,6 +338,8 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) __wt_spin_destroy(session, &cache->evict_pass_lock); __wt_spin_destroy(session, &cache->evict_queue_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); + __wt_spin_destroy(session, &cache->las_lock); + __wt_spin_destroy(session, &cache->las_sweep_lock); wt_session = &cache->walk_session->iface; if (wt_session != NULL) WT_TRET(wt_session->close(wt_session, NULL)); diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 2f3f9488b58..42ae866b329 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -55,7 +55,6 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); - WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); @@ -125,7 +124,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); - __wt_spin_destroy(session, &conn->las_lock); __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 9b64c7a0f77..06e441a3037 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -278,10 +278,12 @@ __sweep_server(void *arg) WT_DECL_RET; WT_SESSION_IMPL *session; time_t now; + uint64_t last_las_sweep_id, oldest_id; u_int dead_handles; session = arg; conn = S2C(session); + last_las_sweep_id = WT_TXN_NONE; /* * Sweep for dead and excess handles. @@ -300,6 +302,26 @@ __sweep_server(void *arg) WT_STAT_CONN_INCR(session, dh_sweeps); /* + * Sweep the lookaside table. If the lookaside table hasn't yet + * been written, there's no work to do. + * + * Don't sweep the lookaside table if the cache is stuck full. + * The sweep uses the cache and can exacerbate the problem. + * If we try to sweep when the cache is full or we aren't + * making progress in eviction, sweeping can wind up constantly + * bringing in and evicting pages from the lookaside table, + * which will stop the cache from moving into the stuck state. + */ + if (__wt_las_nonempty(session) && + !__wt_cache_stuck(session)) { + oldest_id = __wt_txn_oldest_id(session); + if (WT_TXNID_LT(last_las_sweep_id, oldest_id)) { + WT_ERR(__wt_las_sweep(session)); + last_las_sweep_id = oldest_id; + } + } + + /* * Mark handles with a time of death, and report whether any * handles are marked dead. If sweep_idle_time is 0, handles * never become idle. @@ -379,15 +401,21 @@ __wt_sweep_create(WT_SESSION_IMPL *session) /* * Handle sweep does enough I/O it may be called upon to perform slow - * operations for the block manager. - * - * Don't tap the sweep thread for eviction. + * operations for the block manager. Sweep should not block due to the + * cache being full. */ - session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION; + session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE; WT_RET(__wt_open_internal_session( conn, "sweep-server", true, session_flags, &conn->sweep_session)); session = conn->sweep_session; + /* + * Sweep should have it's own lookaside cursor to avoid blocking reads + * and eviction when processing drops. + */ + if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) + WT_RET(__wt_las_cursor_open(session)); + WT_RET(__wt_cond_alloc( session, "handle sweep server", &conn->sweep_cond)); diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index e1fbb63178f..bcd3943122d 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -532,7 +532,8 @@ typedef struct { * Handle a key produced by a custom extractor. */ static int -__curjoin_extract_insert(WT_CURSOR *cursor) { +__curjoin_extract_insert(WT_CURSOR *cursor) +{ WT_CURJOIN_EXTRACTOR *cextract; WT_DECL_RET; WT_ITEM ikey; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 48db980efff..429f75208f2 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -33,7 +33,8 @@ typedef struct { * Handle a key produced by a custom extractor. */ static int -__curextract_insert(WT_CURSOR *cursor) { +__curextract_insert(WT_CURSOR *cursor) +{ WT_CURSOR_EXTRACTOR *cextract; WT_ITEM *key, ikey, pkey; WT_SESSION_IMPL *session; @@ -135,12 +136,13 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, * Apply an operation to all indices of a table. */ static int -__apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, bool skip_immutable) { +__apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, bool skip_immutable) +{ WT_CURSOR **cp; WT_INDEX *idx; WT_SESSION_IMPL *session; - int (*f)(WT_CURSOR *); u_int i; + int (*f)(WT_CURSOR *); cp = ctable->idx_cursors; session = (WT_SESSION_IMPL *)ctable->iface.session; diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 147b615c0ab..13e2823d234 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -54,10 +54,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) */ if (F_ISSET(dhandle, WT_DHANDLE_DEAD) && F_ISSET(S2C(session), WT_CONN_LOOKASIDE_OPEN) && - !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { - WT_ASSERT(session, !WT_IS_METADATA(dhandle)); + btree->lookaside_entries) { + WT_ASSERT(session, !WT_IS_METADATA(dhandle) && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)); - WT_RET(__wt_las_remove_block(session, NULL, btree->id, 0)); + WT_RET(__wt_las_save_dropped(session)); } else FLD_SET(walk_flags, WT_READ_LOOKASIDE); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 02851492039..3af5338d73f 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -75,7 +75,8 @@ __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref) return (WT_READGEN_OLDEST); /* Any page from a dead tree is a great choice. */ - if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) + if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) || + F_ISSET(btree, WT_BTREE_LOOKASIDE)) return (WT_READGEN_OLDEST); /* Any empty page (leaf or internal), is a good choice. */ @@ -606,6 +607,21 @@ __evict_update_work(WT_SESSION_IMPL *session) F_SET(cache, WT_CACHE_EVICT_SCRUB); /* + * Try lookaside evict when: + * (1) the cache is stuck; OR + * (2) the lookaside score goes over 80; and + * (3) the cache is more than half way from the dirty target to the + * dirty trigger. + */ + if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) && + (__wt_cache_stuck(session) || + (__wt_cache_lookaside_score(cache) > 80 && + dirty_inuse > (uint64_t) + ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) * + bytes_max) / 200))) + F_SET(cache, WT_CACHE_EVICT_LOOKASIDE); + + /* * With an in-memory cache, we only do dirty eviction in order to scrub * pages. */ @@ -1632,6 +1648,28 @@ __evict_walk_file(WT_SESSION_IMPL *session, QUEUE_FILLS_PER_PASS; /* + * If the tree is dead or we're near the end of the queue, fill the + * remaining slots. + */ + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + target_pages = remaining_slots; + + /* + * Lookaside pages don't count toward the cache's dirty limit. + * + * Preferentially evict lookaside pages unless applications are stalled + * on the dirty limit. Once application threads are stalled by the + * dirty limit, don't take any lookaside pages unless we're also up + * against the total cache size limit. + */ + if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) { + if (!F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) + target_pages = remaining_slots; + else if (!F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) + target_pages = 0; + } + + /* * Walk trees with a small fraction of the cache in case there are so * many trees that none of them use enough of the cache to be allocated * slots. Only skip a tree if it has no bytes of interest. @@ -1652,12 +1690,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, if (target_pages < MIN_PAGES_PER_TREE) target_pages = MIN_PAGES_PER_TREE; - /* - * If the tree is dead or we're near the end of the queue, fill the - * remaining slots. - */ - if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || - target_pages > remaining_slots) + if (target_pages > remaining_slots) target_pages = remaining_slots; /* @@ -1993,8 +2026,8 @@ fast: /* If the page can't be evicted, give up. */ if (restarts == 0) WT_STAT_CONN_INCR( session, cache_eviction_walks_abandoned); - WT_RET(__wt_page_release(cache->walk_session, - ref, WT_READ_NO_EVICT)); + WT_RET(__wt_page_release( + cache->walk_session, ref, walk_flags)); ref = NULL; } else if (WT_READGEN_EVICT_SOON(ref->page->read_gen)) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( @@ -2315,8 +2348,9 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) /* See if eviction is still needed. */ if (!__wt_eviction_needed(session, busy, &pct_full) || - (pct_full < 100 && cache->eviction_progress > - initial_progress + max_progress)) + ((pct_full < 100 || cache->eviction_scrub_limit > 0.0) && + (cache->eviction_progress > + initial_progress + max_progress))) break; /* diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 103c93a075b..65009dc3449 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -522,6 +522,13 @@ __evict_review( return (0); /* + * If reconciliation is disabled for this thread (e.g., during an + * eviction that writes to lookaside), give up. + */ + if (F_ISSET(session, WT_SESSION_NO_RECONCILE)) + return (EBUSY); + + /* * If the page is dirty, reconcile it to decide if we can evict it. * * If we have an exclusive lock (we're discarding the tree), assert @@ -575,9 +582,7 @@ __evict_review( * that can't be evicted, check if reconciliation * suggests trying the lookaside table. */ - if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) && - (__wt_cache_lookaside_score(cache) > 50 || - __wt_cache_stuck(session))) + if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE)) lookaside_retryp = &lookaside_retry; } } diff --git a/src/include/btmem.h b/src/include/btmem.h index c3646a2ae59..abb7cc19972 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -167,11 +167,12 @@ struct __wt_ovfl_reuse { * are written into a lookaside table, and restored as necessary if the page is * read. * - * The key is a unique marker for the page (a file ID plus a page ID), a - * counter (used to ensure the update records remain in the original order), - * and the record's key (byte-string for row-store, record number for - * column-store). The value is the WT_UPDATE structure's transaction ID, - * timestamp, update type and value. + * The key is a unique marker for the page (a page ID plus a file ID, ordered + * this way so that overall the lookaside table is append-mostly), a counter + * (used to ensure the update records remain in the original order), and the + * record's key (byte-string for row-store, record number for column-store). + * The value is the WT_UPDATE structure's transaction ID, timestamp, update + * type and value. * * As the key for the lookaside table is different for row- and column-store, we * store both key types in a WT_ITEM, building/parsing them in the code, because @@ -181,8 +182,8 @@ struct __wt_ovfl_reuse { * makes the lookaside table's value more likely to overflow the page size when * the row-store key is relatively large. */ -#define WT_LAS_FORMAT \ - "key_format=" WT_UNCHECKED_STRING(IQQu) \ +#define WT_LAS_CONFIG \ + "key_format=" WT_UNCHECKED_STRING(QIQu) \ ",value_format=" WT_UNCHECKED_STRING(QuBu) /* diff --git a/src/include/btree.h b/src/include/btree.h index 7dc9b4a11a7..8a3273d1b6b 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -134,13 +134,13 @@ struct __wt_btree { u_int rec_multiblock_max; /* Maximum blocks written for a page */ uint64_t last_recno; /* Column-store last record number */ - uint64_t las_pageid; /* Lookaside table page ID counter */ WT_REF root; /* Root page reference */ bool modified; /* If the tree ever modified */ uint8_t original; /* Newly created: bulk-load possible (want a bool but needs atomic cas) */ + bool lookaside_entries; /* Has entries in the lookaside table */ bool lsm_primary; /* Handle is/was the LSM primary */ WT_BM *bm; /* Block manager reference */ diff --git a/src/include/btree.i b/src/include/btree.i index f2948bfc90f..edc0973ee6f 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -149,7 +149,8 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) if (WT_PAGE_IS_INTERNAL(page)) { (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); - } else if (!btree->lsm_primary) { + } else if (!btree->lsm_primary && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } @@ -189,7 +190,7 @@ __wt_cache_decr_check_size( */ static inline void __wt_cache_decr_check_uint64( - WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld) + WT_SESSION_IMPL *session, uint64_t *vp, uint64_t v, const char *fld) { if (__wt_atomic_sub64(vp, v) < WT_EXABYTE) return; @@ -200,7 +201,7 @@ __wt_cache_decr_check_uint64( */ *vp = 0; __wt_errx(session, - "%s went negative with decrement of %" WT_SIZET_FMT, fld, v); + "%s went negative with decrement of %" PRIu64, fld, v); #ifdef HAVE_DIAGNOSTIC __wt_abort(session); @@ -261,7 +262,7 @@ __wt_cache_page_byte_dirty_decr( decr, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); - } else if (!btree->lsm_primary) { + } else if (!btree->lsm_primary && !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf, @@ -321,7 +322,8 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { - if (!btree->lsm_primary) { + if (!btree->lsm_primary && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } @@ -420,7 +422,8 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite) __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); - } else if (!btree->lsm_primary) { + } else if (!btree->lsm_primary && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); @@ -1359,6 +1362,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_BTREE *btree; WT_PAGE *page; + bool inmem_split; btree = S2BT(session); @@ -1387,10 +1391,10 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) */ page = ref->page; if (!WT_READGEN_EVICT_SOON(page->read_gen) || - LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(session, WT_SESSION_NO_EVICTION) || + LF_ISSET(WT_READ_NO_SPLIT) || btree->evict_disabled > 0 || - !__wt_page_can_evict(session, ref, NULL)) + !__wt_page_can_evict(session, ref, &inmem_split) || + (F_ISSET(session, WT_SESSION_NO_RECONCILE) && !inmem_split)) return (__wt_hazard_clear(session, ref)); WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); @@ -1622,6 +1626,6 @@ __wt_ref_state_yield_sleep(uint64_t *yield_count, uint64_t *sleep_count) return; } - (*sleep_count) = WT_MIN((*sleep_count) + WT_THOUSAND, 10 * WT_THOUSAND); + (*sleep_count) = WT_MIN((*sleep_count) + 100, WT_THOUSAND); __wt_sleep(0, (*sleep_count)); } diff --git a/src/include/cache.h b/src/include/cache.h index 0a42853b95b..f9ce4316e29 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -7,6 +7,12 @@ */ /* + * Helper: in order to read without any calls to eviction, we have to ignore + * the cache size and disable splits. + */ +#define WT_READ_NO_EVICT (WT_READ_IGNORE_CACHE_SIZE | WT_READ_NO_SPLIT) + +/* * Tuning constants: I hesitate to call this tuning, but we want to review some * number of pages from each file's in-memory tree for each page we evict. */ @@ -176,6 +182,38 @@ struct __wt_cache { int32_t evict_lookaside_score; /* + * Shared lookaside lock, session and cursor, used by threads accessing + * the lookaside table (other than eviction server and worker threads + * and the sweep thread, all of which have their own lookaside cursors). + */ +#define WT_LAS_NUM_SESSIONS 5 + WT_SPINLOCK las_lock; + WT_SESSION_IMPL *las_session[WT_LAS_NUM_SESSIONS]; + bool las_session_inuse[WT_LAS_NUM_SESSIONS]; + + uint32_t las_fileid; /* Lookaside table file ID */ + uint64_t las_entry_count; /* Count of entries in lookaside */ + uint64_t las_pageid; /* Lookaside table page ID counter */ + + WT_SPINLOCK las_sweep_lock; + WT_ITEM las_sweep_key; /* Track sweep position. */ + uint32_t las_sweep_dropmin; /* Minimum btree ID in current set. */ + uint8_t *las_sweep_dropmap; /* Bitmap of dropped btree IDs. */ + uint32_t las_sweep_dropmax; /* Maximum btree ID in current set. */ + + uint32_t *las_dropped; /* List of dropped btree IDs. */ + size_t las_dropped_next; /* Next index into drop list. */ + size_t las_dropped_alloc; /* Allocated size of drop list. */ + + /* + * The "lookaside_activity" verbose messages are throttled to once per + * checkpoint. To accomplish this we track the checkpoint generation + * for the most recent read and write verbose messages. + */ + uint64_t las_verb_gen_read; + uint64_t las_verb_gen_write; + + /* * Cache pool information. */ uint64_t cp_pass_pressure; /* Calculated pressure from this pass */ @@ -200,8 +238,9 @@ struct __wt_cache { #define WT_CACHE_EVICT_CLEAN_HARD 0x002 /* Clean % blocking app threads */ #define WT_CACHE_EVICT_DIRTY 0x004 /* Evict dirty pages */ #define WT_CACHE_EVICT_DIRTY_HARD 0x008 /* Dirty % blocking app threads */ -#define WT_CACHE_EVICT_SCRUB 0x010 /* Scrub dirty pages */ -#define WT_CACHE_EVICT_URGENT 0x020 /* Pages are in the urgent queue */ +#define WT_CACHE_EVICT_LOOKASIDE 0x010 /* Try lookaside eviction */ +#define WT_CACHE_EVICT_SCRUB 0x020 /* Scrub dirty pages */ +#define WT_CACHE_EVICT_URGENT 0x040 /* Pages are in the urgent queue */ #define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY) uint32_t flags; }; diff --git a/src/include/cache.i b/src/include/cache.i index e160dbf4d64..c7d802f8a5f 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -241,12 +241,12 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) return (false); /* - * LSM sets the no-eviction flag when holding the LSM tree lock, in that - * case, or when holding the schema lock, we don't want to highjack the - * thread for eviction. + * LSM sets the "ignore cache size" flag when holding the LSM tree + * lock, in that case, or when holding the schema lock, we don't want + * this thread to block for eviction. */ - return (!F_ISSET( - session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)); + return (!F_ISSET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_LOCKED_SCHEMA)); } /* @@ -395,12 +395,12 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) txn_global->current != txn_global->oldest_id); /* - * LSM sets the no-cache-check flag when holding the LSM tree lock, in - * that case, or when holding the handle list, schema or table locks - * (which can block checkpoints and eviction), don't block the thread - * for eviction. + * LSM sets the "ignore cache size" flag when holding the LSM tree + * lock, in that case, or when holding the handle list, schema or table + * locks (which can block checkpoints and eviction), don't block the + * thread for eviction. */ - if (F_ISSET(session, WT_SESSION_NO_EVICTION | + if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA | WT_SESSION_LOCKED_TABLE)) return (0); diff --git a/src/include/connection.h b/src/include/connection.h index c1d1921bdcc..9288618c87e 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -358,23 +358,6 @@ struct __wt_connection_impl { uint64_t sweep_interval; /* Handle sweep interval */ uint64_t sweep_handles_min;/* Handle sweep minimum open */ - /* - * Shared lookaside lock, session and cursor, used by threads accessing - * the lookaside table (other than eviction server and worker threads - * and the sweep thread, all of which have their own lookaside cursors). - */ - WT_SPINLOCK las_lock; /* Lookaside table spinlock */ - WT_SESSION_IMPL *las_session; /* Lookaside table session */ - uint32_t las_fileid; /* Lookaside table file ID */ - - /* - * The "lookaside_activity" verbose messages are throttled to once per - * checkpoint. To accomplish this we track the checkpoint generation - * for the most recent read and write verbose messages. - */ - uint64_t las_verb_gen_read; - uint64_t las_verb_gen_write; - /* Set of btree IDs not being rolled back */ uint8_t *stable_rollback_bitstring; uint32_t stable_rollback_maxfile; diff --git a/src/include/extern.h b/src/include/extern.h index bbe66abf753..17afb48bda6 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -200,15 +200,18 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_las_nonempty(WT_SESSION_IMPL *session); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); -extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_save_dropped(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_checksum_init(void); extern void __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); diff --git a/src/include/flags.h b/src/include/flags.h index 8ec3916435a..b191e8fe01d 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -32,17 +32,18 @@ #define WT_LOG_FSYNC 0x00000008 #define WT_LOG_SYNC_ENABLED 0x00000010 #define WT_READ_CACHE 0x00000001 -#define WT_READ_LOOKASIDE 0x00000002 -#define WT_READ_NOTFOUND_OK 0x00000004 -#define WT_READ_NO_EMPTY 0x00000008 -#define WT_READ_NO_EVICT 0x00000010 +#define WT_READ_IGNORE_CACHE_SIZE 0x00000002 +#define WT_READ_LOOKASIDE 0x00000004 +#define WT_READ_NOTFOUND_OK 0x00000008 +#define WT_READ_NO_EMPTY 0x00000010 #define WT_READ_NO_GEN 0x00000020 -#define WT_READ_NO_WAIT 0x00000040 -#define WT_READ_PREV 0x00000080 -#define WT_READ_RESTART_OK 0x00000100 -#define WT_READ_SKIP_INTL 0x00000200 -#define WT_READ_TRUNCATE 0x00000400 -#define WT_READ_WONT_NEED 0x00000800 +#define WT_READ_NO_SPLIT 0x00000040 +#define WT_READ_NO_WAIT 0x00000080 +#define WT_READ_PREV 0x00000100 +#define WT_READ_RESTART_OK 0x00000200 +#define WT_READ_SKIP_INTL 0x00000400 +#define WT_READ_TRUNCATE 0x00000800 +#define WT_READ_WONT_NEED 0x00001000 #define WT_REC_CHECKPOINT 0x00000001 #define WT_REC_EVICT 0x00000002 #define WT_REC_IN_MEMORY 0x00000004 @@ -52,26 +53,27 @@ #define WT_REC_VISIBILITY_ERR 0x00000040 #define WT_REC_VISIBLE_ALL 0x00000080 #define WT_SESSION_CAN_WAIT 0x00000001 -#define WT_SESSION_INTERNAL 0x00000002 -#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 -#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008 -#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010 -#define WT_SESSION_LOCKED_METADATA 0x00000020 -#define WT_SESSION_LOCKED_PASS 0x00000040 -#define WT_SESSION_LOCKED_SCHEMA 0x00000080 -#define WT_SESSION_LOCKED_SLOT 0x00000100 -#define WT_SESSION_LOCKED_TABLE_READ 0x00000200 -#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400 -#define WT_SESSION_LOCKED_TURTLE 0x00000800 -#define WT_SESSION_LOGGING_INMEM 0x00001000 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000 -#define WT_SESSION_NO_CACHE 0x00004000 +#define WT_SESSION_IGNORE_CACHE_SIZE 0x00000002 +#define WT_SESSION_INTERNAL 0x00000004 +#define WT_SESSION_LOCKED_CHECKPOINT 0x00000008 +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000010 +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000020 +#define WT_SESSION_LOCKED_METADATA 0x00000040 +#define WT_SESSION_LOCKED_PASS 0x00000080 +#define WT_SESSION_LOCKED_SCHEMA 0x00000100 +#define WT_SESSION_LOCKED_SLOT 0x00000200 +#define WT_SESSION_LOCKED_TABLE_READ 0x00000400 +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000800 +#define WT_SESSION_LOCKED_TURTLE 0x00001000 +#define WT_SESSION_LOGGING_INMEM 0x00002000 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00004000 #define WT_SESSION_NO_DATA_HANDLES 0x00008000 -#define WT_SESSION_NO_EVICTION 0x00010000 -#define WT_SESSION_NO_LOGGING 0x00020000 +#define WT_SESSION_NO_LOGGING 0x00010000 +#define WT_SESSION_NO_RECONCILE 0x00020000 #define WT_SESSION_NO_SCHEMA_LOCK 0x00040000 #define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000 -#define WT_SESSION_SERVER_ASYNC 0x00100000 +#define WT_SESSION_READ_WONT_NEED 0x00100000 +#define WT_SESSION_SERVER_ASYNC 0x00200000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 diff --git a/src/include/stat.h b/src/include/stat.h index 12a7d532496..2477079a2a8 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -536,6 +536,8 @@ struct __wt_connection_stats { int64_t txn_pinned_range; int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_snapshot_range; + int64_t txn_pinned_timestamp; + int64_t txn_pinned_timestamp_oldest; int64_t txn_sync; int64_t txn_commit_queue_head; int64_t txn_commit_queue_inserts; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 1d7d36e332d..5d3b0c52cbd 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -5285,26 +5285,33 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * snapshots */ #define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1278 +/*! transaction: transaction range of timestamps currently pinned */ +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1279 +/*! + * transaction: transaction range of timestamps pinned by the oldest + * timestamp + */ +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1280 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1279 +#define WT_STAT_CONN_TXN_SYNC 1281 /*! transaction: transactions commit timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1280 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1282 /*! transaction: transactions commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1281 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1283 /*! transaction: transactions commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1282 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1284 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1283 +#define WT_STAT_CONN_TXN_COMMIT 1285 /*! transaction: transactions read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1284 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1286 /*! transaction: transactions read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1285 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1287 /*! transaction: transactions read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1286 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1288 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1287 +#define WT_STAT_CONN_TXN_ROLLBACK 1289 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1288 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1290 /*! * @} diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index d159005ee11..7a20686fb97 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -446,7 +446,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) } /* Discard pages we read as soon as we're done with them. */ - F_SET(session, WT_SESSION_NO_CACHE); + F_SET(session, WT_SESSION_READ_WONT_NEED); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; @@ -498,14 +498,14 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) WT_TRET(dest->close(dest)); src = dest = NULL; - F_CLR(session, WT_SESSION_NO_CACHE); + F_CLR(session, WT_SESSION_READ_WONT_NEED); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ - F_SET(session, WT_SESSION_NO_EVICTION); + F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); if (create_bloom) { if (ret == 0) @@ -626,6 +626,7 @@ err: if (locked) "Merge failed with %s", __wt_strerror(session, ret, NULL, 0)); } - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED); return (ret); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 6195726ec67..6927fe909f8 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -1068,7 +1068,8 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); } /* @@ -1078,7 +1079,8 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) void __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); __wt_readunlock(session, &lsm_tree->rwlock); } @@ -1096,7 +1098,8 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); } /* @@ -1106,7 +1109,8 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) void __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK); __wt_writeunlock(session, &lsm_tree->rwlock); } diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index f6aea02e20d..76827f7888c 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -503,7 +503,8 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ - F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_SET(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); __wt_bloom_insert(bloom, &key); @@ -514,7 +515,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); - F_CLR(session, WT_SESSION_NO_CACHE); + F_CLR(session, WT_SESSION_READ_WONT_NEED); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); @@ -537,7 +538,8 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_CLR(session, + WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED); return (ret); } diff --git a/src/os_posix/os_map.c b/src/os_posix/os_map.c index 3d06461a9ba..5e625a49bac 100644 --- a/src/os_posix/os_map.c +++ b/src/os_posix/os_map.c @@ -88,7 +88,7 @@ __wt_posix_map_preload(WT_FILE_HANDLE *fh, length += WT_PTRDIFF(map, blk); /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ - if (F_ISSET(session, WT_SESSION_NO_CACHE)) { + if (F_ISSET(session, WT_SESSION_READ_WONT_NEED)) { /* Read in 2MB blocks every 1MB of data. */ if (((uintptr_t)((uint8_t *)blk + length) & (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk) diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index f3d469a95c9..b509c49cbbc 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -1461,6 +1461,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE)) return (EBUSY); + WT_ASSERT(session, r->max_txn != WT_TXN_NONE); + /* * The order of the updates on the list matters, we can't move only the * unresolved updates, move the entire update list. @@ -6062,7 +6064,7 @@ __rec_las_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->supd != NULL) WT_ERR(__wt_las_insert_block( - session, r->page, cursor, multi, key)); + session, cursor, r->page, multi, key)); err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); diff --git a/src/session/session_api.c b/src/session/session_api.c index fa33b55c936..d81735234a0 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -259,9 +259,9 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config) ret = __wt_config_getones(session, config, "ignore_cache_size", &cval); if (ret == 0) { if (cval.val) - F_SET(session, WT_SESSION_NO_EVICTION); + F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); else - F_CLR(session, WT_SESSION_NO_EVICTION); + F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE); } WT_ERR_NOTFOUND_OK(ret); @@ -1489,7 +1489,12 @@ __session_timestamp_transaction(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; +#ifdef HAVE_DIAGNOSTIC SESSION_API_CALL(session, timestamp_transaction, config, cfg); +#else + SESSION_API_CALL(session, timestamp_transaction, NULL, cfg); + cfg[1] = config; +#endif WT_TRET(__wt_txn_set_timestamp(session, cfg)); err: API_END_RET(session, ret); } diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 6ccf3161229..aa2f1bc3bd8 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -349,23 +349,21 @@ __wt_session_compact( WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; - bool no_eviction_set; + bool ignore_cache_size_set; - no_eviction_set = false; + ignore_cache_size_set = false; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* - * Don't highjack the compaction thread for eviction; it's holding locks - * blocking checkpoints and once an application is tapped for eviction, - * it can spend a long time doing nothing else. (And, if we're tapping - * application threads for eviction, compaction should quit, it's not - * making anything better.) + * The compaction thread should not block when the cache is full: it is + * holding locks blocking checkpoints and once the cache is full, it can + * spend a long time doing eviction. */ - if (!F_ISSET(session, WT_SESSION_NO_EVICTION)) { - no_eviction_set = true; - F_SET(session, WT_SESSION_NO_EVICTION); + if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) { + ignore_cache_size_set = true; + F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); } /* In-memory ignores compaction operations. */ @@ -437,8 +435,8 @@ err: session->compact = NULL; */ WT_TRET(__wt_session_release_resources(session)); - if (no_eviction_set) - F_CLR(session, WT_SESSION_NO_EVICTION); + if (ignore_cache_size_set) + F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_compact_fail); diff --git a/src/support/stat.c b/src/support/stat.c index 924afaa21d6..b4533841ec6 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -1004,6 +1004,8 @@ static const char * const __stats_connection_desc[] = { "transaction: transaction range of IDs currently pinned", "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned by named snapshots", + "transaction: transaction range of timestamps currently pinned", + "transaction: transaction range of timestamps pinned by the oldest timestamp", "transaction: transaction sync calls", "transaction: transactions commit timestamp queue inserts to head", "transaction: transactions commit timestamp queue inserts total", @@ -1335,6 +1337,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing txn_pinned_range */ /* not clearing txn_pinned_checkpoint_range */ /* not clearing txn_pinned_snapshot_range */ + /* not clearing txn_pinned_timestamp */ + /* not clearing txn_pinned_timestamp_oldest */ stats->txn_sync = 0; stats->txn_commit_queue_head = 0; stats->txn_commit_queue_inserts = 0; @@ -1769,6 +1773,9 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, txn_pinned_checkpoint_range); to->txn_pinned_snapshot_range += WT_STAT_READ(from, txn_pinned_snapshot_range); + to->txn_pinned_timestamp += WT_STAT_READ(from, txn_pinned_timestamp); + to->txn_pinned_timestamp_oldest += + WT_STAT_READ(from, txn_pinned_timestamp_oldest); to->txn_sync += WT_STAT_READ(from, txn_sync); to->txn_commit_queue_head += WT_STAT_READ(from, txn_commit_queue_head); diff --git a/src/txn/txn.c b/src/txn/txn.c index 3d45ff8a88c..8b4a7fc7936 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -612,7 +612,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; u_int i; - bool did_update, locked; + bool locked; #ifdef HAVE_TIMESTAMPS wt_timestamp_t prev_commit_timestamp, ts; bool update_timestamp; @@ -621,11 +621,11 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) txn = &session->txn; conn = S2C(session); txn_global = &conn->txn_global; - did_update = txn->mod_count != 0; locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || !did_update); + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || + txn->mod_count == 0); /* * Look for a commit timestamp. @@ -716,7 +716,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) } /* If we are logging, write a commit log record. */ - if (did_update && + if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* @@ -757,8 +757,8 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * Writes to the lookaside file can be evicted as soon * as they commit. */ - if (conn->las_fileid != 0 && - op->fileid == conn->las_fileid) { + if (conn->cache->las_fileid != 0 && + op->fileid == conn->cache->las_fileid) { op->u.upd->txnid = WT_TXN_NONE; break; } @@ -823,6 +823,20 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * write lock and re-check. */ if (update_timestamp) { +#if WT_TIMESTAMP_SIZE == 8 + while (__wt_timestamp_cmp( + &txn->commit_timestamp, &prev_commit_timestamp) > 0) { + if (__wt_atomic_cas64( + &txn_global->commit_timestamp.val, + prev_commit_timestamp.val, + txn->commit_timestamp.val)) { + txn_global->has_commit_timestamp = true; + break; + } + __wt_timestamp_set( + &prev_commit_timestamp, &txn_global->commit_timestamp); + } +#else __wt_writelock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&txn->commit_timestamp, &txn_global->commit_timestamp) > 0) { @@ -831,6 +845,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) txn_global->has_commit_timestamp = true; } __wt_writeunlock(session, &txn_global->rwlock); +#endif } #endif @@ -881,8 +896,9 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: WT_ASSERT(session, op->u.upd->txnid == txn->id); - WT_ASSERT(session, S2C(session)->las_fileid == 0 || - op->fileid != S2C(session)->las_fileid); + WT_ASSERT(session, + S2C(session)->cache->las_fileid == 0 || + op->fileid != S2C(session)->cache->las_fileid); op->u.upd->txnid = WT_TXN_ABORTED; break; case WT_TXN_OP_REF: @@ -962,6 +978,15 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_STAT_SET(session, stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); +#if WT_TIMESTAMP_SIZE == 8 + WT_STAT_SET(session, stats, txn_pinned_timestamp, + txn_global->commit_timestamp.val - + txn_global->pinned_timestamp.val); + WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest, + txn_global->commit_timestamp.val - + txn_global->oldest_timestamp.val); +#endif + WT_STAT_SET(session, stats, txn_pinned_snapshot_range, snapshot_pinned == WT_TXN_NONE ? 0 : txn_global->current - snapshot_pinned); diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index eb32ef2d06a..c82187daf85 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -122,7 +122,7 @@ __checkpoint_update_generation(WT_SESSION_IMPL *session) */ static int __checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[], - int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp) + int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; @@ -205,7 +205,7 @@ err: __wt_scr_free(session, &tmp); */ static int __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], - int (*op)(WT_SESSION_IMPL *, const char *[])) + int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; u_int i; @@ -440,6 +440,13 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) if (current_dirty <= (double)cache->eviction_checkpoint_target) break; + /* + * Don't scrub when the lookaside table is in use: scrubbing is + * counter-productive in that case. + */ + if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE)) + break; + __wt_sleep(0, stepdown_us / 10); __wt_epoch(session, &stop); current_us = WT_TIMEDIFF_US(stop, last); @@ -1080,7 +1087,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) */ #undef WT_CHECKPOINT_SESSION_FLAGS #define WT_CHECKPOINT_SESSION_FLAGS \ - (WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION) + (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE) #undef WT_CHECKPOINT_SESSION_FLAGS_OFF #define WT_CHECKPOINT_SESSION_FLAGS_OFF \ (WT_SESSION_LOOKASIDE_CURSOR) diff --git a/src/txn/txn_rollback_to_stable.c b/src/txn/txn_rollback_to_stable.c index 929aba30155..c68d00d7503 100644 --- a/src/txn/txn_rollback_to_stable.c +++ b/src/txn/txn_rollback_to_stable.c @@ -46,12 +46,12 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) __wt_las_cursor(session, &cursor, &session_flags); /* Discard pages we read as soon as we're done with them. */ - F_SET(session, WT_SESSION_NO_CACHE); + F_SET(session, WT_SESSION_READ_WONT_NEED); /* Walk the file. */ for (; (ret = cursor->next(cursor)) == 0; ) { WT_ERR(cursor->get_key(cursor, - &las_id, &las_pageid, &las_counter, &las_key)); + &las_pageid, &las_id, &las_counter, &las_key)); /* Check the file ID so we can skip durable tables */ if (las_id >= conn->stable_rollback_maxfile) @@ -79,7 +79,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_STAT_CONN_SET(session, cache_lookaside_entries, las_total); - F_CLR(session, WT_SESSION_NO_CACHE); + F_CLR(session, WT_SESSION_READ_WONT_NEED); return (ret); } diff --git a/src/txn/txn_timestamp.c b/src/txn/txn_timestamp.c index 98887627bfc..5a39a6d84dc 100644 --- a/src/txn/txn_timestamp.c +++ b/src/txn/txn_timestamp.c @@ -210,6 +210,10 @@ __txn_global_query_timestamp( __wt_timestamp_set(&ts, &txn_global->commit_timestamp)); WT_ASSERT(session, !__wt_timestamp_iszero(&ts)); + /* Skip the lock if there are no running transactions. */ + if (TAILQ_EMPTY(&txn_global->commit_timestamph)) + goto done; + /* Compare with the oldest running transaction. */ __wt_readlock(session, &txn_global->commit_timestamp_rwlock); txn = TAILQ_FIRST(&txn_global->commit_timestamph); @@ -254,7 +258,7 @@ __txn_global_query_timestamp( WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str); - __wt_timestamp_set(tsp, &ts); +done: __wt_timestamp_set(tsp, &ts); return (0); } #endif @@ -292,7 +296,8 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) { WT_DECL_RET; WT_TXN_GLOBAL *txn_global; - wt_timestamp_t active_timestamp, oldest_timestamp, pinned_timestamp; + wt_timestamp_t active_timestamp, last_pinned_timestamp; + wt_timestamp_t oldest_timestamp, pinned_timestamp; const char *query_cfg[] = { WT_CONFIG_BASE(session, WT_CONNECTION_query_timestamp), "get=pinned", NULL }; @@ -316,6 +321,16 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) } else __wt_timestamp_set(&pinned_timestamp, &active_timestamp); + if (txn_global->has_pinned_timestamp) { + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &last_pinned_timestamp, &txn_global->pinned_timestamp)); + + if (__wt_timestamp_cmp( + &pinned_timestamp, &last_pinned_timestamp) <= 0) + return (0); + } + __wt_writelock(session, &txn_global->rwlock); if (!txn_global->has_pinned_timestamp || __wt_timestamp_cmp( &txn_global->pinned_timestamp, &pinned_timestamp) < 0) { @@ -364,6 +379,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) { WT_TXN_GLOBAL *txn_global; wt_timestamp_t commit_ts, oldest_ts, stable_ts; + wt_timestamp_t last_oldest_ts, last_stable_ts; txn_global = &S2C(session)->txn_global; /* @@ -376,7 +392,11 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) session, "oldest", &oldest_ts, &oldest_cval)); WT_RET(__wt_txn_parse_timestamp( session, "stable", &stable_ts, &stable_cval)); - __wt_writelock(session, &txn_global->rwlock); + + __wt_readlock(session, &txn_global->rwlock); + + __wt_timestamp_set(&last_oldest_ts, &txn_global->oldest_timestamp); + __wt_timestamp_set(&last_stable_ts, &txn_global->stable_timestamp); /* * First do error checking on the timestamp values. The @@ -388,9 +408,9 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) if (!has_commit && txn_global->has_commit_timestamp) __wt_timestamp_set(&commit_ts, &txn_global->commit_timestamp); if (!has_oldest && txn_global->has_oldest_timestamp) - __wt_timestamp_set(&oldest_ts, &txn_global->oldest_timestamp); - if (!has_stable && txn_global->has_oldest_timestamp) - __wt_timestamp_set(&stable_ts, &txn_global->stable_timestamp); + __wt_timestamp_set(&oldest_ts, &last_oldest_ts); + if (!has_stable && txn_global->has_stable_timestamp) + __wt_timestamp_set(&stable_ts, &last_stable_ts); /* * If a commit timestamp was supplied, check that it is no older than @@ -398,7 +418,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) */ if (has_commit && (has_oldest || txn_global->has_oldest_timestamp) && __wt_timestamp_cmp(&oldest_ts, &commit_ts) > 0) { - __wt_writeunlock(session, &txn_global->rwlock); + __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "set_timestamp: oldest timestamp must not be later than " "commit timestamp"); @@ -406,7 +426,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) if (has_commit && (has_stable || txn_global->has_stable_timestamp) && __wt_timestamp_cmp(&stable_ts, &commit_ts) > 0) { - __wt_writeunlock(session, &txn_global->rwlock); + __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "set_timestamp: stable timestamp must not be later than " "commit timestamp"); @@ -420,12 +440,27 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) (has_oldest || txn_global->has_oldest_timestamp) && (has_stable || txn_global->has_stable_timestamp) && __wt_timestamp_cmp(&oldest_ts, &stable_ts) > 0) { - __wt_writeunlock(session, &txn_global->rwlock); + __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "set_timestamp: oldest timestamp must not be later than " "stable timestamp"); } + __wt_readunlock(session, &txn_global->rwlock); + + /* Check if we are actually updating anything. */ + if (has_oldest && txn_global->has_oldest_timestamp && + __wt_timestamp_cmp(&oldest_ts, &last_oldest_ts) <= 0) + has_oldest = false; + + if (has_stable && txn_global->has_stable_timestamp && + __wt_timestamp_cmp(&stable_ts, &last_stable_ts) <= 0) + has_stable = false; + + if (!has_commit && !has_oldest && !has_stable) + return (0); + + __wt_writelock(session, &txn_global->rwlock); /* * This method can be called from multiple threads, check that we are * moving the global timestamps forwards. @@ -543,7 +578,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) /* * Look for a commit timestamp. */ - ret = __wt_config_gets(session, cfg, "commit_timestamp", &cval); + ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval); if (ret == 0 && cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_TXN *txn = &session->txn; |