summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-11-10 14:25:02 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-11-10 14:25:02 +1100
commitd56f8dc481f3250b531273d0cd376f57df324914 (patch)
treecdf913c7523c32f4a2441e05062d2150097b4d8c /src
parent7a702f125909a58035625e664affa625c1f88049 (diff)
downloadmongo-d56f8dc481f3250b531273d0cd376f57df324914.tar.gz
WT-3715 Lookaside eviction tuning. (#3777)
Multiple changes aimed at improving performance and decreasing stalls when applications keep more history than fits in cache. Support multiple lookaside sessions / cursors simultaneously (initially 5). Don't count lookaside pages as part of the dirty content in cache. Add statistics that indicate the range of pinned timestamps. Try to further hand-optimize WT_SESSION::transaction_timestamp, since it is called under a mutex by MongoDB. Dropping a tree with lookaside entries now causes the entries to be discarded in the background by the sweep thread, rather than doing a full pass of the lookaside table for every drop.
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_read.c58
-rw-r--r--src/btree/bt_split.c10
-rw-r--r--src/btree/bt_sync.c1
-rw-r--r--src/cache/cache_las.c426
-rw-r--r--src/conn/conn_cache.c6
-rw-r--r--src/conn/conn_handle.c2
-rw-r--r--src/conn/conn_sweep.c36
-rw-r--r--src/cursor/cur_join.c3
-rw-r--r--src/cursor/cur_table.c8
-rw-r--r--src/evict/evict_file.c7
-rw-r--r--src/evict/evict_lru.c56
-rw-r--r--src/evict/evict_page.c11
-rw-r--r--src/include/btmem.h15
-rw-r--r--src/include/btree.h2
-rw-r--r--src/include/btree.i24
-rw-r--r--src/include/cache.h43
-rw-r--r--src/include/cache.i20
-rw-r--r--src/include/connection.h17
-rw-r--r--src/include/extern.h7
-rw-r--r--src/include/flags.h56
-rw-r--r--src/include/stat.h2
-rw-r--r--src/include/wiredtiger.in27
-rw-r--r--src/lsm/lsm_merge.c9
-rw-r--r--src/lsm/lsm_tree.c12
-rw-r--r--src/lsm/lsm_work_unit.c8
-rw-r--r--src/os_posix/os_map.c2
-rw-r--r--src/reconcile/rec_write.c4
-rw-r--r--src/session/session_api.c9
-rw-r--r--src/session/session_compact.c22
-rw-r--r--src/support/stat.c7
-rw-r--r--src/txn/txn.c41
-rw-r--r--src/txn/txn_ckpt.c13
-rw-r--r--src/txn/txn_rollback_to_stable.c6
-rw-r--r--src/txn/txn_timestamp.c55
34 files changed, 783 insertions, 242 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index e6e8cce02e2..fc4afc7f9b1 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -116,7 +116,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id)
cursor, btree_id, ref->page_las->las_pageid);
for (; ret == 0; ret = cursor->next(cursor)) {
WT_ERR(cursor->get_key(cursor,
- &las_id, &las_pageid, &las_counter, &las_key));
+ &las_pageid, &las_id, &las_counter, &las_key));
/*
* Confirm the search using the unique prefix; if not a match,
@@ -314,6 +314,11 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
/*
* Attempt to set the state to WT_REF_READING for normal reads, or
* WT_REF_LOCKED, for deleted pages or pages with lookaside entries.
+ * The difference is that checkpoints can skip over clean pages that
+ * are being read into cache, but need to wait for deletes or lookaside
+ * updates to be resolved (in order for checkpoint to write the correct
+ * version of the page).
+ *
* If successful, we've won the race, read the page.
*/
switch (previous_state = ref->state) {
@@ -368,8 +373,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
*/
page_flags =
WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED;
- if (LF_ISSET(WT_READ_NO_EVICT) ||
- F_ISSET(session, WT_SESSION_NO_EVICTION))
+ if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
FLD_SET(page_flags, WT_PAGE_READ_NO_EVICT);
WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &page));
tmp.mem = NULL;
@@ -518,6 +522,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
btree = S2BT(session);
+ if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE))
+ LF_SET(WT_READ_IGNORE_CACHE_SIZE);
+
/*
* Ignore reads of pages already known to be in cache, otherwise the
* eviction server can dominate these statistics.
@@ -554,7 +561,7 @@ read: /*
* allowed to do eviction work, check for space in the
* cache.
*/
- if (!LF_ISSET(WT_READ_NO_EVICT))
+ if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
WT_RET(__wt_cache_eviction_check(
session, 1, NULL));
WT_RET(__page_read(session, ref, flags));
@@ -574,7 +581,7 @@ read: /*
* we "acquire" it.
*/
wont_need = LF_ISSET(WT_READ_WONT_NEED) ||
- F_ISSET(session, WT_SESSION_NO_CACHE);
+ F_ISSET(session, WT_SESSION_READ_WONT_NEED);
continue;
case WT_REF_READING:
if (LF_ISSET(WT_READ_CACHE))
@@ -623,17 +630,22 @@ read: /*
}
/*
- * If eviction is configured for this file, check to see
- * if the page qualifies for forced eviction and update
- * the page's generation number. If eviction isn't being
- * done on this file, we're done.
+ * Check if the page requires forced eviction.
*/
- if (did_read || LF_ISSET(WT_READ_NO_EVICT) ||
- F_ISSET(session, WT_SESSION_NO_EVICTION) ||
+ if (did_read || LF_ISSET(WT_READ_NO_SPLIT) ||
btree->evict_disabled > 0 || btree->lsm_primary)
goto skip_evict;
/*
+ * If reconciliation is disabled (e.g., when inserting
+ * into the lookaside table), skip forced eviction if
+ * the page can't split.
+ */
+ if (F_ISSET(session, WT_SESSION_NO_RECONCILE) &&
+ !__wt_leaf_page_can_split(session, ref->page))
+ goto skip_evict;
+
+ /*
* Forcibly evict pages that are too big.
*/
if (force_attempts < 10 &&
@@ -684,9 +696,19 @@ skip_evict: /*
* Check if we need an autocommit transaction.
* Starting a transaction can trigger eviction, so skip
* it if eviction isn't permitted.
+ *
+ * The logic here is a little weird: some code paths do
+ * a blanket ban on checking the cache size in
+ * sessions, but still require a transaction (e.g.,
+ * when updating metadata or lookaside). If
+ * WT_READ_IGNORE_CACHE_SIZE was passed in explicitly,
+ * we're done. If we set WT_READ_IGNORE_CACHE_SIZE
+ * because it was set in the session then make sure we
+ * start a transaction.
*/
- return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
- __wt_txn_autocommit_check(session));
+ return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) &&
+ !F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE) ?
+ 0 : __wt_txn_autocommit_check(session));
WT_ILLEGAL_VALUE(session);
}
@@ -707,7 +729,7 @@ skip_evict: /*
* check if the cache needs help. If we do work for the cache,
* substitute that for a sleep.
*/
- if (!LF_ISSET(WT_READ_NO_EVICT)) {
+ if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) {
WT_RET(
__wt_cache_eviction_check(session, 1, &cache_work));
if (cache_work)
@@ -728,16 +750,16 @@ __btree_verbose_lookaside_read(
WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid)
{
#ifdef HAVE_VERBOSE
- WT_CONNECTION_IMPL *conn;
+ WT_CACHE *cache;
uint64_t ckpt_gen_current, ckpt_gen_last;
if (!WT_VERBOSE_ISSET(session,
WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY))
return;
- conn = S2C(session);
+ cache = S2C(session)->cache;
ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT);
- ckpt_gen_last = conn->las_verb_gen_read;
+ ckpt_gen_last = cache->las_verb_gen_read;
/*
* This message is throttled to one per checkpoint. To do this we
@@ -751,7 +773,7 @@ __btree_verbose_lookaside_read(
* for which this message was printed. If the atomic swap fails
* we have raced and the winning thread will print the message.
*/
- if (__wt_atomic_casv64(&conn->las_verb_gen_read,
+ if (__wt_atomic_casv64(&cache->las_verb_gen_read,
ckpt_gen_last, ckpt_gen_current)) {
__wt_verbose(session,
WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY,
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 91b53dcba96..021788919d0 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -141,6 +141,9 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_DECL_RET;
WT_REF *ref;
+ uint32_t read_flags;
+
+ read_flags = WT_READ_CACHE | WT_READ_NO_EVICT;
/* The split is complete and live, verify all of the pages involved. */
__split_verify_intl_key_order(session, page);
@@ -156,14 +159,14 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page)
* Ignore pages not in-memory (deleted, on-disk, being read),
* there's no in-memory structure to check.
*/
- if ((ret = __wt_page_in(session,
- ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
+ if ((ret =
+ __wt_page_in(session, ref, read_flags)) == WT_NOTFOUND)
continue;
WT_ERR(ret);
__split_verify_intl_key_order(session, ref->page);
- WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT));
+ WT_ERR(__wt_page_release(session, ref, read_flags));
} WT_INTL_FOREACH_END;
return (0);
@@ -1648,6 +1651,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
WT_RET(__wt_calloc_one(session, &ref->page_las));
*ref->page_las = multi->page_las;
+ WT_ASSERT(session, ref->page_las->las_max_txn != WT_TXN_NONE);
ref->state = WT_REF_LOOKASIDE;
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index d15852af935..2338d5be8ed 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -58,6 +58,7 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page)
i = 0; i < mod->mod_multi_entries; ++multi, ++i)
if (multi->addr.addr == NULL)
return (false);
+
return (true);
}
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
index f883acef4d5..deed37517bb 100644
--- a/src/cache/cache_las.c
+++ b/src/cache/cache_las.c
@@ -9,18 +9,44 @@
#include "wt_internal.h"
/*
+ * When an operation is accessing the lookaside table, it should ignore the
+ * cache size (since the cache is already full), any pages it reads should be
+ * evicted before application data, and the operation can't reenter
+ * reconciliation.
+ */
+#define WT_LAS_SESSION_FLAGS \
+ (WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED | \
+ WT_SESSION_NO_RECONCILE)
+
+/*
+ * __wt_las_nonempty --
+ * Return when there are entries in the lookaside table.
+ */
+bool
+__wt_las_nonempty(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+
+ cache = S2C(session)->cache;
+
+ return (cache->las_entry_count > 0);
+}
+
+/*
* __wt_las_stats_update --
* Update the lookaside table statistics for return to the application.
*/
void
__wt_las_stats_update(WT_SESSION_IMPL *session)
{
+ WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS **cstats;
WT_DSRC_STATS **dstats;
int64_t v;
conn = S2C(session);
+ cache = conn->cache;
/*
* Lookaside table statistics are copied from the underlying lookaside
@@ -36,7 +62,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
*/
cstats = conn->stats;
dstats = ((WT_CURSOR_BTREE *)
- conn->las_session->las_cursor)->btree->dhandle->stats;
+ cache->las_session[0]->las_cursor)->btree->dhandle->stats;
v = WT_STAT_READ(dstats, cursor_insert);
WT_STAT_SET(session, cstats, cache_lookaside_insert, v);
@@ -62,13 +88,15 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
int
__wt_las_create(WT_SESSION_IMPL *session)
{
+ WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- uint32_t session_flags;
+ int i;
const char *drop_cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL };
conn = S2C(session);
+ cache = conn->cache;
/* Read-only and in-memory configurations don't need the LAS table. */
if (F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY))
@@ -86,16 +114,17 @@ __wt_las_create(WT_SESSION_IMPL *session)
WT_RET(ret);
/* Re-create the table. */
- WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT));
+ WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_CONFIG));
/*
* Open a shared internal session and cursor used for the lookaside
- * table. This session should never be tapped for eviction.
+ * table. This session should never perform reconciliation.
*/
- session_flags = WT_SESSION_NO_EVICTION;
- WT_RET(__wt_open_internal_session(
- conn, "lookaside table", true, session_flags, &conn->las_session));
- WT_RET(__wt_las_cursor_open(conn->las_session));
+ for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) {
+ WT_RET(__wt_open_internal_session(conn, "lookaside table",
+ true, WT_LAS_SESSION_FLAGS, &cache->las_session[i]));
+ WT_RET(__wt_las_cursor_open(cache->las_session[i]));
+ }
/* The statistics server is already running, make sure we don't race. */
WT_WRITE_BARRIER();
@@ -111,20 +140,31 @@ __wt_las_create(WT_SESSION_IMPL *session)
int
__wt_las_destroy(WT_SESSION_IMPL *session)
{
+ WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION *wt_session;
+ int i;
conn = S2C(session);
+ cache = conn->cache;
F_CLR(conn, WT_CONN_LOOKASIDE_OPEN);
- if (conn->las_session == NULL)
+ if (cache == NULL)
return (0);
- wt_session = &conn->las_session->iface;
- ret = wt_session->close(wt_session, NULL);
+ for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) {
+ if (cache->las_session[i] == NULL)
+ continue;
- conn->las_session = NULL;
+ wt_session = &cache->las_session[i]->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ cache->las_session[i] = NULL;
+ }
+
+ __wt_buf_free(session, &cache->las_sweep_key);
+ __wt_free(session, cache->las_dropped);
+ __wt_free(session, cache->las_sweep_dropmap);
return (ret);
}
@@ -154,8 +194,8 @@ __wt_las_cursor_open(WT_SESSION_IMPL *session)
btree = ((WT_CURSOR_BTREE *)cursor)->btree;
/* Track the lookaside file ID. */
- if (S2C(session)->las_fileid == 0)
- S2C(session)->las_fileid = btree->id;
+ if (S2C(session)->cache->las_fileid == 0)
+ S2C(session)->cache->las_fileid = btree->id;
/*
* Set special flags for the lookaside table: the lookaside flag (used,
@@ -187,7 +227,8 @@ void
__wt_las_cursor(
WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
{
- WT_CONNECTION_IMPL *conn;
+ WT_CACHE *cache;
+ int i;
*cursorp = NULL;
@@ -200,10 +241,9 @@ __wt_las_cursor(
* problems and there's no reason to believe lookaside pages will be
* useful more than once.
*/
- *session_flags =
- F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ *session_flags = F_MASK(session, WT_LAS_SESSION_FLAGS);
- conn = S2C(session);
+ cache = S2C(session)->cache;
/*
* Some threads have their own lookaside table cursors, else lock the
@@ -212,12 +252,30 @@ __wt_las_cursor(
if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
*cursorp = session->las_cursor;
else {
- __wt_spin_lock(session, &conn->las_lock);
- *cursorp = conn->las_session->las_cursor;
+ for (;;) {
+ __wt_spin_lock(session, &cache->las_lock);
+ for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) {
+ if (!cache->las_session_inuse[i]) {
+ *cursorp =
+ cache->las_session[i]->las_cursor;
+ cache->las_session_inuse[i] = true;
+ break;
+ }
+ }
+ __wt_spin_unlock(session, &cache->las_lock);
+ if (*cursorp != NULL)
+ break;
+ /*
+ * If all the lookaside sessions are busy, stall.
+ *
+ * XXX better as a condition variable.
+ */
+ __wt_sleep(0, 1000);
+ }
}
- /* Turn caching and eviction off. */
- F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ /* Configure session to access the lookaside table. */
+ F_SET(session, WT_LAS_SESSION_FLAGS);
}
/*
@@ -226,13 +284,14 @@ __wt_las_cursor(
*/
int
__wt_las_cursor_close(
- WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags)
+ WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags)
{
- WT_CONNECTION_IMPL *conn;
+ WT_CACHE *cache;
WT_CURSOR *cursor;
WT_DECL_RET;
+ int i;
- conn = S2C(session);
+ cache = S2C(session)->cache;
if ((cursor = *cursorp) == NULL)
return (0);
@@ -245,15 +304,23 @@ __wt_las_cursor_close(
* We turned off caching and eviction while the lookaside cursor was in
* use, restore the session's flags.
*/
- F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ F_CLR(session, WT_LAS_SESSION_FLAGS);
F_SET(session, session_flags);
/*
* Some threads have their own lookaside table cursors, else unlock the
* shared lookaside cursor.
*/
- if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
- __wt_spin_unlock(session, &conn->las_lock);
+ if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
+ __wt_spin_lock(session, &cache->las_lock);
+ for (i = 0; i < WT_LAS_NUM_SESSIONS; i++)
+ if (cursor->session == &cache->las_session[i]->iface) {
+ cache->las_session_inuse[i] = false;
+ break;
+ }
+ __wt_spin_unlock(session, &cache->las_lock);
+ WT_ASSERT(session, i != WT_LAS_NUM_SESSIONS);
+ }
return (ret);
}
@@ -267,6 +334,7 @@ static int
__las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
{
#ifdef HAVE_VERBOSE
+ WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
#ifdef HAVE_TIMESTAMPS
char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1];
@@ -283,8 +351,9 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
return (0);
conn = S2C(session);
+ cache = conn->cache;
ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT);
- ckpt_gen_last = conn->las_verb_gen_write;
+ ckpt_gen_last = cache->las_verb_gen_write;
/*
* Print a message if verbose lookaside, or once per checkpoint if
@@ -293,7 +362,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
*/
if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) ||
(ckpt_gen_current > ckpt_gen_last &&
- __wt_atomic_casv64(&conn->las_verb_gen_write,
+ __wt_atomic_casv64(&cache->las_verb_gen_write,
ckpt_gen_last, ckpt_gen_current))) {
(void)__wt_eviction_clean_needed(session, &pct_full);
(void)__wt_eviction_dirty_needed(session, &pct_dirty);
@@ -323,7 +392,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
/* Never skip updating the tracked generation */
if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE))
- conn->las_verb_gen_write = ckpt_gen_current;
+ cache->las_verb_gen_write = ckpt_gen_current;
#else
WT_UNUSED(session);
WT_UNUSED(multi);
@@ -336,12 +405,14 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
* Copy one set of saved updates into the database's lookaside buffer.
*/
int
-__wt_las_insert_block(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key)
+__wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
+ WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key)
{
+ WT_BTREE *btree;
WT_DECL_RET;
WT_ITEM las_timestamp, las_value;
WT_SAVE_UPD *list;
+ WT_SESSION_IMPL *las_session;
WT_UPDATE *upd;
uint64_t insert_cnt, las_counter, las_pageid;
uint32_t btree_id, i, slot;
@@ -351,15 +422,23 @@ __wt_las_insert_block(WT_SESSION_IMPL *session,
WT_CLEAR(las_value);
insert_cnt = 0;
- btree_id = S2BT(session)->id;
+ btree = S2BT(session);
+ btree_id = btree->id;
las_pageid = multi->page_las.las_pageid =
- __wt_atomic_add64(&S2BT(session)->las_pageid, 1);
+ __wt_atomic_add64(&S2C(session)->cache->las_pageid, 1);
+
+ if (!btree->lookaside_entries)
+ btree->lookaside_entries = true;
+
+ /* Wrap all the updates in a transaction. */
+ las_session = (WT_SESSION_IMPL *)cursor->session;
+ WT_RET(__wt_txn_begin(las_session, NULL));
/*
* Make sure there are no leftover entries (e.g., from a handle
* reopen).
*/
- WT_RET(__wt_las_remove_block(session, cursor, btree_id, las_pageid));
+ WT_ERR(__wt_las_remove_block(session, cursor, btree_id, las_pageid));
/* Enter each update in the boundary's list into the lookaside store. */
for (las_counter = 0, i = 0,
@@ -369,20 +448,20 @@ __wt_las_insert_block(WT_SESSION_IMPL *session,
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
p = key->mem;
- WT_RET(
+ WT_ERR(
__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
key->size = WT_PTRDIFF(p, key->data);
break;
case WT_PAGE_ROW_LEAF:
if (list->ins == NULL)
- WT_RET(__wt_row_leaf_key(
+ WT_ERR(__wt_row_leaf_key(
session, page, list->ripcip, key, false));
else {
key->data = WT_INSERT_KEY(list->ins);
key->size = WT_INSERT_KEY_SIZE(list->ins);
}
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE_ERR(session);
}
/*
@@ -430,7 +509,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session,
}
cursor->set_key(cursor,
- btree_id, las_pageid, ++las_counter, key);
+ las_pageid, btree_id, ++las_counter, key);
#ifdef HAVE_TIMESTAMPS
las_timestamp.data = &upd->timestamp;
@@ -439,7 +518,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session,
cursor->set_value(cursor,
upd->txnid, &las_timestamp, upd->type, &las_value);
- WT_RET(cursor->insert(cursor));
+ WT_ERR(cursor->insert(cursor));
++insert_cnt;
} while ((upd = upd->next) != NULL);
}
@@ -447,10 +526,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session,
if (insert_cnt > 0) {
WT_STAT_CONN_INCRV(
session, cache_lookaside_entries, insert_cnt);
+ __wt_atomic_add64(
+ &S2C(session)->cache->las_entry_count, insert_cnt);
WT_ERR(__las_insert_block_verbose(session, multi));
}
-err: __wt_free(session, multi->supd);
+err: /* Resolve the transaction. */
+ if (ret == 0)
+ ret = __wt_txn_commit(las_session, NULL);
+ else
+ WT_TRET(__wt_txn_rollback(las_session, NULL));
+ __wt_free(session, multi->supd);
multi->supd_entries = 0;
return (ret);
}
@@ -471,6 +557,15 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid)
int exact;
/*
+ * When scanning for all pages, start at the beginning of the lookaside
+ * table.
+ */
+ if (pageid == 0) {
+ WT_RET(cursor->reset(cursor));
+ return (cursor->next(cursor));
+ }
+
+ /*
* Because of the special visibility rules for lookaside, a new block
* can appear in between our search and the block of interest. Keep
* trying until we find it.
@@ -478,7 +573,7 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid)
for (;;) {
WT_CLEAR(las_key);
cursor->set_key(cursor,
- btree_id, pageid, (uint64_t)0, &las_key);
+ pageid, btree_id, (uint64_t)0, &las_key);
WT_RET(cursor->search_near(cursor, &exact));
if (exact < 0) {
WT_RET(cursor->next(cursor));
@@ -494,9 +589,9 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid)
* WT_CONNECTION::rollback_to_stable.
*/
WT_RET(cursor->get_key(cursor,
- &las_id, &las_pageid, &las_counter, &las_key));
- if (las_id < btree_id || (las_id == btree_id &&
- pageid != 0 && las_pageid < pageid))
+ &las_pageid, &las_id, &las_counter, &las_key));
+ if (las_pageid < pageid || (las_pageid == pageid &&
+ las_id < btree_id))
continue;
}
@@ -508,7 +603,7 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid)
/*
* __wt_las_remove_block --
- * Remove all records matching a key prefix from the lookaside store.
+ * Remove all records for a given page from the lookaside store.
*/
int
__wt_las_remove_block(WT_SESSION_IMPL *session,
@@ -516,18 +611,29 @@ __wt_las_remove_block(WT_SESSION_IMPL *session,
{
WT_DECL_RET;
WT_ITEM las_key;
+ WT_SESSION_IMPL *las_session;
uint64_t las_counter, las_pageid, remove_cnt;
uint32_t las_id, session_flags;
- bool local_cursor;
+ bool local_cursor, local_txn;
remove_cnt = 0;
session_flags = 0; /* [-Wconditional-uninitialized] */
- local_cursor = false;
+ local_cursor = local_txn = false;
if (cursor == NULL) {
__wt_las_cursor(session, &cursor, &session_flags);
local_cursor = true;
}
+ las_session = (WT_SESSION_IMPL *)cursor->session;
+
+ /*
+ * Wrap all of the removes in a transaction, unless this remove is part
+ * of a larger operation.
+ */
+ if (local_cursor) {
+ WT_ERR(__wt_txn_begin(las_session, NULL));
+ local_txn = true;
+ }
/*
* Search for the block's unique prefix and step through all matching
@@ -536,16 +642,13 @@ __wt_las_remove_block(WT_SESSION_IMPL *session,
ret = __wt_las_cursor_position(cursor, btree_id, pageid);
for (; ret == 0; ret = cursor->next(cursor)) {
WT_ERR(cursor->get_key(cursor,
- &las_id, &las_pageid, &las_counter, &las_key));
+ &las_pageid, &las_id, &las_counter, &las_key));
/*
* Confirm the search using the unique prefix; if not a match,
- * we're done searching for records for this page. Note that
- * page ID zero is special: it is a wild card indicating that
- * all pages in the tree should be removed.
+ * we're done searching for records for this page.
*/
- if (las_id != btree_id ||
- (pageid != 0 && las_pageid != pageid))
+ if (las_pageid != pageid || las_id != btree_id)
break;
WT_ERR(cursor->remove(cursor));
@@ -553,9 +656,218 @@ __wt_las_remove_block(WT_SESSION_IMPL *session,
}
WT_ERR_NOTFOUND_OK(ret);
-err: if (local_cursor)
- WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+err: if (local_txn) {
+ if (ret == 0)
+ ret = __wt_txn_commit(las_session, NULL);
+ else
+ WT_TRET(__wt_txn_rollback(las_session, NULL));
+ }
+ if (local_cursor)
+ WT_TRET(__wt_las_cursor_close(
+ session, &cursor, session_flags));
WT_STAT_CONN_DECRV(session, cache_lookaside_entries, remove_cnt);
+ __wt_cache_decr_check_uint64(session,
+ &S2C(session)->cache->las_entry_count, remove_cnt,
+ "lookaside entry count");
+ return (ret);
+}
+
+/*
+ * __wt_las_save_dropped --
+ * Save a dropped btree ID to be swept from the lookaside table.
+ */
+int
+__wt_las_save_dropped(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_DECL_RET;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+
+ __wt_spin_lock(session, &cache->las_sweep_lock);
+ WT_ERR(__wt_realloc_def(session, &cache->las_dropped_alloc,
+ cache->las_dropped_next + 1, &cache->las_dropped));
+ cache->las_dropped[cache->las_dropped_next++] = btree->id;
+err: __wt_spin_unlock(session, &cache->las_sweep_lock);
+ return (ret);
+}
+
+/*
+ * __las_sweep_init --
+ * Prepare to start a lookaside sweep.
+ */
+static int
+__las_sweep_init(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_DECL_RET;
+ u_int i;
+
+ cache = S2C(session)->cache;
+
+ __wt_spin_lock(session, &cache->las_sweep_lock);
+ /* If no files have been dropped, there's nothing to do. */
+ if (cache->las_dropped_next == 0)
+ WT_ERR(WT_NOTFOUND);
+
+ /* Scan the btree IDs to find min/max. */
+ cache->las_sweep_dropmin = UINT32_MAX;
+ cache->las_sweep_dropmax = 0;
+ for (i = 0; i < cache->las_dropped_next; i++) {
+ cache->las_sweep_dropmin = WT_MIN(
+ cache->las_sweep_dropmin,
+ cache->las_dropped[i]);
+ cache->las_sweep_dropmax = WT_MAX(
+ cache->las_sweep_dropmax,
+ cache->las_dropped[i]);
+ }
+
+ /* Initialize the bitmap. */
+ __wt_free(session, cache->las_sweep_dropmap);
+ WT_ERR(__bit_alloc(session,
+ 1 + cache->las_sweep_dropmax - cache->las_sweep_dropmin,
+ &cache->las_sweep_dropmap));
+ for (i = 0; i < cache->las_dropped_next; i++)
+ __bit_set(cache->las_sweep_dropmap,
+ cache->las_dropped[i] - cache->las_sweep_dropmin);
+
+ /* Clear the list of btree IDs. */
+ cache->las_dropped_next = 0;
+
+err: __wt_spin_unlock(session, &cache->las_sweep_lock);
+ return (ret);
+}
+
+/*
+ * __wt_las_sweep --
+ * Sweep the lookaside table.
+ */
+int
+__wt_las_sweep(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_ITEM *key, las_key;
+ uint64_t cnt, las_counter, las_pageid, remove_cnt;
+ uint32_t las_id, session_flags;
+ int notused;
+
+ cache = S2C(session)->cache;
+ cursor = NULL;
+ key = &cache->las_sweep_key;
+ remove_cnt = 0;
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+
+ __wt_las_cursor(session, &cursor, &session_flags);
+
+ /* We should have our own session. */
+ WT_ASSERT(session, cursor->session == &session->iface);
+
+ /*
+ * When continuing a sweep, position the cursor using the key from the
+ * last call (we don't care if we're before or after the key, either
+ * side is fine).
+ *
+ * Otherwise, we're starting a new sweep, gather the list of trees to
+ * sweep.
+ */
+ if (key->size != 0) {
+ __wt_cursor_set_raw_key(cursor, key);
+ ret = cursor->search_near(cursor, &notused);
+
+ /*
+ * Don't search for the same key twice; if we don't set a new
+ * key below, it's because we've reached the end of the table
+ * and we want the next pass to start at the beginning of the
+ * table. Searching for the same key could leave us stuck at
+ * the end of the table, repeatedly checking the same rows.
+ */
+ key->size = 0;
+ } else
+ ret = __las_sweep_init(session);
+
+ if (ret != 0)
+ goto srch_notfound;
+
+ /*
+ * The sweep server wakes up every 10 seconds (by default), it's a slow
+ * moving thread. Try to review the entire lookaside table once every 5
+ * minutes, or every 30 calls.
+ *
+ * The reason is because the lookaside table exists because we're seeing
+ * cache/eviction pressure (it allows us to trade performance and disk
+ * space for cache space), and it's likely lookaside blocks are being
+ * evicted, and reading them back in doesn't help things. A trickier,
+ * but possibly better, alternative might be to review all lookaside
+ * blocks in the cache in order to get rid of them, and slowly review
+ * lookaside blocks that have already been evicted.
+ */
+ cnt = (uint64_t)WT_MAX(100, cache->las_entry_count / 30);
+
+ /* Walk the file. */
+ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
+ /*
+ * Give up if the cache is stuck: we are ignoring the cache
+ * size while scanning the lookaside table, so we're making
+ * things worse.
+ */
+ if (__wt_cache_stuck(session))
+ cnt = 1;
+
+ /*
+ * If the loop terminates after completing a work unit, we will
+ * continue the table sweep next time. Get a local copy of the
+ * sweep key, we're going to reset the cursor; do so before
+ * calling cursor.remove, cursor.remove can discard our hazard
+ * pointer and the page could be evicted from underneath us.
+ */
+ if (cnt == 1) {
+ WT_ERR(__wt_cursor_get_raw_key(cursor, key));
+ if (!WT_DATA_IN_ITEM(key))
+ WT_ERR(__wt_buf_set(
+ session, key, key->data, key->size));
+ }
+
+ WT_ERR(cursor->get_key(cursor,
+ &las_pageid, &las_id, &las_counter, &las_key));
+
+ /*
+ * If the entry belongs to a dropped tree, discard it.
+ *
+ * Cursor opened overwrite=true: won't return WT_NOTFOUND
+ * should another thread remove the record before we do (not
+ * expected for dropped trees), and the cursor remains
+ * positioned in that case.
+ *
+ * TODO it would also be good to remove entries in lookaside
+ * from live files that have aged out. If we track for each
+ * entry whether it was the on-page value chosen by
+ * reconciliation, we can safely remove entries from that point
+ * on (for the given key) that are visible to all readers.
+ */
+ if (__bit_test(cache->las_sweep_dropmap,
+ las_id - cache->las_sweep_dropmin)) {
+ WT_ERR(cursor->remove(cursor));
+ ++remove_cnt;
+ }
+ }
+
+srch_notfound:
+ WT_ERR_NOTFOUND_OK(ret);
+
+ if (0) {
+err: __wt_buf_free(session, key);
+ }
+
+ WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ __wt_cache_decr_check_uint64(session,
+ &S2C(session)->cache->las_entry_count, remove_cnt,
+ "lookaside entry count");
+
return (ret);
}
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 007aa8757da..76106b3592f 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -198,6 +198,10 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET_MSG(NULL, ret,
"Failed to create session for eviction walks");
+ WT_RET(__wt_spin_init(session, &cache->las_lock, "lookaside table"));
+ WT_RET(__wt_spin_init(
+ session, &cache->las_sweep_lock, "lookaside sweep"));
+
/* Allocate the LRU eviction queue. */
cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
for (i = 0; i < WT_EVICT_QUEUE_MAX; ++i) {
@@ -334,6 +338,8 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
__wt_spin_destroy(session, &cache->evict_pass_lock);
__wt_spin_destroy(session, &cache->evict_queue_lock);
__wt_spin_destroy(session, &cache->evict_walk_lock);
+ __wt_spin_destroy(session, &cache->las_lock);
+ __wt_spin_destroy(session, &cache->las_sweep_lock);
wt_session = &cache->walk_session->iface;
if (wt_session != NULL)
WT_TRET(wt_session->close(wt_session, NULL));
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 2f3f9488b58..42ae866b329 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -55,7 +55,6 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint);
WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor"));
WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
- WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table"));
WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata);
WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema);
@@ -125,7 +124,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->encryptor_lock);
__wt_spin_destroy(session, &conn->fh_lock);
__wt_rwlock_destroy(session, &conn->hot_backup_lock);
- __wt_spin_destroy(session, &conn->las_lock);
__wt_spin_destroy(session, &conn->metadata_lock);
__wt_spin_destroy(session, &conn->reconfig_lock);
__wt_spin_destroy(session, &conn->schema_lock);
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index 9b64c7a0f77..06e441a3037 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -278,10 +278,12 @@ __sweep_server(void *arg)
WT_DECL_RET;
WT_SESSION_IMPL *session;
time_t now;
+ uint64_t last_las_sweep_id, oldest_id;
u_int dead_handles;
session = arg;
conn = S2C(session);
+ last_las_sweep_id = WT_TXN_NONE;
/*
* Sweep for dead and excess handles.
@@ -300,6 +302,26 @@ __sweep_server(void *arg)
WT_STAT_CONN_INCR(session, dh_sweeps);
/*
+ * Sweep the lookaside table. If the lookaside table hasn't yet
+ * been written, there's no work to do.
+ *
+ * Don't sweep the lookaside table if the cache is stuck full.
+ * The sweep uses the cache and can exacerbate the problem.
+ * If we try to sweep when the cache is full or we aren't
+ * making progress in eviction, sweeping can wind up constantly
+ * bringing in and evicting pages from the lookaside table,
+ * which will stop the cache from moving into the stuck state.
+ */
+ if (__wt_las_nonempty(session) &&
+ !__wt_cache_stuck(session)) {
+ oldest_id = __wt_txn_oldest_id(session);
+ if (WT_TXNID_LT(last_las_sweep_id, oldest_id)) {
+ WT_ERR(__wt_las_sweep(session));
+ last_las_sweep_id = oldest_id;
+ }
+ }
+
+ /*
* Mark handles with a time of death, and report whether any
* handles are marked dead. If sweep_idle_time is 0, handles
* never become idle.
@@ -379,15 +401,21 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
/*
* Handle sweep does enough I/O it may be called upon to perform slow
- * operations for the block manager.
- *
- * Don't tap the sweep thread for eviction.
+ * operations for the block manager. Sweep should not block due to the
+ * cache being full.
*/
- session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION;
+ session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE;
WT_RET(__wt_open_internal_session(
conn, "sweep-server", true, session_flags, &conn->sweep_session));
session = conn->sweep_session;
+ /*
+ * Sweep should have it's own lookaside cursor to avoid blocking reads
+ * and eviction when processing drops.
+ */
+ if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN))
+ WT_RET(__wt_las_cursor_open(session));
+
WT_RET(__wt_cond_alloc(
session, "handle sweep server", &conn->sweep_cond));
diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c
index e1fbb63178f..bcd3943122d 100644
--- a/src/cursor/cur_join.c
+++ b/src/cursor/cur_join.c
@@ -532,7 +532,8 @@ typedef struct {
* Handle a key produced by a custom extractor.
*/
static int
-__curjoin_extract_insert(WT_CURSOR *cursor) {
+__curjoin_extract_insert(WT_CURSOR *cursor)
+{
WT_CURJOIN_EXTRACTOR *cextract;
WT_DECL_RET;
WT_ITEM ikey;
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index 48db980efff..429f75208f2 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -33,7 +33,8 @@ typedef struct {
* Handle a key produced by a custom extractor.
*/
static int
-__curextract_insert(WT_CURSOR *cursor) {
+__curextract_insert(WT_CURSOR *cursor)
+{
WT_CURSOR_EXTRACTOR *cextract;
WT_ITEM *key, ikey, pkey;
WT_SESSION_IMPL *session;
@@ -135,12 +136,13 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx,
* Apply an operation to all indices of a table.
*/
static int
-__apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, bool skip_immutable) {
+__apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, bool skip_immutable)
+{
WT_CURSOR **cp;
WT_INDEX *idx;
WT_SESSION_IMPL *session;
- int (*f)(WT_CURSOR *);
u_int i;
+ int (*f)(WT_CURSOR *);
cp = ctable->idx_cursors;
session = (WT_SESSION_IMPL *)ctable->iface.session;
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 147b615c0ab..13e2823d234 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -54,10 +54,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
*/
if (F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
F_ISSET(S2C(session), WT_CONN_LOOKASIDE_OPEN) &&
- !F_ISSET(btree, WT_BTREE_LOOKASIDE)) {
- WT_ASSERT(session, !WT_IS_METADATA(dhandle));
+ btree->lookaside_entries) {
+ WT_ASSERT(session, !WT_IS_METADATA(dhandle) &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE));
- WT_RET(__wt_las_remove_block(session, NULL, btree->id, 0));
+ WT_RET(__wt_las_save_dropped(session));
} else
FLD_SET(walk_flags, WT_READ_LOOKASIDE);
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 02851492039..3af5338d73f 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -75,7 +75,8 @@ __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref)
return (WT_READGEN_OLDEST);
/* Any page from a dead tree is a great choice. */
- if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
+ if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) ||
+ F_ISSET(btree, WT_BTREE_LOOKASIDE))
return (WT_READGEN_OLDEST);
/* Any empty page (leaf or internal), is a good choice. */
@@ -606,6 +607,21 @@ __evict_update_work(WT_SESSION_IMPL *session)
F_SET(cache, WT_CACHE_EVICT_SCRUB);
/*
+ * Try lookaside evict when:
+ * (1) the cache is stuck; OR
+ * (2) the lookaside score goes over 80; and
+ * (3) the cache is more than half way from the dirty target to the
+ * dirty trigger.
+ */
+ if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
+ (__wt_cache_stuck(session) ||
+ (__wt_cache_lookaside_score(cache) > 80 &&
+ dirty_inuse > (uint64_t)
+ ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) *
+ bytes_max) / 200)))
+ F_SET(cache, WT_CACHE_EVICT_LOOKASIDE);
+
+ /*
* With an in-memory cache, we only do dirty eviction in order to scrub
* pages.
*/
@@ -1632,6 +1648,28 @@ __evict_walk_file(WT_SESSION_IMPL *session,
QUEUE_FILLS_PER_PASS;
/*
+ * If the tree is dead or we're near the end of the queue, fill the
+ * remaining slots.
+ */
+ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
+ target_pages = remaining_slots;
+
+ /*
+ * Lookaside pages don't count toward the cache's dirty limit.
+ *
+ * Preferentially evict lookaside pages unless applications are stalled
+ * on the dirty limit. Once application threads are stalled by the
+ * dirty limit, don't take any lookaside pages unless we're also up
+ * against the total cache size limit.
+ */
+ if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) {
+ if (!F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD))
+ target_pages = remaining_slots;
+ else if (!F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD))
+ target_pages = 0;
+ }
+
+ /*
* Walk trees with a small fraction of the cache in case there are so
* many trees that none of them use enough of the cache to be allocated
* slots. Only skip a tree if it has no bytes of interest.
@@ -1652,12 +1690,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
if (target_pages < MIN_PAGES_PER_TREE)
target_pages = MIN_PAGES_PER_TREE;
- /*
- * If the tree is dead or we're near the end of the queue, fill the
- * remaining slots.
- */
- if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
- target_pages > remaining_slots)
+ if (target_pages > remaining_slots)
target_pages = remaining_slots;
/*
@@ -1993,8 +2026,8 @@ fast: /* If the page can't be evicted, give up. */
if (restarts == 0)
WT_STAT_CONN_INCR(
session, cache_eviction_walks_abandoned);
- WT_RET(__wt_page_release(cache->walk_session,
- ref, WT_READ_NO_EVICT));
+ WT_RET(__wt_page_release(
+ cache->walk_session, ref, walk_flags));
ref = NULL;
} else if (WT_READGEN_EVICT_SOON(ref->page->read_gen))
WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
@@ -2315,8 +2348,9 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
/* See if eviction is still needed. */
if (!__wt_eviction_needed(session, busy, &pct_full) ||
- (pct_full < 100 && cache->eviction_progress >
- initial_progress + max_progress))
+ ((pct_full < 100 || cache->eviction_scrub_limit > 0.0) &&
+ (cache->eviction_progress >
+ initial_progress + max_progress)))
break;
/*
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 103c93a075b..65009dc3449 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -522,6 +522,13 @@ __evict_review(
return (0);
/*
+ * If reconciliation is disabled for this thread (e.g., during an
+ * eviction that writes to lookaside), give up.
+ */
+ if (F_ISSET(session, WT_SESSION_NO_RECONCILE))
+ return (EBUSY);
+
+ /*
* If the page is dirty, reconcile it to decide if we can evict it.
*
* If we have an exclusive lock (we're discarding the tree), assert
@@ -575,9 +582,7 @@ __evict_review(
* that can't be evicted, check if reconciliation
* suggests trying the lookaside table.
*/
- if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
- (__wt_cache_lookaside_score(cache) > 50 ||
- __wt_cache_stuck(session)))
+ if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE))
lookaside_retryp = &lookaside_retry;
}
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index c3646a2ae59..abb7cc19972 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -167,11 +167,12 @@ struct __wt_ovfl_reuse {
* are written into a lookaside table, and restored as necessary if the page is
* read.
*
- * The key is a unique marker for the page (a file ID plus a page ID), a
- * counter (used to ensure the update records remain in the original order),
- * and the record's key (byte-string for row-store, record number for
- * column-store). The value is the WT_UPDATE structure's transaction ID,
- * timestamp, update type and value.
+ * The key is a unique marker for the page (a page ID plus a file ID, ordered
+ * this way so that overall the lookaside table is append-mostly), a counter
+ * (used to ensure the update records remain in the original order), and the
+ * record's key (byte-string for row-store, record number for column-store).
+ * The value is the WT_UPDATE structure's transaction ID, timestamp, update
+ * type and value.
*
* As the key for the lookaside table is different for row- and column-store, we
* store both key types in a WT_ITEM, building/parsing them in the code, because
@@ -181,8 +182,8 @@ struct __wt_ovfl_reuse {
* makes the lookaside table's value more likely to overflow the page size when
* the row-store key is relatively large.
*/
-#define WT_LAS_FORMAT \
- "key_format=" WT_UNCHECKED_STRING(IQQu) \
+#define WT_LAS_CONFIG \
+ "key_format=" WT_UNCHECKED_STRING(QIQu) \
",value_format=" WT_UNCHECKED_STRING(QuBu)
/*
diff --git a/src/include/btree.h b/src/include/btree.h
index 7dc9b4a11a7..8a3273d1b6b 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -134,13 +134,13 @@ struct __wt_btree {
u_int rec_multiblock_max; /* Maximum blocks written for a page */
uint64_t last_recno; /* Column-store last record number */
- uint64_t las_pageid; /* Lookaside table page ID counter */
WT_REF root; /* Root page reference */
bool modified; /* If the tree ever modified */
uint8_t original; /* Newly created: bulk-load possible
(want a bool but needs atomic cas) */
+ bool lookaside_entries; /* Has entries in the lookaside table */
bool lsm_primary; /* Handle is/was the LSM primary */
WT_BM *bm; /* Block manager reference */
diff --git a/src/include/btree.i b/src/include/btree.i
index f2948bfc90f..edc0973ee6f 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -149,7 +149,8 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
if (WT_PAGE_IS_INTERNAL(page)) {
(void)__wt_atomic_add64(&btree->bytes_dirty_intl, size);
(void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
- } else if (!btree->lsm_primary) {
+ } else if (!btree->lsm_primary &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE)) {
(void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
(void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
}
@@ -189,7 +190,7 @@ __wt_cache_decr_check_size(
*/
static inline void
__wt_cache_decr_check_uint64(
- WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld)
+ WT_SESSION_IMPL *session, uint64_t *vp, uint64_t v, const char *fld)
{
if (__wt_atomic_sub64(vp, v) < WT_EXABYTE)
return;
@@ -200,7 +201,7 @@ __wt_cache_decr_check_uint64(
*/
*vp = 0;
__wt_errx(session,
- "%s went negative with decrement of %" WT_SIZET_FMT, fld, v);
+ "%s went negative with decrement of %" PRIu64, fld, v);
#ifdef HAVE_DIAGNOSTIC
__wt_abort(session);
@@ -261,7 +262,7 @@ __wt_cache_page_byte_dirty_decr(
decr, "WT_BTREE.bytes_dirty_intl");
__wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl,
decr, "WT_CACHE.bytes_dirty_intl");
- } else if (!btree->lsm_primary) {
+ } else if (!btree->lsm_primary && !F_ISSET(btree, WT_BTREE_LOOKASIDE)) {
__wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf,
decr, "WT_BTREE.bytes_dirty_leaf");
__wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf,
@@ -321,7 +322,8 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
(void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
(void)__wt_atomic_add64(&cache->pages_dirty_intl, 1);
} else {
- if (!btree->lsm_primary) {
+ if (!btree->lsm_primary &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE)) {
(void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
(void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
}
@@ -420,7 +422,8 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite)
__wt_cache_decr_check_uint64(session,
&cache->bytes_dirty_intl,
modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl");
- } else if (!btree->lsm_primary) {
+ } else if (!btree->lsm_primary &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE)) {
__wt_cache_decr_check_uint64(session,
&btree->bytes_dirty_leaf,
modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf");
@@ -1359,6 +1362,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_BTREE *btree;
WT_PAGE *page;
+ bool inmem_split;
btree = S2BT(session);
@@ -1387,10 +1391,10 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
*/
page = ref->page;
if (!WT_READGEN_EVICT_SOON(page->read_gen) ||
- LF_ISSET(WT_READ_NO_EVICT) ||
- F_ISSET(session, WT_SESSION_NO_EVICTION) ||
+ LF_ISSET(WT_READ_NO_SPLIT) ||
btree->evict_disabled > 0 ||
- !__wt_page_can_evict(session, ref, NULL))
+ !__wt_page_can_evict(session, ref, &inmem_split) ||
+ (F_ISSET(session, WT_SESSION_NO_RECONCILE) && !inmem_split))
return (__wt_hazard_clear(session, ref));
WT_RET_BUSY_OK(__wt_page_release_evict(session, ref));
@@ -1622,6 +1626,6 @@ __wt_ref_state_yield_sleep(uint64_t *yield_count, uint64_t *sleep_count)
return;
}
- (*sleep_count) = WT_MIN((*sleep_count) + WT_THOUSAND, 10 * WT_THOUSAND);
+ (*sleep_count) = WT_MIN((*sleep_count) + 100, WT_THOUSAND);
__wt_sleep(0, (*sleep_count));
}
diff --git a/src/include/cache.h b/src/include/cache.h
index 0a42853b95b..f9ce4316e29 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -7,6 +7,12 @@
*/
/*
+ * Helper: in order to read without any calls to eviction, we have to ignore
+ * the cache size and disable splits.
+ */
+#define WT_READ_NO_EVICT (WT_READ_IGNORE_CACHE_SIZE | WT_READ_NO_SPLIT)
+
+/*
* Tuning constants: I hesitate to call this tuning, but we want to review some
* number of pages from each file's in-memory tree for each page we evict.
*/
@@ -176,6 +182,38 @@ struct __wt_cache {
int32_t evict_lookaside_score;
/*
+ * Shared lookaside lock, session and cursor, used by threads accessing
+ * the lookaside table (other than eviction server and worker threads
+ * and the sweep thread, all of which have their own lookaside cursors).
+ */
+#define WT_LAS_NUM_SESSIONS 5
+ WT_SPINLOCK las_lock;
+ WT_SESSION_IMPL *las_session[WT_LAS_NUM_SESSIONS];
+ bool las_session_inuse[WT_LAS_NUM_SESSIONS];
+
+ uint32_t las_fileid; /* Lookaside table file ID */
+ uint64_t las_entry_count; /* Count of entries in lookaside */
+ uint64_t las_pageid; /* Lookaside table page ID counter */
+
+ WT_SPINLOCK las_sweep_lock;
+ WT_ITEM las_sweep_key; /* Track sweep position. */
+ uint32_t las_sweep_dropmin; /* Minimum btree ID in current set. */
+ uint8_t *las_sweep_dropmap; /* Bitmap of dropped btree IDs. */
+ uint32_t las_sweep_dropmax; /* Maximum btree ID in current set. */
+
+ uint32_t *las_dropped; /* List of dropped btree IDs. */
+ size_t las_dropped_next; /* Next index into drop list. */
+ size_t las_dropped_alloc; /* Allocated size of drop list. */
+
+ /*
+ * The "lookaside_activity" verbose messages are throttled to once per
+ * checkpoint. To accomplish this we track the checkpoint generation
+ * for the most recent read and write verbose messages.
+ */
+ uint64_t las_verb_gen_read;
+ uint64_t las_verb_gen_write;
+
+ /*
* Cache pool information.
*/
uint64_t cp_pass_pressure; /* Calculated pressure from this pass */
@@ -200,8 +238,9 @@ struct __wt_cache {
#define WT_CACHE_EVICT_CLEAN_HARD 0x002 /* Clean % blocking app threads */
#define WT_CACHE_EVICT_DIRTY 0x004 /* Evict dirty pages */
#define WT_CACHE_EVICT_DIRTY_HARD 0x008 /* Dirty % blocking app threads */
-#define WT_CACHE_EVICT_SCRUB 0x010 /* Scrub dirty pages */
-#define WT_CACHE_EVICT_URGENT 0x020 /* Pages are in the urgent queue */
+#define WT_CACHE_EVICT_LOOKASIDE 0x010 /* Try lookaside eviction */
+#define WT_CACHE_EVICT_SCRUB 0x020 /* Scrub dirty pages */
+#define WT_CACHE_EVICT_URGENT 0x040 /* Pages are in the urgent queue */
#define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY)
uint32_t flags;
};
diff --git a/src/include/cache.i b/src/include/cache.i
index e160dbf4d64..c7d802f8a5f 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -241,12 +241,12 @@ __wt_session_can_wait(WT_SESSION_IMPL *session)
return (false);
/*
- * LSM sets the no-eviction flag when holding the LSM tree lock, in that
- * case, or when holding the schema lock, we don't want to highjack the
- * thread for eviction.
+ * LSM sets the "ignore cache size" flag when holding the LSM tree
+ * lock, in that case, or when holding the schema lock, we don't want
+ * this thread to block for eviction.
*/
- return (!F_ISSET(
- session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA));
+ return (!F_ISSET(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_LOCKED_SCHEMA));
}
/*
@@ -395,12 +395,12 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp)
txn_global->current != txn_global->oldest_id);
/*
- * LSM sets the no-cache-check flag when holding the LSM tree lock, in
- * that case, or when holding the handle list, schema or table locks
- * (which can block checkpoints and eviction), don't block the thread
- * for eviction.
+ * LSM sets the "ignore cache size" flag when holding the LSM tree
+ * lock, in that case, or when holding the handle list, schema or table
+ * locks (which can block checkpoints and eviction), don't block the
+ * thread for eviction.
*/
- if (F_ISSET(session, WT_SESSION_NO_EVICTION |
+ if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE |
WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA |
WT_SESSION_LOCKED_TABLE))
return (0);
diff --git a/src/include/connection.h b/src/include/connection.h
index c1d1921bdcc..9288618c87e 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -358,23 +358,6 @@ struct __wt_connection_impl {
uint64_t sweep_interval; /* Handle sweep interval */
uint64_t sweep_handles_min;/* Handle sweep minimum open */
- /*
- * Shared lookaside lock, session and cursor, used by threads accessing
- * the lookaside table (other than eviction server and worker threads
- * and the sweep thread, all of which have their own lookaside cursors).
- */
- WT_SPINLOCK las_lock; /* Lookaside table spinlock */
- WT_SESSION_IMPL *las_session; /* Lookaside table session */
- uint32_t las_fileid; /* Lookaside table file ID */
-
- /*
- * The "lookaside_activity" verbose messages are throttled to once per
- * checkpoint. To accomplish this we track the checkpoint generation
- * for the most recent read and write verbose messages.
- */
- uint64_t las_verb_gen_read;
- uint64_t las_verb_gen_write;
-
/* Set of btree IDs not being rolled back */
uint8_t *stable_rollback_bitstring;
uint32_t stable_rollback_maxfile;
diff --git a/src/include/extern.h b/src/include/extern.h
index bbe66abf753..17afb48bda6 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -200,15 +200,18 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_las_nonempty(WT_SESSION_IMPL *session);
extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
-extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_save_dropped(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_checksum_init(void);
extern void __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
diff --git a/src/include/flags.h b/src/include/flags.h
index 8ec3916435a..b191e8fe01d 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -32,17 +32,18 @@
#define WT_LOG_FSYNC 0x00000008
#define WT_LOG_SYNC_ENABLED 0x00000010
#define WT_READ_CACHE 0x00000001
-#define WT_READ_LOOKASIDE 0x00000002
-#define WT_READ_NOTFOUND_OK 0x00000004
-#define WT_READ_NO_EMPTY 0x00000008
-#define WT_READ_NO_EVICT 0x00000010
+#define WT_READ_IGNORE_CACHE_SIZE 0x00000002
+#define WT_READ_LOOKASIDE 0x00000004
+#define WT_READ_NOTFOUND_OK 0x00000008
+#define WT_READ_NO_EMPTY 0x00000010
#define WT_READ_NO_GEN 0x00000020
-#define WT_READ_NO_WAIT 0x00000040
-#define WT_READ_PREV 0x00000080
-#define WT_READ_RESTART_OK 0x00000100
-#define WT_READ_SKIP_INTL 0x00000200
-#define WT_READ_TRUNCATE 0x00000400
-#define WT_READ_WONT_NEED 0x00000800
+#define WT_READ_NO_SPLIT 0x00000040
+#define WT_READ_NO_WAIT 0x00000080
+#define WT_READ_PREV 0x00000100
+#define WT_READ_RESTART_OK 0x00000200
+#define WT_READ_SKIP_INTL 0x00000400
+#define WT_READ_TRUNCATE 0x00000800
+#define WT_READ_WONT_NEED 0x00001000
#define WT_REC_CHECKPOINT 0x00000001
#define WT_REC_EVICT 0x00000002
#define WT_REC_IN_MEMORY 0x00000004
@@ -52,26 +53,27 @@
#define WT_REC_VISIBILITY_ERR 0x00000040
#define WT_REC_VISIBLE_ALL 0x00000080
#define WT_SESSION_CAN_WAIT 0x00000001
-#define WT_SESSION_INTERNAL 0x00000002
-#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004
-#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008
-#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010
-#define WT_SESSION_LOCKED_METADATA 0x00000020
-#define WT_SESSION_LOCKED_PASS 0x00000040
-#define WT_SESSION_LOCKED_SCHEMA 0x00000080
-#define WT_SESSION_LOCKED_SLOT 0x00000100
-#define WT_SESSION_LOCKED_TABLE_READ 0x00000200
-#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400
-#define WT_SESSION_LOCKED_TURTLE 0x00000800
-#define WT_SESSION_LOGGING_INMEM 0x00001000
-#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000
-#define WT_SESSION_NO_CACHE 0x00004000
+#define WT_SESSION_IGNORE_CACHE_SIZE 0x00000002
+#define WT_SESSION_INTERNAL 0x00000004
+#define WT_SESSION_LOCKED_CHECKPOINT 0x00000008
+#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000010
+#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000020
+#define WT_SESSION_LOCKED_METADATA 0x00000040
+#define WT_SESSION_LOCKED_PASS 0x00000080
+#define WT_SESSION_LOCKED_SCHEMA 0x00000100
+#define WT_SESSION_LOCKED_SLOT 0x00000200
+#define WT_SESSION_LOCKED_TABLE_READ 0x00000400
+#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000800
+#define WT_SESSION_LOCKED_TURTLE 0x00001000
+#define WT_SESSION_LOGGING_INMEM 0x00002000
+#define WT_SESSION_LOOKASIDE_CURSOR 0x00004000
#define WT_SESSION_NO_DATA_HANDLES 0x00008000
-#define WT_SESSION_NO_EVICTION 0x00010000
-#define WT_SESSION_NO_LOGGING 0x00020000
+#define WT_SESSION_NO_LOGGING 0x00010000
+#define WT_SESSION_NO_RECONCILE 0x00020000
#define WT_SESSION_NO_SCHEMA_LOCK 0x00040000
#define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000
-#define WT_SESSION_SERVER_ASYNC 0x00100000
+#define WT_SESSION_READ_WONT_NEED 0x00100000
+#define WT_SESSION_SERVER_ASYNC 0x00200000
#define WT_STAT_CLEAR 0x00000001
#define WT_STAT_JSON 0x00000002
#define WT_STAT_ON_CLOSE 0x00000004
diff --git a/src/include/stat.h b/src/include/stat.h
index 12a7d532496..2477079a2a8 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -536,6 +536,8 @@ struct __wt_connection_stats {
int64_t txn_pinned_range;
int64_t txn_pinned_checkpoint_range;
int64_t txn_pinned_snapshot_range;
+ int64_t txn_pinned_timestamp;
+ int64_t txn_pinned_timestamp_oldest;
int64_t txn_sync;
int64_t txn_commit_queue_head;
int64_t txn_commit_queue_inserts;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 1d7d36e332d..5d3b0c52cbd 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -5285,26 +5285,33 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
* snapshots
*/
#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1278
+/*! transaction: transaction range of timestamps currently pinned */
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1279
+/*!
+ * transaction: transaction range of timestamps pinned by the oldest
+ * timestamp
+ */
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1280
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1279
+#define WT_STAT_CONN_TXN_SYNC 1281
/*! transaction: transactions commit timestamp queue inserts to head */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1280
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1282
/*! transaction: transactions commit timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1281
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1283
/*! transaction: transactions commit timestamp queue length */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1282
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1284
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1283
+#define WT_STAT_CONN_TXN_COMMIT 1285
/*! transaction: transactions read timestamp queue inserts to head */
-#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1284
+#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1286
/*! transaction: transactions read timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1285
+#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1287
/*! transaction: transactions read timestamp queue length */
-#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1286
+#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1288
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1287
+#define WT_STAT_CONN_TXN_ROLLBACK 1289
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1288
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1290
/*!
* @}
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index d159005ee11..7a20686fb97 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -446,7 +446,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
}
/* Discard pages we read as soon as we're done with them. */
- F_SET(session, WT_SESSION_NO_CACHE);
+ F_SET(session, WT_SESSION_READ_WONT_NEED);
cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
cfg[1] = "bulk,raw,skip_sort_check";
@@ -498,14 +498,14 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
WT_TRET(dest->close(dest));
src = dest = NULL;
- F_CLR(session, WT_SESSION_NO_CACHE);
+ F_CLR(session, WT_SESSION_READ_WONT_NEED);
/*
* We're doing advisory reads to fault the new trees into cache.
* Don't block if the cache is full: our next unit of work may be to
* discard some trees to free space.
*/
- F_SET(session, WT_SESSION_NO_EVICTION);
+ F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE);
if (create_bloom) {
if (ret == 0)
@@ -626,6 +626,7 @@ err: if (locked)
"Merge failed with %s",
__wt_strerror(session, ret, NULL, 0));
}
- F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ F_CLR(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED);
return (ret);
}
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 6195726ec67..6927fe909f8 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -1068,7 +1068,8 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Diagnostic: avoid deadlocks with the schema lock: if we need it for
* an operation, we should already have it.
*/
- F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
+ F_SET(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
}
/*
@@ -1078,7 +1079,8 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
void
__wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
__wt_readunlock(session, &lsm_tree->rwlock);
}
@@ -1096,7 +1098,8 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Diagnostic: avoid deadlocks with the schema lock: if we need it for
* an operation, we should already have it.
*/
- F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
+ F_SET(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
}
/*
@@ -1106,7 +1109,8 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
void
__wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
__wt_writeunlock(session, &lsm_tree->rwlock);
}
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index f6aea02e20d..76827f7888c 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -503,7 +503,8 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
* ourselves to get stuck creating bloom filters, the entire tree
* can stall since there may be no worker threads available to flush.
*/
- F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ F_SET(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED);
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
WT_ERR(src->get_key(src, &key));
__wt_bloom_insert(bloom, &key);
@@ -514,7 +515,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
WT_TRET(__wt_bloom_finalize(bloom));
WT_ERR(ret);
- F_CLR(session, WT_SESSION_NO_CACHE);
+ F_CLR(session, WT_SESSION_READ_WONT_NEED);
/* Load the new Bloom filter into cache. */
WT_CLEAR(key);
@@ -537,7 +538,8 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
err: if (bloom != NULL)
WT_TRET(__wt_bloom_close(bloom));
- F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ F_CLR(session,
+ WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED);
return (ret);
}
diff --git a/src/os_posix/os_map.c b/src/os_posix/os_map.c
index 3d06461a9ba..5e625a49bac 100644
--- a/src/os_posix/os_map.c
+++ b/src/os_posix/os_map.c
@@ -88,7 +88,7 @@ __wt_posix_map_preload(WT_FILE_HANDLE *fh,
length += WT_PTRDIFF(map, blk);
/* XXX proxy for "am I doing a scan?" -- manual read-ahead */
- if (F_ISSET(session, WT_SESSION_NO_CACHE)) {
+ if (F_ISSET(session, WT_SESSION_READ_WONT_NEED)) {
/* Read in 2MB blocks every 1MB of data. */
if (((uintptr_t)((uint8_t *)blk + length) &
(uintptr_t)((1<<20) - 1)) < (uintptr_t)blk)
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index f3d469a95c9..b509c49cbbc 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -1461,6 +1461,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE))
return (EBUSY);
+ WT_ASSERT(session, r->max_txn != WT_TXN_NONE);
+
/*
* The order of the updates on the list matters, we can't move only the
* unresolved updates, move the entire update list.
@@ -6062,7 +6064,7 @@ __rec_las_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i)
if (multi->supd != NULL)
WT_ERR(__wt_las_insert_block(
- session, r->page, cursor, multi, key));
+ session, cursor, r->page, multi, key));
err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
diff --git a/src/session/session_api.c b/src/session/session_api.c
index fa33b55c936..d81735234a0 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -259,9 +259,9 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config)
ret = __wt_config_getones(session, config, "ignore_cache_size", &cval);
if (ret == 0) {
if (cval.val)
- F_SET(session, WT_SESSION_NO_EVICTION);
+ F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE);
else
- F_CLR(session, WT_SESSION_NO_EVICTION);
+ F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE);
}
WT_ERR_NOTFOUND_OK(ret);
@@ -1489,7 +1489,12 @@ __session_timestamp_transaction(WT_SESSION *wt_session, const char *config)
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)wt_session;
+#ifdef HAVE_DIAGNOSTIC
SESSION_API_CALL(session, timestamp_transaction, config, cfg);
+#else
+ SESSION_API_CALL(session, timestamp_transaction, NULL, cfg);
+ cfg[1] = config;
+#endif
WT_TRET(__wt_txn_set_timestamp(session, cfg));
err: API_END_RET(session, ret);
}
diff --git a/src/session/session_compact.c b/src/session/session_compact.c
index 6ccf3161229..aa2f1bc3bd8 100644
--- a/src/session/session_compact.c
+++ b/src/session/session_compact.c
@@ -349,23 +349,21 @@ __wt_session_compact(
WT_DECL_RET;
WT_SESSION_IMPL *session;
u_int i;
- bool no_eviction_set;
+ bool ignore_cache_size_set;
- no_eviction_set = false;
+ ignore_cache_size_set = false;
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL(session, compact, config, cfg);
/*
- * Don't highjack the compaction thread for eviction; it's holding locks
- * blocking checkpoints and once an application is tapped for eviction,
- * it can spend a long time doing nothing else. (And, if we're tapping
- * application threads for eviction, compaction should quit, it's not
- * making anything better.)
+ * The compaction thread should not block when the cache is full: it is
+ * holding locks blocking checkpoints and once the cache is full, it can
+ * spend a long time doing eviction.
*/
- if (!F_ISSET(session, WT_SESSION_NO_EVICTION)) {
- no_eviction_set = true;
- F_SET(session, WT_SESSION_NO_EVICTION);
+ if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) {
+ ignore_cache_size_set = true;
+ F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE);
}
/* In-memory ignores compaction operations. */
@@ -437,8 +435,8 @@ err: session->compact = NULL;
*/
WT_TRET(__wt_session_release_resources(session));
- if (no_eviction_set)
- F_CLR(session, WT_SESSION_NO_EVICTION);
+ if (ignore_cache_size_set)
+ F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE);
if (ret != 0)
WT_STAT_CONN_INCR(session, session_table_compact_fail);
diff --git a/src/support/stat.c b/src/support/stat.c
index 924afaa21d6..b4533841ec6 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -1004,6 +1004,8 @@ static const char * const __stats_connection_desc[] = {
"transaction: transaction range of IDs currently pinned",
"transaction: transaction range of IDs currently pinned by a checkpoint",
"transaction: transaction range of IDs currently pinned by named snapshots",
+ "transaction: transaction range of timestamps currently pinned",
+ "transaction: transaction range of timestamps pinned by the oldest timestamp",
"transaction: transaction sync calls",
"transaction: transactions commit timestamp queue inserts to head",
"transaction: transactions commit timestamp queue inserts total",
@@ -1335,6 +1337,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing txn_pinned_range */
/* not clearing txn_pinned_checkpoint_range */
/* not clearing txn_pinned_snapshot_range */
+ /* not clearing txn_pinned_timestamp */
+ /* not clearing txn_pinned_timestamp_oldest */
stats->txn_sync = 0;
stats->txn_commit_queue_head = 0;
stats->txn_commit_queue_inserts = 0;
@@ -1769,6 +1773,9 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, txn_pinned_checkpoint_range);
to->txn_pinned_snapshot_range +=
WT_STAT_READ(from, txn_pinned_snapshot_range);
+ to->txn_pinned_timestamp += WT_STAT_READ(from, txn_pinned_timestamp);
+ to->txn_pinned_timestamp_oldest +=
+ WT_STAT_READ(from, txn_pinned_timestamp_oldest);
to->txn_sync += WT_STAT_READ(from, txn_sync);
to->txn_commit_queue_head +=
WT_STAT_READ(from, txn_commit_queue_head);
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 3d45ff8a88c..8b4a7fc7936 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -612,7 +612,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN_GLOBAL *txn_global;
WT_TXN_OP *op;
u_int i;
- bool did_update, locked;
+ bool locked;
#ifdef HAVE_TIMESTAMPS
wt_timestamp_t prev_commit_timestamp, ts;
bool update_timestamp;
@@ -621,11 +621,11 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
txn = &session->txn;
conn = S2C(session);
txn_global = &conn->txn_global;
- did_update = txn->mod_count != 0;
locked = false;
WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
- WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || !did_update);
+ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
+ txn->mod_count == 0);
/*
* Look for a commit timestamp.
@@ -716,7 +716,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
}
/* If we are logging, write a commit log record. */
- if (did_update &&
+ if (txn->logrec != NULL &&
FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
!F_ISSET(session, WT_SESSION_NO_LOGGING)) {
/*
@@ -757,8 +757,8 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
* Writes to the lookaside file can be evicted as soon
* as they commit.
*/
- if (conn->las_fileid != 0 &&
- op->fileid == conn->las_fileid) {
+ if (conn->cache->las_fileid != 0 &&
+ op->fileid == conn->cache->las_fileid) {
op->u.upd->txnid = WT_TXN_NONE;
break;
}
@@ -823,6 +823,20 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
* write lock and re-check.
*/
if (update_timestamp) {
+#if WT_TIMESTAMP_SIZE == 8
+ while (__wt_timestamp_cmp(
+ &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
+ if (__wt_atomic_cas64(
+ &txn_global->commit_timestamp.val,
+ prev_commit_timestamp.val,
+ txn->commit_timestamp.val)) {
+ txn_global->has_commit_timestamp = true;
+ break;
+ }
+ __wt_timestamp_set(
+ &prev_commit_timestamp, &txn_global->commit_timestamp);
+ }
+#else
__wt_writelock(session, &txn_global->rwlock);
if (__wt_timestamp_cmp(&txn->commit_timestamp,
&txn_global->commit_timestamp) > 0) {
@@ -831,6 +845,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
txn_global->has_commit_timestamp = true;
}
__wt_writeunlock(session, &txn_global->rwlock);
+#endif
}
#endif
@@ -881,8 +896,9 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
case WT_TXN_OP_BASIC_TS:
case WT_TXN_OP_INMEM:
WT_ASSERT(session, op->u.upd->txnid == txn->id);
- WT_ASSERT(session, S2C(session)->las_fileid == 0 ||
- op->fileid != S2C(session)->las_fileid);
+ WT_ASSERT(session,
+ S2C(session)->cache->las_fileid == 0 ||
+ op->fileid != S2C(session)->cache->las_fileid);
op->u.upd->txnid = WT_TXN_ABORTED;
break;
case WT_TXN_OP_REF:
@@ -962,6 +978,15 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
WT_STAT_SET(session, stats, txn_pinned_range,
txn_global->current - txn_global->oldest_id);
+#if WT_TIMESTAMP_SIZE == 8
+ WT_STAT_SET(session, stats, txn_pinned_timestamp,
+ txn_global->commit_timestamp.val -
+ txn_global->pinned_timestamp.val);
+ WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest,
+ txn_global->commit_timestamp.val -
+ txn_global->oldest_timestamp.val);
+#endif
+
WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
snapshot_pinned == WT_TXN_NONE ?
0 : txn_global->current - snapshot_pinned);
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index eb32ef2d06a..c82187daf85 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -122,7 +122,7 @@ __checkpoint_update_generation(WT_SESSION_IMPL *session)
*/
static int
__checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[],
- int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp)
+ int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp)
{
WT_CONFIG targetconf;
WT_CONFIG_ITEM cval, k, v;
@@ -205,7 +205,7 @@ err: __wt_scr_free(session, &tmp);
*/
static int
__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
- int (*op)(WT_SESSION_IMPL *, const char *[]))
+ int (*op)(WT_SESSION_IMPL *, const char *[]))
{
WT_DECL_RET;
u_int i;
@@ -440,6 +440,13 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
if (current_dirty <= (double)cache->eviction_checkpoint_target)
break;
+ /*
+ * Don't scrub when the lookaside table is in use: scrubbing is
+ * counter-productive in that case.
+ */
+ if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE))
+ break;
+
__wt_sleep(0, stepdown_us / 10);
__wt_epoch(session, &stop);
current_us = WT_TIMEDIFF_US(stop, last);
@@ -1080,7 +1087,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting)
*/
#undef WT_CHECKPOINT_SESSION_FLAGS
#define WT_CHECKPOINT_SESSION_FLAGS \
- (WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION)
+ (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE)
#undef WT_CHECKPOINT_SESSION_FLAGS_OFF
#define WT_CHECKPOINT_SESSION_FLAGS_OFF \
(WT_SESSION_LOOKASIDE_CURSOR)
diff --git a/src/txn/txn_rollback_to_stable.c b/src/txn/txn_rollback_to_stable.c
index 929aba30155..c68d00d7503 100644
--- a/src/txn/txn_rollback_to_stable.c
+++ b/src/txn/txn_rollback_to_stable.c
@@ -46,12 +46,12 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
__wt_las_cursor(session, &cursor, &session_flags);
/* Discard pages we read as soon as we're done with them. */
- F_SET(session, WT_SESSION_NO_CACHE);
+ F_SET(session, WT_SESSION_READ_WONT_NEED);
/* Walk the file. */
for (; (ret = cursor->next(cursor)) == 0; ) {
WT_ERR(cursor->get_key(cursor,
- &las_id, &las_pageid, &las_counter, &las_key));
+ &las_pageid, &las_id, &las_counter, &las_key));
/* Check the file ID so we can skip durable tables */
if (las_id >= conn->stable_rollback_maxfile)
@@ -79,7 +79,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
WT_STAT_CONN_SET(session, cache_lookaside_entries, las_total);
- F_CLR(session, WT_SESSION_NO_CACHE);
+ F_CLR(session, WT_SESSION_READ_WONT_NEED);
return (ret);
}
diff --git a/src/txn/txn_timestamp.c b/src/txn/txn_timestamp.c
index 98887627bfc..5a39a6d84dc 100644
--- a/src/txn/txn_timestamp.c
+++ b/src/txn/txn_timestamp.c
@@ -210,6 +210,10 @@ __txn_global_query_timestamp(
__wt_timestamp_set(&ts, &txn_global->commit_timestamp));
WT_ASSERT(session, !__wt_timestamp_iszero(&ts));
+ /* Skip the lock if there are no running transactions. */
+ if (TAILQ_EMPTY(&txn_global->commit_timestamph))
+ goto done;
+
/* Compare with the oldest running transaction. */
__wt_readlock(session, &txn_global->commit_timestamp_rwlock);
txn = TAILQ_FIRST(&txn_global->commit_timestamph);
@@ -254,7 +258,7 @@ __txn_global_query_timestamp(
WT_RET_MSG(session, EINVAL,
"unknown timestamp query %.*s", (int)cval.len, cval.str);
- __wt_timestamp_set(tsp, &ts);
+done: __wt_timestamp_set(tsp, &ts);
return (0);
}
#endif
@@ -292,7 +296,8 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session)
{
WT_DECL_RET;
WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t active_timestamp, oldest_timestamp, pinned_timestamp;
+ wt_timestamp_t active_timestamp, last_pinned_timestamp;
+ wt_timestamp_t oldest_timestamp, pinned_timestamp;
const char *query_cfg[] = { WT_CONFIG_BASE(session,
WT_CONNECTION_query_timestamp), "get=pinned", NULL };
@@ -316,6 +321,16 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session)
} else
__wt_timestamp_set(&pinned_timestamp, &active_timestamp);
+ if (txn_global->has_pinned_timestamp) {
+ WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
+ __wt_timestamp_set(
+ &last_pinned_timestamp, &txn_global->pinned_timestamp));
+
+ if (__wt_timestamp_cmp(
+ &pinned_timestamp, &last_pinned_timestamp) <= 0)
+ return (0);
+ }
+
__wt_writelock(session, &txn_global->rwlock);
if (!txn_global->has_pinned_timestamp || __wt_timestamp_cmp(
&txn_global->pinned_timestamp, &pinned_timestamp) < 0) {
@@ -364,6 +379,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t commit_ts, oldest_ts, stable_ts;
+ wt_timestamp_t last_oldest_ts, last_stable_ts;
txn_global = &S2C(session)->txn_global;
/*
@@ -376,7 +392,11 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
session, "oldest", &oldest_ts, &oldest_cval));
WT_RET(__wt_txn_parse_timestamp(
session, "stable", &stable_ts, &stable_cval));
- __wt_writelock(session, &txn_global->rwlock);
+
+ __wt_readlock(session, &txn_global->rwlock);
+
+ __wt_timestamp_set(&last_oldest_ts, &txn_global->oldest_timestamp);
+ __wt_timestamp_set(&last_stable_ts, &txn_global->stable_timestamp);
/*
* First do error checking on the timestamp values. The
@@ -388,9 +408,9 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
if (!has_commit && txn_global->has_commit_timestamp)
__wt_timestamp_set(&commit_ts, &txn_global->commit_timestamp);
if (!has_oldest && txn_global->has_oldest_timestamp)
- __wt_timestamp_set(&oldest_ts, &txn_global->oldest_timestamp);
- if (!has_stable && txn_global->has_oldest_timestamp)
- __wt_timestamp_set(&stable_ts, &txn_global->stable_timestamp);
+ __wt_timestamp_set(&oldest_ts, &last_oldest_ts);
+ if (!has_stable && txn_global->has_stable_timestamp)
+ __wt_timestamp_set(&stable_ts, &last_stable_ts);
/*
* If a commit timestamp was supplied, check that it is no older than
@@ -398,7 +418,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
*/
if (has_commit && (has_oldest || txn_global->has_oldest_timestamp) &&
__wt_timestamp_cmp(&oldest_ts, &commit_ts) > 0) {
- __wt_writeunlock(session, &txn_global->rwlock);
+ __wt_readunlock(session, &txn_global->rwlock);
WT_RET_MSG(session, EINVAL,
"set_timestamp: oldest timestamp must not be later than "
"commit timestamp");
@@ -406,7 +426,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
if (has_commit && (has_stable || txn_global->has_stable_timestamp) &&
__wt_timestamp_cmp(&stable_ts, &commit_ts) > 0) {
- __wt_writeunlock(session, &txn_global->rwlock);
+ __wt_readunlock(session, &txn_global->rwlock);
WT_RET_MSG(session, EINVAL,
"set_timestamp: stable timestamp must not be later than "
"commit timestamp");
@@ -420,12 +440,27 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
(has_oldest || txn_global->has_oldest_timestamp) &&
(has_stable || txn_global->has_stable_timestamp) &&
__wt_timestamp_cmp(&oldest_ts, &stable_ts) > 0) {
- __wt_writeunlock(session, &txn_global->rwlock);
+ __wt_readunlock(session, &txn_global->rwlock);
WT_RET_MSG(session, EINVAL,
"set_timestamp: oldest timestamp must not be later than "
"stable timestamp");
}
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /* Check if we are actually updating anything. */
+ if (has_oldest && txn_global->has_oldest_timestamp &&
+ __wt_timestamp_cmp(&oldest_ts, &last_oldest_ts) <= 0)
+ has_oldest = false;
+
+ if (has_stable && txn_global->has_stable_timestamp &&
+ __wt_timestamp_cmp(&stable_ts, &last_stable_ts) <= 0)
+ has_stable = false;
+
+ if (!has_commit && !has_oldest && !has_stable)
+ return (0);
+
+ __wt_writelock(session, &txn_global->rwlock);
/*
* This method can be called from multiple threads, check that we are
* moving the global timestamps forwards.
@@ -543,7 +578,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
/*
* Look for a commit timestamp.
*/
- ret = __wt_config_gets(session, cfg, "commit_timestamp", &cval);
+ ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval);
if (ret == 0 && cval.len != 0) {
#ifdef HAVE_TIMESTAMPS
WT_TXN *txn = &session->txn;