summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/cache/cache_las.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/cache/cache_las.c')
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c305
1 files changed, 150 insertions, 155 deletions
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
index 64fe93806e9..cf28027d8b7 100644
--- a/src/third_party/wiredtiger/src/cache/cache_las.c
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -19,24 +19,6 @@
WT_SESSION_NO_RECONCILE)
/*
- * __las_timing_stress --
- * Optionally add delay to simulate the race conditions in lookaside
- * sweep for debug purposes.
- */
-static void
-__las_timing_stress(WT_SESSION_IMPL *session)
-{
- WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
-
- /* Only sleep when lookaside sweep race flag is set. */
- if (FLD_ISSET(conn->timing_stress_flags,
- WT_TIMING_STRESS_LOOKASIDE_SWEEP))
- __wt_sleep(0, TIMING_STRESS_TEST_SLEEP);
-}
-
-/*
* __las_set_isolation --
* Switch to read-uncommitted.
*/
@@ -328,7 +310,16 @@ __wt_las_cursor(
*
* XXX better as a condition variable.
*/
- __wt_sleep(0, 1000);
+ __wt_sleep(0, WT_THOUSAND);
+ if (F_ISSET(session, WT_SESSION_INTERNAL))
+ WT_STAT_CONN_INCRV(session,
+ cache_lookaside_cursor_wait_internal,
+ WT_THOUSAND);
+ else
+ WT_STAT_CONN_INCRV(session,
+ cache_lookaside_cursor_wait_application,
+ WT_THOUSAND);
+
}
}
@@ -412,9 +403,6 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref)
* We also need to instantiate a lookaside page if this is an update
* operation in progress.
*/
- if (ref->page_las->invalid)
- return (false);
-
if (F_ISSET(txn, WT_TXN_UPDATE))
return (false);
@@ -422,43 +410,34 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref)
return (false);
/*
- * If page image has the newest version of data and includes data newer
- * than the reader's snapshot then we should read the history.
- */
- if (ref->page_las->las_skew_newest &&
- WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn))
- return (false);
-
- /*
- * If page image has the oldest version of data and some of the history
- * overlaps with the reader's snapshot then we should read the history.
+ * If some of the page's history overlaps with the reader's snapshot
+ * then we have to read it. This is only relevant if we chose versions
+ * that were unstable when the page was written.
*/
- if (!ref->page_las->las_skew_newest &&
- WT_TXNID_LE(ref->page_las->las_min_txn, txn->snap_max))
+ if (ref->page_las->skew_newest &&
+ WT_TXNID_LE(txn->snap_min, ref->page_las->unstable_txn))
return (false);
- if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_newest)
- return (true);
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ return (ref->page_las->skew_newest);
#ifdef HAVE_TIMESTAMPS
/*
* Skip lookaside pages if reading as of a timestamp, we evicted new
* versions of data and all the updates are in the past.
*/
- if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) &&
- ref->page_las->las_skew_newest &&
+ if (ref->page_las->skew_newest &&
__wt_timestamp_cmp(
- &ref->page_las->onpage_timestamp, &session->txn.read_timestamp) < 0)
+ &txn->read_timestamp, &ref->page_las->unstable_timestamp) > 0)
return (true);
/*
* Skip lookaside pages if reading as of a timestamp, we evicted old
- * versions of data and all the updates are in the future.
+ * versions of data and all the unstable updates are in the future.
*/
- if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) &&
- !ref->page_las->las_skew_newest &&
+ if (!ref->page_las->skew_newest &&
__wt_timestamp_cmp(
- &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0)
+ &txn->read_timestamp, &ref->page_las->unstable_timestamp) < 0)
return (true);
#endif
@@ -497,18 +476,23 @@ __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
* Remove all records for a given page from the lookaside store.
*/
static int
-__las_remove_block(WT_SESSION_IMPL *session,
+__las_remove_block(
WT_CURSOR *cursor, uint64_t pageid, bool lock_wait, uint64_t *remove_cntp)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_ITEM las_key;
+ WT_SESSION_IMPL *session;
+ WT_TXN_ISOLATION saved_isolation;
uint64_t las_counter, las_pageid;
uint32_t las_id;
+ bool local_txn;
*remove_cntp = 0;
+ session = (WT_SESSION_IMPL *)cursor->session;
conn = S2C(session);
+ local_txn = false;
/* Prevent the sweep thread from removing the block. */
if (lock_wait)
@@ -517,6 +501,10 @@ __las_remove_block(WT_SESSION_IMPL *session,
WT_RET(__wt_try_writelock(
session, &conn->cache->las_sweepwalk_lock));
+ __las_set_isolation(session, &saved_isolation);
+ WT_ERR(__wt_txn_begin(session, NULL));
+ local_txn = true;
+
/*
* Search for the block's unique btree ID and page ID prefix and step
* through all matching records, removing them.
@@ -535,7 +523,15 @@ __las_remove_block(WT_SESSION_IMPL *session,
}
WT_ERR_NOTFOUND_OK(ret);
-err: __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
+err: if (local_txn) {
+ if (ret == 0)
+ ret = __wt_txn_commit(session, NULL);
+ else
+ WT_TRET(__wt_txn_rollback(session, NULL));
+ }
+
+ __las_restore_isolation(session, saved_isolation);
+ __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
return (ret);
}
@@ -545,7 +541,8 @@ err: __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
* cache state when performing a lookaside table write.
*/
static int
-__las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
+__las_insert_block_verbose(
+ WT_SESSION_IMPL *session, WT_BTREE *btree, WT_MULTI *multi)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
@@ -557,7 +554,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
#endif
const char *ts;
- btree_id = S2BT(session)->id;
+ btree_id = btree->id;
if (!WT_VERBOSE_ISSET(session,
WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY))
@@ -581,8 +578,8 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
(void)__wt_eviction_dirty_needed(session, &pct_dirty);
#ifdef HAVE_TIMESTAMPS
- WT_RET(__wt_timestamp_to_hex_string(
- session, hex_timestamp, &multi->page_las.min_timestamp));
+ WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp,
+ &multi->page_las.unstable_timestamp));
ts = hex_timestamp;
#else
ts = "disabled";
@@ -591,14 +588,14 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY,
"Page reconciliation triggered lookaside write "
"file ID %" PRIu32 ", page ID %" PRIu64 ". "
- "Max txn ID %" PRIu64 ", min timestamp %s, skewed %s. "
+ "Max txn ID %" PRIu64 ", unstable timestamp %s, %s. "
"Entries now in lookaside file: %" PRId64 ", "
"cache dirty: %2.3f%% , "
"cache use: %2.3f%%",
btree_id, multi->page_las.las_pageid,
- multi->page_las.las_max_txn,
+ multi->page_las.max_txn,
ts,
- multi->page_las.las_skew_newest ? "newest" : "oldest",
+ multi->page_las.skew_newest ? "newest" : "not newest",
WT_STAT_READ(conn->stats, cache_lookaside_entries),
pct_dirty, pct_full);
}
@@ -614,15 +611,15 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
* Copy one set of saved updates into the database's lookaside table.
*/
int
-__wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
- WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key)
+__wt_las_insert_block(WT_CURSOR *cursor,
+ WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key)
{
- WT_BTREE *btree;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_DECL_TIMESTAMP(prev_timestamp)
WT_ITEM las_timestamp, las_value;
WT_SAVE_UPD *list;
- WT_SESSION_IMPL *las_session;
+ WT_SESSION_IMPL *session;
WT_TXN_ISOLATION saved_isolation;
WT_UPDATE *upd;
uint64_t insert_cnt;
@@ -631,7 +628,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
uint8_t *p;
bool local_txn;
- btree = S2BT(session);
+ session = (WT_SESSION_IMPL *)cursor->session;
conn = S2C(session);
WT_CLEAR(las_timestamp);
WT_CLEAR(las_value);
@@ -639,17 +636,13 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
btree_id = btree->id;
local_txn = false;
+ __wt_timestamp_set_zero(&prev_timestamp);
+
las_pageid = __wt_atomic_add64(&conn->cache->las_pageid, 1);
if (!btree->lookaside_entries)
btree->lookaside_entries = true;
- /* Wrap all the updates in a transaction. */
- las_session = (WT_SESSION_IMPL *)cursor->session;
- __las_set_isolation(las_session, &saved_isolation);
- WT_ERR(__wt_txn_begin(las_session, NULL));
- local_txn = true;
-
#ifdef HAVE_DIAGNOSTIC
{
uint64_t remove_cnt;
@@ -657,12 +650,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
* There should never be any entries with the page ID we are about to
* use.
*/
- WT_ERR_BUSY_OK(__las_remove_block(
- session, cursor, las_pageid, false, &remove_cnt));
+ WT_RET_BUSY_OK(
+ __las_remove_block(cursor, las_pageid, false, &remove_cnt));
WT_ASSERT(session, remove_cnt == 0);
}
#endif
+ /* Wrap all the updates in a transaction. */
+ __las_set_isolation(session, &saved_isolation);
+ WT_ERR(__wt_txn_begin(session, NULL));
+ local_txn = true;
+
/* Enter each update in the boundary's list into the lookaside store. */
for (las_counter = 0, i = 0,
list = multi->supd; i < multi->supd_entries; ++i, ++list) {
@@ -676,15 +674,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
key->size = WT_PTRDIFF(p, key->data);
break;
case WT_PAGE_ROW_LEAF:
- if (list->ins == NULL)
- WT_ERR(__wt_row_leaf_key(
+ if (list->ins == NULL) {
+ WT_WITH_BTREE(session, btree,
+ ret = __wt_row_leaf_key(
session, page, list->ripcip, key, false));
- else {
+ WT_ERR(ret);
+ } else {
key->data = WT_INSERT_KEY(list->ins);
key->size = WT_INSERT_KEY_SIZE(list->ins);
}
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
/*
@@ -726,7 +726,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
case WT_UPDATE_TOMBSTONE:
las_value.size = 0;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
}
cursor->set_key(cursor,
@@ -742,18 +742,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
* table. (We check the length because row-store doesn't
* write zero-length data items.)
*/
- if (multi->page_las.las_skew_newest &&
- upd == list->onpage_upd &&
+ if (upd == list->onpage_upd &&
upd->size > 0 &&
(upd->type == WT_UPDATE_STANDARD ||
upd->type == WT_UPDATE_MODIFY)) {
las_value.size = 0;
- cursor->set_value(cursor,
- upd->txnid, &las_timestamp,
+ cursor->set_value(cursor, upd->txnid,
+ &las_timestamp, upd->prepare_state,
WT_UPDATE_BIRTHMARK, &las_value);
} else
- cursor->set_value(cursor,
- upd->txnid, &las_timestamp,
+ cursor->set_value(cursor, upd->txnid,
+ &las_timestamp, upd->prepare_state,
upd->type, &las_value);
/*
@@ -770,9 +769,9 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
err: /* Resolve the transaction. */
if (local_txn) {
if (ret == 0)
- WT_TRET(__wt_txn_commit(las_session, NULL));
+ ret = __wt_txn_commit(session, NULL);
else
- WT_TRET(__wt_txn_rollback(las_session, NULL));
+ WT_TRET(__wt_txn_rollback(session, NULL));
/* Adjust the entry count. */
if (ret == 0)
@@ -780,11 +779,11 @@ err: /* Resolve the transaction. */
&conn->cache->las_insert_count, insert_cnt);
}
- __las_restore_isolation(las_session, saved_isolation);
+ __las_restore_isolation(session, saved_isolation);
if (ret == 0 && insert_cnt > 0) {
multi->page_las.las_pageid = las_pageid;
- ret = __las_insert_block_verbose(session, multi);
+ ret = __las_insert_block_verbose(session, btree, multi);
}
return (ret);
@@ -860,8 +859,6 @@ __wt_las_remove_block(
WT_CONNECTION_IMPL *conn;
WT_CURSOR *cursor;
WT_DECL_RET;
- WT_SESSION_IMPL *las_session;
- WT_TXN_ISOLATION saved_isolation;
uint64_t remove_cnt;
uint32_t session_flags;
@@ -875,24 +872,12 @@ __wt_las_remove_block(
*/
__wt_las_cursor(session, &cursor, &session_flags);
- las_session = (WT_SESSION_IMPL *)cursor->session;
- __las_set_isolation(las_session, &saved_isolation);
-
- WT_ERR(__wt_txn_begin(las_session, NULL));
-
- ret = __las_remove_block(
- las_session, cursor, pageid, lock_wait, &remove_cnt);
- if (ret == 0)
- ret = __wt_txn_commit(las_session, NULL);
- else
- WT_TRET(__wt_txn_rollback(las_session, NULL));
- if (ret == 0)
+ if ((ret = __las_remove_block(
+ cursor, pageid, lock_wait, &remove_cnt)) == 0)
(void)__wt_atomic_add64(
&conn->cache->las_remove_count, remove_cnt);
-err: __las_restore_isolation(las_session, saved_isolation);
WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
-
return (ret);
}
@@ -1021,21 +1006,27 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
uint64_t cnt, remove_cnt, las_pageid, saved_pageid, visit_cnt;
uint64_t las_counter, las_txnid;
uint32_t las_id, session_flags;
- uint8_t upd_type;
+ uint8_t prepare_state, upd_type;
int notused;
- bool local_txn, locked;
+ bool local_txn, locked, removing_key_block;
cache = S2C(session)->cache;
cursor = NULL;
sweep_key = &cache->las_sweep_key;
remove_cnt = 0;
session_flags = 0; /* [-Werror=maybe-uninitialized] */
- local_txn = locked = false;
+ local_txn = locked = removing_key_block = false;
WT_RET(__wt_scr_alloc(session, 0, &saved_key));
saved_pageid = 0;
/*
+ * Prevent other threads removing entries from underneath the sweep.
+ */
+ __wt_writelock(session, &cache->las_sweepwalk_lock);
+ locked = true;
+
+ /*
* Allocate a cursor and wrap all the updates in a transaction.
* We should have our own lookaside cursor.
*/
@@ -1045,14 +1036,8 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
WT_ERR(__wt_txn_begin(session, NULL));
local_txn = true;
- /*
- * Prevent other threads removing entries from underneath the sweep.
- */
- __wt_writelock(session, &cache->las_sweepwalk_lock);
- locked = true;
-
/* Encourage a race */
- __las_timing_stress(session);
+ __wt_timing_stress(session, WT_TIMING_STRESS_LOOKASIDE_SWEEP);
/*
* When continuing a sweep, position the cursor using the key from the
@@ -1087,21 +1072,17 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
WT_ERR(cursor->get_key(cursor,
&las_pageid, &las_id, &las_counter, &las_key));
- /*
- * If we have switched to a different page, clear the saved key.
- * Otherwise, sweep could incorrectly remove records after
- * seeing a birthmark for a key in one block if the same key is
- * at the beginning of the next block. See WT-3982 for details.
- */
- if (las_pageid != saved_pageid) {
- saved_key->size = 0;
- saved_pageid = las_pageid;
- }
+ __wt_verbose(session,
+ WT_VERB_LOOKASIDE_ACTIVITY,
+ "Sweep reviewing lookaside entry with lookaside "
+ "page ID %" PRIu64 " btree ID %" PRIu32
+ " saved key size: %" WT_SIZET_FMT,
+ las_pageid, las_id, saved_key->size);
/*
- * Stop if the cache is stuck: we are ignoring the cache size
- * while scanning the lookaside table, so we're making things
- * worse.
+ * Signal to stop if the cache is stuck: we are ignoring the
+ * cache size while scanning the lookaside table, so we're
+ * making things worse.
*/
if (__wt_cache_stuck(session))
cnt = 0;
@@ -1112,9 +1093,8 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
* and there is a reader waiting and we're on a key boundary.
*/
++visit_cnt;
- if ((cnt == 0 ||
- (visit_cnt > WT_LAS_SWEEP_ENTRIES && cache->las_reader)) &&
- saved_key->size == 0)
+ if (!removing_key_block && (cnt == 0 ||
+ (visit_cnt > WT_LAS_SWEEP_ENTRIES && cache->las_reader)))
break;
if (cnt > 0)
--cnt;
@@ -1134,15 +1114,20 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
WT_ERR(cursor->remove(cursor));
++remove_cnt;
saved_key->size = 0;
+ /*
+ * Allow sweep to break while removing entries from a
+ * dead file.
+ */
+ removing_key_block = false;
continue;
}
/*
- * Remove entries from the lookaside that have aged out and are
- * now no longer needed.
+ * Remove all entries for a key once they have aged out and are
+ * no longer needed.
*/
- WT_ERR(cursor->get_value(cursor,
- &las_txnid, &las_timestamp, &upd_type, &las_value));
+ WT_ERR(cursor->get_value(cursor, &las_txnid,
+ &las_timestamp, &prepare_state, &upd_type, &las_value));
#ifdef HAVE_TIMESTAMPS
WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
memcpy(&timestamp, las_timestamp.data, las_timestamp.size);
@@ -1152,42 +1137,51 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
#endif
/*
- * If this entry isn't globally visible we cannot remove it.
- * If it is visible then perform additional checks to see
- * whether it has aged out of a live file.
- */
- if (!__wt_txn_visible_all(session, las_txnid, val_ts)) {
- saved_key->size = 0;
- continue;
- }
-
- /*
- * Save our key for comparing with older entries if we
- * don't have one or it is different.
+ * Check to see if the page or key has changed this iteration,
+ * and if they have, setup context for safely removing obsolete
+ * updates.
+ *
+ * It's important to check for page boundaries explicitly
+ * because it is possible for the same key to be at the start
+ * of the next block. See WT-3982 for details.
*/
- if (saved_key->size != las_key.size ||
+ if (las_pageid != saved_pageid ||
+ saved_key->size != las_key.size ||
memcmp(saved_key->data, las_key.data, las_key.size) != 0) {
- /* If we have processed enough entries, give up. */
+ /* If we've examined enough entries, give up. */
if (cnt == 0)
break;
- /* We can only start removing from a full value. */
- if (upd_type == WT_UPDATE_MODIFY) {
- saved_key->size = 0;
- continue;
- }
-
- WT_ERR(__wt_buf_set(session, saved_key,
- las_key.data, las_key.size));
+ saved_pageid = las_pageid;
+ WT_ERR(__wt_buf_set(
+ session, saved_key, las_key.data, las_key.size));
/*
- * If the first stable record contains data, we have to
- * keep it.
+ * There are several conditions that need to be met
+ * before we choose to remove a key block:
+ * * The entries were written with skew newest.
+ * Indicated by the first entry being a birthmark.
+ * * The first entry is globally visible.
+ * * The entry wasn't from a prepared transaction.
*/
- if (upd_type != WT_UPDATE_BIRTHMARK)
- continue;
+ if (upd_type == WT_UPDATE_BIRTHMARK &&
+ __wt_txn_visible_all(session, las_txnid, val_ts) &&
+ prepare_state != WT_PREPARE_INPROGRESS)
+ removing_key_block = true;
+ else
+ removing_key_block = false;
}
+ if (!removing_key_block)
+ continue;
+
+ __wt_verbose(session,
+ WT_VERB_LOOKASIDE_ACTIVITY,
+ "Sweep removing lookaside entry with "
+ "page ID: %" PRIu64 " btree ID: %" PRIu32
+ " saved key size: %" WT_SIZET_FMT ", record type: %" PRIu8
+ " transaction ID: %" PRIu64,
+ las_pageid, las_id, saved_key->size, upd_type, las_txnid);
WT_ERR(cursor->remove(cursor));
++remove_cnt;
}
@@ -1221,11 +1215,12 @@ err: __wt_buf_free(session, sweep_key);
(void)__wt_atomic_add64(
&cache->las_remove_count, remove_cnt);
}
- if (locked)
- __wt_writeunlock(session, &cache->las_sweepwalk_lock);
- WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
__las_restore_isolation(session, saved_isolation);
+ WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ if (locked)
+ __wt_writeunlock(session, &cache->las_sweepwalk_lock);
__wt_scr_free(session, &saved_key);