diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/cache/cache_las.c')
-rw-r--r-- | src/third_party/wiredtiger/src/cache/cache_las.c | 305 |
1 files changed, 150 insertions, 155 deletions
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 64fe93806e9..cf28027d8b7 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -19,24 +19,6 @@ WT_SESSION_NO_RECONCILE) /* - * __las_timing_stress -- - * Optionally add delay to simulate the race conditions in lookaside - * sweep for debug purposes. - */ -static void -__las_timing_stress(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - - conn = S2C(session); - - /* Only sleep when lookaside sweep race flag is set. */ - if (FLD_ISSET(conn->timing_stress_flags, - WT_TIMING_STRESS_LOOKASIDE_SWEEP)) - __wt_sleep(0, TIMING_STRESS_TEST_SLEEP); -} - -/* * __las_set_isolation -- * Switch to read-uncommitted. */ @@ -328,7 +310,16 @@ __wt_las_cursor( * * XXX better as a condition variable. */ - __wt_sleep(0, 1000); + __wt_sleep(0, WT_THOUSAND); + if (F_ISSET(session, WT_SESSION_INTERNAL)) + WT_STAT_CONN_INCRV(session, + cache_lookaside_cursor_wait_internal, + WT_THOUSAND); + else + WT_STAT_CONN_INCRV(session, + cache_lookaside_cursor_wait_application, + WT_THOUSAND); + } } @@ -412,9 +403,6 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) * We also need to instantiate a lookaside page if this is an update * operation in progress. */ - if (ref->page_las->invalid) - return (false); - if (F_ISSET(txn, WT_TXN_UPDATE)) return (false); @@ -422,43 +410,34 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) return (false); /* - * If page image has the newest version of data and includes data newer - * than the reader's snapshot then we should read the history. - */ - if (ref->page_las->las_skew_newest && - WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn)) - return (false); - - /* - * If page image has the oldest version of data and some of the history - * overlaps with the reader's snapshot then we should read the history. + * If some of the page's history overlaps with the reader's snapshot + * then we have to read it. This is only relevant if we chose versions + * that were unstable when the page was written. */ - if (!ref->page_las->las_skew_newest && - WT_TXNID_LE(ref->page_las->las_min_txn, txn->snap_max)) + if (ref->page_las->skew_newest && + WT_TXNID_LE(txn->snap_min, ref->page_las->unstable_txn)) return (false); - if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_newest) - return (true); + if (!F_ISSET(txn, WT_TXN_HAS_TS_READ)) + return (ref->page_las->skew_newest); #ifdef HAVE_TIMESTAMPS /* * Skip lookaside pages if reading as of a timestamp, we evicted new * versions of data and all the updates are in the past. */ - if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - ref->page_las->las_skew_newest && + if (ref->page_las->skew_newest && __wt_timestamp_cmp( - &ref->page_las->onpage_timestamp, &session->txn.read_timestamp) < 0) + &txn->read_timestamp, &ref->page_las->unstable_timestamp) > 0) return (true); /* * Skip lookaside pages if reading as of a timestamp, we evicted old - * versions of data and all the updates are in the future. + * versions of data and all the unstable updates are in the future. */ - if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - !ref->page_las->las_skew_newest && + if (!ref->page_las->skew_newest && __wt_timestamp_cmp( - &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) + &txn->read_timestamp, &ref->page_las->unstable_timestamp) < 0) return (true); #endif @@ -497,18 +476,23 @@ __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) * Remove all records for a given page from the lookaside store. */ static int -__las_remove_block(WT_SESSION_IMPL *session, +__las_remove_block( WT_CURSOR *cursor, uint64_t pageid, bool lock_wait, uint64_t *remove_cntp) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_ITEM las_key; + WT_SESSION_IMPL *session; + WT_TXN_ISOLATION saved_isolation; uint64_t las_counter, las_pageid; uint32_t las_id; + bool local_txn; *remove_cntp = 0; + session = (WT_SESSION_IMPL *)cursor->session; conn = S2C(session); + local_txn = false; /* Prevent the sweep thread from removing the block. */ if (lock_wait) @@ -517,6 +501,10 @@ __las_remove_block(WT_SESSION_IMPL *session, WT_RET(__wt_try_writelock( session, &conn->cache->las_sweepwalk_lock)); + __las_set_isolation(session, &saved_isolation); + WT_ERR(__wt_txn_begin(session, NULL)); + local_txn = true; + /* * Search for the block's unique btree ID and page ID prefix and step * through all matching records, removing them. @@ -535,7 +523,15 @@ __las_remove_block(WT_SESSION_IMPL *session, } WT_ERR_NOTFOUND_OK(ret); -err: __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock); +err: if (local_txn) { + if (ret == 0) + ret = __wt_txn_commit(session, NULL); + else + WT_TRET(__wt_txn_rollback(session, NULL)); + } + + __las_restore_isolation(session, saved_isolation); + __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock); return (ret); } @@ -545,7 +541,8 @@ err: __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock); * cache state when performing a lookaside table write. */ static int -__las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) +__las_insert_block_verbose( + WT_SESSION_IMPL *session, WT_BTREE *btree, WT_MULTI *multi) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; @@ -557,7 +554,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) #endif const char *ts; - btree_id = S2BT(session)->id; + btree_id = btree->id; if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY)) @@ -581,8 +578,8 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) (void)__wt_eviction_dirty_needed(session, &pct_dirty); #ifdef HAVE_TIMESTAMPS - WT_RET(__wt_timestamp_to_hex_string( - session, hex_timestamp, &multi->page_las.min_timestamp)); + WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp, + &multi->page_las.unstable_timestamp)); ts = hex_timestamp; #else ts = "disabled"; @@ -591,14 +588,14 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY, "Page reconciliation triggered lookaside write " "file ID %" PRIu32 ", page ID %" PRIu64 ". " - "Max txn ID %" PRIu64 ", min timestamp %s, skewed %s. " + "Max txn ID %" PRIu64 ", unstable timestamp %s, %s. " "Entries now in lookaside file: %" PRId64 ", " "cache dirty: %2.3f%% , " "cache use: %2.3f%%", btree_id, multi->page_las.las_pageid, - multi->page_las.las_max_txn, + multi->page_las.max_txn, ts, - multi->page_las.las_skew_newest ? "newest" : "oldest", + multi->page_las.skew_newest ? "newest" : "not newest", WT_STAT_READ(conn->stats, cache_lookaside_entries), pct_dirty, pct_full); } @@ -614,15 +611,15 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) * Copy one set of saved updates into the database's lookaside table. */ int -__wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, - WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) +__wt_las_insert_block(WT_CURSOR *cursor, + WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) { - WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + WT_DECL_TIMESTAMP(prev_timestamp) WT_ITEM las_timestamp, las_value; WT_SAVE_UPD *list; - WT_SESSION_IMPL *las_session; + WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; WT_UPDATE *upd; uint64_t insert_cnt; @@ -631,7 +628,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint8_t *p; bool local_txn; - btree = S2BT(session); + session = (WT_SESSION_IMPL *)cursor->session; conn = S2C(session); WT_CLEAR(las_timestamp); WT_CLEAR(las_value); @@ -639,17 +636,13 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, btree_id = btree->id; local_txn = false; + __wt_timestamp_set_zero(&prev_timestamp); + las_pageid = __wt_atomic_add64(&conn->cache->las_pageid, 1); if (!btree->lookaside_entries) btree->lookaside_entries = true; - /* Wrap all the updates in a transaction. */ - las_session = (WT_SESSION_IMPL *)cursor->session; - __las_set_isolation(las_session, &saved_isolation); - WT_ERR(__wt_txn_begin(las_session, NULL)); - local_txn = true; - #ifdef HAVE_DIAGNOSTIC { uint64_t remove_cnt; @@ -657,12 +650,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, * There should never be any entries with the page ID we are about to * use. */ - WT_ERR_BUSY_OK(__las_remove_block( - session, cursor, las_pageid, false, &remove_cnt)); + WT_RET_BUSY_OK( + __las_remove_block(cursor, las_pageid, false, &remove_cnt)); WT_ASSERT(session, remove_cnt == 0); } #endif + /* Wrap all the updates in a transaction. */ + __las_set_isolation(session, &saved_isolation); + WT_ERR(__wt_txn_begin(session, NULL)); + local_txn = true; + /* Enter each update in the boundary's list into the lookaside store. */ for (las_counter = 0, i = 0, list = multi->supd; i < multi->supd_entries; ++i, ++list) { @@ -676,15 +674,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, key->size = WT_PTRDIFF(p, key->data); break; case WT_PAGE_ROW_LEAF: - if (list->ins == NULL) - WT_ERR(__wt_row_leaf_key( + if (list->ins == NULL) { + WT_WITH_BTREE(session, btree, + ret = __wt_row_leaf_key( session, page, list->ripcip, key, false)); - else { + WT_ERR(ret); + } else { key->data = WT_INSERT_KEY(list->ins); key->size = WT_INSERT_KEY_SIZE(list->ins); } break; - WT_ILLEGAL_VALUE_ERR(session); + WT_ILLEGAL_VALUE_ERR(session, page->type); } /* @@ -726,7 +726,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, case WT_UPDATE_TOMBSTONE: las_value.size = 0; break; - WT_ILLEGAL_VALUE_ERR(session); + WT_ILLEGAL_VALUE_ERR(session, upd->type); } cursor->set_key(cursor, @@ -742,18 +742,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, * table. (We check the length because row-store doesn't * write zero-length data items.) */ - if (multi->page_las.las_skew_newest && - upd == list->onpage_upd && + if (upd == list->onpage_upd && upd->size > 0 && (upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY)) { las_value.size = 0; - cursor->set_value(cursor, - upd->txnid, &las_timestamp, + cursor->set_value(cursor, upd->txnid, + &las_timestamp, upd->prepare_state, WT_UPDATE_BIRTHMARK, &las_value); } else - cursor->set_value(cursor, - upd->txnid, &las_timestamp, + cursor->set_value(cursor, upd->txnid, + &las_timestamp, upd->prepare_state, upd->type, &las_value); /* @@ -770,9 +769,9 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, err: /* Resolve the transaction. */ if (local_txn) { if (ret == 0) - WT_TRET(__wt_txn_commit(las_session, NULL)); + ret = __wt_txn_commit(session, NULL); else - WT_TRET(__wt_txn_rollback(las_session, NULL)); + WT_TRET(__wt_txn_rollback(session, NULL)); /* Adjust the entry count. */ if (ret == 0) @@ -780,11 +779,11 @@ err: /* Resolve the transaction. */ &conn->cache->las_insert_count, insert_cnt); } - __las_restore_isolation(las_session, saved_isolation); + __las_restore_isolation(session, saved_isolation); if (ret == 0 && insert_cnt > 0) { multi->page_las.las_pageid = las_pageid; - ret = __las_insert_block_verbose(session, multi); + ret = __las_insert_block_verbose(session, btree, multi); } return (ret); @@ -860,8 +859,6 @@ __wt_las_remove_block( WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_RET; - WT_SESSION_IMPL *las_session; - WT_TXN_ISOLATION saved_isolation; uint64_t remove_cnt; uint32_t session_flags; @@ -875,24 +872,12 @@ __wt_las_remove_block( */ __wt_las_cursor(session, &cursor, &session_flags); - las_session = (WT_SESSION_IMPL *)cursor->session; - __las_set_isolation(las_session, &saved_isolation); - - WT_ERR(__wt_txn_begin(las_session, NULL)); - - ret = __las_remove_block( - las_session, cursor, pageid, lock_wait, &remove_cnt); - if (ret == 0) - ret = __wt_txn_commit(las_session, NULL); - else - WT_TRET(__wt_txn_rollback(las_session, NULL)); - if (ret == 0) + if ((ret = __las_remove_block( + cursor, pageid, lock_wait, &remove_cnt)) == 0) (void)__wt_atomic_add64( &conn->cache->las_remove_count, remove_cnt); -err: __las_restore_isolation(las_session, saved_isolation); WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); - return (ret); } @@ -1021,21 +1006,27 @@ __wt_las_sweep(WT_SESSION_IMPL *session) uint64_t cnt, remove_cnt, las_pageid, saved_pageid, visit_cnt; uint64_t las_counter, las_txnid; uint32_t las_id, session_flags; - uint8_t upd_type; + uint8_t prepare_state, upd_type; int notused; - bool local_txn, locked; + bool local_txn, locked, removing_key_block; cache = S2C(session)->cache; cursor = NULL; sweep_key = &cache->las_sweep_key; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ - local_txn = locked = false; + local_txn = locked = removing_key_block = false; WT_RET(__wt_scr_alloc(session, 0, &saved_key)); saved_pageid = 0; /* + * Prevent other threads removing entries from underneath the sweep. + */ + __wt_writelock(session, &cache->las_sweepwalk_lock); + locked = true; + + /* * Allocate a cursor and wrap all the updates in a transaction. * We should have our own lookaside cursor. */ @@ -1045,14 +1036,8 @@ __wt_las_sweep(WT_SESSION_IMPL *session) WT_ERR(__wt_txn_begin(session, NULL)); local_txn = true; - /* - * Prevent other threads removing entries from underneath the sweep. - */ - __wt_writelock(session, &cache->las_sweepwalk_lock); - locked = true; - /* Encourage a race */ - __las_timing_stress(session); + __wt_timing_stress(session, WT_TIMING_STRESS_LOOKASIDE_SWEEP); /* * When continuing a sweep, position the cursor using the key from the @@ -1087,21 +1072,17 @@ __wt_las_sweep(WT_SESSION_IMPL *session) WT_ERR(cursor->get_key(cursor, &las_pageid, &las_id, &las_counter, &las_key)); - /* - * If we have switched to a different page, clear the saved key. - * Otherwise, sweep could incorrectly remove records after - * seeing a birthmark for a key in one block if the same key is - * at the beginning of the next block. See WT-3982 for details. - */ - if (las_pageid != saved_pageid) { - saved_key->size = 0; - saved_pageid = las_pageid; - } + __wt_verbose(session, + WT_VERB_LOOKASIDE_ACTIVITY, + "Sweep reviewing lookaside entry with lookaside " + "page ID %" PRIu64 " btree ID %" PRIu32 + " saved key size: %" WT_SIZET_FMT, + las_pageid, las_id, saved_key->size); /* - * Stop if the cache is stuck: we are ignoring the cache size - * while scanning the lookaside table, so we're making things - * worse. + * Signal to stop if the cache is stuck: we are ignoring the + * cache size while scanning the lookaside table, so we're + * making things worse. */ if (__wt_cache_stuck(session)) cnt = 0; @@ -1112,9 +1093,8 @@ __wt_las_sweep(WT_SESSION_IMPL *session) * and there is a reader waiting and we're on a key boundary. */ ++visit_cnt; - if ((cnt == 0 || - (visit_cnt > WT_LAS_SWEEP_ENTRIES && cache->las_reader)) && - saved_key->size == 0) + if (!removing_key_block && (cnt == 0 || + (visit_cnt > WT_LAS_SWEEP_ENTRIES && cache->las_reader))) break; if (cnt > 0) --cnt; @@ -1134,15 +1114,20 @@ __wt_las_sweep(WT_SESSION_IMPL *session) WT_ERR(cursor->remove(cursor)); ++remove_cnt; saved_key->size = 0; + /* + * Allow sweep to break while removing entries from a + * dead file. + */ + removing_key_block = false; continue; } /* - * Remove entries from the lookaside that have aged out and are - * now no longer needed. + * Remove all entries for a key once they have aged out and are + * no longer needed. */ - WT_ERR(cursor->get_value(cursor, - &las_txnid, &las_timestamp, &upd_type, &las_value)); + WT_ERR(cursor->get_value(cursor, &las_txnid, + &las_timestamp, &prepare_state, &upd_type, &las_value)); #ifdef HAVE_TIMESTAMPS WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE); memcpy(×tamp, las_timestamp.data, las_timestamp.size); @@ -1152,42 +1137,51 @@ __wt_las_sweep(WT_SESSION_IMPL *session) #endif /* - * If this entry isn't globally visible we cannot remove it. - * If it is visible then perform additional checks to see - * whether it has aged out of a live file. - */ - if (!__wt_txn_visible_all(session, las_txnid, val_ts)) { - saved_key->size = 0; - continue; - } - - /* - * Save our key for comparing with older entries if we - * don't have one or it is different. + * Check to see if the page or key has changed this iteration, + * and if they have, setup context for safely removing obsolete + * updates. + * + * It's important to check for page boundaries explicitly + * because it is possible for the same key to be at the start + * of the next block. See WT-3982 for details. */ - if (saved_key->size != las_key.size || + if (las_pageid != saved_pageid || + saved_key->size != las_key.size || memcmp(saved_key->data, las_key.data, las_key.size) != 0) { - /* If we have processed enough entries, give up. */ + /* If we've examined enough entries, give up. */ if (cnt == 0) break; - /* We can only start removing from a full value. */ - if (upd_type == WT_UPDATE_MODIFY) { - saved_key->size = 0; - continue; - } - - WT_ERR(__wt_buf_set(session, saved_key, - las_key.data, las_key.size)); + saved_pageid = las_pageid; + WT_ERR(__wt_buf_set( + session, saved_key, las_key.data, las_key.size)); /* - * If the first stable record contains data, we have to - * keep it. + * There are several conditions that need to be met + * before we choose to remove a key block: + * * The entries were written with skew newest. + * Indicated by the first entry being a birthmark. + * * The first entry is globally visible. + * * The entry wasn't from a prepared transaction. */ - if (upd_type != WT_UPDATE_BIRTHMARK) - continue; + if (upd_type == WT_UPDATE_BIRTHMARK && + __wt_txn_visible_all(session, las_txnid, val_ts) && + prepare_state != WT_PREPARE_INPROGRESS) + removing_key_block = true; + else + removing_key_block = false; } + if (!removing_key_block) + continue; + + __wt_verbose(session, + WT_VERB_LOOKASIDE_ACTIVITY, + "Sweep removing lookaside entry with " + "page ID: %" PRIu64 " btree ID: %" PRIu32 + " saved key size: %" WT_SIZET_FMT ", record type: %" PRIu8 + " transaction ID: %" PRIu64, + las_pageid, las_id, saved_key->size, upd_type, las_txnid); WT_ERR(cursor->remove(cursor)); ++remove_cnt; } @@ -1221,11 +1215,12 @@ err: __wt_buf_free(session, sweep_key); (void)__wt_atomic_add64( &cache->las_remove_count, remove_cnt); } - if (locked) - __wt_writeunlock(session, &cache->las_sweepwalk_lock); - WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); __las_restore_isolation(session, saved_isolation); + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + if (locked) + __wt_writeunlock(session, &cache->las_sweepwalk_lock); __wt_scr_free(session, &saved_key); |