summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-05-22 13:41:22 +1000
committerLuke Chen <luke.chen@mongodb.com>2019-05-22 13:41:22 +1000
commitddb5e3eb82bd8f22a384835923d7f20bfa479d24 (patch)
treeb1113b99d8938729957520faf1d95b9cbf2b4f02
parent7afd28d78040c3d171cc9711d419e1294404b267 (diff)
downloadmongo-ddb5e3eb82bd8f22a384835923d7f20bfa479d24.tar.gz
Import wiredtiger: 4051e4941c894655cdb7d3dec97a7e32e7defbe6 from branch mongodb-4.0r4.0.10-rc0
ref: 1d72f206c3..4051e4941c for: 4.0.10 WT-4352 Resolve birthmarks during eviction in more cases WT-4750 Sweep can remove active lookaside records when files are closed and re-opened WT-4759 Save a copy when an old overflow value is discarded WT-4768 Inconsistent data with lookaside eviction followed by sweep WT-4769 Don't discard active history for empty pages WT-4794 Mark lookaside history resolved in all paths WT-4796 Enhance diagnostics that track ref state transitions
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c19
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c28
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c7
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c55
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c22
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c4
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c2
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h14
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i2
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h1
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h3
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i34
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c33
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c43
18 files changed, 206 insertions, 83 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index e6bd6967675..43dc53c86e3 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "1d72f206c3078d002666cc6ea042ec9fa1fbe13c",
+ "commit": "4051e4941c894655cdb7d3dec97a7e32e7defbe6",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-4.0"
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index d861276a843..ec9f8b21ec5 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -238,7 +238,7 @@ __wt_compact_page_skip(
* reference an on-page cell, and page eviction can free that memory.
* Lock the WT_REF so we can look at its address.
*/
- if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
+ if (!WT_REF_CAS_STATE(session, ref, WT_REF_DISK, WT_REF_LOCKED))
return (0);
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 61f1fa0948f..b0fd6a58edf 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -74,7 +74,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/* If we have a clean page in memory, attempt to evict it. */
previous_state = ref->state;
if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) &&
- __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) {
+ WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) {
if (__wt_page_is_modified(ref->page)) {
WT_REF_SET_STATE(ref, previous_state);
return (0);
@@ -99,7 +99,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
default:
return (0);
}
- if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED))
+ if (!WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED))
return (0);
/*
@@ -190,7 +190,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* If the page is still "deleted", it's as we left it,
* reset the state.
*/
- if (__wt_atomic_casv32(&ref->state,
+ if (WT_REF_CAS_STATE(session, ref,
WT_REF_DELETED, ref->page_del->previous_state))
goto done;
break;
@@ -201,8 +201,8 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
break;
case WT_REF_MEM:
case WT_REF_SPLIT:
- if (__wt_atomic_casv32(
- &ref->state, current_state, WT_REF_LOCKED))
+ if (WT_REF_CAS_STATE(
+ session, ref, current_state, WT_REF_LOCKED))
locked = true;
break;
case WT_REF_DISK:
@@ -279,13 +279,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
* the page could switch to an in-memory state at any time. Lock down
* the structure, just to be safe.
*/
- if (ref->page_del == NULL)
+ if (ref->page_del == NULL && ref->page_las == NULL)
return (true);
- if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED))
return (false);
- skip = !__wt_page_del_active(session, ref, visible_all);
+ skip = !__wt_page_del_active(session, ref, visible_all) &&
+ !__wt_page_las_active(session, ref);
/*
* The page_del structure can be freed as soon as the delete is stable:
@@ -300,7 +301,7 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
__wt_free(session, ref->page_del);
}
- WT_PUBLISH(ref->state, WT_REF_DELETED);
+ WT_REF_SET_STATE(ref, WT_REF_DELETED);
return (skip);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 62b50b34acf..91d5862eae6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -115,6 +115,12 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
/* Initialize and configure the WT_BTREE structure. */
WT_ERR(__btree_conf(session, &ckpt));
+ /*
+ * We could be a re-open of a table that was put in the lookaside
+ * dropped list. Remove our id from that list.
+ */
+ __wt_las_remove_dropped(session);
+
/* Connect to the underlying block manager. */
filename = dhandle->name;
if (!WT_PREFIX_SKIP(filename, "file:"))
@@ -236,6 +242,16 @@ __wt_btree_close(WT_SESSION_IMPL *session)
F_SET(btree, WT_BTREE_CLOSED);
/*
+ * If closing a tree let sweep drop lookaside entries for it.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_LOOKASIDE_OPEN) &&
+ btree->lookaside_entries) {
+ WT_ASSERT(session, !WT_IS_METADATA(btree->dhandle) &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE));
+ WT_TRET(__wt_las_save_dropped(session));
+ }
+
+ /*
* If we turned eviction off and never turned it back on, do that now,
* otherwise the counter will be off.
*/
@@ -533,12 +549,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
* Initialize a tree root reference, and link in the root page.
*/
void
-__wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno)
+__wt_root_ref_init(WT_SESSION_IMPL *session,
+ WT_REF *root_ref, WT_PAGE *root, bool is_recno)
{
+ WT_UNUSED(session); /* Used in a macro for diagnostic builds */
memset(root_ref, 0, sizeof(*root_ref));
root_ref->page = root;
- root_ref->state = WT_REF_MEM;
+ WT_REF_SET_STATE(root_ref, WT_REF_MEM);
root_ref->ref_recno = is_recno ? 1 : WT_RECNO_OOB;
@@ -613,7 +631,8 @@ __wt_btree_tree_open(
dsk.mem = NULL;
/* Finish initializing the root, root reference links. */
- __wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
+ __wt_root_ref_init(session,
+ &btree->root, page, btree->type != BTREE_ROW);
err: __wt_buf_free(session, &dsk);
__wt_scr_free(session, &tmp);
@@ -697,7 +716,8 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
}
/* Finish initializing the root, root reference links. */
- __wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW);
+ __wt_root_ref_init(session,
+ &btree->root, root, btree->type != BTREE_ROW);
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 5d6055b969d..8dd918e8011 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -290,6 +290,13 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
}
}
+ /*
+ * Now the lookaside history has been read into cache there is no
+ * further need to maintain a reference to it.
+ */
+ ref->page_las->eviction_to_lookaside = false;
+ ref->page_las->resolved = true;
+
err: if (locked)
__wt_readunlock(session, &cache->las_sweepwalk_lock);
WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
@@ -413,7 +420,6 @@ __page_read_lookaside(WT_SESSION_IMPL *session, WT_REF *ref,
}
WT_RET(__las_page_instantiate(session, ref));
- ref->page_las->eviction_to_lookaside = false;
return (0);
}
@@ -463,7 +469,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
default:
return (0);
}
- if (!__wt_atomic_casv32(&ref->state, previous_state, new_state))
+ if (!WT_REF_CAS_STATE(session, ref, previous_state, new_state))
return (0);
final_state = WT_REF_MEM;
@@ -538,10 +544,8 @@ skip_read:
* information), first update based on the lookaside table and
* then apply the delete.
*/
- if (ref->page_las != NULL) {
+ if (ref->page_las != NULL)
WT_ERR(__las_page_instantiate(session, ref));
- ref->page_las->eviction_to_lookaside = false;
- }
/* Move all records to a deleted state. */
WT_ERR(__wt_delete_page_instantiate(session, ref));
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 99a0bf6d323..f8f2552dc0a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -1208,7 +1208,7 @@ __slvg_col_build_internal(
++ref;
}
- __wt_root_ref_init(&ss->root_ref, page, true);
+ __wt_root_ref_init(session, &ss->root_ref, page, true);
if (0) {
err: __wt_free(session, addr);
@@ -1870,7 +1870,7 @@ __slvg_row_build_internal(
++ref;
}
- __wt_root_ref_init(&ss->root_ref, page, false);
+ __wt_root_ref_init(session, &ss->root_ref, page, false);
if (0) {
err: __wt_free(session, addr);
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 9b2d15d072f..113b95e6ff9 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -678,8 +678,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_SESSION_BTREE_SYNC(session)) &&
next_ref->state == WT_REF_DELETED &&
__wt_delete_page_skip(session, next_ref, true) &&
- __wt_atomic_casv32(
- &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) {
+ WT_REF_CAS_STATE(
+ session, next_ref, WT_REF_DELETED, WT_REF_SPLIT))) {
WT_ERR(__wt_buf_grow(session, scr,
(deleted_entries + 1) * sizeof(uint32_t)));
deleted_refs = scr->mem;
@@ -855,6 +855,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
}
}
+ /* Check that we are not discarding active history. */
+ WT_ASSERT(session, !__wt_page_las_active(session, next_ref));
+
/*
* The page-delete and lookaside memory weren't added to the
* parent's footprint, ignore it here.
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
index 9d3719e0a82..77614e9c9e4 100644
--- a/src/third_party/wiredtiger/src/cache/cache_las.c
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -455,7 +455,7 @@ __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
previous_state != WT_REF_LOOKASIDE)
return (false);
- if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED))
+ if (!WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED))
return (false);
skip = __wt_las_page_skip_locked(session, ref);
@@ -863,6 +863,33 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, uint64_t pageid)
}
/*
+ * __wt_las_remove_dropped --
+ * Remove an opened btree ID if it is in the dropped table.
+ */
+void
+__wt_las_remove_dropped(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ u_int i, j;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+
+ __wt_spin_lock(session, &cache->las_sweep_lock);
+ for (i = 0; i < cache->las_dropped_next &&
+ cache->las_dropped[i] != btree->id; i++)
+ ;
+
+ if (i < cache->las_dropped_next) {
+ cache->las_dropped_next--;
+ for (j = i; j < cache->las_dropped_next; j++)
+ cache->las_dropped[j] = cache->las_dropped[j + 1];
+ }
+ __wt_spin_unlock(session, &cache->las_sweep_lock);
+}
+
+/*
* __wt_las_save_dropped --
* Save a dropped btree ID to be swept from the lookaside table.
*/
@@ -939,6 +966,19 @@ __las_sweep_init(WT_SESSION_IMPL *session)
goto err;
}
+ /*
+ * Record the current page ID: sweep will stop after this point.
+ *
+ * Since the btree IDs we're scanning are closed, any eviction must
+ * have already completed, so we won't miss anything with this
+ * approach.
+ *
+ * Also, if a tree is reopened and there is lookaside activity before
+ * this sweep completes, it will have a higher page ID and should not
+ * be removed.
+ */
+ cache->las_sweep_max_pageid = cache->las_pageid;
+
/* Scan the btree IDs to find min/max. */
cache->las_sweep_dropmin = UINT32_MAX;
cache->las_sweep_dropmax = 0;
@@ -1035,7 +1075,7 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
* table. Searching for the same key could leave us stuck at
* the end of the table, repeatedly checking the same rows.
*/
- sweep_key->size = 0;
+ __wt_buf_free(session, sweep_key);
} else
ret = __las_sweep_init(session);
if (ret != 0)
@@ -1065,6 +1105,17 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
cnt = 0;
/*
+ * Don't go past the end of lookaside from when sweep started.
+ * If a file is reopened, its ID may be reused past this point
+ * so the bitmap we're using is not valid.
+ */
+ if (las_pageid > cache->las_sweep_max_pageid) {
+ __wt_buf_free(session, sweep_key);
+ ret = WT_NOTFOUND;
+ break;
+ }
+
+ /*
* We only want to break between key blocks. Stop if we've
* processed enough entries either all we wanted or enough
* and there is a reader waiting and we're on a key boundary.
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index a632bb6e068..b9747d1b681 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -16,7 +16,6 @@ int
__wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
WT_PAGE *page;
@@ -25,7 +24,6 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
dhandle = session->dhandle;
btree = dhandle->handle;
- conn = S2C(session);
/*
* We need exclusive access to the file, we're about to discard the root
@@ -41,24 +39,6 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
if (btree->root.page == NULL)
return (0);
- /*
- * If discarding a dead tree, remove any lookaside entries. This deals
- * with the case where a tree is dropped with "force=true". It happens
- * that we also force-drop the lookaside table itself: it can never
- * participate in lookaside eviction, and we can't open a cursor on it
- * as we are discarding it.
- *
- * We use the special page ID zero so that all lookaside entries for
- * the tree are removed.
- */
- if (F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
- F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN) && btree->lookaside_entries) {
- WT_ASSERT(session, !WT_IS_METADATA(dhandle) &&
- !F_ISSET(btree, WT_BTREE_LOOKASIDE));
-
- WT_RET(__wt_las_save_dropped(session));
- }
-
/* Make sure the oldest transaction ID is up-to-date. */
WT_RET(__wt_txn_update_oldest(
session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
@@ -123,7 +103,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
*/
WT_ASSERT(session,
F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
- F_ISSET(conn, WT_CONN_CLOSING) ||
+ F_ISSET(S2C(session), WT_CONN_CLOSING) ||
__wt_page_can_evict(session, ref, NULL));
__wt_ref_out(session, ref);
break;
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 1dc54d66382..3001f3d23da 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -2225,8 +2225,8 @@ __evict_get_ref(WT_SESSION_IMPL *session,
*/
if (((previous_state = evict->ref->state) != WT_REF_MEM &&
previous_state != WT_REF_LIMBO) ||
- !__wt_atomic_casv32(
- &evict->ref->state, previous_state, WT_REF_LOCKED)) {
+ !WT_REF_CAS_STATE(
+ session, evict->ref, previous_state, WT_REF_LOCKED)) {
__evict_list_clear(session, evict);
continue;
}
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 34e138a7a48..e75f0ef1bed 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -73,7 +73,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
*/
previous_state = ref->state;
if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) &&
- __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED))
+ WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED))
locked = true;
if ((ret = __wt_hazard_clear(session, ref)) != 0 || !locked) {
if (locked)
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index b3af1464568..46f507ebedf 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -240,6 +240,7 @@ struct __wt_page_lookaside {
wt_timestamp_t unstable_timestamp;/* First timestamp not on page */
bool eviction_to_lookaside; /* Revert to lookaside on eviction */
bool has_prepares; /* One or more updates are prepared */
+ bool resolved; /* History has been read into cache */
bool skew_newest; /* Page image has newest versions */
};
@@ -879,6 +880,10 @@ struct __wt_ref {
WT_PAGE_DELETED *page_del; /* Deleted page information */
WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */
+/* A macro wrapper allowing us to remember the callers code location */
+#define WT_REF_CAS_STATE(session, ref, old_state, new_state) \
+ __wt_ref_cas_state_int((session), (ref), (old_state), (new_state),\
+ __FILE__, __LINE__)
#ifdef HAVE_DIAGNOSTIC
/* Capture history of ref state changes. */
struct __wt_ref_hist {
@@ -889,14 +894,17 @@ struct __wt_ref {
uint32_t state;
} hist[3];
uint64_t histoff;
-#define WT_REF_SET_STATE(ref, s) do { \
+#define WT_REF_SAVE_STATE(ref, s, f, l) do { \
(ref)->hist[(ref)->histoff].session = session; \
(ref)->hist[(ref)->histoff].name = session->name; \
- (ref)->hist[(ref)->histoff].file = __FILE__; \
- (ref)->hist[(ref)->histoff].line = __LINE__; \
+ (ref)->hist[(ref)->histoff].file = (f); \
+ (ref)->hist[(ref)->histoff].line = (l); \
(ref)->hist[(ref)->histoff].state = s; \
(ref)->histoff = \
((ref)->histoff + 1) % WT_ELEMENTS((ref)->hist); \
+} while (0)
+#define WT_REF_SET_STATE(ref, s) do { \
+ WT_REF_SAVE_STATE(ref, s, __FILE__, __LINE__); \
WT_PUBLISH((ref)->state, s); \
} while (0)
#else
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 9bd3316fe85..5e0f0521ded 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1190,6 +1190,8 @@ __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref)
if ((page_las = ref->page_las) == NULL)
return (false);
+ if (page_las->resolved)
+ return (false);
if (!page_las->skew_newest || page_las->has_prepares)
return (true);
if (__wt_txn_visible_all(session, page_las->max_txn,
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index c4a276ca4d3..7966d9802b3 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -211,6 +211,7 @@ struct __wt_cache {
uint32_t las_sweep_dropmin; /* Minimum btree ID in current set. */
uint8_t *las_sweep_dropmap; /* Bitmap of dropped btree IDs. */
uint32_t las_sweep_dropmax; /* Maximum btree ID in current set. */
+ uint64_t las_sweep_max_pageid; /* Maximum page ID for sweep. */
uint32_t *las_dropped; /* List of dropped btree IDs. */
size_t las_dropped_next; /* Next index into drop list. */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index b12febce98d..aa313fa2caf 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -140,7 +140,7 @@ extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd);
extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btree_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btree_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno);
+extern void __wt_root_ref_init(WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno);
extern int __wt_btree_tree_open(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_btree_immediately_durable(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -214,6 +214,7 @@ extern bool __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUN
extern int __wt_las_insert_block(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_remove_block(WT_SESSION_IMPL *session, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_las_remove_dropped(WT_SESSION_IMPL *session);
extern int __wt_las_save_dropped(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint32_t __wt_checksum_sw(const void *chunk, size_t len);
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 21d4c195c99..1cd615fa3bd 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -14,6 +14,35 @@ typedef enum {
WT_VISIBLE_PREPARE=1, /* Prepared update */
WT_VISIBLE_TRUE=2 /* A visible update */
} WT_VISIBLE_TYPE;
+/*
+ * __wt_ref_cas_state_int --
+ * Try to do a compare and swap, if successful update the ref history in
+ * diagnostic mode.
+ */
+static inline bool
+__wt_ref_cas_state_int(WT_SESSION_IMPL *session, WT_REF *ref,
+ uint32_t old_state, uint32_t new_state, const char *file, int line)
+{
+ bool cas_result;
+
+ /* Parameters that are used in a macro for diagnostic builds */
+ WT_UNUSED(session);
+ WT_UNUSED(file);
+ WT_UNUSED(line);
+
+ cas_result = __wt_atomic_casv32(&ref->state, old_state, new_state);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * The history update here has potential to race; if the state gets
+ * updated again after the CAS above but before the history has been
+ * updated.
+ */
+ if (cas_result)
+ WT_REF_SAVE_STATE(ref, new_state, file, line);
+#endif
+ return (cas_result);
+}
/*
* __wt_txn_timestamp_flags --
@@ -366,9 +395,8 @@ __wt_txn_op_apply_prepare_state(
for (;; __wt_yield()) {
previous_state = ref->state;
WT_ASSERT(session, previous_state != WT_REF_READING);
- if (previous_state != WT_REF_LOCKED &&
- __wt_atomic_casv32(
- &ref->state, previous_state, WT_REF_LOCKED))
+ if (previous_state != WT_REF_LOCKED && WT_REF_CAS_STATE(
+ session, ref, previous_state, WT_REF_LOCKED))
break;
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index d617342c6df..87ce7ca1cc3 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -791,7 +791,8 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
/*
* Fake up a reference structure, and write the next root page.
*/
- __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT);
+ __wt_root_ref_init(session,
+ &fake_ref, next, page->type == WT_PAGE_COL_INT);
return (__wt_reconcile(session, &fake_ref, NULL, flags, NULL));
err: __wt_page_out(session, &next);
@@ -1193,7 +1194,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
wt_timestamp_t timestamp;
size_t upd_memsize;
uint64_t max_txn, txnid;
- bool all_visible, prepared, skipped_birthmark, uncommitted;
+ bool all_visible, prepared, skipped_birthmark, uncommitted, upd_saved;
if (upd_savedp != NULL)
*upd_savedp = false;
@@ -1203,7 +1204,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
first_ts_upd = first_txn_upd = NULL;
upd_memsize = 0;
max_txn = WT_TXN_NONE;
- prepared = skipped_birthmark = uncommitted = false;
+ prepared = skipped_birthmark = uncommitted = upd_saved = false;
/*
* If called with a WT_INSERT item, use its WT_UPDATE list (which must
@@ -1421,6 +1422,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* unresolved updates, move the entire update list.
*/
WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize));
+ upd_saved = true;
if (upd_savedp != NULL)
*upd_savedp = true;
@@ -1458,18 +1460,15 @@ check_original_value:
/*
* Returning an update means the original on-page value might be lost,
- * and that's a problem if there's a reader that needs it. There are
- * several cases:
- * - any update from a modify operation (because the modify has to be
- * applied to a stable update, not the new on-page update),
- * - any lookaside table eviction (because the backing disk image is
- * rewritten),
- * - or any reconciliation of a backing overflow record that will be
- * physically removed once it's no longer needed.
- */
- if (*updp != NULL && (!WT_UPDATE_DATA_VALUE(*updp) ||
- F_ISSET(r, WT_REC_LOOKASIDE) || (vpack != NULL &&
- vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)))
+ * and that's a problem if there's a reader that needs it. This call
+ * makes a copy of the on-page value and if there is a birthmark in the
+ * update list, replaces it. We do that any time there are saved
+ * updates and during reconciliation of a backing overflow record that
+ * will be physically removed once it's no longer needed.
+ */
+ if (*updp != NULL && (upd_saved ||
+ (vpack != NULL && vpack->ovfl &&
+ vpack->raw != WT_CELL_VALUE_OVFL_RM)))
WT_RET(
__rec_append_orig_value(session, page, first_upd, vpack));
@@ -1657,8 +1656,8 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* to see if the delete is visible to us. Lock down the
* structure.
*/
- if (!__wt_atomic_casv32(
- &ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ if (!WT_REF_CAS_STATE(
+ session, ref, WT_REF_DELETED, WT_REF_LOCKED))
break;
ret = __rec_child_deleted(session, r, ref, statep);
WT_REF_SET_STATE(ref, WT_REF_DELETED);
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 424ebf68445..d2401970064 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -243,18 +243,43 @@ __txn_abort_newer_updates(
* dirty. Otherwise, the history we need could be swept from the
* lookaside table before the page is read because the lookaside sweep
* code has no way to tell that the page image is invalid.
+ *
+ * So, if there is lookaside history for a page, first check if the
+ * history needs to be rolled back make sure that history is loaded
+ * into cache. That is, if skew_newest is true, so the disk image
+ * potentially contained unstable updates, and the history is more
+ * recent than the rollback timestamp.
+ *
+ * Also, we have separately discarded any lookaside history more recent
+ * than the rollback timestamp. For page_las structures in cache,
+ * reset any future timestamps back to the rollback timestamp. This
+ * allows those structures to be discarded once the rollback timestamp
+ * is stable (crucially for tests, they can be discarded if the
+ * connection is closed right after a rollback_to_stable call).
*/
local_read = false;
read_flags = WT_READ_WONT_NEED;
- if (ref->page_las != NULL && ref->page_las->skew_newest &&
- rollback_timestamp < ref->page_las->unstable_timestamp) {
- /* Make sure get back a page with history, not limbo page */
- WT_ASSERT(session,
- !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
- WT_RET(__wt_page_in(session, ref, read_flags));
- WT_ASSERT(session, ref->state != WT_REF_LIMBO &&
- ref->page != NULL && __wt_page_is_modified(ref->page));
- local_read = true;
+ if (ref->page_las != NULL) {
+ if (ref->page_las->skew_newest && rollback_timestamp <
+ ref->page_las->unstable_timestamp) {
+ /*
+ * Make sure we get back a page with history, not a
+ * limbo page.
+ */
+ WT_ASSERT(session,
+ !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
+ WT_RET(__wt_page_in(session, ref, read_flags));
+ WT_ASSERT(session, ref->state != WT_REF_LIMBO &&
+ ref->page != NULL &&
+ __wt_page_is_modified(ref->page));
+ local_read = true;
+ }
+ if (ref->page_las->max_timestamp > rollback_timestamp)
+ ref->page_las->max_timestamp = rollback_timestamp;
+ if (ref->page_las->unstable_timestamp > rollback_timestamp)
+ ref->page_las->unstable_timestamp = rollback_timestamp;
+ if (ref->page_las->unstable_timestamp > rollback_timestamp)
+ ref->page_las->unstable_timestamp = rollback_timestamp;
}
/* Review deleted page saved to the ref */