diff options
18 files changed, 206 insertions, 83 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index e6bd6967675..43dc53c86e3 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "1d72f206c3078d002666cc6ea042ec9fa1fbe13c", + "commit": "4051e4941c894655cdb7d3dec97a7e32e7defbe6", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.0" diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index d861276a843..ec9f8b21ec5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -238,7 +238,7 @@ __wt_compact_page_skip( * reference an on-page cell, and page eviction can free that memory. * Lock the WT_REF so we can look at its address. */ - if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) + if (!WT_REF_CAS_STATE(session, ref, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 61f1fa0948f..b0fd6a58edf 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -74,7 +74,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* If we have a clean page in memory, attempt to evict it. */ previous_state = ref->state; if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) && - __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) { + WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_REF_SET_STATE(ref, previous_state); return (0); @@ -99,7 +99,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) default: return (0); } - if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) + if (!WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) return (0); /* @@ -190,7 +190,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * If the page is still "deleted", it's as we left it, * reset the state. */ - if (__wt_atomic_casv32(&ref->state, + if (WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, ref->page_del->previous_state)) goto done; break; @@ -201,8 +201,8 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) break; case WT_REF_MEM: case WT_REF_SPLIT: - if (__wt_atomic_casv32( - &ref->state, current_state, WT_REF_LOCKED)) + if (WT_REF_CAS_STATE( + session, ref, current_state, WT_REF_LOCKED)) locked = true; break; case WT_REF_DISK: @@ -279,13 +279,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ - if (ref->page_del == NULL) + if (ref->page_del == NULL && ref->page_las == NULL) return (true); - if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) return (false); - skip = !__wt_page_del_active(session, ref, visible_all); + skip = !__wt_page_del_active(session, ref, visible_all) && + !__wt_page_las_active(session, ref); /* * The page_del structure can be freed as soon as the delete is stable: @@ -300,7 +301,7 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) __wt_free(session, ref->page_del); } - WT_PUBLISH(ref->state, WT_REF_DELETED); + WT_REF_SET_STATE(ref, WT_REF_DELETED); return (skip); } diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 62b50b34acf..91d5862eae6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -115,6 +115,12 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) /* Initialize and configure the WT_BTREE structure. */ WT_ERR(__btree_conf(session, &ckpt)); + /* + * We could be a re-open of a table that was put in the lookaside + * dropped list. Remove our id from that list. + */ + __wt_las_remove_dropped(session); + /* Connect to the underlying block manager. */ filename = dhandle->name; if (!WT_PREFIX_SKIP(filename, "file:")) @@ -236,6 +242,16 @@ __wt_btree_close(WT_SESSION_IMPL *session) F_SET(btree, WT_BTREE_CLOSED); /* + * If closing a tree let sweep drop lookaside entries for it. + */ + if (F_ISSET(S2C(session), WT_CONN_LOOKASIDE_OPEN) && + btree->lookaside_entries) { + WT_ASSERT(session, !WT_IS_METADATA(btree->dhandle) && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)); + WT_TRET(__wt_las_save_dropped(session)); + } + + /* * If we turned eviction off and never turned it back on, do that now, * otherwise the counter will be off. */ @@ -533,12 +549,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) * Initialize a tree root reference, and link in the root page. */ void -__wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno) +__wt_root_ref_init(WT_SESSION_IMPL *session, + WT_REF *root_ref, WT_PAGE *root, bool is_recno) { + WT_UNUSED(session); /* Used in a macro for diagnostic builds */ memset(root_ref, 0, sizeof(*root_ref)); root_ref->page = root; - root_ref->state = WT_REF_MEM; + WT_REF_SET_STATE(root_ref, WT_REF_MEM); root_ref->ref_recno = is_recno ? 1 : WT_RECNO_OOB; @@ -613,7 +631,8 @@ __wt_btree_tree_open( dsk.mem = NULL; /* Finish initializing the root, root reference links. */ - __wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW); + __wt_root_ref_init(session, + &btree->root, page, btree->type != BTREE_ROW); err: __wt_buf_free(session, &dsk); __wt_scr_free(session, &tmp); @@ -697,7 +716,8 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) } /* Finish initializing the root, root reference links. */ - __wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW); + __wt_root_ref_init(session, + &btree->root, root, btree->type != BTREE_ROW); return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 5d6055b969d..8dd918e8011 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -290,6 +290,13 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) } } + /* + * Now the lookaside history has been read into cache there is no + * further need to maintain a reference to it. + */ + ref->page_las->eviction_to_lookaside = false; + ref->page_las->resolved = true; + err: if (locked) __wt_readunlock(session, &cache->las_sweepwalk_lock); WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); @@ -413,7 +420,6 @@ __page_read_lookaside(WT_SESSION_IMPL *session, WT_REF *ref, } WT_RET(__las_page_instantiate(session, ref)); - ref->page_las->eviction_to_lookaside = false; return (0); } @@ -463,7 +469,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) default: return (0); } - if (!__wt_atomic_casv32(&ref->state, previous_state, new_state)) + if (!WT_REF_CAS_STATE(session, ref, previous_state, new_state)) return (0); final_state = WT_REF_MEM; @@ -538,10 +544,8 @@ skip_read: * information), first update based on the lookaside table and * then apply the delete. */ - if (ref->page_las != NULL) { + if (ref->page_las != NULL) WT_ERR(__las_page_instantiate(session, ref)); - ref->page_las->eviction_to_lookaside = false; - } /* Move all records to a deleted state. */ WT_ERR(__wt_delete_page_instantiate(session, ref)); diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 99a0bf6d323..f8f2552dc0a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -1208,7 +1208,7 @@ __slvg_col_build_internal( ++ref; } - __wt_root_ref_init(&ss->root_ref, page, true); + __wt_root_ref_init(session, &ss->root_ref, page, true); if (0) { err: __wt_free(session, addr); @@ -1870,7 +1870,7 @@ __slvg_row_build_internal( ++ref; } - __wt_root_ref_init(&ss->root_ref, page, false); + __wt_root_ref_init(session, &ss->root_ref, page, false); if (0) { err: __wt_free(session, addr); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 9b2d15d072f..113b95e6ff9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -678,8 +678,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_SESSION_BTREE_SYNC(session)) && next_ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, next_ref, true) && - __wt_atomic_casv32( - &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) { + WT_REF_CAS_STATE( + session, next_ref, WT_REF_DELETED, WT_REF_SPLIT))) { WT_ERR(__wt_buf_grow(session, scr, (deleted_entries + 1) * sizeof(uint32_t))); deleted_refs = scr->mem; @@ -855,6 +855,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, } } + /* Check that we are not discarding active history. */ + WT_ASSERT(session, !__wt_page_las_active(session, next_ref)); + /* * The page-delete and lookaside memory weren't added to the * parent's footprint, ignore it here. diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 9d3719e0a82..77614e9c9e4 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -455,7 +455,7 @@ __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) previous_state != WT_REF_LOOKASIDE) return (false); - if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) + if (!WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) return (false); skip = __wt_las_page_skip_locked(session, ref); @@ -863,6 +863,33 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, uint64_t pageid) } /* + * __wt_las_remove_dropped -- + * Remove an opened btree ID if it is in the dropped table. + */ +void +__wt_las_remove_dropped(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + u_int i, j; + + btree = S2BT(session); + cache = S2C(session)->cache; + + __wt_spin_lock(session, &cache->las_sweep_lock); + for (i = 0; i < cache->las_dropped_next && + cache->las_dropped[i] != btree->id; i++) + ; + + if (i < cache->las_dropped_next) { + cache->las_dropped_next--; + for (j = i; j < cache->las_dropped_next; j++) + cache->las_dropped[j] = cache->las_dropped[j + 1]; + } + __wt_spin_unlock(session, &cache->las_sweep_lock); +} + +/* * __wt_las_save_dropped -- * Save a dropped btree ID to be swept from the lookaside table. */ @@ -939,6 +966,19 @@ __las_sweep_init(WT_SESSION_IMPL *session) goto err; } + /* + * Record the current page ID: sweep will stop after this point. + * + * Since the btree IDs we're scanning are closed, any eviction must + * have already completed, so we won't miss anything with this + * approach. + * + * Also, if a tree is reopened and there is lookaside activity before + * this sweep completes, it will have a higher page ID and should not + * be removed. + */ + cache->las_sweep_max_pageid = cache->las_pageid; + /* Scan the btree IDs to find min/max. */ cache->las_sweep_dropmin = UINT32_MAX; cache->las_sweep_dropmax = 0; @@ -1035,7 +1075,7 @@ __wt_las_sweep(WT_SESSION_IMPL *session) * table. Searching for the same key could leave us stuck at * the end of the table, repeatedly checking the same rows. */ - sweep_key->size = 0; + __wt_buf_free(session, sweep_key); } else ret = __las_sweep_init(session); if (ret != 0) @@ -1065,6 +1105,17 @@ __wt_las_sweep(WT_SESSION_IMPL *session) cnt = 0; /* + * Don't go past the end of lookaside from when sweep started. + * If a file is reopened, its ID may be reused past this point + * so the bitmap we're using is not valid. + */ + if (las_pageid > cache->las_sweep_max_pageid) { + __wt_buf_free(session, sweep_key); + ret = WT_NOTFOUND; + break; + } + + /* * We only want to break between key blocks. Stop if we've * processed enough entries either all we wanted or enough * and there is a reader waiting and we're on a key boundary. diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index a632bb6e068..b9747d1b681 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -16,7 +16,6 @@ int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_PAGE *page; @@ -25,7 +24,6 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) dhandle = session->dhandle; btree = dhandle->handle; - conn = S2C(session); /* * We need exclusive access to the file, we're about to discard the root @@ -41,24 +39,6 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) if (btree->root.page == NULL) return (0); - /* - * If discarding a dead tree, remove any lookaside entries. This deals - * with the case where a tree is dropped with "force=true". It happens - * that we also force-drop the lookaside table itself: it can never - * participate in lookaside eviction, and we can't open a cursor on it - * as we are discarding it. - * - * We use the special page ID zero so that all lookaside entries for - * the tree are removed. - */ - if (F_ISSET(dhandle, WT_DHANDLE_DEAD) && - F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN) && btree->lookaside_entries) { - WT_ASSERT(session, !WT_IS_METADATA(dhandle) && - !F_ISSET(btree, WT_BTREE_LOOKASIDE)); - - WT_RET(__wt_las_save_dropped(session)); - } - /* Make sure the oldest transaction ID is up-to-date. */ WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); @@ -123,7 +103,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) */ WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_DEAD) || - F_ISSET(conn, WT_CONN_CLOSING) || + F_ISSET(S2C(session), WT_CONN_CLOSING) || __wt_page_can_evict(session, ref, NULL)); __wt_ref_out(session, ref); break; diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 1dc54d66382..3001f3d23da 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -2225,8 +2225,8 @@ __evict_get_ref(WT_SESSION_IMPL *session, */ if (((previous_state = evict->ref->state) != WT_REF_MEM && previous_state != WT_REF_LIMBO) || - !__wt_atomic_casv32( - &evict->ref->state, previous_state, WT_REF_LOCKED)) { + !WT_REF_CAS_STATE( + session, evict->ref, previous_state, WT_REF_LOCKED)) { __evict_list_clear(session, evict); continue; } diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 34e138a7a48..e75f0ef1bed 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -73,7 +73,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) */ previous_state = ref->state; if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) && - __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) + WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) locked = true; if ((ret = __wt_hazard_clear(session, ref)) != 0 || !locked) { if (locked) diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index b3af1464568..46f507ebedf 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -240,6 +240,7 @@ struct __wt_page_lookaside { wt_timestamp_t unstable_timestamp;/* First timestamp not on page */ bool eviction_to_lookaside; /* Revert to lookaside on eviction */ bool has_prepares; /* One or more updates are prepared */ + bool resolved; /* History has been read into cache */ bool skew_newest; /* Page image has newest versions */ }; @@ -879,6 +880,10 @@ struct __wt_ref { WT_PAGE_DELETED *page_del; /* Deleted page information */ WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */ +/* A macro wrapper allowing us to remember the callers code location */ +#define WT_REF_CAS_STATE(session, ref, old_state, new_state) \ + __wt_ref_cas_state_int((session), (ref), (old_state), (new_state),\ + __FILE__, __LINE__) #ifdef HAVE_DIAGNOSTIC /* Capture history of ref state changes. */ struct __wt_ref_hist { @@ -889,14 +894,17 @@ struct __wt_ref { uint32_t state; } hist[3]; uint64_t histoff; -#define WT_REF_SET_STATE(ref, s) do { \ +#define WT_REF_SAVE_STATE(ref, s, f, l) do { \ (ref)->hist[(ref)->histoff].session = session; \ (ref)->hist[(ref)->histoff].name = session->name; \ - (ref)->hist[(ref)->histoff].file = __FILE__; \ - (ref)->hist[(ref)->histoff].line = __LINE__; \ + (ref)->hist[(ref)->histoff].file = (f); \ + (ref)->hist[(ref)->histoff].line = (l); \ (ref)->hist[(ref)->histoff].state = s; \ (ref)->histoff = \ ((ref)->histoff + 1) % WT_ELEMENTS((ref)->hist); \ +} while (0) +#define WT_REF_SET_STATE(ref, s) do { \ + WT_REF_SAVE_STATE(ref, s, __FILE__, __LINE__); \ WT_PUBLISH((ref)->state, s); \ } while (0) #else diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 9bd3316fe85..5e0f0521ded 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1190,6 +1190,8 @@ __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref) if ((page_las = ref->page_las) == NULL) return (false); + if (page_las->resolved) + return (false); if (!page_las->skew_newest || page_las->has_prepares) return (true); if (__wt_txn_visible_all(session, page_las->max_txn, diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index c4a276ca4d3..7966d9802b3 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -211,6 +211,7 @@ struct __wt_cache { uint32_t las_sweep_dropmin; /* Minimum btree ID in current set. */ uint8_t *las_sweep_dropmap; /* Bitmap of dropped btree IDs. */ uint32_t las_sweep_dropmax; /* Maximum btree ID in current set. */ + uint64_t las_sweep_max_pageid; /* Maximum page ID for sweep. */ uint32_t *las_dropped; /* List of dropped btree IDs. */ size_t las_dropped_next; /* Next index into drop list. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index b12febce98d..aa313fa2caf 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -140,7 +140,7 @@ extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno); +extern void __wt_root_ref_init(WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno); extern int __wt_btree_tree_open(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_btree_immediately_durable(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -214,6 +214,7 @@ extern bool __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUN extern int __wt_las_insert_block(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_las_remove_dropped(WT_SESSION_IMPL *session); extern int __wt_las_save_dropped(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_checksum_sw(const void *chunk, size_t len); diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 21d4c195c99..1cd615fa3bd 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -14,6 +14,35 @@ typedef enum { WT_VISIBLE_PREPARE=1, /* Prepared update */ WT_VISIBLE_TRUE=2 /* A visible update */ } WT_VISIBLE_TYPE; +/* + * __wt_ref_cas_state_int -- + * Try to do a compare and swap, if successful update the ref history in + * diagnostic mode. + */ +static inline bool +__wt_ref_cas_state_int(WT_SESSION_IMPL *session, WT_REF *ref, + uint32_t old_state, uint32_t new_state, const char *file, int line) +{ + bool cas_result; + + /* Parameters that are used in a macro for diagnostic builds */ + WT_UNUSED(session); + WT_UNUSED(file); + WT_UNUSED(line); + + cas_result = __wt_atomic_casv32(&ref->state, old_state, new_state); + +#ifdef HAVE_DIAGNOSTIC + /* + * The history update here has potential to race; if the state gets + * updated again after the CAS above but before the history has been + * updated. + */ + if (cas_result) + WT_REF_SAVE_STATE(ref, new_state, file, line); +#endif + return (cas_result); +} /* * __wt_txn_timestamp_flags -- @@ -366,9 +395,8 @@ __wt_txn_op_apply_prepare_state( for (;; __wt_yield()) { previous_state = ref->state; WT_ASSERT(session, previous_state != WT_REF_READING); - if (previous_state != WT_REF_LOCKED && - __wt_atomic_casv32( - &ref->state, previous_state, WT_REF_LOCKED)) + if (previous_state != WT_REF_LOCKED && WT_REF_CAS_STATE( + session, ref, previous_state, WT_REF_LOCKED)) break; } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index d617342c6df..87ce7ca1cc3 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -791,7 +791,8 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) /* * Fake up a reference structure, and write the next root page. */ - __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT); + __wt_root_ref_init(session, + &fake_ref, next, page->type == WT_PAGE_COL_INT); return (__wt_reconcile(session, &fake_ref, NULL, flags, NULL)); err: __wt_page_out(session, &next); @@ -1193,7 +1194,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t timestamp; size_t upd_memsize; uint64_t max_txn, txnid; - bool all_visible, prepared, skipped_birthmark, uncommitted; + bool all_visible, prepared, skipped_birthmark, uncommitted, upd_saved; if (upd_savedp != NULL) *upd_savedp = false; @@ -1203,7 +1204,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, first_ts_upd = first_txn_upd = NULL; upd_memsize = 0; max_txn = WT_TXN_NONE; - prepared = skipped_birthmark = uncommitted = false; + prepared = skipped_birthmark = uncommitted = upd_saved = false; /* * If called with a WT_INSERT item, use its WT_UPDATE list (which must @@ -1421,6 +1422,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * unresolved updates, move the entire update list. */ WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); + upd_saved = true; if (upd_savedp != NULL) *upd_savedp = true; @@ -1458,18 +1460,15 @@ check_original_value: /* * Returning an update means the original on-page value might be lost, - * and that's a problem if there's a reader that needs it. There are - * several cases: - * - any update from a modify operation (because the modify has to be - * applied to a stable update, not the new on-page update), - * - any lookaside table eviction (because the backing disk image is - * rewritten), - * - or any reconciliation of a backing overflow record that will be - * physically removed once it's no longer needed. - */ - if (*updp != NULL && (!WT_UPDATE_DATA_VALUE(*updp) || - F_ISSET(r, WT_REC_LOOKASIDE) || (vpack != NULL && - vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) + * and that's a problem if there's a reader that needs it. This call + * makes a copy of the on-page value and if there is a birthmark in the + * update list, replaces it. We do that any time there are saved + * updates and during reconciliation of a backing overflow record that + * will be physically removed once it's no longer needed. + */ + if (*updp != NULL && (upd_saved || + (vpack != NULL && vpack->ovfl && + vpack->raw != WT_CELL_VALUE_OVFL_RM))) WT_RET( __rec_append_orig_value(session, page, first_upd, vpack)); @@ -1657,8 +1656,8 @@ __rec_child_modify(WT_SESSION_IMPL *session, * to see if the delete is visible to us. Lock down the * structure. */ - if (!__wt_atomic_casv32( - &ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + if (!WT_REF_CAS_STATE( + session, ref, WT_REF_DELETED, WT_REF_LOCKED)) break; ret = __rec_child_deleted(session, r, ref, statep); WT_REF_SET_STATE(ref, WT_REF_DELETED); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 424ebf68445..d2401970064 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -243,18 +243,43 @@ __txn_abort_newer_updates( * dirty. Otherwise, the history we need could be swept from the * lookaside table before the page is read because the lookaside sweep * code has no way to tell that the page image is invalid. + * + * So, if there is lookaside history for a page, first check if the + * history needs to be rolled back make sure that history is loaded + * into cache. That is, if skew_newest is true, so the disk image + * potentially contained unstable updates, and the history is more + * recent than the rollback timestamp. + * + * Also, we have separately discarded any lookaside history more recent + * than the rollback timestamp. For page_las structures in cache, + * reset any future timestamps back to the rollback timestamp. This + * allows those structures to be discarded once the rollback timestamp + * is stable (crucially for tests, they can be discarded if the + * connection is closed right after a rollback_to_stable call). */ local_read = false; read_flags = WT_READ_WONT_NEED; - if (ref->page_las != NULL && ref->page_las->skew_newest && - rollback_timestamp < ref->page_las->unstable_timestamp) { - /* Make sure get back a page with history, not limbo page */ - WT_ASSERT(session, - !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); - WT_RET(__wt_page_in(session, ref, read_flags)); - WT_ASSERT(session, ref->state != WT_REF_LIMBO && - ref->page != NULL && __wt_page_is_modified(ref->page)); - local_read = true; + if (ref->page_las != NULL) { + if (ref->page_las->skew_newest && rollback_timestamp < + ref->page_las->unstable_timestamp) { + /* + * Make sure we get back a page with history, not a + * limbo page. + */ + WT_ASSERT(session, + !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); + WT_RET(__wt_page_in(session, ref, read_flags)); + WT_ASSERT(session, ref->state != WT_REF_LIMBO && + ref->page != NULL && + __wt_page_is_modified(ref->page)); + local_read = true; + } + if (ref->page_las->max_timestamp > rollback_timestamp) + ref->page_las->max_timestamp = rollback_timestamp; + if (ref->page_las->unstable_timestamp > rollback_timestamp) + ref->page_las->unstable_timestamp = rollback_timestamp; + if (ref->page_las->unstable_timestamp > rollback_timestamp) + ref->page_las->unstable_timestamp = rollback_timestamp; } /* Review deleted page saved to the ref */ |