summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Hows <howsdav@gmail.com>2016-08-08 12:06:08 +1000
committerDavid Hows <howsdav@gmail.com>2016-08-08 12:06:32 +1000
commit0a0526b04f14a1ad13e1d2942e1e7aabfc715a36 (patch)
tree7697d97703f9213b1cb387ecc0f5822f9b0618be
parent61106b8075fda058a5e8aae0141f8377b56bff85 (diff)
downloadmongo-0a0526b04f14a1ad13e1d2942e1e7aabfc715a36.tar.gz
Import wiredtiger-wiredtiger-mongodb-3.0.9-19-ga5c67bd.tar.gz from wiredtiger branch mongodb-3.0
ref: 9cfe4e1..a5c67bd for: 3.0.13 WT-2139 LSM with read-uncommitted isolation, read after free WT-2313 sweep-server: conn_dhandle.c, 610: dhandle != conn->cache->evict_file_next WT-2434 Race between force-drop and sweep WT-2708 split child-update race with reconciliation/eviction WT-2725 WiredTiger hitting assert trying to free update list in MongoDB 3.0 WT-2733 Backport fixes for races between eviction and dead handle cleanup WT-2802 Transaction commit causes heap-use-after free WT-2804 Don't read values in a tree without a snapshot
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c28
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c57
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c16
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c57
-rw-r--r--src/third_party/wiredtiger/src/include/api.h2
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i14
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i11
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c2
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c9
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c18
10 files changed, 118 insertions, 96 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 7f3620bb361..28fcb60e95d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -251,7 +251,7 @@ static int
__split_ref_deepen_move(WT_SESSION_IMPL *session,
WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
{
- WT_ADDR *addr;
+ WT_ADDR *addr, *ref_addr;
WT_CELL_UNPACK unpack;
WT_DECL_RET;
WT_IKEY *ikey;
@@ -287,13 +287,18 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
}
/*
- * If there's no address (the page has never been written), or the
- * address has been instantiated, there's no work to do. Otherwise,
- * get the address from the on-page cell.
+ * If there's no address at all (the page has never been written), or
+ * the address has already been instantiated, there's no work to do.
+ * Otherwise, the address still references a split page on-page cell,
+ * instantiate it. We can race with reconciliation and/or eviction of
+ * the child pages, be cautious: read the address and verify it, and
+ * only update it if the value is unchanged from the original. In the
+ * case of a race, the address must no longer reference the split page,
+ * we're done.
*/
- addr = ref->addr;
- if (addr != NULL && !__wt_off_page(parent, addr)) {
- __wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
+ WT_ORDERED_READ(ref_addr, ref->addr);
+ if (ref_addr != NULL && !__wt_off_page(parent, ref_addr)) {
+ __wt_cell_unpack((WT_CELL *)ref_addr, &unpack);
WT_RET(__wt_calloc_one(session, &addr));
if ((ret = __wt_strndup(
session, unpack.data, unpack.size, &addr->addr)) != 0) {
@@ -304,6 +309,10 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
addr->type =
unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
ref->addr = addr;
+ if (!__wt_atomic_cas_ptr(&ref->addr, ref_addr, addr)) {
+ __wt_free(session, addr->addr);
+ __wt_free(session, addr);
+ }
}
/* And finally, the WT_REF itself. */
@@ -502,7 +511,8 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
* array, a thread might see a freed WT_REF. Set the eviction
* transaction requirement for the newly created internal pages.
*/
- child->modify->mod_split_txn = __wt_txn_id_alloc(session, false);
+ child->modify->mod_split_txn =
+ __wt_txn_id_alloc(session, false);
/*
* The newly allocated child's page index references the same
@@ -856,7 +866,7 @@ __split_parent_lock(
}
/*
* If a checkpoint is running and we fail to lock the parent
- * page, give up immmediately to avoid deadlock.
+ * page, give up immediately to avoid deadlock.
*/
if (S2BT(session)->checkpointing)
return (EBUSY);
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index fed0012b211..1bb8e37249f 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -247,30 +247,6 @@ err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
}
/*
- * __conn_dhandle_mark_dead --
- * Mark a data handle dead.
- */
-static int
-__conn_dhandle_mark_dead(WT_SESSION_IMPL *session)
-{
- bool evict_reset;
-
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
-
- /*
- * Handle forced discard (e.g., when dropping a file).
- *
- * We need exclusive access to the file -- disable ordinary
- * eviction and drain any blocks already queued.
- */
- WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));
- F_SET(session->dhandle, WT_DHANDLE_DEAD);
- if (evict_reset)
- __wt_evict_file_exclusive_off(session);
- return (0);
-}
-
-/*
* __wt_conn_btree_sync_and_close --
* Sync and close the underlying btree handle.
*/
@@ -280,10 +256,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- bool no_schema_lock;
+ bool evict_reset, marked_dead, no_schema_lock;
dhandle = session->dhandle;
btree = S2BT(session);
+ evict_reset = marked_dead = false;
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
return (0);
@@ -309,6 +286,13 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
__wt_spin_lock(session, &dhandle->close_lock);
/*
+ * Ensure we aren't racing with the eviction server; inside the close
+ * lock so threads won't race setting/clearing the tree's "no eviction"
+ * flag.
+ */
+ WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
+
+ /*
* The close can fail if an update cannot be written, return the EBUSY
* error to our caller for eventual retry.
*
@@ -319,20 +303,31 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
* invalid if the mapping is closed.
*/
if (!F_ISSET(btree,
- WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
- WT_ERR(force && (btree->bm == NULL || btree->bm->map == NULL) ?
- __conn_dhandle_mark_dead(session) :
- __wt_checkpoint_close(session, final));
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+ if (force && (btree->bm == NULL || btree->bm->map == NULL)) {
+ F_SET(session->dhandle, WT_DHANDLE_DEAD);
+ marked_dead = true;
+ }
+ if (!marked_dead || final)
+ WT_ERR(__wt_checkpoint_close(session, final));
+ }
WT_TRET(__wt_btree_close(session));
- if (!force || final) {
+
+ /*
+ * If we marked a handle dead it will be closed by sweep, via
+ * another call to sync and close.
+ */
+ if (!marked_dead) {
F_CLR(dhandle, WT_DHANDLE_OPEN);
if (dhandle->checkpoint == NULL)
--S2C(session)->open_btree_count;
}
F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
-err: __wt_spin_unlock(session, &dhandle->close_lock);
+err: if (evict_reset)
+ __wt_evict_file_exclusive_off(session);
+ __wt_spin_unlock(session, &dhandle->close_lock);
if (no_schema_lock)
F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK);
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 8854f5e0592..a89b2f5ca6f 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -64,11 +64,9 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- bool evict_reset;
btree = S2BT(session);
dhandle = session->dhandle;
- evict_reset = false;
/*
* Acquire an exclusive lock on the handle and mark it dead.
@@ -92,19 +90,13 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
!__wt_txn_visible_all(session, btree->rec_max_txn))
goto err;
- /* Ensure that we aren't racing with the eviction server */
- WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
-
/*
- * Mark the handle as dead and close the underlying file
- * handle. Closing the handle decrements the open file count,
- * meaning the close loop won't overrun the configured minimum.
+ * Mark the handle dead and close the underlying file handle.
+ * Closing the handle decrements the open file count, meaning the close
+ * loop won't overrun the configured minimum.
*/
ret = __wt_conn_btree_sync_and_close(session, false, true);
- if (evict_reset)
- __wt_evict_file_exclusive_off(session);
-
err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
return (ret);
@@ -171,7 +163,7 @@ __sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp)
!F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
- /* If the handle is marked "dead", flush it from cache. */
+ /* If the handle is marked dead, flush it from cache. */
WT_WITH_DHANDLE(session, dhandle, ret =
__wt_conn_btree_sync_and_close(session, false, false));
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 47917289503..1314f715907 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -729,31 +729,35 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
{
WT_BTREE *btree;
WT_CACHE *cache;
+ WT_DECL_RET;
WT_EVICT_ENTRY *evict;
u_int i, elem;
btree = S2BT(session);
cache = S2C(session)->cache;
- /*
- * If the file isn't evictable, there's no work to do.
- */
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
- *evict_resetp = false;
+ *evict_resetp = false;
+ /* If the file was never evictable, there's no work to do. */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
return (0);
- }
- *evict_resetp = true;
/*
* Hold the walk lock to set the "no eviction" flag: no new pages from
* the file will be queued for eviction after this point.
*/
__wt_spin_lock(session, &cache->evict_walk_lock);
- F_SET(btree, WT_BTREE_NO_EVICTION);
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+ *evict_resetp = true;
+ }
__wt_spin_unlock(session, &cache->evict_walk_lock);
+ /* If some other operation has disabled eviction, we're done. */
+ if (!*evict_resetp)
+ return (0);
+
/* Clear any existing LRU eviction walk for the file. */
- WT_RET(__evict_request_walk_clear(session));
+ WT_ERR(__evict_request_walk_clear(session));
/* Hold the evict lock to remove any queued pages from this file. */
__wt_spin_lock(session, &cache->evict_lock);
@@ -776,6 +780,10 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
__wt_yield();
return (0);
+
+err: F_CLR(btree, WT_BTREE_NO_EVICTION);
+ *evict_resetp = false;
+ return (ret);
}
/*
@@ -789,7 +797,14 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
btree = S2BT(session);
- WT_ASSERT(session, btree->evict_ref == NULL);
+ /*
+ * We have seen subtle bugs with multiple threads racing to turn
+ * eviction on/off. Make races more likely in diagnostic builds.
+ */
+ WT_DIAGNOSTIC_YIELD;
+
+ WT_ASSERT(session, btree->evict_ref == NULL &&
+ F_ISSET(btree, WT_BTREE_NO_EVICTION));
F_CLR(btree, WT_BTREE_NO_EVICTION);
}
@@ -952,6 +967,7 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
conn = S2C(session);
cache = S2C(session)->cache;
+ btree = NULL;
dhandle = NULL;
dhandle_locked = incr = false;
retries = 0;
@@ -1011,6 +1027,7 @@ retry: while (slot < max_entries && ret == 0) {
(void)__wt_atomic_subi32(
&dhandle->session_inuse, 1);
incr = false;
+ cache->evict_file_next = NULL;
}
dhandle = TAILQ_NEXT(dhandle, q);
}
@@ -1065,6 +1082,9 @@ retry: while (slot < max_entries && ret == 0) {
* exclusive access when a handle is being closed.
*/
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+ /* Remember the file to visit first, next loop. */
+ cache->evict_file_next = dhandle;
+
WT_WITH_DHANDLE(session, dhandle,
ret = __evict_walk_file(session, &slot, flags));
WT_ASSERT(session, session->split_gen == 0);
@@ -1084,9 +1104,6 @@ retry: while (slot < max_entries && ret == 0) {
}
if (incr) {
- /* Remember the file we should visit first, next loop. */
- cache->evict_file_next = dhandle;
-
WT_ASSERT(session, dhandle->session_inuse > 0);
(void)__wt_atomic_subi32(&dhandle->session_inuse, 1);
incr = false;
@@ -1248,19 +1265,6 @@ fast: /* If the page can't be evicted, give up. */
continue;
/*
- * If the page is clean but has modifications that appear too
- * new to evict, skip it.
- *
- * Note: take care with ordering: if we detected that the page
- * is modified above, we expect mod != NULL.
- */
- mod = page->modify;
- if (!modified && mod != NULL && !LF_ISSET(
- WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
- !__wt_txn_visible_all(session, mod->rec_max_txn))
- continue;
-
- /*
* If the oldest transaction hasn't changed since the last time
* this page was written, it's unlikely that we can make
* progress. Similarly, if the most recent update on the page
@@ -1273,6 +1277,7 @@ fast: /* If the page can't be evicted, give up. */
* running last time we wrote the page has since rolled back,
* or we can help get the checkpoint completed sooner.
*/
+ mod = page->modify;
if (modified && !LF_ISSET(
WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
(mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 4514f5a42fa..86e560ced78 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -66,6 +66,8 @@
else if (ret == 0 && !F_ISSET(&(s)->txn, WT_TXN_ERROR)) \
ret = __wt_txn_commit((s), NULL); \
else { \
+ if (retry) \
+ WT_TRET(__wt_session_copy_values(s)); \
WT_TRET(__wt_txn_rollback((s), NULL)); \
if ((ret == 0 || ret == WT_ROLLBACK) && \
(retry)) { \
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 6dff3156d09..c58abd356cf 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1073,6 +1073,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
WT_BTREE *btree;
WT_PAGE_MODIFY *mod;
WT_TXN_GLOBAL *txn_global;
+ bool modified;
if (inmem_splitp != NULL)
*inmem_splitp = false;
@@ -1109,6 +1110,8 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
!__wt_txn_visible_all(session, mod->mod_split_txn))
return (false);
+ modified = __wt_page_is_modified(page);
+
/*
* If the file is being checkpointed, we can't evict dirty pages:
* if we write a page and free the previous version of the page, that
@@ -1116,8 +1119,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* been written in the checkpoint, leaving the checkpoint inconsistent.
*/
if (btree->checkpointing &&
- (__wt_page_is_modified(page) ||
- mod->rec_result == WT_PM_REC_MULTIBLOCK)) {
+ (modified || mod->rec_result == WT_PM_REC_MULTIBLOCK)) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
return (false);
@@ -1133,6 +1135,14 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
return (false);
}
+ /*
+ * If the page is clean, check if it has an update that is too new to
+ * evict.
+ */
+ if (!modified && mod != NULL &&
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
+ return (false);
+
return (true);
}
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index d7d958e801e..4cf6653d988 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -457,13 +457,10 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
* further forward, so that once a read-uncommitted cursor is
* positioned on a value, it can't be freed.
*/
- if (txn->isolation == WT_ISO_READ_UNCOMMITTED &&
- !F_ISSET(txn, WT_TXN_HAS_ID) &&
- WT_TXNID_LT(txn_state->snap_min, txn_global->last_running))
- txn_state->snap_min = txn_global->last_running;
-
- if (txn->isolation != WT_ISO_READ_UNCOMMITTED &&
- !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
+ if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
+ if (txn_state->snap_min == WT_TXN_NONE)
+ txn_state->snap_min = txn_global->last_running;
+ } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
__wt_txn_get_snapshot(session);
}
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
index b8d7aa930ea..6c8296a9bdd 100644
--- a/src/third_party/wiredtiger/src/meta/meta_track.c
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -487,7 +487,7 @@ __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, bool created)
/*
* __wt_meta_track_init --
- * Intialize metadata tracking.
+ * Initialize metadata tracking.
*/
int
__wt_meta_track_init(WT_SESSION_IMPL *session)
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index f71ff8ea71c..dffd832672e 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -43,6 +43,15 @@ __wt_session_copy_values(WT_SESSION_IMPL *session)
TAILQ_FOREACH(cursor, &session->cursors, q)
if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * We have to do this with a transaction ID pinned
+ * unless the cursor is reading from a checkpoint.
+ */
+ WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session);
+ WT_ASSERT(session, txn_state->snap_min != WT_TXN_NONE);
+#endif
+
F_CLR(cursor, WT_CURSTD_VALUE_INT);
WT_RET(__wt_buf_set(session, &cursor->value,
cursor->value.data, cursor->value.size));
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index f9af9589172..de67ef4bdd4 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -408,6 +408,16 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_TRET(txn->notify->notify(txn->notify,
(WT_SESSION *)session, txn->id, 1));
+ /*
+ * We are about to release the snapshot: copy values into any
+ * positioned cursors so they don't point to updates that could be
+ * freed once we don't have a snapshot.
+ */
+ if (session->ncursors > 0) {
+ WT_DIAGNOSTIC_YIELD;
+ WT_RET(__wt_session_copy_values(session));
+ }
+
/* If we are logging, write a commit log record. */
if (ret == 0 && txn->mod_count > 0 &&
FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) &&
@@ -437,14 +447,6 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
__wt_txn_op_free(session, op);
txn->mod_count = 0;
- /*
- * We are about to release the snapshot: copy values into any
- * positioned cursors so they don't point to updates that could be
- * freed once we don't have a transaction ID pinned.
- */
- if (session->ncursors > 0)
- WT_RET(__wt_session_copy_values(session));
-
__wt_txn_release(session);
return (0);
}