diff options
author | David Hows <howsdav@gmail.com> | 2016-08-08 12:06:08 +1000 |
---|---|---|
committer | David Hows <howsdav@gmail.com> | 2016-08-08 12:06:32 +1000 |
commit | 0a0526b04f14a1ad13e1d2942e1e7aabfc715a36 (patch) | |
tree | 7697d97703f9213b1cb387ecc0f5822f9b0618be | |
parent | 61106b8075fda058a5e8aae0141f8377b56bff85 (diff) | |
download | mongo-0a0526b04f14a1ad13e1d2942e1e7aabfc715a36.tar.gz |
Import wiredtiger-wiredtiger-mongodb-3.0.9-19-ga5c67bd.tar.gz from wiredtiger branch mongodb-3.0
ref: 9cfe4e1..a5c67bd
for: 3.0.13
WT-2139 LSM with read-uncommitted isolation, read after free
WT-2313 sweep-server: conn_dhandle.c, 610: dhandle != conn->cache->evict_file_next
WT-2434 Race between force-drop and sweep
WT-2708 split child-update race with reconciliation/eviction
WT-2725 WiredTiger hitting assert trying to free update list in MongoDB 3.0
WT-2733 Backport fixes for races between eviction and dead handle cleanup
WT-2802 Transaction commit causes heap-use-after free
WT-2804 Don't read values in a tree without a snapshot
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 28 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/conn/conn_dhandle.c | 57 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/conn/conn_sweep.c | 16 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_lru.c | 57 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/api.h | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree.i | 14 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/txn.i | 11 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/meta/meta_track.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/session/session_api.c | 9 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn.c | 18 |
10 files changed, 118 insertions, 96 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 7f3620bb361..28fcb60e95d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -251,7 +251,7 @@ static int __split_ref_deepen_move(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) { - WT_ADDR *addr; + WT_ADDR *addr, *ref_addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; @@ -287,13 +287,18 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, } /* - * If there's no address (the page has never been written), or the - * address has been instantiated, there's no work to do. Otherwise, - * get the address from the on-page cell. + * If there's no address at all (the page has never been written), or + * the address has already been instantiated, there's no work to do. + * Otherwise, the address still references a split page on-page cell, + * instantiate it. We can race with reconciliation and/or eviction of + * the child pages, be cautious: read the address and verify it, and + * only update it if the value is unchanged from the original. In the + * case of a race, the address must no longer reference the split page, + * we're done. */ - addr = ref->addr; - if (addr != NULL && !__wt_off_page(parent, addr)) { - __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); + WT_ORDERED_READ(ref_addr, ref->addr); + if (ref_addr != NULL && !__wt_off_page(parent, ref_addr)) { + __wt_cell_unpack((WT_CELL *)ref_addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( session, unpack.data, unpack.size, &addr->addr)) != 0) { @@ -304,6 +309,10 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, addr->type = unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; ref->addr = addr; + if (!__wt_atomic_cas_ptr(&ref->addr, ref_addr, addr)) { + __wt_free(session, addr->addr); + __wt_free(session, addr); + } } /* And finally, the WT_REF itself. */ @@ -502,7 +511,8 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) * array, a thread might see a freed WT_REF. Set the eviction * transaction requirement for the newly created internal pages. */ - child->modify->mod_split_txn = __wt_txn_id_alloc(session, false); + child->modify->mod_split_txn = + __wt_txn_id_alloc(session, false); /* * The newly allocated child's page index references the same @@ -856,7 +866,7 @@ __split_parent_lock( } /* * If a checkpoint is running and we fail to lock the parent - * page, give up immmediately to avoid deadlock. + * page, give up immediately to avoid deadlock. */ if (S2BT(session)->checkpointing) return (EBUSY); diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index fed0012b211..1bb8e37249f 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -247,30 +247,6 @@ err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); } /* - * __conn_dhandle_mark_dead -- - * Mark a data handle dead. - */ -static int -__conn_dhandle_mark_dead(WT_SESSION_IMPL *session) -{ - bool evict_reset; - - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - /* - * Handle forced discard (e.g., when dropping a file). - * - * We need exclusive access to the file -- disable ordinary - * eviction and drain any blocks already queued. - */ - WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); - F_SET(session->dhandle, WT_DHANDLE_DEAD); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - return (0); -} - -/* * __wt_conn_btree_sync_and_close -- * Sync and close the underlying btree handle. */ @@ -280,10 +256,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - bool no_schema_lock; + bool evict_reset, marked_dead, no_schema_lock; dhandle = session->dhandle; btree = S2BT(session); + evict_reset = marked_dead = false; if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) return (0); @@ -309,6 +286,13 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) __wt_spin_lock(session, &dhandle->close_lock); /* + * Ensure we aren't racing with the eviction server; inside the close + * lock so threads won't race setting/clearing the tree's "no eviction" + * flag. + */ + WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); + + /* * The close can fail if an update cannot be written, return the EBUSY * error to our caller for eventual retry. * @@ -319,20 +303,31 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) * invalid if the mapping is closed. */ if (!F_ISSET(btree, - WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) - WT_ERR(force && (btree->bm == NULL || btree->bm->map == NULL) ? - __conn_dhandle_mark_dead(session) : - __wt_checkpoint_close(session, final)); + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { + if (force && (btree->bm == NULL || btree->bm->map == NULL)) { + F_SET(session->dhandle, WT_DHANDLE_DEAD); + marked_dead = true; + } + if (!marked_dead || final) + WT_ERR(__wt_checkpoint_close(session, final)); + } WT_TRET(__wt_btree_close(session)); - if (!force || final) { + + /* + * If we marked a handle dead it will be closed by sweep, via + * another call to sync and close. + */ + if (!marked_dead) { F_CLR(dhandle, WT_DHANDLE_OPEN); if (dhandle->checkpoint == NULL) --S2C(session)->open_btree_count; } F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); -err: __wt_spin_unlock(session, &dhandle->close_lock); +err: if (evict_reset) + __wt_evict_file_exclusive_off(session); + __wt_spin_unlock(session, &dhandle->close_lock); if (no_schema_lock) F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 8854f5e0592..a89b2f5ca6f 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -64,11 +64,9 @@ __sweep_expire_one(WT_SESSION_IMPL *session) WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - bool evict_reset; btree = S2BT(session); dhandle = session->dhandle; - evict_reset = false; /* * Acquire an exclusive lock on the handle and mark it dead. @@ -92,19 +90,13 @@ __sweep_expire_one(WT_SESSION_IMPL *session) !__wt_txn_visible_all(session, btree->rec_max_txn)) goto err; - /* Ensure that we aren't racing with the eviction server */ - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); - /* - * Mark the handle as dead and close the underlying file - * handle. Closing the handle decrements the open file count, - * meaning the close loop won't overrun the configured minimum. + * Mark the handle dead and close the underlying file handle. + * Closing the handle decrements the open file count, meaning the close + * loop won't overrun the configured minimum. */ ret = __wt_conn_btree_sync_and_close(session, false, true); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); return (ret); @@ -171,7 +163,7 @@ __sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp) !F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; - /* If the handle is marked "dead", flush it from cache. */ + /* If the handle is marked dead, flush it from cache. */ WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, false, false)); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 47917289503..1314f715907 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -729,31 +729,35 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) { WT_BTREE *btree; WT_CACHE *cache; + WT_DECL_RET; WT_EVICT_ENTRY *evict; u_int i, elem; btree = S2BT(session); cache = S2C(session)->cache; - /* - * If the file isn't evictable, there's no work to do. - */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { - *evict_resetp = false; + *evict_resetp = false; + /* If the file was never evictable, there's no work to do. */ + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (0); - } - *evict_resetp = true; /* * Hold the walk lock to set the "no eviction" flag: no new pages from * the file will be queued for eviction after this point. */ __wt_spin_lock(session, &cache->evict_walk_lock); - F_SET(btree, WT_BTREE_NO_EVICTION); + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + F_SET(btree, WT_BTREE_NO_EVICTION); + *evict_resetp = true; + } __wt_spin_unlock(session, &cache->evict_walk_lock); + /* If some other operation has disabled eviction, we're done. */ + if (!*evict_resetp) + return (0); + /* Clear any existing LRU eviction walk for the file. */ - WT_RET(__evict_request_walk_clear(session)); + WT_ERR(__evict_request_walk_clear(session)); /* Hold the evict lock to remove any queued pages from this file. */ __wt_spin_lock(session, &cache->evict_lock); @@ -776,6 +780,10 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) __wt_yield(); return (0); + +err: F_CLR(btree, WT_BTREE_NO_EVICTION); + *evict_resetp = false; + return (ret); } /* @@ -789,7 +797,14 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) btree = S2BT(session); - WT_ASSERT(session, btree->evict_ref == NULL); + /* + * We have seen subtle bugs with multiple threads racing to turn + * eviction on/off. Make races more likely in diagnostic builds. + */ + WT_DIAGNOSTIC_YIELD; + + WT_ASSERT(session, btree->evict_ref == NULL && + F_ISSET(btree, WT_BTREE_NO_EVICTION)); F_CLR(btree, WT_BTREE_NO_EVICTION); } @@ -952,6 +967,7 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags) conn = S2C(session); cache = S2C(session)->cache; + btree = NULL; dhandle = NULL; dhandle_locked = incr = false; retries = 0; @@ -1011,6 +1027,7 @@ retry: while (slot < max_entries && ret == 0) { (void)__wt_atomic_subi32( &dhandle->session_inuse, 1); incr = false; + cache->evict_file_next = NULL; } dhandle = TAILQ_NEXT(dhandle, q); } @@ -1065,6 +1082,9 @@ retry: while (slot < max_entries && ret == 0) { * exclusive access when a handle is being closed. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + /* Remember the file to visit first, next loop. */ + cache->evict_file_next = dhandle; + WT_WITH_DHANDLE(session, dhandle, ret = __evict_walk_file(session, &slot, flags)); WT_ASSERT(session, session->split_gen == 0); @@ -1084,9 +1104,6 @@ retry: while (slot < max_entries && ret == 0) { } if (incr) { - /* Remember the file we should visit first, next loop. */ - cache->evict_file_next = dhandle; - WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); incr = false; @@ -1248,19 +1265,6 @@ fast: /* If the page can't be evicted, give up. */ continue; /* - * If the page is clean but has modifications that appear too - * new to evict, skip it. - * - * Note: take care with ordering: if we detected that the page - * is modified above, we expect mod != NULL. - */ - mod = page->modify; - if (!modified && mod != NULL && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && - !__wt_txn_visible_all(session, mod->rec_max_txn)) - continue; - - /* * If the oldest transaction hasn't changed since the last time * this page was written, it's unlikely that we can make * progress. Similarly, if the most recent update on the page @@ -1273,6 +1277,7 @@ fast: /* If the page can't be evicted, give up. */ * running last time we wrote the page has since rolled back, * or we can help get the checkpoint completed sooner. */ + mod = page->modify; if (modified && !LF_ISSET( WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && (mod->disk_snap_min == S2C(session)->txn_global.oldest_id || diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 4514f5a42fa..86e560ced78 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -66,6 +66,8 @@ else if (ret == 0 && !F_ISSET(&(s)->txn, WT_TXN_ERROR)) \ ret = __wt_txn_commit((s), NULL); \ else { \ + if (retry) \ + WT_TRET(__wt_session_copy_values(s)); \ WT_TRET(__wt_txn_rollback((s), NULL)); \ if ((ret == 0 || ret == WT_ROLLBACK) && \ (retry)) { \ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 6dff3156d09..c58abd356cf 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1073,6 +1073,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_PAGE_MODIFY *mod; WT_TXN_GLOBAL *txn_global; + bool modified; if (inmem_splitp != NULL) *inmem_splitp = false; @@ -1109,6 +1110,8 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, !__wt_txn_visible_all(session, mod->mod_split_txn)) return (false); + modified = __wt_page_is_modified(page); + /* * If the file is being checkpointed, we can't evict dirty pages: * if we write a page and free the previous version of the page, that @@ -1116,8 +1119,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * been written in the checkpoint, leaving the checkpoint inconsistent. */ if (btree->checkpointing && - (__wt_page_is_modified(page) || - mod->rec_result == WT_PM_REC_MULTIBLOCK)) { + (modified || mod->rec_result == WT_PM_REC_MULTIBLOCK)) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); return (false); @@ -1133,6 +1135,14 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, return (false); } + /* + * If the page is clean, check if it has an update that is too new to + * evict. + */ + if (!modified && mod != NULL && + !__wt_txn_visible_all(session, mod->rec_max_txn)) + return (false); + return (true); } diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index d7d958e801e..4cf6653d988 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -457,13 +457,10 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * further forward, so that once a read-uncommitted cursor is * positioned on a value, it can't be freed. */ - if (txn->isolation == WT_ISO_READ_UNCOMMITTED && - !F_ISSET(txn, WT_TXN_HAS_ID) && - WT_TXNID_LT(txn_state->snap_min, txn_global->last_running)) - txn_state->snap_min = txn_global->last_running; - - if (txn->isolation != WT_ISO_READ_UNCOMMITTED && - !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + if (txn->isolation == WT_ISO_READ_UNCOMMITTED) { + if (txn_state->snap_min == WT_TXN_NONE) + txn_state->snap_min = txn_global->last_running; + } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) __wt_txn_get_snapshot(session); } diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index b8d7aa930ea..6c8296a9bdd 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -487,7 +487,7 @@ __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, bool created) /* * __wt_meta_track_init -- - * Intialize metadata tracking. + * Initialize metadata tracking. */ int __wt_meta_track_init(WT_SESSION_IMPL *session) diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index f71ff8ea71c..dffd832672e 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -43,6 +43,15 @@ __wt_session_copy_values(WT_SESSION_IMPL *session) TAILQ_FOREACH(cursor, &session->cursors, q) if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) { +#ifdef HAVE_DIAGNOSTIC + /* + * We have to do this with a transaction ID pinned + * unless the cursor is reading from a checkpoint. + */ + WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session); + WT_ASSERT(session, txn_state->snap_min != WT_TXN_NONE); +#endif + F_CLR(cursor, WT_CURSTD_VALUE_INT); WT_RET(__wt_buf_set(session, &cursor->value, cursor->value.data, cursor->value.size)); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index f9af9589172..de67ef4bdd4 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -408,6 +408,16 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); + /* + * We are about to release the snapshot: copy values into any + * positioned cursors so they don't point to updates that could be + * freed once we don't have a snapshot. + */ + if (session->ncursors > 0) { + WT_DIAGNOSTIC_YIELD; + WT_RET(__wt_session_copy_values(session)); + } + /* If we are logging, write a commit log record. */ if (ret == 0 && txn->mod_count > 0 && FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) && @@ -437,14 +447,6 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) __wt_txn_op_free(session, op); txn->mod_count = 0; - /* - * We are about to release the snapshot: copy values into any - * positioned cursors so they don't point to updates that could be - * freed once we don't have a transaction ID pinned. - */ - if (session->ncursors > 0) - WT_RET(__wt_session_copy_values(session)); - __wt_txn_release(session); return (0); } |