summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src
diff options
context:
space:
mode:
authorAlexander Gorrod <alexander.gorrod@mongodb.com>2015-05-08 05:34:43 +0000
committerAlexander Gorrod <alexander.gorrod@mongodb.com>2015-05-08 05:34:43 +0000
commitcddb4b8f5a193e32d1400963cd177fe47c8570df (patch)
treedade044e688956af69505fc796244f229d40241c /src/third_party/wiredtiger/src
parent3a8bcdfb23ab85b57e5da308fb4d3c607ce77a49 (diff)
downloadmongo-cddb4b8f5a193e32d1400963cd177fe47c8570df.tar.gz
Import wiredtiger-wiredtiger-2.5.3-486-g4f9aa1c.tar.gz from wiredtiger branch mongodb-3.2
Diffstat (limited to 'src/third_party/wiredtiger/src')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c11
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c11
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c88
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c14
-rw-r--r--src/third_party/wiredtiger/src/config/config_api.c3
-rw-r--r--src/third_party/wiredtiger/src/config/config_concat.c72
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c3
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache.c15
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c4
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c381
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c4
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c4
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c12
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_log.c2
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c6
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c9
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c94
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h21
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i106
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i4
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h7
-rw-r--r--src/third_party/wiredtiger/src/include/dhandle.h5
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h25
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h28
-rw-r--r--src/third_party/wiredtiger/src/include/hardware.h2
-rw-r--r--src/third_party/wiredtiger/src/include/log.h2
-rw-r--r--src/third_party/wiredtiger/src/include/lsm.h17
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i11
-rw-r--r--src/third_party/wiredtiger/src/include/schema.h66
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i9
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h3
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i29
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in7
-rw-r--r--src/third_party/wiredtiger/src/log/log.c10
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c106
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c116
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c6
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c35
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c2
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c7
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c6
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_alloc.c17
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c40
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_drop.c4
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_open.c4
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_rename.c2
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_truncate.c2
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_worker.c10
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c80
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c369
-rw-r--r--src/third_party/wiredtiger/src/support/mutex.c255
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c205
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c18
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c4
64 files changed, 1170 insertions, 1241 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index 1528d65b8c8..4709ac3260e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -110,7 +110,7 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
*
* We're holding the schema lock which serializes with checkpoints.
*/
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
/*
* Get the tree handle's flush lock which blocks threads writing leaf
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 7c894effacd..0aed5940533 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -841,8 +841,11 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_ERR(ret);
if (__cursor_valid(cbt, &upd))
WT_ERR(__wt_kv_return(session, cbt, upd));
- else
- WT_ERR(__wt_btcur_search_near(cbt, NULL));
+ else {
+ if ((ret = __wt_btcur_next(cbt, 0)) == WT_NOTFOUND)
+ ret = __wt_btcur_prev(cbt, 0);
+ WT_ERR(ret);
+ }
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 9cc7cd2a824..b1fa5ce6178 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -640,10 +640,14 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ", disk-mapped");
if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
__dmsg(ds, ", evict-lru");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_REFUSE_DEEPEN))
+ __dmsg(ds, ", refuse-deepen");
if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
__dmsg(ds, ", scanning");
- if (F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING))
- __dmsg(ds, ", splitting");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
+ __dmsg(ds, ", split-insert");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED))
+ __dmsg(ds, ", split-locked");
if (mod != NULL)
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
@@ -656,6 +660,9 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
case WT_PM_REC_REPLACE:
__dmsg(ds, ", replaced");
break;
+ case WT_PM_REC_REWRITE:
+ __dmsg(ds, ", rewrite");
+ break;
case 0:
break;
WT_ILLEGAL_VALUE(session);
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index f43e936eeda..a05c6217338 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -56,7 +56,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED));
#ifdef HAVE_DIAGNOSTIC
{
@@ -150,6 +150,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
case WT_PM_REC_MULTIBLOCK:
+ case WT_PM_REC_REWRITE:
/* Free list of replacement blocks. */
for (multi = mod->mod_multi,
i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index e249f997d87..4303ba4cd48 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -133,16 +133,13 @@ __wt_btree_close(WT_SESSION_IMPL *session)
{
WT_BM *bm;
WT_BTREE *btree;
- WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- dhandle = session->dhandle;
btree = S2BT(session);
if ((bm = btree->bm) != NULL) {
/* Unload the checkpoint, unless it's a special command. */
- if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
- !F_ISSET(btree,
+ if (!F_ISSET(btree,
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
WT_TRET(bm->checkpoint_unload(bm, session));
@@ -173,6 +170,8 @@ __wt_btree_close(WT_SESSION_IMPL *session)
btree->bulk_load_ok = 0;
+ F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 8086806b3a4..17d9442e1a4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -49,8 +49,12 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
/* Trigger eviction on the next page release. */
__wt_page_evict_soon(page);
+ /* Bump the oldest ID, we're about to do some visibility checks. */
+ __wt_txn_update_oldest(session, 0);
+
/* If eviction cannot succeed, don't try. */
- return (__wt_page_can_evict(session, page, 1));
+ return (
+ __wt_page_can_evict(session, page, WT_EVICT_CHECK_SPLITS, NULL));
}
/*
@@ -181,8 +185,11 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
skip_evict:
/*
* Check if we need an autocommit transaction.
+ * Starting a transaction can trigger eviction, so skip
+ * it if eviction isn't permitted.
*/
- return (__wt_txn_autocommit_check(session));
+ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
+ __wt_txn_autocommit_check(session));
WT_ILLEGAL_VALUE(session);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 896ab23f1c2..1bfd03f58cb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -327,7 +327,7 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
*/
if (ss->root_ref.page != NULL) {
btree->ckpt = ckptbase;
- ret = __wt_evict(session, &ss->root_ref, 1);
+ ret = __wt_evict(session, &ss->root_ref, WT_EVICT_EXCLUSIVE);
ss->root_ref.page = NULL;
btree->ckpt = NULL;
}
@@ -1313,7 +1313,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
- ret = __wt_evict(session, ref, 1);
+ ret = __wt_evict(session, ref, WT_EVICT_EXCLUSIVE);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));
@@ -2022,7 +2022,7 @@ __slvg_row_build_leaf(
*/
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
- ret = __wt_evict(session, ref, 1);
+ ret = __wt_evict(session, ref, WT_EVICT_EXCLUSIVE);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 59ad7abb221..d4c8cf1b92d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -670,7 +670,6 @@ __split_multi_inmem(
* when discarding the original page, and our caller will discard the
* allocated page on error, when discarding the allocated WT_REF.
*/
-
WT_RET(__wt_page_inmem(session, ref,
multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size,
WT_PAGE_DISK_ALLOC, &page));
@@ -815,7 +814,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
*/
static int
__split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
- WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive)
+ WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, uint32_t flags)
{
WT_DECL_RET;
WT_IKEY *ikey;
@@ -849,13 +848,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
*/
for (;;) {
parent = ref->home;
- F_CAS_ATOMIC(parent, WT_PAGE_SPLITTING, ret);
+ F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret);
if (ret == 0) {
if (parent == ref->home)
break;
- F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+ F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
continue;
}
+ /*
+ * If we're attempting an in-memory split and we can't lock the
+ * parent, give up. This avoids an infinite loop where we are
+ * trying to split a page while its parent is being
+ * checkpointed.
+ */
+ if (LF_ISSET(WT_EVICT_INMEM_SPLIT))
+ return (EBUSY);
__wt_yield();
}
@@ -865,6 +872,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* update the parent's index, it will no longer refer to the child, and
* could conceivably be evicted. Get a hazard pointer on the parent
* now, so that we can safely access it after updating the index.
+ *
+ * Take care that getting the page doesn't trigger eviction, or we
+ * could block trying to split a different child of our parent and
+ * deadlock.
*/
if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
@@ -1031,7 +1042,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* Add it to the session discard list, to be freed when it's safe.
*/
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
+ WT_TRET(__split_safe_free(session,
+ split_gen, LF_ISSET(WT_EVICT_EXCLUSIVE), pindex, size));
parent_decr += size;
/*
@@ -1056,7 +1068,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* Do the check here because we've just grown the parent page and
* are holding it locked.
*/
- if (ret == 0 && !exclusive &&
+ if (ret == 0 && !LF_ISSET(WT_EVICT_EXCLUSIVE) &&
!F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) &&
__split_should_deepen(session, parent_ref, &children)) {
/*
@@ -1078,7 +1090,7 @@ err: if (!complete)
if (next_ref->state == WT_REF_SPLIT)
next_ref->state = WT_REF_DELETED;
}
- F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+ F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
if (hazard)
WT_TRET(__wt_hazard_clear(session, parent));
@@ -1102,9 +1114,8 @@ err: if (!complete)
* list into a separate page.
*/
int
-__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
+__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
{
- WT_BTREE *btree;
WT_DECL_RET;
WT_DECL_ITEM(key);
WT_INSERT *ins, **insp, *moved_ins, *prev_ins;
@@ -1114,60 +1125,20 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
size_t page_decr, parent_incr, right_incr;
int i;
- *splitp = 0;
-
- btree = S2BT(session);
page = ref->page;
right = NULL;
page_decr = parent_incr = right_incr = 0;
- /*
- * Check for pages with append-only workloads. A common application
- * pattern is to have multiple threads frantically appending to the
- * tree. We want to reconcile and evict this page, but we'd like to
- * do it without making the appending threads wait. If we're not
- * discarding the tree, check and see if it's worth doing a split to
- * let the threads continue before doing eviction.
- *
- * Ignore anything other than large, dirty row-store leaf pages.
- *
- * XXX KEITH
- * Need a better test for append-only workloads.
- */
- if (page->type != WT_PAGE_ROW_LEAF ||
- page->memory_footprint < btree->maxmempage ||
- !__wt_page_is_modified(page))
- return (0);
+ WT_ASSERT(session, __wt_page_can_split(session, page));
- /*
- * There is no point splitting if the list is small, no deep items is
- * our heuristic for that. (A 1/4 probability of adding a new skiplist
- * level means there will be a new 6th level for roughly each 4KB of
- * entries in the list. If we have at least two 6th level entries, the
- * list is at least large enough to work with.)
- *
- * The following code requires at least two items on the insert list,
- * this test serves the additional purpose of confirming that.
- */
-#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1)
+ /* Find the last item on the page. */
ins_head = page->pg_row_entries == 0 ?
WT_ROW_INSERT_SMALLEST(page) :
WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
- if (ins_head == NULL ||
- ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL ||
- ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] ==
- ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH])
- return (0);
-
- /* Find the last item in the insert list. */
moved_ins = WT_SKIP_LAST(ins_head);
- /*
- * Only split a page once, otherwise workloads that update in the middle
- * of the page could continually split without benefit.
- */
- if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
- return (0);
+ /* Mark that this page has already been through an in-memory split. */
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT));
F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);
/*
@@ -1360,8 +1331,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
* longer locked, so we cannot safely look at it.
*/
page = NULL;
- if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, 0)) != 0) {
+ if ((ret = __split_parent(session,
+ ref, split_ref, 2, parent_incr, WT_EVICT_INMEM_SPLIT)) != 0) {
/*
* Move the insert list element back to the original page list.
* For simplicity, the previous skip list pointers originally
@@ -1384,9 +1355,6 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
WT_ERR(ret);
}
- /* Let our caller know that we split. */
- *splitp = 1;
-
WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
WT_STAT_FAST_DATA_INCR(session, cache_inmem_split);
@@ -1480,8 +1448,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
page, &mod->mod_multi[i], &ref_new[i], &parent_incr));
/* Split into the parent. */
- WT_ERR(__split_parent(
- session, ref, ref_new, new_entries, parent_incr, exclusive));
+ WT_ERR(__split_parent( session, ref, ref_new,
+ new_entries, parent_incr, exclusive ? WT_EVICT_EXCLUSIVE : 0));
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index dae2dd8d480..cc52f63f1f5 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -71,7 +71,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
__wt_txn_visible_all(
session, page->modify->update_txn)) {
if (txn->isolation == TXN_ISO_READ_COMMITTED)
- __wt_txn_refresh(session, 1);
+ __wt_txn_get_snapshot(session);
leaf_bytes += page->memory_footprint;
++leaf_pages;
WT_ERR(__wt_reconcile(session, walk, NULL, 0));
@@ -150,7 +150,8 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
*/
if (!WT_PAGE_IS_INTERNAL(page) &&
F_ISSET(txn, TXN_HAS_SNAPSHOT) &&
- TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
+ TXNID_LT(txn->snap_max, mod->first_dirty_txn) &&
+ !F_ISSET(mod, WT_PM_REC_REWRITE)) {
__wt_page_modify_set(session, page);
continue;
}
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index 0e351682e9e..d56b44bbd95 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -287,9 +287,11 @@ __wt_update_alloc(
* Check for obsolete updates.
*/
WT_UPDATE *
-__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+__wt_update_obsolete_check(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd)
{
WT_UPDATE *first, *next;
+ u_int count;
/*
* This function identifies obsolete updates, and truncates them from
@@ -299,7 +301,7 @@ __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
*
* Walk the list of updates, looking for obsolete updates at the end.
*/
- for (first = NULL; upd != NULL; upd = upd->next)
+ for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++)
if (__wt_txn_visible_all(session, upd->txnid)) {
if (first == NULL)
first = upd;
@@ -317,6 +319,14 @@ __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
WT_ATOMIC_CAS8(first->next, next, NULL))
return (next);
+ /*
+ * If the list is long, don't retry checks on this page until the
+ * transaction state has moved forwards.
+ */
+ if (count > 20)
+ page->modify->obsolete_check_txn =
+ S2C(session)->txn_global.last_running;
+
return (NULL);
}
diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c
index deff33a10bd..4e8f0d08400 100644
--- a/src/third_party/wiredtiger/src/config/config_api.c
+++ b/src/third_party/wiredtiger/src/config/config_api.c
@@ -322,8 +322,7 @@ __wt_configure_method(WT_SESSION_IMPL *session,
newcheck = &checks[cnt];
newcheck->name = newcheck_name;
WT_ERR(__wt_strdup(session, type, &newcheck->type));
- if (check != NULL)
- WT_ERR(__wt_strdup(session, check, &newcheck->checks));
+ WT_ERR(__wt_strdup(session, check, &newcheck->checks));
entry->checks = checks;
entry->checks_entries = 0;
diff --git a/src/third_party/wiredtiger/src/config/config_concat.c b/src/third_party/wiredtiger/src/config/config_concat.c
deleted file mode 100644
index e872722a272..00000000000
--- a/src/third_party/wiredtiger/src/config/config_concat.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-/*
- * __wt_config_concat --
- * Given a NULL-terminated list of configuration strings, concatenate them
- * into newly allocated memory. Nothing special is assumed about any of
- * the config strings, they are simply combined in order.
- *
- * This code deals with the case where some of the config strings are
- * wrapped in brackets but others aren't: the resulting string does not
- * have brackets.
- */
-int
-__wt_config_concat(
- WT_SESSION_IMPL *session, const char **cfg, char **config_ret)
-{
- WT_CONFIG cparser;
- WT_CONFIG_ITEM k, v;
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- const char **cp;
-
- WT_RET(__wt_scr_alloc(session, 0, &tmp));
-
- for (cp = cfg; *cp != NULL; ++cp) {
- WT_ERR(__wt_config_init(session, &cparser, *cp));
- while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
- if (k.type != WT_CONFIG_ITEM_STRING &&
- k.type != WT_CONFIG_ITEM_ID)
- WT_ERR_MSG(session, EINVAL,
- "Invalid configuration key found: '%s'\n",
- k.str);
- /* Include the quotes around string keys/values. */
- if (k.type == WT_CONFIG_ITEM_STRING) {
- --k.str;
- k.len += 2;
- }
- if (v.type == WT_CONFIG_ITEM_STRING) {
- --v.str;
- v.len += 2;
- }
- WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s%s%.*s,",
- (int)k.len, k.str,
- (v.len > 0) ? "=" : "",
- (int)v.len, v.str));
- }
- if (ret != WT_NOTFOUND)
- goto err;
- }
-
- /*
- * If the caller passes us no valid configuration strings, we get here
- * with no bytes to copy -- that's OK, the underlying string copy can
- * handle empty strings.
- *
- * Strip any trailing comma.
- */
- if (tmp->size != 0)
- --tmp->size;
- ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
-
-err: __wt_scr_free(session, &tmp);
- return (ret);
-}
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 3214926bcf1..56f15a89f30 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -919,8 +919,7 @@ __conn_open_session(WT_CONNECTION *wt_conn,
CONNECTION_API_CALL(conn, session, open_session, config, cfg);
WT_UNUSED(cfg);
- WT_ERR(__wt_open_session(conn, event_handler, config, &session_ret));
-
+ WT_ERR(__wt_open_session(conn, event_handler, config, 1, &session_ret));
*wt_sessionp = &session_ret->iface;
err: API_END_RET_NOTFOUND_MAP(session, ret);
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c
index 8de8cd3f8bc..1edd9dac7fb 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache.c
@@ -176,13 +176,22 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session)
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS *stats;
+ uint64_t inuse, leaf, used;
conn = S2C(session);
cache = conn->cache;
stats = &conn->stats;
+ inuse = __wt_cache_bytes_inuse(cache);
+ /*
+ * There are races updating the different cache tracking values so
+ * be paranoid calculating the leaf byte usage.
+ */
+ used = cache->bytes_overflow + cache->bytes_internal;
+ leaf = inuse > used ? inuse - used : 0;
+
WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
- WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache));
+ WT_STAT_SET(stats, cache_bytes_inuse, inuse);
WT_STAT_SET(stats, cache_overhead, cache->overhead_pct);
WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
@@ -191,11 +200,9 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session)
cache_eviction_maximum_page_size, cache->evict_max_page_size);
WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty);
- /* Figure out internal, leaf and overflow stats */
WT_STAT_SET(stats, cache_bytes_internal, cache->bytes_internal);
- WT_STAT_SET(stats, cache_bytes_leaf,
- conn->cache_size - (cache->bytes_internal + cache->bytes_overflow));
WT_STAT_SET(stats, cache_bytes_overflow, cache->bytes_overflow);
+ WT_STAT_SET(stats, cache_bytes_leaf, leaf);
}
/*
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
index 488864ce351..de7e9e3486f 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -96,8 +96,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
__wt_process.cache_pool = cp;
WT_ERR(__wt_verbose(session,
WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name));
- } else if (!updating && !WT_STRING_MATCH(
- __wt_process.cache_pool->name, pool_name, strlen(pool_name)))
+ } else if (!updating &&
+ strcmp(__wt_process.cache_pool->name, pool_name) != 0)
/* Only a single cache pool is supported. */
WT_ERR_MSG(session, WT_ERROR,
"Attempting to join a cache pool that does not exist: %s",
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 60e7c41f76d..07d4cce40f5 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -9,119 +9,57 @@
#include "wt_internal.h"
/*
- * __conn_dhandle_open_lock --
- * Spin on the current data handle until either (a) it is open, read
- * locked; or (b) it is closed, write locked. If exclusive access is
- * requested and cannot be granted immediately because the handle is
- * in use, fail with EBUSY.
- *
- * Here is a brief summary of how different operations synchronize using
- * either the schema lock, handle locks or handle flags:
- *
- * open -- holds the schema lock, one thread gets the handle exclusive,
- * reverts to a shared handle lock and drops the schema lock
- * once the handle is open;
- * bulk load -- sets bulk and exclusive;
- * salvage, truncate, update, verify -- hold the schema lock, set a
- * "special" flag;
- * sweep -- gets a write lock on the handle, doesn't set exclusive
- *
- * The schema lock prevents a lot of potential conflicts: we should never
- * see handles being salvaged or verified because those operation hold the
- * schema lock. However, it is possible to see a handle that is being
- * bulk loaded, or that the sweep server is closing.
- *
- * The principle here is that application operations can cause other
- * application operations to fail (so attempting to open a cursor on a
- * file while it is being bulk-loaded will fail), but internal or
- * database-wide operations should not prevent application-initiated
- * operations. For example, attempting to verify a file should not fail
- * because the sweep server happens to be in the process of closing that
- * file.
+ * __conn_dhandle_destroy --
+ * Destroy a data handle.
+ */
+static int
+__conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle)
+{
+ WT_DECL_RET;
+
+ ret = __wt_rwlock_destroy(session, &dhandle->rwlock);
+ __wt_free(session, dhandle->name);
+ __wt_free(session, dhandle->checkpoint);
+ __wt_free(session, dhandle->handle);
+ __wt_spin_destroy(session, &dhandle->close_lock);
+ __wt_overwrite_and_free(session, dhandle);
+
+ return (ret);
+}
+
+/*
+ * __conn_dhandle_alloc --
+ * Allocate a new data handle and return it linked into the connection's
+ * list.
*/
static int
-__conn_dhandle_open_lock(
- WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags)
+__conn_dhandle_alloc(WT_SESSION_IMPL *session,
+ const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep)
{
WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- int is_open, lock_busy, want_exclusive;
- btree = dhandle->handle;
- lock_busy = 0;
- want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE) ? 1 : 0;
+ WT_RET(__wt_calloc_one(session, &dhandle));
- /*
- * Check that the handle is open. We've already incremented
- * the reference count, so once the handle is open it won't be
- * closed by another thread.
- *
- * If we can see the WT_DHANDLE_OPEN flag set while holding a
- * lock on the handle, then it's really open and we can start
- * using it. Alternatively, if we can get an exclusive lock
- * and WT_DHANDLE_OPEN is still not set, we need to do the open.
- */
- for (;;) {
- /*
- * If the handle is already open for a special operation,
- * give up.
- */
- if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
- return (EBUSY);
+ WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));
+ dhandle->name_hash = __wt_hash_city64(uri, strlen(uri));
+ WT_ERR(__wt_strdup(session, uri, &dhandle->name));
+ WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint));
- /*
- * If the handle is open, get a read lock and recheck.
- *
- * Wait for a read lock if we want exclusive access and failed
- * to get it: the sweep server may be closing this handle, and
- * we need to wait for it to release its lock. If we want
- * exclusive access and find the handle open once we get the
- * read lock, give up: some other thread has it locked for real.
- */
- if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
- (!want_exclusive || lock_busy)) {
- WT_RET(__wt_readlock(session, dhandle->rwlock));
- is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0;
- if (is_open && !want_exclusive) {
- WT_ASSERT(session,
- !F_ISSET(dhandle, WT_DHANDLE_DEAD));
- return (0);
- }
- WT_RET(__wt_readunlock(session, dhandle->rwlock));
- } else
- is_open = 0;
+ /* TODO: abstract this out for other data handle types */
+ WT_ERR(__wt_calloc_one(session, &btree));
+ dhandle->handle = btree;
+ btree->dhandle = dhandle;
- /*
- * It isn't open or we want it exclusive: try to get an
- * exclusive lock. There is some subtlety here: if we race
- * with another thread that successfully opens the file, we
- * don't want to block waiting to get exclusive access.
- */
- if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) {
- /*
- * If it was opened while we waited, drop the write
- * lock and get a read lock instead.
- */
- if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
- !want_exclusive) {
- lock_busy = 0;
- WT_RET(
- __wt_writeunlock(session, dhandle->rwlock));
- continue;
- }
+ WT_ERR(__wt_spin_init(
+ session, &dhandle->close_lock, "data handle close"));
- /* We have an exclusive lock, we're done. */
- F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
- WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD));
- return (0);
- } else if (ret != EBUSY || (is_open && want_exclusive))
- return (ret);
- else
- lock_busy = 1;
+ *dhandlep = dhandle;
+ return (0);
- /* Give other threads a chance to make progress. */
- __wt_yield();
- }
+err: WT_TRET(__conn_dhandle_destroy(session, dhandle));
+ return (ret);
}
/*
@@ -129,8 +67,8 @@ __conn_dhandle_open_lock(
* Find a previously opened data handle.
*/
int
-__wt_conn_dhandle_find(WT_SESSION_IMPL *session,
- const char *name, const char *ckpt, uint32_t flags)
+__wt_conn_dhandle_find(
+ WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
@@ -138,20 +76,16 @@ __wt_conn_dhandle_find(WT_SESSION_IMPL *session,
conn = S2C(session);
- /*
- * We must be holding the handle list lock at a higher level, and not
- * have a reference.
- */
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED) &&
- !LF_ISSET(WT_DHANDLE_HAVE_REF));
+ /* We must be holding the handle list lock at a higher level. */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
- bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE;
- if (ckpt == NULL) {
+ bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
+ if (checkpoint == NULL) {
SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
if (F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
if (dhandle->checkpoint == NULL &&
- strcmp(name, dhandle->name) == 0) {
+ strcmp(uri, dhandle->name) == 0) {
session->dhandle = dhandle;
return (0);
}
@@ -161,90 +95,25 @@ __wt_conn_dhandle_find(WT_SESSION_IMPL *session,
if (F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
if (dhandle->checkpoint != NULL &&
- strcmp(name, dhandle->name) == 0 &&
- strcmp(ckpt, dhandle->checkpoint) == 0) {
+ strcmp(uri, dhandle->name) == 0 &&
+ strcmp(checkpoint, dhandle->checkpoint) == 0) {
session->dhandle = dhandle;
return (0);
}
}
- return (WT_NOTFOUND);
-}
-
-/*
- * __conn_dhandle_get --
- * Allocate a new data handle, lock it exclusively, and return it linked
- * into the connection's list.
- */
-static int
-__conn_dhandle_get(WT_SESSION_IMPL *session,
- const char *name, const char *ckpt, uint32_t flags)
-{
- WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
- WT_DATA_HANDLE *dhandle;
- WT_DECL_RET;
- uint32_t bucket;
-
- conn = S2C(session);
-
- /*
- * We have the handle lock, check whether we can find the handle we
- * are looking for. If we do, and we can lock it in the state we
- * want, this session will take ownership and we are done.
- */
- ret = __wt_conn_dhandle_find(session, name, ckpt, flags);
- if (ret == 0) {
- dhandle = session->dhandle;
- WT_RET(__conn_dhandle_open_lock(session, dhandle, flags));
- return (0);
- }
- WT_RET_NOTFOUND_OK(ret);
-
- /*
- * If no handle was found, allocate the data handle and a btree handle,
- * then initialize the data handle. Exclusively lock the data handle
- * before inserting it in the list.
- */
- WT_RET(__wt_calloc_one(session, &dhandle));
-
- WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));
-
- dhandle->name_hash = __wt_hash_city64(name, strlen(name));
- WT_ERR(__wt_strdup(session, name, &dhandle->name));
- if (ckpt != NULL)
- WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));
-
- WT_ERR(__wt_calloc_one(session, &btree));
- dhandle->handle = btree;
- btree->dhandle = dhandle;
-
- WT_ERR(__wt_spin_init(
- session, &dhandle->close_lock, "data handle close"));
-
- F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
- WT_ERR(__wt_writelock(session, dhandle->rwlock));
+ WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle));
/*
* Prepend the handle to the connection list, assuming we're likely to
* need new files again soon, until they are cached by all sessions.
* Find the right hash bucket to insert into as well.
*/
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket);
session->dhandle = dhandle;
return (0);
-
-err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
- __wt_free(session, dhandle->name);
- __wt_free(session, dhandle->checkpoint);
- __wt_free(session, dhandle->handle); /* btree free */
- __wt_spin_destroy(session, &dhandle->close_lock);
- __wt_overwrite_and_free(session, dhandle);
-
- return (ret);
}
/*
@@ -256,8 +125,6 @@ __conn_dhandle_mark_dead(WT_SESSION_IMPL *session)
{
int evict_reset;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
-
/*
* Handle forced discard (e.g., when dropping a file).
*
@@ -281,10 +148,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force)
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- int no_schema_lock;
+ int marked_dead, no_schema_lock;
- dhandle = session->dhandle;
btree = S2BT(session);
+ dhandle = session->dhandle;
+ marked_dead = 0;
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
return (0);
@@ -297,7 +165,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force)
* a handle lock (specifically, checkpoint).
*/
no_schema_lock = 0;
- if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
+ if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) {
no_schema_lock = 1;
F_SET(session, WT_SESSION_NO_SCHEMA_LOCK);
}
@@ -320,18 +188,27 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force)
* invalid if the mapping is closed.
*/
if (!F_ISSET(btree,
- WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
- WT_ERR(force && (btree->bm == NULL || btree->bm->map == NULL) ?
- __conn_dhandle_mark_dead(session) :
- __wt_checkpoint_close(session, final));
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+ if (force && (btree->bm == NULL || btree->bm->map == NULL)) {
+ WT_ERR(__conn_dhandle_mark_dead(session));
+ marked_dead = 1;
+ } else
+ WT_ERR(__wt_checkpoint_close(session, final));
+ }
WT_TRET(__wt_btree_close(session));
- if (!force || final) {
+ /*
+ * If we marked a handle as dead it will be closed by sweep, via
+ * another call to sync and close.
+ */
+ if (!marked_dead) {
F_CLR(dhandle, WT_DHANDLE_OPEN);
if (dhandle->checkpoint == NULL)
--S2C(session)->open_btree_count;
}
- F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+ WT_ASSERT(session,
+ F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN));
err: __wt_spin_unlock(session, &dhandle->close_lock);
@@ -408,11 +285,12 @@ err: __wt_free(session, metaconf);
}
/*
- * __conn_btree_open --
+ * __wt_conn_btree_open --
* Open the current btree handle.
*/
-static int
-__conn_btree_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
+int
+__wt_conn_btree_open(
+ WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
{
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
@@ -421,24 +299,23 @@ __conn_btree_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
dhandle = session->dhandle;
btree = S2BT(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+ WT_ASSERT(session,
F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
!LF_ISSET(WT_DHANDLE_LOCK_ONLY));
WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING));
/*
- * If the handle is already open, it has to be closed so it can be
- * reopened with a new configuration. We don't need to check again:
- * this function isn't called if the handle is already open in the
- * required mode.
+ * If the handle is already open, it has to be closed so it can
+ * be reopened with a new configuration.
*
- * This call can return EBUSY if there's an update in the object that's
- * not yet globally visible. That's not a problem because it can only
- * happen when we're switching from a normal handle to a "special" one,
- * so we're returning EBUSY to an attempt to verify or do other special
- * operations. The reverse won't happen because when the handle from a
- * verify or other special operation is closed, there won't be updates
+ * This call can return EBUSY if there's an update in the
+ * object that's not yet globally visible. That's not a
+ * problem because it can only happen when we're switching from
+ * a normal handle to a "special" one, so we're returning EBUSY
+ * to an attempt to verify or do other special operations. The
+ * reverse won't happen because when the handle from a verify
+ * or other special operation is closed, there won't be updates
* in the tree that can block the close.
*/
if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
@@ -451,72 +328,24 @@ __conn_btree_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
/* Set any special flags on the handle. */
F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));
- do {
- WT_ERR(__wt_btree_open(session, cfg));
- F_SET(dhandle, WT_DHANDLE_OPEN);
- /*
- * Checkpoint handles are read only, so eviction calculations
- * based on the number of btrees are better to ignore them.
- */
- if (dhandle->checkpoint == NULL)
- ++S2C(session)->open_btree_count;
-
- /* Drop back to a readlock if that is all that was needed. */
- if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
- F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
- WT_ERR(__wt_writeunlock(session, dhandle->rwlock));
- WT_ERR(
- __conn_dhandle_open_lock(session, dhandle, flags));
- }
- } while (!F_ISSET(dhandle, WT_DHANDLE_OPEN));
+ WT_ERR(__wt_btree_open(session, cfg));
+ F_SET(dhandle, WT_DHANDLE_OPEN);
+
+ /*
+ * Checkpoint handles are read only, so eviction calculations
+ * based on the number of btrees are better to ignore them.
+ */
+ if (dhandle->checkpoint == NULL)
+ ++S2C(session)->open_btree_count;
if (0) {
err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
- /* If the open failed, close the handle. */
- if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
- WT_TRET(__wt_conn_btree_sync_and_close(session, 0, 0));
}
return (ret);
}
/*
- * __wt_conn_btree_get --
- * Get an open btree file handle, otherwise open a new one.
- */
-int
-__wt_conn_btree_get(WT_SESSION_IMPL *session,
- const char *name, const char *ckpt, const char *cfg[], uint32_t flags)
-{
- WT_DATA_HANDLE *dhandle;
- WT_DECL_RET;
-
- if (LF_ISSET(WT_DHANDLE_HAVE_REF))
- WT_RET(
- __conn_dhandle_open_lock(session, session->dhandle, flags));
- else {
- WT_WITH_DHANDLE_LOCK(session,
- ret = __conn_dhandle_get(session, name, ckpt, flags));
- WT_RET(ret);
- }
- dhandle = session->dhandle;
-
- if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) &&
- (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
- LF_ISSET(WT_BTREE_SPECIAL_FLAGS)))
- if ((ret = __conn_btree_open(session, cfg, flags)) != 0) {
- F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
- WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
- }
-
- WT_ASSERT(session, ret != 0 ||
- LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
- F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
-
- return (ret);
-}
-
-/*
* __conn_btree_apply_internal --
* Apply a function to the open btree handles.
*/
@@ -561,7 +390,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session,
conn = S2C(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
/*
* If we're given a URI, then we walk only the hash list for that
@@ -651,7 +480,7 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
conn = S2C(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
hash = __wt_hash_city64(uri, strlen(uri));
bucket = hash % WT_HASH_ARRAY_SIZE;
@@ -689,7 +518,7 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
*/
int
__wt_conn_dhandle_close_all(
- WT_SESSION_IMPL *session, const char *name, int force)
+ WT_SESSION_IMPL *session, const char *uri, int force)
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
@@ -698,12 +527,12 @@ __wt_conn_dhandle_close_all(
conn = S2C(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
WT_ASSERT(session, session->dhandle == NULL);
- bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE;
+ bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
- if (strcmp(dhandle->name, name) != 0 ||
+ if (strcmp(dhandle->name, uri) != 0 ||
F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
@@ -759,7 +588,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, int final)
dhandle = session->dhandle;
bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
/* Check if the handle was reacquired by a session while we waited. */
if (!final &&
@@ -799,25 +628,19 @@ __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final, int force)
* Kludge: interrupt the eviction server in case it is holding the
* handle list lock.
*/
- if (!F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED))
+ if (!F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST))
F_SET(S2C(session)->cache, WT_CACHE_CLEAR_WALKS);
/* Try to remove the handle, protected by the data handle lock. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
WT_TRET(__conn_dhandle_remove(session, final)));
/*
* After successfully removing the handle, clean it up.
*/
if (ret == 0 || final) {
- WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
- __wt_free(session, dhandle->name);
- __wt_free(session, dhandle->checkpoint);
__conn_btree_config_clear(session);
- __wt_free(session, dhandle->handle);
- __wt_spin_destroy(session, &dhandle->close_lock);
- __wt_overwrite_and_free(session, dhandle);
-
+ WT_TRET(__conn_dhandle_destroy(session, dhandle));
session->dhandle = NULL;
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index f2d50e09561..75fdd7a9aa1 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -408,7 +408,7 @@ __log_wrlsn_server(void *arg)
* as soon as one is not in order.
*/
for (i = 0; i < written_i; i++) {
- if (LOG_CMP(&log->write_lsn,
+ if (WT_LOG_CMP(&log->write_lsn,
&written[i].lsn) != 0)
break;
/*
@@ -416,7 +416,7 @@ __log_wrlsn_server(void *arg)
* Advance the LSN and process the slot.
*/
slot = &log->slot_pool[written[i].slot_index];
- WT_ASSERT(session, LOG_CMP(&written[i].lsn,
+ WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn,
&slot->slot_release_lsn) == 0);
log->write_start_lsn = slot->slot_start_lsn;
log->write_lsn = slot->slot_end_lsn;
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index bf363e81215..ff3ad7a67f7 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -92,7 +92,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* transaction ID will catch up with the current ID.
*/
for (;;) {
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
if (txn_global->oldest_id == txn_global->current)
break;
__wt_yield();
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index 647e4b02abb..8acbd84ccba 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -323,7 +323,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
* any that match the list of object sources.
*/
if (conn->stat_sources != NULL) {
- WT_WITH_DHANDLE_LOCK(session, ret =
+ WT_WITH_HANDLE_LIST_LOCK(session, ret =
__wt_conn_btree_apply(
session, 0, NULL, __statlog_apply, NULL));
WT_RET(ret);
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index fc29e0b2e15..3a07f2afe17 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -249,14 +249,14 @@ __sweep_server(void *arg)
/* Close handles if we have reached the configured limit */
if (conn->open_file_count >= conn->sweep_handles_min) {
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __sweep_expire(session));
WT_ERR(ret);
}
WT_ERR(__sweep_flush(session));
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __sweep_remove_handles(session));
WT_ERR(ret);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index a201477abe3..e366a3673b8 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -376,7 +376,7 @@ __backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
WT_ERR_NOTFOUND_OK(ret);
/* Build a list of the file objects that need to be copied. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_meta_btree_apply(
session, __backup_list_all_append, NULL));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index 44a00d4d192..92d4d583300 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -513,12 +513,12 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
* open failing with EBUSY due to a database-wide checkpoint.
*/
if (bulk)
- __wt_spin_lock(
- session, &S2C(session)->checkpoint_lock);
- ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags);
- if (bulk)
- __wt_spin_unlock(
- session, &S2C(session)->checkpoint_lock);
+ WT_WITH_CHECKPOINT_LOCK(session, ret =
+ __wt_session_get_btree_ckpt(
+ session, uri, cfg, flags));
+ else
+ ret = __wt_session_get_btree_ckpt(
+ session, uri, cfg, flags);
WT_RET(ret);
} else
WT_RET(__wt_bad_object_type(session, uri));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c
index b7f11576425..4b72a472cb7 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_log.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_log.c
@@ -74,7 +74,7 @@ __curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
acl = (WT_CURSOR_LOG *)a;
bcl = (WT_CURSOR_LOG *)b;
WT_ASSERT(session, cmpp != NULL);
- *cmpp = LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
+ *cmpp = WT_LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
/*
* If both are on the same LSN, compare step counter.
*/
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index 864c116a380..21f6a1f016a 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -27,7 +27,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));
/* Make sure the oldest transaction ID is up-to-date. */
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
/* Walk the tree, discarding pages. */
next_ref = NULL;
@@ -76,11 +76,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
/*
* Evict the page.
*/
- WT_ERR(__wt_evict(session, ref, 1));
+ WT_ERR(__wt_evict(session, ref, WT_EVICT_EXCLUSIVE));
break;
case WT_SYNC_DISCARD:
WT_ASSERT(session,
- __wt_page_can_evict(session, page, 0));
+ __wt_page_can_evict(session, page, 0, NULL));
__wt_evict_page_clean_update(session, ref);
break;
case WT_SYNC_DISCARD_FORCE:
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index a22531277dd..3ad7e8a2723 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -681,7 +681,7 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
* before evicting, using a special "eviction" isolation level, where
* only globally visible updates can be evicted.
*/
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
txn = &session->txn;
saved_iso = txn->isolation;
txn->isolation = TXN_ISO_EVICTION;
@@ -941,7 +941,7 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
* after a long-running transaction (such as a checkpoint) completes,
* we may never start evicting again.
*/
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
if (cache->evict_current == NULL)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
@@ -1232,7 +1232,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
}
fast: /* If the page can't be evicted, give up. */
- if (!__wt_page_can_evict(session, page, 1))
+ if (!__wt_page_can_evict(
+ session, page, WT_EVICT_CHECK_SPLITS, NULL))
continue;
/*
@@ -1522,7 +1523,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full)
* are not busy.
*/
if (busy) {
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 0);
if (txn_state->id == txn_global->oldest_id ||
txn_state->snap_min == txn_global->oldest_id)
return (0);
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index e276f72fe3f..e54ed0ff8e7 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -9,7 +9,7 @@
#include "wt_internal.h"
static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, int);
-static int __evict_review(WT_SESSION_IMPL *, WT_REF *, int, int *);
+static int __evict_review(WT_SESSION_IMPL *, WT_REF *, int *, uint32_t);
/*
* __evict_exclusive_clear --
@@ -49,7 +49,7 @@ __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref)
* Evict a page.
*/
int
-__wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+__wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -73,7 +73,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
* to make this check for clean pages, too: while unlikely eviction
* would choose an internal page with children, it's not disallowed.
*/
- WT_ERR(__evict_review(session, ref, exclusive, &inmem_split));
+ WT_ERR(__evict_review(session, ref, &inmem_split, flags));
/*
* If there was an in-memory split, the tree has been left in the state
@@ -89,7 +89,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
mod = page->modify;
/* Count evictions of internal pages during normal operation. */
- if (!exclusive && WT_PAGE_IS_INTERNAL(page)) {
+ if (!LF_ISSET(WT_EVICT_EXCLUSIVE) && WT_PAGE_IS_INTERNAL(page)) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_internal);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal);
}
@@ -115,22 +115,22 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else
- WT_ERR(
- __evict_page_dirty_update(session, ref, exclusive));
+ WT_ERR(__evict_page_dirty_update(
+ session, ref, LF_ISSET(WT_EVICT_EXCLUSIVE)));
WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty);
}
if (0) {
-err: if (!exclusive)
+err: if (!LF_ISSET(WT_EVICT_EXCLUSIVE))
__evict_exclusive_clear(session, ref);
WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail);
}
-done: if ((inmem_split || (forced_eviction && ret == EBUSY)) &&
+done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) &&
!F_ISSET(conn->cache, WT_CACHE_WOULD_BLOCK)) {
F_SET(conn->cache, WT_CACHE_WOULD_BLOCK);
WT_TRET(__wt_evict_server_wake(session));
@@ -195,25 +195,10 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
/*
- * There are two cases in this code.
- *
- * First, an in-memory page that got too large, we forcibly
- * evicted it, and there wasn't anything to write. (Imagine two
- * threads updating a small set keys on a leaf page. The page is
- * too large so we try to evict it, but after reconciliation
- * there's only a small amount of data (so it's a single page we
- * can't split), and because there are two threads, there's some
- * data we can't write (so we can't evict it). In that case, we
- * take advantage of the fact we have exclusive access to the
- * page and rewrite it in memory.)
- *
- * Second, a real split where we reconciled a page and it turned
- * into a lot of pages.
+ * A real split where we reconciled a page and it turned into a
+ * lot of pages.
*/
- if (mod->mod_multi_entries == 1)
- WT_RET(__wt_split_rewrite(session, ref));
- else
- WT_RET(__wt_split_multi(session, ref, exclusive));
+ WT_RET(__wt_split_multi(session, ref, exclusive));
break;
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
@@ -236,6 +221,20 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
ref->addr = addr;
WT_PUBLISH(ref->state, WT_REF_DISK);
break;
+ case WT_PM_REC_REWRITE:
+ /*
+ * An in-memory page that got too large, we forcibly evicted
+ * it, and there wasn't anything to write. (Imagine two threads
+ * updating a small set keys on a leaf page. The page is too
+ * large so we try to evict it, but after reconciliation
+ * there's only a small amount of data (so it's a single page
+ * we can't split), and because there are two threads, there's
+ * some data we can't write (so we can't evict it). In that
+ * case, we take advantage of the fact we have exclusive access
+ * to the page and rewrite it in memory.)
+ */
+ WT_RET(__wt_split_rewrite(session, ref));
+ break;
WT_ILLEGAL_VALUE(session);
}
@@ -271,18 +270,20 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
*/
static int
__evict_review(
- WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int *inmem_splitp)
+ WT_SESSION_IMPL *session, WT_REF *ref, int *inmem_splitp, uint32_t flags)
{
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
- uint32_t flags;
+ uint32_t reconcile_flags;
+
+ reconcile_flags = WT_EVICTING;
/*
* Get exclusive access to the page if our caller doesn't have the tree
* locked down.
*/
- if (!exclusive) {
+ if (!LF_ISSET(WT_EVICT_EXCLUSIVE)) {
WT_RET(__evict_exclusive(session, ref));
/*
@@ -312,19 +313,19 @@ __evict_review(
}
/* Check if the page can be evicted. */
- if (!exclusive && !__wt_page_can_evict(session, page, 0))
- return (EBUSY);
+ if (!LF_ISSET(WT_EVICT_EXCLUSIVE)) {
+ if (!__wt_page_can_evict(session, page, flags, inmem_splitp))
+ return (EBUSY);
- /*
- * Check for an append-only workload needing an in-memory split; we
- * can't do this earlier because in-memory splits require exclusive
- * access. If an in-memory split completes, the page stays in memory
- * and the tree is left in the desired state: avoid the usual cleanup.
- */
- if (!exclusive) {
- WT_RET(__wt_split_insert(session, ref, inmem_splitp));
+ /*
+ * Check for an append-only workload needing an in-memory
+ * split; we can't do this earlier because in-memory splits
+ * require exclusive access. If an in-memory split completes,
+ * the page stays in memory and the tree is left in the desired
+ * state: avoid the usual cleanup.
+ */
if (*inmem_splitp)
- return (0);
+ return (__wt_split_insert(session, ref));
}
/*
@@ -346,24 +347,23 @@ __evict_review(
* Don't set the update-restore flag for internal pages, they don't have
* updates that can be saved and restored.
*/
- flags = WT_EVICTING;
if (__wt_page_is_modified(page)) {
- if (exclusive)
- LF_SET(WT_SKIP_UPDATE_ERR);
+ if (LF_ISSET(WT_EVICT_EXCLUSIVE))
+ FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR);
else if (!WT_PAGE_IS_INTERNAL(page) &&
page->read_gen == WT_READGEN_OLDEST)
- LF_SET(WT_SKIP_UPDATE_RESTORE);
- WT_RET(__wt_reconcile(session, ref, NULL, flags));
+ FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE);
+ WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags));
WT_ASSERT(session,
!__wt_page_is_modified(page) ||
- LF_ISSET(WT_SKIP_UPDATE_RESTORE));
+ FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE));
}
/*
* If the page was ever modified, make sure all of the updates
* on the page are old enough they can be discarded from cache.
*/
- if (!exclusive && mod != NULL &&
+ if (!LF_ISSET(WT_EVICT_EXCLUSIVE) && mod != NULL &&
!__wt_txn_visible_all(session, mod->rec_max_txn) &&
!LF_ISSET(WT_SKIP_UPDATE_RESTORE))
return (EBUSY);
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index e9b6b5a1d6e..cde01e4e1ac 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -179,18 +179,21 @@ struct __wt_page_modify {
*/
uint64_t disk_snap_min;
- /* The largest transaction ID seen on the page by reconciliation. */
- uint64_t rec_max_txn;
-
/* The first unwritten transaction ID (approximate). */
uint64_t first_dirty_txn;
- /* The largest update transaction ID (approximate). */
- uint64_t update_txn;
-
/* In-memory split transaction ID. */
uint64_t inmem_split_txn;
+ /* Avoid checking for obsolete updates during checkpoints. */
+ uint64_t obsolete_check_txn;
+
+ /* The largest transaction ID seen on the page by reconciliation. */
+ uint64_t rec_max_txn;
+
+ /* The largest update transaction ID (approximate). */
+ uint64_t update_txn;
+
/* Dirty bytes added to the cache. */
size_t bytes_dirty;
@@ -353,8 +356,10 @@ struct __wt_page_modify {
#define WT_PM_REC_EMPTY 0x01 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 0x02 /* Reconciliation: multiple blocks */
#define WT_PM_REC_REPLACE 0x04 /* Reconciliation: single block */
+#define WT_PM_REC_REWRITE 0x08 /* Reconciliation: rewrite in place */
#define WT_PM_REC_MASK \
- (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE)
+ (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | \
+ WT_PM_REC_REPLACE | WT_PM_REC_REWRITE)
uint8_t flags; /* Page flags */
};
@@ -535,7 +540,7 @@ struct __wt_page {
#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */
#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */
#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
-#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */
+#define WT_PAGE_SPLIT_LOCKED 0x80 /* An internal page is growing */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 9038dab2b34..5a2253f6078 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -226,6 +226,11 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
/* Update the bytes in-memory to reflect the eviction. */
WT_CACHE_DECR(session, cache->bytes_inmem, page->memory_footprint);
+ /* Update the bytes_internal value to reflect the eviction */
+ if (WT_PAGE_IS_INTERNAL(page))
+ WT_CACHE_DECR(session,
+ cache->bytes_internal, page->memory_footprint);
+
/* Update the cache's dirty-byte count. */
if (modify != NULL && modify->bytes_dirty != 0) {
if (cache->bytes_dirty < modify->bytes_dirty) {
@@ -949,17 +954,86 @@ __wt_ref_info(WT_SESSION_IMPL *session,
}
/*
+ * __wt_page_can_split --
+ * Check whether a page can be split in memory.
+ */
+static inline int
+__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_INSERT_HEAD *ins_head;
+
+ btree = S2BT(session);
+
+ /*
+ * Only split a page once, otherwise workloads that update in the middle
+ * of the page could continually split without benefit.
+ */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
+ return (0);
+
+ /*
+ * Check for pages with append-only workloads. A common application
+ * pattern is to have multiple threads frantically appending to the
+ * tree. We want to reconcile and evict this page, but we'd like to
+ * do it without making the appending threads wait. If we're not
+ * discarding the tree, check and see if it's worth doing a split to
+ * let the threads continue before doing eviction.
+ *
+ * Ignore anything other than large, dirty row-store leaf pages.
+ *
+ * XXX KEITH
+ * Need a better test for append-only workloads.
+ */
+ if (page->type != WT_PAGE_ROW_LEAF ||
+ page->memory_footprint < btree->maxmempage ||
+ !__wt_page_is_modified(page))
+ return (0);
+
+ /* Don't split a page that is pending a multi-block split. */
+ if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK))
+ return (0);
+
+ /*
+ * There is no point splitting if the list is small, no deep items is
+ * our heuristic for that. (A 1/4 probability of adding a new skiplist
+ * level means there will be a new 6th level for roughly each 4KB of
+ * entries in the list. If we have at least two 6th level entries, the
+ * list is at least large enough to work with.)
+ *
+ * The following code requires at least two items on the insert list,
+ * this test serves the additional purpose of confirming that.
+ */
+#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1)
+ ins_head = page->pg_row_entries == 0 ?
+ WT_ROW_INSERT_SMALLEST(page) :
+ WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ if (ins_head == NULL ||
+ ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL ||
+ ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] ==
+ ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH])
+ return (0);
+
+ return (1);
+}
+
+/*
* __wt_page_can_evict --
* Check whether a page can be evicted.
*/
static inline int
-__wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits)
+__wt_page_can_evict(
+ WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags, int *inmem_splitp)
{
WT_BTREE *btree;
WT_PAGE_MODIFY *mod;
+ WT_TXN_GLOBAL *txn_global;
btree = S2BT(session);
mod = page->modify;
+ txn_global = &S2C(session)->txn_global;
+ if (inmem_splitp != NULL)
+ *inmem_splitp = 0;
/* Pages that have never been modified can always be evicted. */
if (mod == NULL)
@@ -974,11 +1048,24 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits)
* a transaction value, once that's globally visible, we know we can
* evict the created page.
*/
- if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
+ if (LF_ISSET(WT_EVICT_CHECK_SPLITS) && WT_PAGE_IS_INTERNAL(page) &&
!__wt_txn_visible_all(session, mod->mod_split_txn))
return (0);
/*
+ * Allow for the splitting of pages when a checkpoint is underway only
+ * if the allow_splits flag has been passed, we know we are performing
+ * a checkpoint, the page is larger than the stated maximum and there
+ * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK
+ * flag is unset.
+ */
+ if (__wt_page_can_split(session, page)) {
+ if (inmem_splitp != NULL)
+ *inmem_splitp = 1;
+ return (1);
+ }
+
+ /*
* If the file is being checkpointed, we can't evict dirty pages:
* if we write a page and free the previous version of the page, that
* previous version might be referenced by an internal page already
@@ -1017,10 +1104,12 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits)
/*
* If the page was recently split in-memory, don't force it out: we
- * hope an eviction thread will find it first.
+ * hope an eviction thread will find it first. The check here is
+ * similar to __wt_txn_visible_all, but ignores the checkpoints
+ * transaction.
*/
- if (check_splits &&
- !__wt_txn_visible_all(session, mod->inmem_split_txn))
+ if (LF_ISSET(WT_EVICT_CHECK_SPLITS) &&
+ TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
return (0);
return (1);
@@ -1040,7 +1129,6 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
btree = S2BT(session);
page = ref->page;
- too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0;
/*
* Take some care with order of operations: if we release the hazard
@@ -1055,6 +1143,8 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
}
(void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
+
+ too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0;
if ((ret = __wt_evict_page(session, ref)) == 0) {
if (too_big)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
@@ -1115,8 +1205,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
page = ref->page;
if (F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
LF_ISSET(WT_READ_NO_EVICT) ||
- page->read_gen != WT_READGEN_OLDEST ||
- !__wt_page_can_evict(session, page, 1))
+ page->read_gen != WT_READGEN_OLDEST || !__wt_page_can_evict(
+ session, page, WT_EVICT_CHECK_SPLITS, NULL))
return (__wt_hazard_clear(session, page));
WT_RET_BUSY_OK(__wt_page_release_evict(session, ref));
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
index f952f1bf698..0c976800b38 100644
--- a/src/third_party/wiredtiger/src/include/cache.i
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -148,7 +148,7 @@ __wt_session_can_wait(WT_SESSION_IMPL *session)
* highjack the thread for eviction.
*/
if (F_ISSET(session,
- WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
+ WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA))
return (0);
return (1);
@@ -170,7 +170,7 @@ __wt_cache_full_check(WT_SESSION_IMPL *session)
* block eviction), we don't want to highjack the thread for eviction.
*/
if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK |
- WT_SESSION_SCHEMA_LOCKED | WT_SESSION_HANDLE_LIST_LOCKED))
+ WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA))
return (0);
/*
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 0121a1625c5..7a19a35c83c 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -124,6 +124,12 @@ struct __wt_named_extractor {
} while (0)
/*
+ * Default hash table size; use a prime number of buckets rather than assuming
+ * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions").
+ */
+#define WT_HASH_ARRAY_SIZE 509
+
+/*
* WT_CONNECTION_IMPL --
* Implementation of WT_CONNECTION
*/
@@ -184,7 +190,6 @@ struct __wt_connection_impl {
* URI.
*/
/* Locked: data handle hash array */
-#define WT_HASH_ARRAY_SIZE 512
SLIST_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE];
/* Locked: data handle list */
SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh;
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
index 034db30a0a2..22a0a2c1dd4 100644
--- a/src/third_party/wiredtiger/src/include/dhandle.h
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -69,8 +69,7 @@ struct __wt_data_handle {
#define WT_DHANDLE_DISCARD 0x02 /* Discard on release */
#define WT_DHANDLE_DISCARD_FORCE 0x04 /* Force discard on release */
#define WT_DHANDLE_EXCLUSIVE 0x08 /* Need exclusive access */
-#define WT_DHANDLE_HAVE_REF 0x10 /* Already have ref */
-#define WT_DHANDLE_LOCK_ONLY 0x20 /* Handle only used as a lock */
-#define WT_DHANDLE_OPEN 0x40 /* Handle is open */
+#define WT_DHANDLE_LOCK_ONLY 0x10 /* Handle only used as a lock */
+#define WT_DHANDLE_OPEN 0x20 /* Handle is open */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 48bf792bcf5..a4810720d55 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -152,7 +152,7 @@ extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const ch
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
-extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp);
+extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
@@ -174,7 +174,7 @@ extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep);
extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep);
-extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd);
+extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert);
@@ -226,13 +226,13 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg);
extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session);
extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize);
-extern int __wt_conn_dhandle_find(WT_SESSION_IMPL *session, const char *name, const char *ckpt, uint32_t flags);
+extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint);
extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force);
-extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *cfg[], uint32_t flags);
+extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags);
extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
-extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force);
+extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, int force);
extern int __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final, int force);
extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
@@ -314,7 +314,7 @@ extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server);
extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full);
extern void __wt_cache_dump(WT_SESSION_IMPL *session);
-extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
+extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags);
extern void __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec);
@@ -363,8 +363,12 @@ extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize);
+extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm);
+extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm);
extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
+extern int __wt_clsm_close(WT_CURSOR *cursor);
extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]);
extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg);
extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg);
extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session);
@@ -444,7 +448,6 @@ extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, voi
extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp);
-extern int __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp);
extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg);
extern int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp);
extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp);
@@ -574,10 +577,10 @@ extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_session_create_strip(WT_SESSION *wt_session, const char *v1, const char *v2, char **value_ret);
extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp);
-extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp);
+extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp);
extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip);
extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config);
-extern int __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, int *deadp);
+extern int __wt_session_lock_dhandle( WT_SESSION_IMPL *session, uint32_t flags, int *is_deadp);
extern int __wt_session_release_btree(WT_SESSION_IMPL *session);
extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags);
extern void __wt_session_close_cache(WT_SESSION_IMPL *session);
@@ -655,8 +658,8 @@ extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
extern void __wt_stat_refresh_connection_stats(void *stats_arg);
extern int WT_CDECL __wt_txnid_cmp(const void *v1, const void *v2);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
-extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session);
-extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot);
+extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
+extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force);
extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]);
extern void __wt_txn_release(WT_SESSION_IMPL *session);
extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index 99c77c94f49..95aa6f9809d 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -18,6 +18,9 @@
#define WT_CONN_SERVER_SWEEP 0x00002000
#define WT_CONN_WAS_BACKUP 0x00004000
#define WT_EVICTING 0x00000001
+#define WT_EVICT_CHECK_SPLITS 0x00000001
+#define WT_EVICT_EXCLUSIVE 0x00000002
+#define WT_EVICT_INMEM_SPLIT 0x00000004
#define WT_FILE_TYPE_CHECKPOINT 0x00000001
#define WT_FILE_TYPE_DATA 0x00000002
#define WT_FILE_TYPE_DIRECTORY 0x00000004
@@ -42,18 +45,19 @@
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
#define WT_SESSION_DISCARD_FORCE 0x00000004
-#define WT_SESSION_HANDLE_LIST_LOCKED 0x00000008
-#define WT_SESSION_INTERNAL 0x00000010
-#define WT_SESSION_LOGGING_INMEM 0x00000020
-#define WT_SESSION_NO_CACHE 0x00000040
-#define WT_SESSION_NO_CACHE_CHECK 0x00000080
-#define WT_SESSION_NO_DATA_HANDLES 0x00000100
-#define WT_SESSION_NO_LOGGING 0x00000200
-#define WT_SESSION_NO_SCHEMA_LOCK 0x00000400
-#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00000800
-#define WT_SESSION_SCHEMA_LOCKED 0x00001000
-#define WT_SESSION_SERVER_ASYNC 0x00002000
-#define WT_SESSION_TABLE_LOCKED 0x00004000
+#define WT_SESSION_INTERNAL 0x00000008
+#define WT_SESSION_LOCKED_CHECKPOINT 0x00000010
+#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000020
+#define WT_SESSION_LOCKED_SCHEMA 0x00000040
+#define WT_SESSION_LOCKED_TABLE 0x00000080
+#define WT_SESSION_LOGGING_INMEM 0x00000100
+#define WT_SESSION_NO_CACHE 0x00000200
+#define WT_SESSION_NO_CACHE_CHECK 0x00000400
+#define WT_SESSION_NO_DATA_HANDLES 0x00000800
+#define WT_SESSION_NO_LOGGING 0x00001000
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00002000
+#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00004000
+#define WT_SESSION_SERVER_ASYNC 0x00008000
#define WT_SKIP_UPDATE_ERR 0x00000002
#define WT_SKIP_UPDATE_RESTORE 0x00000004
#define WT_SYNC_CHECKPOINT 0x00000001
diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h
index 1c3fb287e86..e3c098826d0 100644
--- a/src/third_party/wiredtiger/src/include/hardware.h
+++ b/src/third_party/wiredtiger/src/include/hardware.h
@@ -38,7 +38,7 @@
} while (0)
#define F_CAS_ATOMIC(p, mask, ret) do { \
- uint8_t __orig; \
+ uint8_t __orig; \
ret = 0; \
do { \
__orig = (p)->flags_atomic; \
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index ebe3a00b19f..f4f7361b53f 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -51,7 +51,7 @@
* Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
* and 1 if lsn0 > lsn1.
*/
-#define LOG_CMP(lsn1, lsn2) \
+#define WT_LOG_CMP(lsn1, lsn2) \
((lsn1)->file != (lsn2)->file ? \
((lsn1)->file < (lsn2)->file ? -1 : 1) : \
((lsn1)->offset != (lsn2)->offset ? \
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
index aa1d797e3b5..dc6a0d7e027 100644
--- a/src/third_party/wiredtiger/src/include/lsm.h
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -57,15 +57,16 @@ struct __wt_cursor_lsm {
u_int update_count; /* Updates performed. */
-#define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */
-#define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */
-#define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */
-#define WT_CLSM_MERGE 0x08 /* Merge cursor, don't update */
-#define WT_CLSM_MINOR_MERGE 0x10 /* Minor merge, include tombstones */
-#define WT_CLSM_MULTIPLE 0x20 /* Multiple cursors have values for the
+#define WT_CLSM_ACTIVE 0x001 /* Incremented the session count */
+#define WT_CLSM_BULK 0x002 /* Open for snapshot isolation */
+#define WT_CLSM_ITERATE_NEXT 0x004 /* Forward iteration */
+#define WT_CLSM_ITERATE_PREV 0x008 /* Backward iteration */
+#define WT_CLSM_MERGE 0x010 /* Merge cursor, don't update */
+#define WT_CLSM_MINOR_MERGE 0x020 /* Minor merge, include tombstones */
+#define WT_CLSM_MULTIPLE 0x040 /* Multiple cursors have values for the
current key */
-#define WT_CLSM_OPEN_READ 0x40 /* Open for reads */
-#define WT_CLSM_OPEN_SNAPSHOT 0x80 /* Open for snapshot isolation */
+#define WT_CLSM_OPEN_READ 0x080 /* Open for reads */
+#define WT_CLSM_OPEN_SNAPSHOT 0x100 /* Open for snapshot isolation */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
index 53f08b3cbeb..98facff02b9 100644
--- a/src/third_party/wiredtiger/src/include/misc.i
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -7,6 +7,17 @@
*/
/*
+ * __wt_strdup --
+ * ANSI strdup function.
+ */
+static inline int
+__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
+{
+ return (__wt_strndup(
+ session, str, (str == NULL) ? 0 : strlen(str), retp));
+}
+
+/*
* __wt_verbose --
* Verbose message.
*/
diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h
index 5d524534b39..8f4884281cd 100644
--- a/src/third_party/wiredtiger/src/include/schema.h
+++ b/src/third_party/wiredtiger/src/include/schema.h
@@ -95,12 +95,20 @@ struct __wt_table {
} while (0)
/*
- * WT_WITH_DHANDLE_LOCK --
+ * WT_WITH_CHECKPOINT_LOCK --
+ * Acquire the checkpoint lock, perform an operation, drop the lock.
+ */
+#define WT_WITH_CHECKPOINT_LOCK(session, op) \
+ WT_WITH_LOCK(session, \
+ &S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op)
+
+/*
+ * WT_WITH_HANDLE_LIST_LOCK --
* Acquire the data handle list lock, perform an operation, drop the lock.
*/
-#define WT_WITH_DHANDLE_LOCK(session, op) \
+#define WT_WITH_HANDLE_LIST_LOCK(session, op) \
WT_WITH_LOCK(session, \
- &S2C(session)->dhandle_lock, WT_SESSION_HANDLE_LIST_LOCKED, op)
+ &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op)
/*
* WT_WITH_SCHEMA_LOCK --
* Acquire the schema lock, perform an operation, drop the lock.
@@ -109,61 +117,61 @@ struct __wt_table {
*/
#define WT_WITH_SCHEMA_LOCK(session, op) do { \
WT_ASSERT(session, \
- F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) || \
- !F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED | \
- WT_SESSION_NO_SCHEMA_LOCK | WT_SESSION_TABLE_LOCKED)); \
+ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || \
+ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST | \
+ WT_SESSION_NO_SCHEMA_LOCK | WT_SESSION_LOCKED_TABLE)); \
+ WT_WITH_LOCK(session, \
+ &S2C(session)->schema_lock, WT_SESSION_LOCKED_SCHEMA, op); \
+} while (0)
+
+/*
+ * WT_WITH_TABLE_LOCK --
+ * Acquire the table lock, perform an operation, drop the lock.
+ */
+#define WT_WITH_TABLE_LOCK(session, op) do { \
+ WT_ASSERT(session, \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \
+ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
WT_WITH_LOCK(session, \
- &S2C(session)->schema_lock, WT_SESSION_SCHEMA_LOCKED, op); \
+ &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \
} while (0)
/*
* WT_WITHOUT_LOCKS --
- * Drop the schema lock and/or the handle list lock, perform an operation,
+ * Drop the handle, table and/or schema locks, perform an operation,
* re-acquire the lock(s).
*/
#define WT_WITHOUT_LOCKS(session, op) do { \
WT_CONNECTION_IMPL *__conn = S2C(session); \
int __handle_locked = \
- F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED);\
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST);\
int __table_locked = \
- F_ISSET(session, WT_SESSION_TABLE_LOCKED); \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE); \
int __schema_locked = \
- F_ISSET(session, WT_SESSION_SCHEMA_LOCKED); \
+ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \
if (__handle_locked) { \
- F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED); \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \
__wt_spin_unlock(session, &__conn->dhandle_lock);\
} \
if (__table_locked) { \
- F_CLR(session, WT_SESSION_TABLE_LOCKED); \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE); \
__wt_spin_unlock(session, &__conn->table_lock);\
} \
if (__schema_locked) { \
- F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \
+ F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \
__wt_spin_unlock(session, &__conn->schema_lock);\
} \
op; \
if (__schema_locked) { \
__wt_spin_lock(session, &__conn->schema_lock); \
- F_SET(session, WT_SESSION_SCHEMA_LOCKED); \
+ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \
} \
if (__table_locked) { \
__wt_spin_lock(session, &__conn->table_lock); \
- F_SET(session, WT_SESSION_TABLE_LOCKED); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE); \
} \
if (__handle_locked) { \
__wt_spin_lock(session, &__conn->dhandle_lock); \
- F_SET(session, WT_SESSION_HANDLE_LIST_LOCKED); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \
} \
} while (0)
-
-/*
- * WT_WITH_TABLE_LOCK --
- * Acquire the table lock, perform an operation, drop the lock.
- */
-#define WT_WITH_TABLE_LOCK(session, op) do { \
- WT_ASSERT(session, \
- F_ISSET(session, WT_SESSION_TABLE_LOCKED) || \
- !F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); \
- WT_WITH_LOCK(session, \
- &S2C(session)->table_lock, WT_SESSION_TABLE_LOCKED, op); \
-} while (0)
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index e3581ae1c39..9e6b0f7916c 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -255,15 +255,18 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
* obsolete check at a time, and to protect updates from disappearing
* under reconciliation.
*/
- if (upd->next != NULL) {
+ if (upd->next != NULL &&
+ __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) {
F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
/* If we can't lock it, don't scan, that's okay. */
if (ret != 0)
return (0);
- obsolete = __wt_update_obsolete_check(session, upd->next);
+ obsolete = __wt_update_obsolete_check(session, page, upd->next);
F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
- if (obsolete != NULL)
+ if (obsolete != NULL) {
+ page->modify->obsolete_check_txn = WT_TXN_NONE;
__wt_update_obsolete_free(session, page, obsolete);
+ }
}
return (0);
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 927ab09d5f9..62f565c0535 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -42,9 +42,6 @@ struct __wt_txn_global {
*/
volatile uint64_t oldest_id;
- /* The oldest session found in the last scan. */
- uint32_t oldest_session;
-
/* Count of scanning threads, or -1 for exclusive access. */
volatile int32_t scan_count;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 4141d829f1d..b1cfba4257d 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -139,20 +139,20 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
txn = &session->txn;
- /*
- * Eviction only sees globally visible updates, or if there is a
- * checkpoint transaction running, use its transaction.
- */
- if (txn->isolation == TXN_ISO_EVICTION)
- return (__wt_txn_visible_all(session, id));
+ /* Changes with no associated transaction are always visible. */
+ if (id == WT_TXN_NONE)
+ return (1);
/* Nobody sees the results of aborted transactions. */
if (id == WT_TXN_ABORTED)
return (0);
- /* Changes with no associated transaction are always visible. */
- if (id == WT_TXN_NONE)
- return (1);
+ /*
+ * Eviction only sees globally visible updates, or if there is a
+ * checkpoint transaction running, use its transaction.
+ */
+ if (txn->isolation == TXN_ISO_EVICTION)
+ return (__wt_txn_visible_all(session, id));
/*
* Read-uncommitted transactions see all other changes.
@@ -222,7 +222,14 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
if (txn->isolation == TXN_ISO_SNAPSHOT) {
if (session->ncursors > 0)
WT_RET(__wt_session_copy_values(session));
- __wt_txn_refresh(session, 1);
+
+ /*
+ * We're about to allocate a snapshot: if we need to block for
+ * eviction, it's better to do it beforehand.
+ */
+ WT_RET(__wt_cache_full_check(session));
+
+ __wt_txn_get_snapshot(session);
}
F_SET(txn, TXN_RUNNING);
@@ -429,7 +436,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
if (txn->isolation != TXN_ISO_READ_UNCOMMITTED &&
!F_ISSET(txn, TXN_HAS_SNAPSHOT))
- __wt_txn_refresh(session, 1);
+ __wt_txn_get_snapshot(session);
}
/*
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 05e92d313f2..6037cdeee96 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -996,9 +996,10 @@ struct __wt_session {
* builtin support for \c "bzip2"\, \c "snappy"\, \c "lz4" or \c "zlib"
* compression\, these names are also available. See @ref compression
* for more information., a string; default \c none.}
- * @config{cache_resident, do not ever evict the object's pages; see
- * @ref tuning_cache_resident for more information., a boolean flag;
- * default \c false.}
+ * @config{cache_resident, do not ever evict the object's pages from
+ * cache. Not compatible with LSM tables; see @ref
+ * tuning_cache_resident for more information., a boolean flag; default
+ * \c false.}
* @config{checksum, configure block checksums; permitted values are
* <code>on</code> (checksum all blocks)\, <code>off</code> (checksum no
* blocks) and <code>uncompresssed</code> (checksum only blocks which
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 27be3dfb07c..6d64cd00c2a 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -973,7 +973,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
* be holes in the log file.
*/
WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn);
- while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+ while (WT_LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
if (++yield_count < 1000)
__wt_yield();
else
@@ -1036,7 +1036,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
* Sync the log file if needed.
*/
if (F_ISSET(slot, SLOT_SYNC) &&
- LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_release: sync log %s", log->log_fh->name));
WT_STAT_FAST_CONN_INCR(session, log_sync);
@@ -1485,7 +1485,7 @@ advance:
/* Truncate if we're in recovery. */
if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
- LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+ WT_LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
WT_ERR(__log_truncate(session,
&rd_lsn, WT_LOG_FILENAME, 0));
@@ -1758,13 +1758,13 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_ERR(__wt_log_slot_free(session, myslot.slot));
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
- while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+ while (WT_LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
(void)__wt_cond_wait(
session, log->log_sync_cond, 10000);
} else if (LF_ISSET(WT_LOG_FLUSH)) {
/* Wait for our writes to reach the OS */
- while (LOG_CMP(&log->write_lsn, &lsn) <= 0 &&
+ while (WT_LOG_CMP(&log->write_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
(void)__wt_cond_wait(
session, log->log_write_cond, 10000);
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index 7c9ac35d489..20d776bcfb9 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -20,11 +20,11 @@ static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t);
static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *);
/*
- * __clsm_request_switch --
+ * __wt_clsm_request_switch --
* Request an LSM tree switch for a cursor operation.
*/
-static inline int
-__clsm_request_switch(WT_CURSOR_LSM *clsm)
+int
+__wt_clsm_request_switch(WT_CURSOR_LSM *clsm)
{
WT_DECL_RET;
WT_LSM_TREE *lsm_tree;
@@ -44,9 +44,9 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm)
if (lsm_tree->nchunks == 0 ||
(clsm->dsk_gen == lsm_tree->dsk_gen &&
!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))) {
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
ret = __wt_lsm_manager_push_entry(
session, WT_LSM_WORK_SWITCH, 0, lsm_tree);
- F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
}
WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
}
@@ -55,6 +55,41 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm)
}
/*
+ * __wt_clsm_await_switch --
+ * Wait for a switch to have completed in the LSM tree
+ */
+int
+__wt_clsm_await_switch(WT_CURSOR_LSM *clsm)
+{
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ int waited;
+
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ /*
+ * If there is no primary chunk, or a chunk has overflowed the hard
+ * limit, which either means a worker thread has fallen behind or there
+ * has just been a user-level checkpoint, wait until the tree changes.
+ *
+ * We used to switch chunks in the application thread here, but that is
+ * problematic because there is a transaction in progress and it could
+ * roll back, leaving the metadata inconsistent.
+ */
+ for (waited = 0;
+ lsm_tree->nchunks == 0 ||
+ clsm->dsk_gen == lsm_tree->dsk_gen;
+ ++waited) {
+ if (waited % 1000 == 0)
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+ __wt_sleep(0, 10);
+ }
+ return (0);
+}
+
+/*
* __clsm_enter_update --
* Make sure an LSM cursor is ready to perform an update.
*/
@@ -65,7 +100,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
WT_LSM_CHUNK *primary_chunk;
WT_LSM_TREE *lsm_tree;
WT_SESSION_IMPL *session;
- int hard_limit, have_primary, ovfl, waited;
+ int hard_limit, have_primary, ovfl;
lsm_tree = clsm->lsm_tree;
ovfl = 0;
@@ -109,30 +144,13 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
}
/* Request a switch. */
- WT_RET(__clsm_request_switch(clsm));
+ WT_RET(__wt_clsm_request_switch(clsm));
/* If we only overflowed the soft limit, we're done. */
if (have_primary && !hard_limit)
return (0);
- /*
- * If there is no primary chunk, or it has overflowed the hard limit,
- * which either means a worker thread has fallen behind or there has
- * just been a user-level checkpoint, wait until the tree changes.
- *
- * We used to switch chunks in the application thread if we got to
- * here, but that is problematic because there is a transaction in
- * progress and it could roll back, leaving the metadata inconsistent.
- */
- for (waited = 0;
- lsm_tree->nchunks == 0 ||
- clsm->dsk_gen == lsm_tree->dsk_gen;
- ++waited) {
- if (waited % 1000 == 0)
- WT_RET(__wt_lsm_manager_push_entry(
- session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
- __wt_sleep(0, 10);
- }
+ WT_RET(__wt_clsm_await_switch(clsm));
return (0);
}
@@ -1424,11 +1442,11 @@ err: __clsm_leave(clsm);
}
/*
- * __clsm_close --
+ * __wt_clsm_close --
* WT_CURSOR->close method for the LSM cursor type.
*/
-static int
-__clsm_close(WT_CURSOR *cursor)
+int
+__wt_clsm_close(WT_CURSOR *cursor)
{
WT_CURSOR_LSM *clsm;
WT_DECL_RET;
@@ -1482,14 +1500,17 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
__clsm_update, /* update */
__clsm_remove, /* remove */
__wt_cursor_reconfigure, /* reconfigure */
- __clsm_close); /* close */
+ __wt_clsm_close); /* close */
WT_CURSOR *cursor;
WT_CURSOR_LSM *clsm;
WT_DECL_RET;
WT_LSM_TREE *lsm_tree;
+ int bulk;
+ bulk = 0;
clsm = NULL;
cursor = NULL;
+ lsm_tree = NULL;
if (!WT_PREFIX_MATCH(uri, "lsm:"))
return (EINVAL);
@@ -1499,9 +1520,21 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
WT_RET_MSG(session, EINVAL,
"LSM does not support opening by checkpoint");
+ WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
+ if (cval.val != 0)
+ bulk = 1;
+
/* Get the LSM tree. */
- WT_WITH_DHANDLE_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+ WT_WITH_HANDLE_LIST_LOCK(session,
+ ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree));
+ /*
+ * Check whether the exclusive open for a bulk load succeeded, and
+ * if it did ensure that it's safe to bulk load into the tree.
+ */
+ if (bulk && (ret == EBUSY || (ret == 0 && lsm_tree->nchunks > 1)))
+ WT_ERR_MSG(session, EINVAL,
+ "bulk-load is only supported on newly created LSM trees");
+ /* Flag any errors from the tree get. */
WT_RET(ret);
WT_ERR(__wt_calloc_one(session, &clsm));
@@ -1524,9 +1557,20 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
+ if (bulk)
+ WT_ERR(__wt_clsm_open_bulk(clsm, cfg));
+
if (0) {
err: if (clsm != NULL)
- WT_TRET(__clsm_close(cursor));
+ WT_TRET(__wt_clsm_close(cursor));
+ else if (lsm_tree != NULL)
+ __wt_lsm_tree_release(session, lsm_tree);
+
+ /*
+ * We open bulk cursors after setting the returned cursor.
+ * Fix that here.
+ */
+ *cursorp = NULL;
}
return (ret);
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c
new file mode 100644
index 00000000000..6b51a070e47
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __clsm_close_bulk --
+ * WT_CURSOR->close method for LSM bulk cursors.
+ */
+static int
+__clsm_close_bulk(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_LSM_TREE *lsm_tree;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+ lsm_tree = clsm->lsm_tree;
+ F_SET(lsm_tree->chunk[0], WT_LSM_CHUNK_ONDISK);
+
+ WT_RET(__wt_clsm_close(cursor));
+ return (0);
+}
+/*
+ * __clsm_insert_bulk --
+ * WT_CURSOR->insert method for LSM bulk cursors.
+ */
+static int
+__clsm_insert_bulk(WT_CURSOR *cursor)
+{
+ WT_CURSOR *bulk_cursor;
+ WT_CURSOR_LSM *clsm;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1);
+ ++lsm_tree->chunk[0]->count;
+ bulk_cursor = *clsm->cursors;
+ bulk_cursor->set_key(bulk_cursor, &cursor->key);
+ bulk_cursor->set_value(bulk_cursor, &cursor->value);
+ WT_RET(bulk_cursor->insert(bulk_cursor));
+
+ return (0);
+}
+
+/*
+ * __wt_clsm_open_bulk --
+ * WT_SESSION->open_cursor method for LSM bulk cursors.
+ */
+int
+__wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[])
+{
+ WT_CURSOR *cursor, *bulk_cursor;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+
+ bulk_cursor = NULL;
+ cursor = &clsm->iface;
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ F_SET(clsm, WT_CLSM_BULK);
+
+ /* Bulk cursors are limited to insert and close. */
+ __wt_cursor_set_notsup(cursor);
+ cursor->insert = __clsm_insert_bulk;
+ cursor->close = __clsm_close_bulk;
+
+ /* Setup the first chunk in the tree. */
+ WT_RET(__wt_clsm_request_switch(clsm));
+ WT_RET(__wt_clsm_await_switch(clsm));
+
+ /*
+ * Grab and release the LSM tree lock to ensure that the first chunk
+ * has been fully created before proceeding. We have the LSM tree
+ * open exclusive, so that saves us from needing the lock generally.
+ */
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /*
+ * Open a bulk cursor on the first chunk, it's not a regular LSM chunk
+ * cursor, but use the standard storage locations. Allocate the space
+ * for a bloom filter - it makes cleanup simpler. Cleaned up by
+ * cursor close on error.
+ */
+ WT_RET(__wt_calloc_one(session, &clsm->blooms));
+ clsm->bloom_alloc = 1;
+ WT_RET(__wt_calloc_one(session, &clsm->cursors));
+ clsm->cursor_alloc = 1;
+ clsm->nchunks = 1;
+
+ /*
+ * Open a bulk cursor on the first chunk in the tree - take a read
+ * lock on the LSM tree while we are opening the chunk, to ensure
+ * that the first chunk has been fully created before we succeed.
+ * Pass through the application config to ensure the tree is open
+ * for bulk access.
+ */
+ WT_RET(__wt_open_cursor(session,
+ lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor));
+ clsm->cursors[0] = bulk_cursor;
+ /* LSM cursors are always raw */
+ F_SET(bulk_cursor, WT_CURSTD_RAW);
+
+ return (0);
+}
+
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index 3d9fc27d1d2..0533e628601 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -423,7 +423,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
if (TAILQ_EMPTY(&conn->lsmqh))
continue;
__wt_spin_lock(session, &conn->dhandle_lock);
- F_SET(session, WT_SESSION_HANDLE_LIST_LOCKED);
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST);
dhandle_locked = 1;
TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
@@ -483,13 +483,13 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
}
}
__wt_spin_unlock(session, &conn->dhandle_lock);
- F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED);
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST);
dhandle_locked = 0;
}
err: if (dhandle_locked) {
__wt_spin_unlock(session, &conn->dhandle_lock);
- F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED);
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST);
}
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
index 5398982aef4..bc694000900 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -32,7 +32,7 @@ __curstat_lsm_init(
"checkpoint=" WT_CHECKPOINT, NULL, NULL };
locked = 0;
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
WT_RET(ret);
WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index cce49984f43..439837e96be 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -29,7 +29,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int final)
/* We may be destroying an lsm_tree before it was added. */
if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
WT_ASSERT(session, final ||
- F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
}
@@ -248,7 +248,7 @@ int
__wt_lsm_tree_setup_chunk(
WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
WT_RET(__wt_epoch(session, &chunk->create_ts));
WT_RET(__wt_lsm_tree_chunk_name(
@@ -307,7 +307,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
char *tmpconfig;
/* If the tree is open, it already exists. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
if (ret == 0) {
__wt_lsm_tree_release(session, lsm_tree);
@@ -348,6 +348,11 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
WT_ERR(__wt_strndup(
session, cval.str, cval.len, &lsm_tree->collator_name));
+ WT_ERR(__wt_config_gets(session, cfg, "cache_resident", &cval));
+ if (cval.val != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "The cache_resident flag is not compatible with LSM");
+
WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
if (cval.val)
F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
@@ -429,7 +434,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
* tracking macros handle cleaning up on failure.
*/
if (ret == 0)
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __lsm_tree_open(session, uri, &lsm_tree));
if (ret == 0)
__wt_lsm_tree_release(session, lsm_tree);
@@ -454,7 +459,7 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
{
WT_LSM_TREE *lsm_tree;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
/* See if the tree is already open. */
TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
@@ -548,7 +553,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
conn = S2C(session);
lsm_tree = NULL;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
/* Start the LSM manager thread if it isn't running. */
if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
@@ -608,7 +613,7 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session,
{
WT_DECL_RET;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
ret = __lsm_tree_find(session, uri, exclusive, treep);
if (ret == WT_NOTFOUND)
@@ -934,7 +939,7 @@ __wt_lsm_tree_drop(
locked = 0;
/* Get the LSM tree. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_lsm_tree_get(session, name, 1, &lsm_tree));
WT_RET(ret);
@@ -970,7 +975,7 @@ __wt_lsm_tree_drop(
err: if (locked)
WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
WT_TRET(__lsm_tree_discard(session, lsm_tree, 0)));
return (ret);
}
@@ -994,7 +999,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
locked = 0;
/* Get the LSM tree. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_lsm_tree_get(session, olduri, 1, &lsm_tree));
WT_RET(ret);
@@ -1044,7 +1049,7 @@ err: if (locked)
* Discard this LSM tree structure. The first operation on the renamed
* tree will create a new one.
*/
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
WT_TRET(__lsm_tree_discard(session, lsm_tree, 0)));
return (ret);
}
@@ -1067,7 +1072,7 @@ __wt_lsm_tree_truncate(
locked = 0;
/* Get the LSM tree. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_lsm_tree_get(session, name, 1, &lsm_tree));
WT_RET(ret);
@@ -1106,7 +1111,7 @@ err: if (locked)
* the last good version of the metadata will be used, resulting
* in a valid (not truncated) tree.
*/
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
WT_TRET(__lsm_tree_discard(session, lsm_tree, 0)));
}
return (ret);
@@ -1204,7 +1209,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
/* Tell __wt_schema_worker not to look inside the LSM tree. */
*skip = 1;
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_lsm_tree_get(session, name, 0, &lsm_tree));
WT_RET(ret);
@@ -1390,7 +1395,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
locked = 0;
exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0;
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
WT_RET(ret);
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 74a52ad7402..1145c329639 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -281,7 +281,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
}
/* Stop if a running transaction needs the chunk. */
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
if (chunk->switch_txn == WT_TXN_NONE ||
!__wt_txn_visible_all(session, chunk->switch_txn)) {
WT_RET(__wt_verbose(session, WT_VERB_LSM,
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
index a2e4a2f8e9f..227d0fa9a6c 100644
--- a/src/third_party/wiredtiger/src/meta/meta_table.c
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -61,6 +61,7 @@ __wt_metadata_cursor(
WT_DECL_RET;
const char *cfg[] =
{ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL };
+ int is_dead;
saved_dhandle = session->dhandle;
WT_ERR(__wt_metadata_open(session));
@@ -71,7 +72,11 @@ __wt_metadata_cursor(
* We use the metadata a lot, so we have a handle cached; lock it and
* increment the in-use counter once the cursor is open.
*/
- WT_ERR(__wt_session_lock_dhandle(session, 0, NULL));
+ WT_ERR(__wt_session_lock_dhandle(session, 0, &is_dead));
+
+ /* The metadata should never be closed. */
+ WT_ASSERT(session, !is_dead);
+
WT_ERR(__wt_curfile_create(session, NULL, cfg, 0, 0, cursorp));
__wt_cursor_dhandle_incr_use(session);
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
index 62d4df47ff6..5e083d6df5e 100644
--- a/src/third_party/wiredtiger/src/meta/meta_track.c
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -386,10 +386,8 @@ __wt_meta_track_fileop(
WT_RET(__meta_track_next(session, &trk));
trk->op = WT_ST_FILEOP;
- if (olduri != NULL)
- WT_RET(__wt_strdup(session, olduri, &trk->a));
- if (newuri != NULL)
- WT_RET(__wt_strdup(session, newuri, &trk->b));
+ WT_RET(__wt_strdup(session, olduri, &trk->a));
+ WT_RET(__wt_strdup(session, newuri, &trk->b));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_alloc.c b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
index e0613197642..4d04f9ac579 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_alloc.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
@@ -43,6 +43,12 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
void *p;
/*
+ * Defensive: if our caller doesn't handle errors correctly, ensure a
+ * free won't fail.
+ */
+ *(void **)retp = NULL;
+
+ /*
* !!!
* This function MUST handle a NULL WT_SESSION_IMPL handle.
*/
@@ -222,17 +228,6 @@ __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp)
}
/*
- * __wt_strdup --
- * ANSI strdup function.
- */
-int
-__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
-{
- return (__wt_strndup(
- session, str, (str == NULL) ? 0 : strlen(str), retp));
-}
-
-/*
* __wt_free_int --
* ANSI free function.
*/
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 5bef5cd2d2d..76d61642bfd 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -483,6 +483,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
case WT_PM_REC_EMPTY: /* Page is empty */
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ case WT_PM_REC_REWRITE: /* Rewrite */
return (0);
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
break;
@@ -3229,6 +3230,18 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_RET(__rec_split_init(
session, r, page, page->pg_intl_recno, btree->maxintlpage));
+ /*
+ * We need to mark this page as splitting, as this may be an in-memory
+ * split during a checkpoint.
+ */
+ for (;;) {
+ F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
+ if (ret == 0) {
+ break;
+ }
+ __wt_yield();
+ }
+
/* For each entry in the in-memory page... */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
/* Update the starting record number in case we split. */
@@ -3271,6 +3284,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
case WT_PM_REC_REPLACE:
addr = &child->modify->mod_replace;
break;
+ case WT_PM_REC_REWRITE:
+ break;
WT_ILLEGAL_VALUE_ERR(session);
}
} else
@@ -3309,6 +3324,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
__rec_copy_incr(session, r, val);
} WT_INTL_FOREACH_END;
+ F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
+
/* Write the remnant page. */
return (__rec_split_finish(session, r));
@@ -4041,6 +4058,18 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
r->cell_zero = 1;
+ /*
+ * We need to mark this page as splitting in order to ensure we don't
+ * deadlock when performing an in-memory split during a checkpoint.
+ */
+ for (;;) {
+ F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
+ if (ret == 0) {
+ break;
+ }
+ __wt_yield();
+ }
+
/* For each entry in the in-memory page... */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
/*
@@ -4199,6 +4228,8 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
__rec_key_state_update(r, ovfl_key);
} WT_INTL_FOREACH_END;
+ F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
+
/* Write the remnant page. */
return (__rec_split_finish(session, r));
@@ -4836,6 +4867,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
case WT_PM_REC_EMPTY: /* Page deleted */
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ case WT_PM_REC_REWRITE: /* Rewrite */
/*
* Discard the multiple replacement blocks.
*/
@@ -4914,7 +4946,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd->dsk = NULL;
mod->mod_multi_entries = 1;
- F_SET(mod, WT_PM_REC_MULTIBLOCK);
+ F_SET(mod, WT_PM_REC_REWRITE);
break;
}
@@ -5064,10 +5096,14 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* information (otherwise we might think the backing block is being
* reused on a subsequent reconciliation where we want to free it).
*/
- if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK)
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_MULTIBLOCK:
+ case WT_PM_REC_REWRITE:
for (multi = mod->mod_multi,
i = 0; i < mod->mod_multi_entries; ++multi, ++i)
multi->addr.reuse = 0;
+ break;
+ }
/*
* On error, discard blocks we've written, they're unreferenced by the
diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c
index 03097128ec2..56c6f7b0551 100644
--- a/src/third_party/wiredtiger/src/schema/schema_drop.c
+++ b/src/third_party/wiredtiger/src/schema/schema_drop.c
@@ -29,7 +29,7 @@ __drop_file(
return (EINVAL);
/* Close all btree handles associated with this file. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_conn_dhandle_close_all(session, uri, force));
WT_RET(ret);
@@ -59,7 +59,7 @@ __drop_colgroup(
WT_DECL_RET;
WT_TABLE *table;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_TABLE_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE));
/* If we can get the colgroup, detach it from the table. */
if ((ret = __wt_schema_get_colgroup(
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
index 414722652a0..2b645d5c666 100644
--- a/src/third_party/wiredtiger/src/schema/schema_open.c
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -44,7 +44,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
char *cgconfig;
u_int i;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_TABLE_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE));
if (table->cg_complete)
return (0);
@@ -407,7 +407,7 @@ __wt_schema_open_table(WT_SESSION_IMPL *session,
table = NULL;
tablename = NULL;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_TABLE_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE));
WT_ERR(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name));
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
index 51281eccec5..3e619fe9cff 100644
--- a/src/third_party/wiredtiger/src/schema/schema_rename.c
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -30,7 +30,7 @@ __rename_file(
return (EINVAL);
/* Close any btree handles in the file. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_conn_dhandle_close_all(session, uri, 0));
WT_ERR(ret);
diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c
index be9f6bcfb57..0124ec70ca2 100644
--- a/src/third_party/wiredtiger/src/schema/schema_truncate.c
+++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c
@@ -33,7 +33,7 @@ __truncate_file(WT_SESSION_IMPL *session, const char *name)
WT_RET(__wt_session_release_btree(session));
/* Close any btree handles in the file. */
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_conn_dhandle_close_all(session, name, 0));
WT_RET(ret);
diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c
index e913fcfe69d..76b47a2ccff 100644
--- a/src/third_party/wiredtiger/src/schema/schema_worker.c
+++ b/src/third_party/wiredtiger/src/schema/schema_worker.c
@@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
* any open file handles, including checkpoints.
*/
if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) {
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_conn_dhandle_close_all(
session, uri, 0));
WT_ERR(ret);
@@ -60,11 +60,13 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
WT_SAVE_DHANDLE(session,
ret = file_func(session, cfg));
WT_TRET(__wt_session_release_btree(session));
- } else if (ret == EBUSY)
- /* TODO: Decode checkpoint from cfg. */
- WT_WITH_DHANDLE_LOCK(session,
+ } else if (ret == EBUSY) {
+ WT_ASSERT(session, !FLD_ISSET(
+ open_flags, WT_DHANDLE_EXCLUSIVE));
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_conn_btree_apply_single_ckpt(
session, uri, file_func, cfg));
+ }
WT_ERR(ret);
}
} else if (WT_PREFIX_MATCH(uri, "colgroup:")) {
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index ac24ae18c1d..2aa8e924302 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -546,12 +546,12 @@ __session_salvage(WT_SESSION *wt_session, const char *uri, const char *config)
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL(session, salvage, config, cfg);
+
/* Block out checkpoints to avoid spurious EBUSY errors. */
- __wt_spin_lock(session, &S2C(session)->checkpoint_lock);
- WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_worker(session, uri, __wt_salvage,
- NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE));
- __wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
+ WT_WITH_CHECKPOINT_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_schema_worker(session, uri, __wt_salvage,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE)));
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
@@ -605,15 +605,11 @@ __session_truncate(WT_SESSION *wt_session,
"the truncate method should not specify any"
"target after the log: URI prefix.");
ret = __wt_log_truncate_files(session, start, cfg);
- } else {
+ } else
/* Wait for checkpoints to avoid EBUSY errors. */
- __wt_spin_lock(session,
- &S2C(session)->checkpoint_lock);
- WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_truncate(session, uri, cfg));
- __wt_spin_unlock(session,
- &S2C(session)->checkpoint_lock);
- }
+ WT_WITH_CHECKPOINT_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_truncate(session, uri, cfg)));
goto done;
}
@@ -717,11 +713,10 @@ __session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config)
SESSION_API_CALL(session, upgrade, config, cfg);
/* Block out checkpoints to avoid spurious EBUSY errors. */
- __wt_spin_lock(session, &S2C(session)->checkpoint_lock);
- WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_worker(session, uri, __wt_upgrade,
- NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE));
- __wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
+ WT_WITH_CHECKPOINT_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_upgrade,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE)));
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
@@ -740,11 +735,10 @@ __session_verify(WT_SESSION *wt_session, const char *uri, const char *config)
SESSION_API_CALL(session, verify, config, cfg);
/* Block out checkpoints to avoid spurious EBUSY errors. */
- __wt_spin_lock(session, &S2C(session)->checkpoint_lock);
- WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_worker(session, uri, __wt_verify,
- NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY));
- __wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
+ WT_WITH_CHECKPOINT_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_verify,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY)));
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
@@ -914,14 +908,12 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
* here to ensure we don't get into trouble.
*/
WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1);
- __wt_spin_lock(session, &S2C(session)->checkpoint_lock);
- ret = __wt_txn_checkpoint(session, cfg);
+ WT_WITH_CHECKPOINT_LOCK(session,
+ ret = __wt_txn_checkpoint(session, cfg));
WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
- __wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
-
err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
API_END_RET_NOTFOUND_MAP(session, ret);
@@ -953,7 +945,7 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
*sessionp = NULL;
- WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+ WT_RET(__wt_open_session(conn, NULL, NULL, open_metadata, &session));
session->name = name;
/*
@@ -971,19 +963,6 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
if (!uses_dhandles)
F_SET(session, WT_SESSION_NO_DATA_HANDLES);
- /*
- * Acquiring the metadata handle requires the schema lock; we've seen
- * problems in the past where a worker thread has acquired the schema
- * lock unexpectedly, relatively late in the run, and deadlocked. Be
- * defensive, get it now. The metadata file may not exist when the
- * connection first creates its default session or the shared cache
- * pool creates its sessions, let our caller decline this work.
- */
- if (open_metadata) {
- WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
- WT_RET(__wt_metadata_open(session));
- }
-
*sessionp = session;
return (0);
}
@@ -995,7 +974,7 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
*/
int
__wt_open_session(WT_CONNECTION_IMPL *conn,
- WT_EVENT_HANDLER *event_handler, const char *config,
+ WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata,
WT_SESSION_IMPL **sessionp)
{
static const WT_SESSION stds = {
@@ -1131,5 +1110,20 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
WT_STAT_FAST_CONN_INCR(session, session_open);
err: __wt_spin_unlock(session, &conn->api_lock);
- return (ret);
+ WT_RET(ret);
+
+ /*
+ * Acquiring the metadata handle requires the schema lock; we've seen
+ * problems in the past where a session has acquired the schema lock
+ * unexpectedly, relatively late in the run, and deadlocked. Be
+ * defensive, get it now. The metadata file may not exist when the
+ * connection first creates its default session or the shared cache
+ * pool creates its sessions, let our caller decline this work.
+ */
+ if (open_metadata) {
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
+ WT_RET(__wt_metadata_open(session_ret));
+ }
+
+ return (0);
}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index ce5f95a40d0..720f40e8d11 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -31,97 +31,139 @@ __session_add_dhandle(
if (dhandle_cachep != NULL)
*dhandle_cachep = dhandle_cache;
- (void)WT_ATOMIC_ADD4(session->dhandle->session_ref, 1);
-
/* Sweep the handle list to remove any dead handles. */
return (__session_dhandle_sweep(session));
}
/*
* __wt_session_lock_dhandle --
- * Try to lock a handle that is cached in this session. This is the fast
- * path that tries to lock a handle without the need for the schema lock.
+ * Return when the current data handle is either (a) open with the
+ * requested lock mode; or (b) closed and write locked. If exclusive
+ * access is requested and cannot be granted immediately because the
+ * handle is in use, fail with EBUSY.
+ *
+ * Here is a brief summary of how different operations synchronize using
+ * either the schema lock, handle locks or handle flags:
*
- * If the handle can't be locked in the required state, release it and
- * fail with WT_NOTFOUND: we have to take the slow path after acquiring
- * the schema lock.
+ * open -- one thread gets the handle exclusive, reverts to a shared
+ * handle lock once the handle is open;
+ * bulk load -- sets bulk and exclusive;
+ * salvage, truncate, update, verify -- hold the schema lock,
+ * get the handle exclusive, set a "special" flag;
+ * sweep -- gets a write lock on the handle, doesn't set exclusive
+ *
+ * The principle is that some application operations can cause other
+ * application operations to fail (so attempting to open a cursor on a
+ * file while it is being bulk-loaded will fail), but internal or
+ * database-wide operations should not prevent application-initiated
+ * operations. For example, attempting to verify a file should not fail
+ * because the sweep server happens to be in the process of closing that
+ * file.
*/
int
-__wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, int *deadp)
+__wt_session_lock_dhandle(
+ WT_SESSION_IMPL *session, uint32_t flags, int *is_deadp)
{
- enum { NOLOCK, READLOCK, WRITELOCK } locked;
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
- uint32_t special_flags;
+ WT_DECL_RET;
+ int is_open, lock_busy, want_exclusive;
+
+ *is_deadp = 0;
- btree = S2BT(session);
dhandle = session->dhandle;
- locked = NOLOCK;
- if (deadp != NULL)
- *deadp = 0;
+ btree = dhandle->handle;
+ lock_busy = 0;
+ want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE) ? 1 : 0;
/*
- * Special operation flags will cause the handle to be reopened.
- * For example, a handle opened with WT_BTREE_BULK cannot use the same
- * internal data structures as a handle opened for ordinary access.
+ * Check that the handle is open. We've already incremented
+ * the reference count, so once the handle is open it won't be
+ * closed by another thread.
+ *
+ * If we can see the WT_DHANDLE_OPEN flag set while holding a
+ * lock on the handle, then it's really open and we can start
+ * using it. Alternatively, if we can get an exclusive lock
+ * and WT_DHANDLE_OPEN is still not set, we need to do the open.
*/
- special_flags = LF_ISSET(WT_BTREE_SPECIAL_FLAGS);
- WT_ASSERT(session,
- special_flags == 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE));
+ for (;;) {
+ /* If the handle is dead, give up. */
+ if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
+ *is_deadp = 1;
+ return (0);
+ }
+
+ /*
+ * If the handle is already open for a special operation,
+ * give up.
+ */
+ if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
+ return (EBUSY);
- if (LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
/*
- * Try to get an exclusive handle lock and fail immediately if
- * it's unavailable. We don't expect exclusive operations on
- * trees to be mixed with ordinary cursor access, but if there
- * is a use case in the future, we could make blocking here
- * configurable.
+ * If the handle is open, get a read lock and recheck.
*
- * Special flags will cause the handle to be reopened, which
- * will get the necessary lock, so don't bother here.
+ * Wait for a read lock if we want exclusive access and failed
+ * to get it: the sweep server may be closing this handle, and
+ * we need to wait for it to release its lock. If we want
+ * exclusive access and find the handle open once we get the
+ * read lock, give up: some other thread has it locked for real.
*/
- if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || special_flags == 0) {
- WT_RET(__wt_try_writelock(session, dhandle->rwlock));
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ (!want_exclusive || lock_busy)) {
+ WT_RET(__wt_readlock(session, dhandle->rwlock));
+ if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
+ *is_deadp = 1;
+ WT_RET(
+ __wt_readunlock(session, dhandle->rwlock));
+ return (0);
+ }
+
+ is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0;
+ if (is_open && !want_exclusive)
+ return (0);
+ WT_RET(__wt_readunlock(session, dhandle->rwlock));
+ } else
+ is_open = 0;
+
+ /*
+ * It isn't open or we want it exclusive: try to get an
+ * exclusive lock. There is some subtlety here: if we race
+ * with another thread that successfully opens the file, we
+ * don't want to block waiting to get exclusive access.
+ */
+ if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) {
+ if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
+ *is_deadp = 1;
+ WT_RET(
+ __wt_writeunlock(session, dhandle->rwlock));
+ return (0);
+ }
+
+ /*
+ * If it was opened while we waited, drop the write
+ * lock and get a read lock instead.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !want_exclusive) {
+ lock_busy = 0;
+ WT_RET(
+ __wt_writeunlock(session, dhandle->rwlock));
+ continue;
+ }
+
+ /* We have an exclusive lock, we're done. */
F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
- locked = WRITELOCK;
+ WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD));
+ return (0);
}
- } else if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
- return (EBUSY);
- else {
- WT_RET(__wt_readlock(session, dhandle->rwlock));
- locked = READLOCK;
- }
-
- /*
- * At this point, we have the requested lock -- if that is all that was
- * required, we're done. Otherwise, check that the handle is open and
- * that no special flags are required.
- */
- if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
- WT_ASSERT(session, deadp != NULL);
- *deadp = 1;
- } else if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
- (F_ISSET(dhandle, WT_DHANDLE_OPEN) && special_flags == 0))
- return (0);
+ if (ret != EBUSY || (is_open && want_exclusive))
+ return (ret);
+ lock_busy = 1;
- /*
- * The handle needs to be opened. If we locked the handle above,
- * unlock it before returning.
- */
- switch (locked) {
- case NOLOCK:
- break;
- case READLOCK:
- WT_RET(__wt_readunlock(session, dhandle->rwlock));
- break;
- case WRITELOCK:
- F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
- WT_RET(__wt_writeunlock(session, dhandle->rwlock));
- break;
+ /* Give other threads a chance to make progress. */
+ __wt_yield();
}
-
- /* Treat an unopened handle just like a non-existent handle. */
- return (WT_NOTFOUND);
}
/*
@@ -131,22 +173,21 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, int *deadp)
int
__wt_session_release_btree(WT_SESSION_IMPL *session)
{
- enum { NOLOCK, READLOCK, WRITELOCK } locked;
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ int write_locked;
btree = S2BT(session);
dhandle = session->dhandle;
+ write_locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? 1 : 0;
- locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? WRITELOCK : READLOCK;
/*
* If we had special flags set, close the handle so that future access
* can get a handle without special flags.
*/
if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) {
- WT_WITH_DHANDLE_LOCK(session,
- ret = __wt_conn_btree_sync_and_close(session, 0, 1));
+ ret = __wt_conn_btree_sync_and_close(session, 0, 1);
F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE);
} else if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) ||
F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
@@ -155,19 +196,12 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
F_CLR(dhandle, WT_DHANDLE_DISCARD);
}
- if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE))
+ if (write_locked)
F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
- switch (locked) {
- case NOLOCK:
- break;
- case READLOCK:
- WT_TRET(__wt_readunlock(session, dhandle->rwlock));
- break;
- case WRITELOCK:
- WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
- break;
- }
+ WT_TRET(write_locked ?
+ __wt_writeunlock(session, dhandle->rwlock):
+ __wt_readunlock(session, dhandle->rwlock));
session->dhandle = NULL;
return (ret);
@@ -211,7 +245,6 @@ retry: WT_RET(__wt_meta_checkpoint_last_name(
}
ret = __wt_session_get_btree(session, uri, checkpoint, cfg, flags);
-
__wt_free(session, checkpoint);
/*
@@ -248,7 +281,6 @@ __session_discard_btree(
dhandle_cache, __wt_data_handle_cache, hashl);
(void)WT_ATOMIC_SUB4(dhandle_cache->dhandle->session_ref, 1);
-
__wt_overwrite_and_free(session, dhandle_cache);
}
@@ -297,8 +329,10 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
if (dhandle != session->dhandle &&
dhandle->session_inuse == 0 &&
(F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
- now - dhandle->timeofdeath > conn->sweep_idle_time)) {
+ (dhandle->timeofdeath != 0 &&
+ now - dhandle->timeofdeath > conn->sweep_idle_time))) {
WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
+ WT_ASSERT(session, !WT_IS_METADATA(dhandle));
__session_discard_btree(session, dhandle_cache);
}
dhandle_cache = dhandle_cache_next;
@@ -307,42 +341,43 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
}
/*
- * __session_dhandle_find --
+ * __session_dhandle_find_shared --
* Search for a data handle in the connection and add it to a session's
* cache. Since the data handle isn't locked, this must be called holding
* the handle list lock, and we must increment the handle's reference
* count before releasing it.
*/
static int
-__session_dhandle_find(WT_SESSION_IMPL *session,
- const char *uri, const char *checkpoint, uint32_t flags)
+__session_dhandle_find_shared(
+ WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
- WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint, flags));
- return (__session_add_dhandle(session, NULL));
+ WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint));
+ (void)WT_ATOMIC_ADD4(session->dhandle->session_ref, 1);
+ return (0);
}
-
/*
- * __wt_session_get_btree --
- * Get a btree handle for the given name, set session->dhandle.
+ * __session_dhandle_find --
+ * Search for a data handle, first in the session cache, then in the
+ * connection.
*/
-int
-__wt_session_get_btree(WT_SESSION_IMPL *session,
- const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags)
+static int
+__session_dhandle_find(
+ WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
WT_DATA_HANDLE *dhandle;
WT_DATA_HANDLE_CACHE *dhandle_cache;
WT_DECL_RET;
uint64_t bucket;
- int is_dead;
-
- WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
- WT_ASSERT(session, !LF_ISSET(WT_DHANDLE_HAVE_REF));
-
- dhandle = NULL;
bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
- SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) {
+retry: SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) {
dhandle = dhandle_cache->dhandle;
+ if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
+ WT_ASSERT(session, !WT_IS_METADATA(dhandle));
+ __session_discard_btree(session, dhandle_cache);
+ /* We deleted our entry, retry from the start. */
+ goto retry;
+ }
if (strcmp(uri, dhandle->name) != 0)
continue;
if (checkpoint == NULL && dhandle->checkpoint == NULL)
@@ -352,71 +387,97 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
break;
}
- if (dhandle_cache != NULL)
+ if (dhandle_cache != NULL) {
session->dhandle = dhandle;
- else {
- /*
- * We didn't find a match in the session cache, now search the
- * shared handle list and cache any handle we find.
- */
- WT_WITH_DHANDLE_LOCK(session, ret =
- __session_dhandle_find(session, uri, checkpoint, flags));
- dhandle = (ret == 0) ? session->dhandle : NULL;
- WT_RET_NOTFOUND_OK(ret);
+ return (0);
}
- if (dhandle != NULL) {
- /* Try to lock the handle; if this succeeds, we're done. */
- if ((ret =
- __wt_session_lock_dhandle(session, flags, &is_dead)) == 0)
- goto done;
+ /*
+ * We didn't find a match in the session cache, search the shared
+ * handle list and cache the handle we find.
+ */
+ WT_WITH_HANDLE_LIST_LOCK(session, ret =
+ __session_dhandle_find_shared(session, uri, checkpoint));
+ if (ret == 0)
+ ret = __session_add_dhandle(session, NULL);
- /* Propagate errors we don't expect. */
- if (ret != WT_NOTFOUND && ret != EBUSY)
- return (ret);
+ return (ret);
+}
+
+/*
+ * __wt_session_get_btree --
+ * Get a btree handle for the given name, set session->dhandle.
+ */
+int
+__wt_session_get_btree(WT_SESSION_IMPL *session,
+ const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ int is_dead;
+
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
+
+ for (;;) {
+ WT_RET(__session_dhandle_find(session, uri, checkpoint));
+ dhandle = session->dhandle;
+
+ /* Try to lock the handle. */
+ WT_RET(__wt_session_lock_dhandle(session, flags, &is_dead));
+ if (is_dead)
+ continue;
+
+ /* If the handle is open in the mode we want, we're done. */
+ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
+ (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !LF_ISSET(WT_BTREE_SPECIAL_FLAGS)))
+ break;
+
+ WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
/*
- * Don't try harder to get the handle if we're only checking
- * for locks or our caller hasn't allowed us to take the schema
- * lock - they do so on purpose and will handle error returns.
+ * For now, we need the schema lock and handle list locks to
+ * open a file for real.
+ *
+ * Code needing exclusive access (such as drop or verify)
+ * assumes that it can close all open handles, then open an
+ * exclusive handle on the active tree and no other threads can
+ * reopen handles in the meantime. A combination of the schema
+ * and handle list locks are used to enforce this.
*/
- if ((LF_ISSET(WT_DHANDLE_LOCK_ONLY) && ret == EBUSY) ||
- (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
- F_ISSET(session,
- WT_SESSION_HANDLE_LIST_LOCKED | WT_SESSION_TABLE_LOCKED)))
- return (ret);
+ if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) ||
+ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) {
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_RET(__wt_writeunlock(session, dhandle->rwlock));
- /* If we found the handle and it isn't dead, reopen it. */
- if (is_dead) {
- __session_discard_btree(session, dhandle_cache);
- dhandle_cache = NULL;
- session->dhandle = dhandle = NULL;
- } else
- LF_SET(WT_DHANDLE_HAVE_REF);
- }
+ WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret =
+ __wt_session_get_btree(
+ session, uri, checkpoint, cfg, flags)));
- /*
- * Acquire the schema lock and the data handle lock, find and/or
- * open the handle.
- *
- * We need the schema lock for this call so that if we lock a handle in
- * order to open it, that doesn't race with a schema-changing operation
- * such as drop.
- */
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_DHANDLE_LOCK(session, ret =
- __wt_conn_btree_get(session, uri, checkpoint, cfg, flags)));
- WT_RET(ret);
+ return (ret);
+ }
+
+ /* Open the handle. */
+ if ((ret = __wt_conn_btree_open(session, cfg, flags)) == 0 &&
+ LF_ISSET(WT_DHANDLE_EXCLUSIVE))
+ break;
- if (!LF_ISSET(WT_DHANDLE_HAVE_REF))
- WT_RET(__session_add_dhandle(session, NULL));
+ /*
+ * If we got the handle exclusive to open it but only want
+ * ordinary access, drop our lock and retry the open.
+ */
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ WT_RET(ret);
+ }
+ WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD));
WT_ASSERT(session, LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
- (F_ISSET(session->dhandle, WT_DHANDLE_OPEN) &&
- !F_ISSET(session->dhandle, WT_DHANDLE_DEAD)));
+ F_ISSET(dhandle, WT_DHANDLE_OPEN));
-done: WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
- F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE));
+ WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
+ F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/support/mutex.c b/src/third_party/wiredtiger/src/support/mutex.c
deleted file mode 100644
index fa85cfc33d5..00000000000
--- a/src/third_party/wiredtiger/src/support/mutex.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
-
-/*
- * __wt_spin_lock_register_lock --
- * Add a lock to the connection's list.
- */
-int
-__wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
-{
- WT_CONNECTION_IMPL *conn;
- u_int i;
-
- /*
- * There is a spinlock we initialize before we have a connection, the
- * global library lock. In that case, the session will be NULL and
- * we can't track the lock.
- */
- if (session == NULL)
- return (0);
-
- conn = S2C(session);
-
- for (i = 0; i < WT_SPINLOCK_MAX; i++)
- if (conn->spinlock_list[i] == NULL &&
- WT_ATOMIC_CAS(conn->spinlock_list[i], NULL, t))
- return (0);
-
- WT_RET_MSG(session, ENOMEM,
- "spinlock connection registry failed, increase the connection's "
- "spinlock list size");
-}
-
-/*
- * __wt_spin_lock_unregister_lock --
- * Remove a lock from the connection's list.
- */
-void
-__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
-{
- WT_CONNECTION_IMPL *conn;
- u_int i;
-
- conn = S2C(session);
-
- for (i = 0; i < WT_SPINLOCK_MAX; i++)
- if (conn->spinlock_list[i] == t)
- conn->spinlock_list[i] = NULL;
-
- /*
- * XXX
- * The statistics thread reads through this array, there's a possible
- * race: if that thread reads the pointer then goes to sleep, then we
- * free the spinlock, then the statistics thread wakes up, it can read
- * free'd memory.
- *
- * This is performance debugging code, so we're not fixing the race for
- * now, minimize the window.
- */
- WT_FULL_BARRIER();
-}
-
-/*
- * __spin_lock_next_id --
- * Return the next spinlock caller ID.
- */
-static int
-__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp)
-{
- static int lock_id = 0, next_id = 0;
- WT_DECL_RET;
-
- /* If we've ever registered this location, we already have an ID. */
- if (*idp != WT_SPINLOCK_REGISTER)
- return (0);
-
- /*
- * We can't use the global spinlock to lock the ID allocation (duh!),
- * use a CAS instruction to serialize access to a local variable.
- * This work only gets done once per library instantiation, there
- * isn't a performance concern.
- */
- while (!WT_ATOMIC_CAS(lock_id, 0, 1))
- __wt_yield();
-
- /* Allocate a blocking ID for this location. */
- if (*idp == WT_SPINLOCK_REGISTER) {
- if (next_id < WT_SPINLOCK_MAX_LOCATION_ID)
- *idp = next_id++;
- else
- WT_ERR_MSG(session, ENOMEM,
- "spinlock caller location registry failed, "
- "increase the connection's blocking matrix size");
- }
-
-err: WT_PUBLISH(lock_id, 0);
- return (ret);
-}
-
-/*
- * __wt_spin_lock_register_caller --
- * Register a spin-lock caller's location information in the blocking
- * matrix.
- */
-int
-__wt_spin_lock_register_caller(WT_SESSION_IMPL *session,
- const char *name, const char *file, int line, int *idp)
-{
- WT_CONNECTION_IMPL *conn;
- WT_CONNECTION_STATS_SPINLOCK *p;
-
- conn = S2C(session);
-
- /*
- * The caller's location ID is a static offset into a per-connection
- * structure, and that has problems: first, if there are multiple
- * connections, we'll need to hold some kind of lock to avoid racing
- * when setting that value, and second, if/when there are multiple
- * connections and/or a single connection is closed and re-opened, the
- * variable may be initialized and underlying connection information
- * may not.
- *
- * First, allocate a location ID if needed.
- */
- WT_RET(__spin_lock_next_id(session, idp));
-
- /*
- * Add the caller's information to the blocking matrix. We could race
- * here (if two threads of control register the same lock at the same
- * time), but we don't care as both threads are setting the identical
- * information.
- */
- p = &conn->spinlock_block[*idp];
- p->name = name;
- if ((p->file = strrchr(file, '/')) == NULL)
- p->file = file;
- else
- ++p->file;
- p->line = line;
- return (0);
-}
-
-/*
- * __wt_statlog_dump_spinlock --
- * Log the spin-lock statistics.
- */
-int
-__wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag)
-{
- WT_SPINLOCK *spin;
- WT_CONNECTION_STATS_SPINLOCK *p, *t;
- uint64_t block_manager, btree_page, ignore;
- u_int i, j;
-
- /*
- * Ignore rare acquisition of a spinlock using a base value of 10 per
- * second so we don't create graphs we don't care about.
- */
- ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10;
-
- /* Output the number of times each spinlock was acquired. */
- block_manager = btree_page = 0;
- for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) {
- if ((spin = conn->spinlock_list[i]) == NULL)
- continue;
-
- /*
- * There are two sets of spinlocks we aggregate, the btree page
- * locks and the block manager per-file locks. The reason is
- * the block manager locks grow with the number of files open
- * (and LSM and bloom filters can open a lot of files), and
- * there are 16 btree page locks and splitting them out has not
- * historically been that informative.
- */
- if (strcmp(spin->name, "block manager") == 0) {
- block_manager += spin->counter;
- if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
- spin->counter = 0;
- continue;
- }
- if (strcmp(spin->name, "btree page") == 0) {
- btree_page += spin->counter;
- if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
- spin->counter = 0;
- continue;
- }
-
- WT_RET(__wt_fprintf(session, conn->stat_fp,
- "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
- conn->stat_stamp,
- spin->counter <= ignore ? 0 : spin->counter,
- tag, spin->name));
- if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
- spin->counter = 0;
- }
- WT_RET(__wt_fprintf(session, conn->stat_fp,
- "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
- conn->stat_stamp,
- block_manager <= ignore ? 0 : block_manager,
- tag, "block manager"));
- WT_RET(__wt_fprintf(session, conn->stat_fp,
- "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
- conn->stat_stamp,
- btree_page <= ignore ? 0 : btree_page,
- tag, "btree page"));
-
- /*
- * Output the number of times each location acquires its spinlock and
- * the blocking matrix.
- */
- for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) {
- p = &conn->spinlock_block[i];
- if (p->name == NULL)
- continue;
-
- WT_RET(__wt_fprintf(session, conn->stat_fp,
- "%s %d %s spinlock %s acquired by %s(%d)\n",
- conn->stat_stamp,
- p->total <= ignore ? 0 : p->total,
- tag,
- p->name, p->file, p->line));
- if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
- p->total = 0;
-
- for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) {
- t = &conn->spinlock_block[j];
- if (t->name == NULL)
- continue;
-
- WT_RET(__wt_fprintf(session, conn->stat_fp,
- "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n",
- conn->stat_stamp,
- p->blocked[j] <= ignore ? 0 : p->blocked[j],
- tag,
- p->name, p->file, p->line,
- t->file, t->line));
- if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
- p->blocked[j] = 0;
- }
- }
-
- WT_FULL_BARRIER(); /* Minimize the window. */
- return (0);
-}
-
-#endif /* SPINLOCK_PTHREAD_MUTEX_LOGGING */
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index fb0a4b7fa6d..05b27cd9a56 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -60,50 +60,29 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_state = &S2C(session)->txn_global.states[session->id];
- if (txn_state->snap_min != WT_TXN_NONE) {
- WT_ASSERT(session,
- session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
- !__wt_txn_visible_all(session, txn_state->snap_min));
- txn_state->snap_min = WT_TXN_NONE;
- }
- F_CLR(txn, TXN_HAS_SNAPSHOT);
-}
+ WT_ASSERT(session,
+ txn_state->snap_min == WT_TXN_NONE ||
+ session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
+ !__wt_txn_visible_all(session, txn_state->snap_min));
-/*
- * __wt_txn_update_oldest --
- * Sweep the running transactions to update the oldest ID required.
- */
-void
-__wt_txn_update_oldest(WT_SESSION_IMPL *session)
-{
- /*
- * !!!
- * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
- * method (for the oldest transaction ID not yet visible to a running
- * transaction), and then comparing that oldest ID against committed
- * transactions to see if updates for a committed transaction are still
- * visible to running transactions, the oldest transaction ID may be
- * the same as the last committed transaction ID, if the transaction
- * state wasn't refreshed after the last transaction committed. Push
- * past the last committed transaction.
- */
- __wt_txn_refresh(session, 0);
+ txn_state->snap_min = WT_TXN_NONE;
+ F_CLR(txn, TXN_HAS_SNAPSHOT);
}
/*
- * __wt_txn_refresh --
- * Allocate a transaction ID and/or a snapshot.
+ * __wt_txn_get_snapshot --
+ * Allocate a snapshot.
*/
void
-__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
+__wt_txn_get_snapshot(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s, *txn_state;
- uint64_t current_id, id, oldest_id;
+ uint64_t ckpt_id, current_id, id;
uint64_t prev_oldest_id, snap_min;
- uint32_t i, n, oldest_session, session_cnt;
+ uint32_t i, n, session_cnt;
int32_t count;
conn = S2C(session);
@@ -116,10 +95,9 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
/* For pure read-only workloads, avoid scanning. */
if (prev_oldest_id == current_id) {
- if (get_snapshot) {
- txn_state->snap_min = current_id;
- __txn_sort_snapshot(session, 0, current_id);
- }
+ txn_state->snap_min = current_id;
+ __txn_sort_snapshot(session, 0, current_id);
+
/* Check that the oldest ID has not moved in the meantime. */
if (prev_oldest_id == txn_global->oldest_id &&
txn_global->scan_count == 0)
@@ -139,15 +117,14 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
/* The oldest ID cannot change until the scan count goes to zero. */
prev_oldest_id = txn_global->oldest_id;
- current_id = oldest_id = snap_min = txn_global->current;
- oldest_session = 0;
+ current_id = snap_min = txn_global->current;
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ ckpt_id = txn_global->checkpoint_id;
for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
/* Skip the checkpoint transaction; it is never read from. */
- if (txn_global->checkpoint_id != WT_TXN_NONE &&
- s->id == txn_global->checkpoint_id)
+ if (ckpt_id != WT_TXN_NONE && ckpt_id == s->id)
continue;
/*
@@ -163,18 +140,104 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
if (s != txn_state &&
(id = s->id) != WT_TXN_NONE &&
TXNID_LE(prev_oldest_id, id)) {
- if (get_snapshot)
- txn->snapshot[n++] = id;
+ txn->snapshot[n++] = id;
if (TXNID_LT(id, snap_min))
snap_min = id;
}
+ }
+
+ /*
+ * If we got a new snapshot, update the published snap_min for this
+ * session.
+ */
+ WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min));
+ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+ txn_state->snap_min = snap_min;
+
+ /* Update the last running ID if we have a much newer value. */
+ if (snap_min > txn_global->last_running + 100)
+ txn_global->last_running = snap_min;
+
+ WT_ASSERT(session, txn_global->scan_count > 0);
+ (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+
+ __txn_sort_snapshot(session, n, current_id);
+}
+
+/*
+ * __wt_txn_update_oldest --
+ * Sweep the running transactions to update the oldest ID required.
+ * !!!
+ * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
+ * method (for the oldest transaction ID not yet visible to a running
+ * transaction), and then comparing that oldest ID against committed
+ * transactions to see if updates for a committed transaction are still
+ * visible to running transactions, the oldest transaction ID may be
+ * the same as the last committed transaction ID, if the transaction
+ * state wasn't refreshed after the last transaction committed. Push
+ * past the last committed transaction.
+*/
+void
+__wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *oldest_session;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ uint64_t ckpt_id, current_id, id, oldest_id, prev_oldest_id, snap_min;
+ uint32_t i, session_cnt;
+ int32_t count;
+ int last_running_moved;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ current_id = snap_min = txn_global->current;
+ oldest_session = NULL;
+ prev_oldest_id = txn_global->oldest_id;
+
+ /*
+ * For pure read-only workloads, or if the update isn't forced and the
+ * oldest ID isn't too far behind, avoid scanning.
+ */
+ if (prev_oldest_id == current_id ||
+ (!force && TXNID_LT(current_id, prev_oldest_id + 100)))
+ return;
+
+ /*
+ * We're going to scan. Increment the count of scanners to prevent the
+ * oldest ID from moving forwards. Spin if the count is negative,
+ * which indicates that some thread is moving the oldest ID forwards.
+ */
+ do {
+ if ((count = txn_global->scan_count) < 0)
+ WT_PAUSE();
+ } while (count < 0 ||
+ !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+
+ /* The oldest ID cannot change until the scan count goes to zero. */
+ prev_oldest_id = txn_global->oldest_id;
+ current_id = oldest_id = snap_min = txn_global->current;
+
+ /* Walk the array of concurrent transactions. */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ ckpt_id = txn_global->checkpoint_id;
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /* Skip the checkpoint transaction; it is never read from. */
+ if (ckpt_id != WT_TXN_NONE && ckpt_id == s->id)
+ continue;
/*
- * Ignore the session's own snap_min: we are about to update
- * it.
+ * Update the oldest ID.
+ *
+ * Ignore: IDs older than the oldest ID we saw. This can happen
+ * if we race with a thread that is allocating an ID -- the ID
+ * will not be used because the thread will keep spinning until
+ * it gets a valid one.
*/
- if (get_snapshot && s == txn_state)
- continue;
+ if ((id = s->id) != WT_TXN_NONE &&
+ TXNID_LE(prev_oldest_id, id) && TXNID_LT(id, snap_min))
+ snap_min = id;
/*
* !!!
@@ -187,49 +250,31 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
if ((id = s->snap_min) != WT_TXN_NONE &&
TXNID_LT(id, oldest_id)) {
oldest_id = id;
- oldest_session = i;
+ oldest_session = &conn->sessions[i];
}
}
if (TXNID_LT(snap_min, oldest_id))
oldest_id = snap_min;
- if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id))
- oldest_id = txn->id;
-
- /*
- * If we got a new snapshot, update the published snap_min for this
- * session.
- */
- if (get_snapshot) {
- WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min));
- WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
- txn_state->snap_min = snap_min;
- }
- /*
- * Update the last running ID if we have a much newer value or we are
- * forcing an update.
- */
- if (!get_snapshot || snap_min > txn_global->last_running + 100)
+ /* Update the last running ID. */
+ if (TXNID_LT(txn_global->last_running, snap_min)) {
txn_global->last_running = snap_min;
+ last_running_moved = 1;
+ } else
+ last_running_moved = 0;
- /*
- * Update the oldest ID if we have a newer ID and we can get exclusive
- * access. During normal snapshot refresh, only do this if we have a
- * much newer value. Once we get exclusive access, do another pass to
- * make sure nobody else is using an earlier ID.
- */
+ /* Update the oldest ID. */
if (TXNID_LT(prev_oldest_id, oldest_id) &&
- (!get_snapshot || oldest_id - prev_oldest_id > 100) &&
WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ ckpt_id = txn_global->checkpoint_id;
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
/*
* Skip the checkpoint transaction; it is never read
* from.
*/
- if (txn_global->checkpoint_id != WT_TXN_NONE &&
- s->id == txn_global->checkpoint_id)
+ if (ckpt_id != WT_TXN_NONE && ckpt_id == s->id)
continue;
if ((id = s->id) != WT_TXN_NONE &&
@@ -244,23 +289,19 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
txn_global->scan_count = 0;
} else {
if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
- current_id - oldest_id > 10000 &&
- txn_global->oldest_session != oldest_session) {
+ current_id - oldest_id > 10000 && last_running_moved &&
+ oldest_session != NULL) {
(void)__wt_verbose(session, WT_VERB_TRANSACTION,
"old snapshot %" PRIu64
" pinned in session %d [%s]"
" with snap_min %" PRIu64 "\n",
- oldest_id, oldest_session,
- conn->sessions[oldest_session].lastop,
- conn->sessions[oldest_session].txn.snap_min);
- txn_global->oldest_session = oldest_session;
+ oldest_id, oldest_session->id,
+ oldest_session->lastop,
+ oldest_session->txn.snap_min);
}
WT_ASSERT(session, txn_global->scan_count > 0);
(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
}
-
- if (get_snapshot)
- __txn_sort_snapshot(session, n, current_id);
}
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 45560ff897a..1ae593fd6be 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -185,7 +185,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
session->ckpt_handle[i].dhandle,
ret = (*op)(session, cfg));
else
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __wt_conn_btree_apply_single(session,
session->ckpt_handle[i].name, NULL, op, cfg));
WT_RET(ret);
@@ -376,7 +376,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_WITH_SCHEMA_LOCK(session,
WT_WITH_TABLE_LOCK(session,
- WT_WITH_DHANDLE_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session,
ret = __checkpoint_apply_all(
session, cfg, __wt_checkpoint_list, NULL))));
WT_ERR(ret);
@@ -387,7 +387,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* This is particularly important for compact, so that all dirty pages
* can be fully written.
*/
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
/* Flush data-sources before we start the checkpoint. */
WT_ERR(__checkpoint_data_source(session, cfg));
@@ -411,7 +411,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
/* Acquire the schema lock. */
- F_SET(session, WT_SESSION_SCHEMA_LOCKED);
+ F_SET(session, WT_SESSION_LOCKED_SCHEMA);
__wt_spin_lock(session, &conn->schema_lock);
WT_ERR(__wt_meta_track_on(session));
@@ -568,8 +568,8 @@ err: /*
__wt_free(session, session->ckpt_handle);
session->ckpt_handle_allocated = session->ckpt_handle_next = 0;
- if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
- F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
+ if (F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) {
+ F_CLR(session, WT_SESSION_LOCKED_SCHEMA);
__wt_spin_unlock(session, &conn->schema_lock);
}
@@ -1057,7 +1057,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ASSERT(session, session->dhandle->checkpoint == NULL);
/* Should be holding the schema lock. */
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
return (__checkpoint_worker(session, cfg, 1));
}
@@ -1107,7 +1107,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final)
* for active readers.
*/
if (!btree->modified && !bulk) {
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
return (__wt_txn_visible_all(session, btree->rec_max_txn) ?
__wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY);
}
@@ -1123,7 +1123,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final)
*/
if (!final)
WT_ASSERT(session,
- bulk || F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
need_tracking = !bulk && !final && !WT_META_TRACKING(session);
if (need_tracking)
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 571754bf5bf..540b0528995 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -65,7 +65,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
"No file found with ID %u (max %u)",
id, r->nfiles));
r->missing = 1;
- } else if (LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+ } else if (WT_LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
/*
* We're going to apply the operation. Get the cursor, opening
* one if none is cached.
@@ -423,7 +423,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;
/* We need a real session for recovery. */
- WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+ WT_RET(__wt_open_session(conn, NULL, NULL, 1, &session));
F_SET(session, WT_SESSION_NO_LOGGING);
r.session = session;