summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/async/async_api.c5
-rw-r--r--src/async/async_worker.c2
-rw-r--r--src/btree/bt_cursor.c136
-rw-r--r--src/btree/bt_debug.c4
-rw-r--r--src/btree/bt_random.c427
-rw-r--r--src/btree/bt_split.c233
-rw-r--r--src/btree/bt_walk.c4
-rw-r--r--src/btree/row_srch.c212
-rw-r--r--src/checksum/power8/crc32_wrapper.c4
-rw-r--r--src/checksum/zseries/crc32-s390x.c26
-rw-r--r--src/config/config_def.c60
-rw-r--r--src/conn/conn_api.c11
-rw-r--r--src/conn/conn_cache.c6
-rw-r--r--src/conn/conn_cache_pool.c8
-rw-r--r--src/conn/conn_ckpt.c26
-rw-r--r--src/conn/conn_dhandle.c55
-rw-r--r--src/conn/conn_handle.c21
-rw-r--r--src/conn/conn_log.c50
-rw-r--r--src/conn/conn_open.c42
-rw-r--r--src/conn/conn_stat.c33
-rw-r--r--src/conn/conn_sweep.c26
-rw-r--r--src/cursor/cur_backup.c8
-rw-r--r--src/cursor/cur_index.c26
-rw-r--r--src/cursor/cur_std.c7
-rw-r--r--src/cursor/cur_table.c2
-rw-r--r--src/docs/cursor-random.dox5
-rw-r--r--src/docs/upgrading.dox6
-rw-r--r--src/docs/wtperf.dox6
-rw-r--r--src/evict/evict_lru.c617
-rw-r--r--src/evict/evict_stat.c2
-rw-r--r--src/include/btmem.h8
-rw-r--r--src/include/btree.i28
-rw-r--r--src/include/cache.h2
-rw-r--r--src/include/cache.i2
-rw-r--r--src/include/connection.h8
-rw-r--r--src/include/dhandle.h18
-rw-r--r--src/include/extern.h25
-rw-r--r--src/include/extern_posix.h4
-rw-r--r--src/include/extern_win.h4
-rw-r--r--src/include/flags.h79
-rw-r--r--src/include/log.h3
-rw-r--r--src/include/misc.i5
-rw-r--r--src/include/mutex.h4
-rw-r--r--src/include/packing.i7
-rw-r--r--src/include/schema.h162
-rw-r--r--src/include/session.h2
-rw-r--r--src/include/stat.h4
-rw-r--r--src/include/wiredtiger.in251
-rw-r--r--src/log/log.c42
-rw-r--r--src/log/log_slot.c206
-rw-r--r--src/lsm/lsm_cursor.c4
-rw-r--r--src/lsm/lsm_manager.c12
-rw-r--r--src/lsm/lsm_stat.c4
-rw-r--r--src/lsm/lsm_tree.c63
-rw-r--r--src/lsm/lsm_work_unit.c4
-rw-r--r--src/lsm/lsm_worker.c2
-rw-r--r--src/os_posix/os_mtx_cond.c28
-rw-r--r--src/os_win/os_mtx_cond.c43
-rw-r--r--src/schema/schema_drop.c2
-rw-r--r--src/schema/schema_list.c2
-rw-r--r--src/schema/schema_rename.c2
-rw-r--r--src/schema/schema_worker.c2
-rw-r--r--src/session/session_api.c56
-rw-r--r--src/session/session_dhandle.c43
-rw-r--r--src/support/cond_auto.c80
-rw-r--r--src/support/rand.c12
-rw-r--r--src/support/stat.c16
-rw-r--r--src/support/thread_group.c2
-rw-r--r--src/txn/txn.c95
-rw-r--r--src/txn/txn_ckpt.c39
-rw-r--r--src/txn/txn_log.c4
-rw-r--r--src/utilities/util.h2
-rw-r--r--src/utilities/util_alter.c9
-rw-r--r--src/utilities/util_compact.c14
-rw-r--r--src/utilities/util_create.c12
-rw-r--r--src/utilities/util_drop.c10
-rw-r--r--src/utilities/util_dump.c26
-rw-r--r--src/utilities/util_list.c21
-rw-r--r--src/utilities/util_load.c2
-rw-r--r--src/utilities/util_load_json.c2
-rw-r--r--src/utilities/util_loadtext.c13
-rw-r--r--src/utilities/util_main.c4
-rw-r--r--src/utilities/util_printlog.c14
-rw-r--r--src/utilities/util_read.c19
-rw-r--r--src/utilities/util_rebalance.c30
-rw-r--r--src/utilities/util_rename.c15
-rw-r--r--src/utilities/util_salvage.c30
-rw-r--r--src/utilities/util_stat.c6
-rw-r--r--src/utilities/util_truncate.c11
-rw-r--r--src/utilities/util_upgrade.c30
-rw-r--r--src/utilities/util_verify.c34
-rw-r--r--src/utilities/util_write.c20
92 files changed, 2160 insertions, 1613 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 54bcb7cd26c..026a008188c 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -240,8 +240,7 @@ __async_start(WT_SESSION_IMPL *session)
async = conn->async;
TAILQ_INIT(&async->formatqh);
WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
- WT_RET(__wt_cond_alloc(
- session, "async flush", false, &async->flush_cond));
+ WT_RET(__wt_cond_alloc(session, "async flush", &async->flush_cond));
WT_RET(__wt_async_op_init(session));
/*
@@ -541,7 +540,7 @@ retry:
async->flush_op.state = WT_ASYNCOP_READY;
WT_RET(__wt_async_op_enqueue(session, &async->flush_op));
while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE)
- __wt_cond_wait(session, async->flush_cond, 100000);
+ __wt_cond_wait(session, async->flush_cond, 100000, NULL);
/*
* Flush is done. Clear the flags.
*/
diff --git a/src/async/async_worker.c b/src/async/async_worker.c
index b1bc3902f7c..11f59ed14f1 100644
--- a/src/async/async_worker.c
+++ b/src/async/async_worker.c
@@ -107,7 +107,7 @@ __async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen)
{
while (async->flush_state == WT_ASYNC_FLUSHING &&
async->flush_gen == my_gen)
- __wt_cond_wait(session, async->flush_cond, 10000);
+ __wt_cond_wait(session, async->flush_cond, 10000, NULL);
}
/*
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index d18b9b76992..5fde2237538 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -76,11 +76,11 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
}
/*
- * __cursor_valid --
+ * __wt_cursor_valid --
* Return if the cursor references an valid key/value pair.
*/
-static inline bool
-__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
+bool
+__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -330,7 +330,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, cbt->ref, false) :
__cursor_col_search(session, cbt, cbt->ref));
- valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
+ valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd);
}
if (!valid) {
WT_ERR(__cursor_func_init(cbt, true));
@@ -338,7 +338,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, NULL, false) :
__cursor_col_search(session, cbt, NULL));
- valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
+ valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd);
}
if (valid)
@@ -419,14 +419,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
* Ignore those cases, it makes things too complicated.
*/
if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1)
- valid = __cursor_valid(cbt, &upd);
+ valid = __wt_cursor_valid(cbt, &upd);
}
if (!valid) {
WT_ERR(__cursor_func_init(cbt, true));
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, NULL, true) :
__cursor_col_search(session, cbt, NULL));
- valid = __cursor_valid(cbt, &upd);
+ valid = __wt_cursor_valid(cbt, &upd);
}
/*
@@ -462,7 +462,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, NULL, true) :
__cursor_col_search(session, cbt, NULL));
- if (__cursor_valid(cbt, &upd)) {
+ if (__wt_cursor_valid(cbt, &upd)) {
exact = cbt->compare;
ret = __wt_kv_return(session, cbt, upd);
} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
@@ -537,7 +537,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
* Fail in that case, the record exists.
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
- ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) ||
+ ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) ||
(cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
WT_ERR(WT_DUPLICATE_KEY);
@@ -552,7 +552,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
* key/value pair.
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
- cbt->compare == 0 && __cursor_valid(cbt, NULL))
+ cbt->compare == 0 && __wt_cursor_valid(cbt, NULL))
WT_ERR(WT_DUPLICATE_KEY);
ret = __cursor_row_modify(session, cbt, false);
@@ -682,12 +682,12 @@ retry: WT_RET(__cursor_func_init(cbt, true));
/*
* If we find a matching record, check whether an update would
* conflict. Do this before checking if the update is visible
- * in __cursor_valid, or we can miss conflict.
+ * in __wt_cursor_valid, or we can miss conflict.
*/
WT_ERR(__curfile_update_check(cbt));
/* Remove the record if it exists. */
- if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) {
+ if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) {
if (!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
/*
@@ -711,7 +711,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
/* Check whether an update would conflict. */
WT_ERR(__curfile_update_check(cbt));
- if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
+ if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
ret = __cursor_row_modify(session, cbt, true);
@@ -786,7 +786,8 @@ retry: WT_RET(__cursor_func_init(cbt, true));
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
WT_ERR(__curfile_update_check(cbt));
- if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) &&
+ if ((cbt->compare != 0 ||
+ !__wt_cursor_valid(cbt, NULL)) &&
!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
}
@@ -800,7 +801,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
WT_ERR(__curfile_update_check(cbt));
- if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
+ if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
}
ret = __cursor_row_modify(session, cbt, false);
@@ -830,111 +831,6 @@ err: if (ret == WT_RESTART) {
}
/*
- * __wt_btcur_next_random --
- * Move to a random record in the tree. There are two algorithms, one
- * where we select a record at random from the whole tree on each
- * retrieval and one where we first select a record at random from the
- * whole tree, and then subsequently sample forward from that location.
- * The sampling approach allows us to select reasonably uniform random
- * points from unbalanced trees.
- */
-int
-__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
-{
- WT_BTREE *btree;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- WT_UPDATE *upd;
- wt_off_t size;
- uint64_t skip;
-
- session = (WT_SESSION_IMPL *)cbt->iface.session;
- btree = cbt->btree;
-
- /*
- * Only supports row-store: applications can trivially select a random
- * value from a column-store, if there were any reason to do so.
- */
- if (btree->type != BTREE_ROW)
- WT_RET_MSG(session, ENOTSUP,
- "WT_CURSOR.next_random only supported by row-store tables");
-
- WT_STAT_CONN_INCR(session, cursor_next);
- WT_STAT_DATA_INCR(session, cursor_next);
-
- /*
- * If retrieving random values without sampling, or we don't have a
- * page reference, pick a roughly random leaf page in the tree.
- */
- if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
- /*
- * Skip past the sample size of the leaf pages in the tree
- * between each random key return to compensate for unbalanced
- * trees.
- *
- * Use the underlying file size divided by its block allocation
- * size as our guess of leaf pages in the file (this can be
- * entirely wrong, as it depends on how many pages are in this
- * particular checkpoint, how large the leaf and internal pages
- * really are, and other factors). Then, divide that value by
- * the configured sample size and increment the final result to
- * make sure tiny files don't leave us with a skip value of 0.
- *
- * !!!
- * Ideally, the number would be prime to avoid restart issues.
- */
- if (cbt->next_random_sample_size != 0) {
- WT_ERR(btree->bm->size(btree->bm, session, &size));
- cbt->next_random_leaf_skip = (uint64_t)
- ((size / btree->allocsize) /
- cbt->next_random_sample_size) + 1;
- }
-
- /*
- * Choose a leaf page from the tree.
- */
- WT_ERR(__cursor_func_init(cbt, true));
- WT_WITH_PAGE_INDEX(
- session, ret = __wt_row_random_descent(session, cbt));
- WT_ERR(ret);
- } else {
- /*
- * Read through the tree, skipping leaf pages. Be cautious about
- * the skip count: if the last leaf page skipped was also the
- * last leaf page in the tree, it may be set to zero on return
- * with the end-of-walk condition.
- *
- * Pages read for data sampling aren't "useful"; don't update
- * the read generation of pages already in memory, and if a page
- * is read, set its generation to a low value so it is evicted
- * quickly.
- */
- for (skip =
- cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;)
- WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
- WT_READ_NO_GEN |
- WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
- }
-
- /*
- * Select a random entry from the leaf page. If it's not valid, move to
- * the next entry, if that doesn't work, move to the previous entry.
- */
- WT_ERR(__wt_row_random_leaf(session, cbt));
- if (__cursor_valid(cbt, &upd))
- WT_ERR(__wt_kv_return(session, cbt, upd));
- else {
- if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
- ret = __wt_btcur_prev(cbt, false);
- WT_ERR(ret);
- }
- return (0);
-
-err: WT_TRET(__cursor_reset(cbt));
- return (ret);
-}
-
-/*
* __wt_btcur_compare --
* Return a comparison between two cursors.
*/
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index b62125e069d..d664da2ebd3 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -652,7 +652,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
page = ref->page;
mod = page->modify;
- WT_RET(ds->f(ds, "%p", (void *)page));
+ WT_RET(ds->f(ds, "%p", (void *)ref));
switch (page->type) {
case WT_PAGE_COL_INT:
@@ -699,8 +699,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, ", evict-lru"));
if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS))
WT_RET(ds->f(ds, ", overflow-keys"));
- if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
- WT_RET(ds->f(ds, ", split-block"));
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
WT_RET(ds->f(ds, ", split-insert"));
if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE))
diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c
new file mode 100644
index 00000000000..44de511f787
--- /dev/null
+++ b/src/btree/bt_random.c
@@ -0,0 +1,427 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_row_random_leaf --
+ * Return a random key from a row-store leaf page.
+ */
+int
+__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *ins, **start, **stop;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ uint64_t samples;
+ uint32_t choice, entries, i;
+ int level;
+
+ page = cbt->ref->page;
+ start = stop = NULL; /* [-Wconditional-uninitialized] */
+ entries = 0; /* [-Wconditional-uninitialized] */
+
+ __cursor_pos_clear(cbt);
+
+ /* If the page has disk-based entries, select from them. */
+ if (page->entries != 0) {
+ cbt->compare = 0;
+ cbt->slot = __wt_random(&session->rnd) % page->entries;
+
+ /*
+ * The real row-store search function builds the key, so we
+ * have to as well.
+ */
+ return (__wt_row_leaf_key(session,
+ page, page->pg_row + cbt->slot, cbt->tmp, false));
+ }
+
+ /*
+ * If the tree is new (and not empty), it might have a large insert
+ * list.
+ *
+ * Walk down the list until we find a level with at least 50 entries,
+ * that's where we'll start rolling random numbers. The value 50 is
+ * used to ignore levels with only a few entries, that is, levels which
+ * are potentially badly skewed.
+ */
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ return (WT_NOTFOUND);
+ for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ start = &ins_head->head[level];
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+
+ if (entries > 50)
+ break;
+ }
+
+ /*
+ * If it's a tiny list and we went all the way to level 0, correct the
+ * level; entries is correctly set.
+ */
+ if (level < 0)
+ level = 0;
+
+ /*
+ * Step down the skip list levels, selecting a random chunk of the name
+ * space at each level.
+ */
+ for (samples = entries; level > 0; samples += entries) {
+ /*
+ * There are (entries) or (entries + 1) chunks of the name space
+ * considered at each level. They are: between start and the 1st
+ * element, between the 1st and 2nd elements, and so on to the
+ * last chunk which is the name space after the stop element on
+ * the current level. This last chunk of name space may or may
+ * not be there: as we descend the levels of the skip list, this
+ * chunk may appear, depending if the next level down has
+ * entries logically after the stop point in the current level.
+ * We can't ignore those entries: because of the algorithm used
+ * to determine the depth of a skiplist, there may be a large
+ * number of entries "revealed" by descending a level.
+ *
+ * If the next level down has more items after the current stop
+ * point, there are (entries + 1) chunks to consider, else there
+ * are (entries) chunks.
+ */
+ if (*(stop - 1) == NULL)
+ choice = __wt_random(&session->rnd) % entries;
+ else
+ choice = __wt_random(&session->rnd) % (entries + 1);
+
+ if (choice == entries) {
+ /*
+ * We selected the name space after the stop element on
+ * this level. Set the start point to the current stop
+ * point, descend a level and move the stop element to
+ * the end of the list, that is, the end of the newly
+ * discovered name space, counting entries as we go.
+ */
+ start = stop;
+ --start;
+ --level;
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+ } else {
+ /*
+ * We selected another name space on the level. Move the
+ * start pointer the selected number of entries forward
+ * to the start of the selected chunk (if the selected
+ * number is 0, start won't move). Set the stop pointer
+ * to the next element in the list and drop both start
+ * and stop down a level.
+ */
+ for (i = 0; i < choice; ++i)
+ start = &(*start)->next[level];
+ stop = &(*start)->next[level];
+
+ --start;
+ --stop;
+ --level;
+
+ /* Count the entries in the selected name space. */
+ for (entries = 0,
+ ins = *start; ins != *stop; ins = ins->next[level])
+ ++entries;
+ }
+ }
+
+ /*
+ * When we reach the bottom level, entries will already be set. Select
+ * a random entry from the name space and return it.
+ *
+ * It should be impossible for the entries count to be 0 at this point,
+ * but check for it out of paranoia and to quiet static testing tools.
+ */
+ if (entries > 0)
+ entries = __wt_random(&session->rnd) % entries;
+ for (ins = *start; entries > 0; --entries)
+ ins = ins->next[0];
+
+ cbt->ins = ins;
+ cbt->ins_head = ins_head;
+ cbt->compare = 0;
+
+ /*
+ * Random lookups in newly created collections can be slow if a page
+ * consists of a large skiplist. Schedule the page for eviction if we
+ * encounter a large skiplist. This worthwhile because applications
+ * that take a sample often take many samples, so the overhead of
+ * traversing the skip list each time accumulates to real time.
+ */
+ if (samples > 5000)
+ __wt_page_evict_soon(session, cbt->ref);
+
+ return (0);
+}
+
+/*
+ * __wt_random_descent --
+ * Find a random page in a tree for either sampling or eviction.
+ */
+int
+__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ uint32_t flags, i, entries, retry;
+
+ btree = S2BT(session);
+ current = NULL;
+ retry = 100;
+
+ /* Eviction should not be tapped to do eviction. */
+ if (eviction)
+ flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN |
+ WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
+ else
+ flags = WT_READ_RESTART_OK;
+
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, flags));
+ }
+
+ /* Search the internal pages of the tree. */
+ current = &btree->root;
+ for (;;) {
+ page = current->page;
+ if (!WT_PAGE_IS_INTERNAL(page))
+ break;
+
+ WT_INTL_INDEX_GET(session, page, pindex);
+ entries = pindex->entries;
+
+ /* Eviction just wants any random child. */
+ if (eviction) {
+ descent = pindex->index[
+ __wt_random(&session->rnd) % entries];
+ goto descend;
+ }
+
+ /*
+ * There may be empty pages in the tree, and they're useless to
+ * us. If we don't find a non-empty page in "entries" random
+ * guesses, take the first non-empty page in the tree. If the
+ * search page contains nothing other than empty pages, restart
+ * from the root some number of times before giving up.
+ *
+ * Random sampling is looking for a key/value pair on a random
+ * leaf page, and so will accept any page that contains a valid
+ * key/value pair, so on-disk is fine, but deleted is not.
+ */
+ descent = NULL;
+ for (i = 0; i < entries; ++i) {
+ descent =
+ pindex->index[__wt_random(&session->rnd) % entries];
+ if (descent->state == WT_REF_MEM ||
+ descent->state == WT_REF_DISK)
+ break;
+ }
+ if (i == entries)
+ for (i = 0; i < entries; ++i) {
+ descent = pindex->index[i];
+ if (descent->state == WT_REF_MEM ||
+ descent->state == WT_REF_DISK)
+ break;
+ }
+ if (i == entries || descent == NULL) {
+ if (--retry > 0)
+ goto restart;
+
+ WT_RET(__wt_page_release(session, current, flags));
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search at the root.
+ *
+ * On other error, simply return, the swap call ensures we're
+ * holding nothing on failure.
+ */
+descend: if ((ret =
+ __wt_page_swap(session, current, descent, flags)) == 0) {
+ current = descent;
+ continue;
+ }
+ if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
+ break;
+ if (ret == WT_RESTART)
+ goto restart;
+ return (ret);
+ }
+
+ /*
+ * There is no point starting with the root page: the walk will exit
+ * immediately. In that case we aren't holding a hazard pointer so
+ * there is nothing to release.
+ */
+ if (!eviction || !__wt_ref_is_root(current))
+ *refp = current;
+ return (0);
+}
+
+/*
+ * __wt_btcur_next_random --
+ * Move to a random record in the tree. There are two algorithms, one
+ * where we select a record at random from the whole tree on each
+ * retrieval and one where we first select a record at random from the
+ * whole tree, and then subsequently sample forward from that location.
+ * The sampling approach allows us to select reasonably uniform random
+ * points from unbalanced trees.
+ */
+int
+__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+ wt_off_t size;
+ uint64_t n, skip;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = cbt->btree;
+
+ /*
+ * Only supports row-store: applications can trivially select a random
+ * value from a column-store, if there were any reason to do so.
+ */
+ if (btree->type != BTREE_ROW)
+ WT_RET_MSG(session, ENOTSUP,
+ "WT_CURSOR.next_random only supported by row-store tables");
+
+ WT_STAT_CONN_INCR(session, cursor_next);
+ WT_STAT_DATA_INCR(session, cursor_next);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Under some conditions we end up using the underlying cursor.next to
+ * walk through the object. Since there are multiple calls, we can hit
+ * the cursor-order checks, turn them off.
+ */
+ __wt_cursor_key_order_reset(cbt);
+#endif
+
+ /*
+ * If we don't have a current position in the tree, or if retrieving
+ * random values without sampling, pick a roughly random leaf page in
+ * the tree and return an entry from it.
+ */
+ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
+ WT_ERR(__cursor_func_init(cbt, true));
+ WT_WITH_PAGE_INDEX(session,
+ ret = __wt_random_descent(session, &cbt->ref, false));
+ if (ret == 0)
+ goto random_page_entry;
+
+ /*
+ * Random descent may return not-found: the tree might be empty
+ * or have so many deleted items we didn't find any valid pages.
+ * We can't return WT_NOTFOUND to the application unless a tree
+ * is really empty, fallback to skipping through tree pages.
+ */
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+
+ /*
+ * Cursor through the tree, skipping past the sample size of the leaf
+ * pages in the tree between each random key return to compensate for
+ * unbalanced trees.
+ *
+ * If the random descent attempt failed, we don't have a configured
+ * sample size, use 100 for no particular reason.
+ */
+ if (cbt->next_random_sample_size == 0)
+ cbt->next_random_sample_size = 100;
+
+ /*
+ * If the random descent attempt failed, or it's our first skip attempt,
+ * we haven't yet set the pages to skip, do it now.
+ *
+ * Use the underlying file size divided by its block allocation size as
+ * our guess of leaf pages in the file (this can be entirely wrong, as
+ * it depends on how many pages are in this particular checkpoint, how
+ * large the leaf and internal pages really are, and other factors).
+ * Then, divide that value by the configured sample size and increment
+ * the final result to make sure tiny files don't leave us with a skip
+ * value of 0.
+ *
+ * !!!
+ * Ideally, the number would be prime to avoid restart issues.
+ */
+ if (cbt->next_random_leaf_skip == 0) {
+ WT_ERR(btree->bm->size(btree->bm, session, &size));
+ cbt->next_random_leaf_skip = (uint64_t)
+ ((size / btree->allocsize) /
+ cbt->next_random_sample_size) + 1;
+ }
+
+ /*
+ * Be paranoid about loop termination: first, if the last leaf page
+ * skipped was also the last leaf page in the tree, skip may be set to
+ * zero on return along with the NULL WT_REF end-of-walk condition.
+ * Second, if a tree has no valid pages at all (the condition after
+ * initial creation), we might make no progress at all, or finally, if
+ * a tree has only deleted pages, we'll make progress, but never get a
+ * useful WT_REF. And, of course, the tree can switch from one of these
+ * states to another without warning. Decrement skip regardless of what
+ * is happening in the search, guarantee we eventually quit.
+ *
+ * Pages read for data sampling aren't "useful"; don't update the read
+ * generation of pages already in memory, and if a page is read, set
+ * its generation to a low value so it is evicted quickly.
+ */
+ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
+ n = skip;
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
+ WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ if (n == skip) {
+ if (skip == 0)
+ break;
+ --skip;
+ }
+ }
+
+ /*
+ * We can't return WT_NOTFOUND to the application unless a tree is
+ * really empty, fallback to a random entry from the first page in the
+ * tree that has anything at all.
+ */
+ if (cbt->ref == NULL)
+ WT_ERR(__wt_btcur_next(cbt, false));
+
+random_page_entry:
+ /*
+ * Select a random entry from the leaf page. If it's not valid, move to
+ * the next entry, if that doesn't work, move to the previous entry.
+ */
+ WT_ERR(__wt_row_random_leaf(session, cbt));
+ if (__wt_cursor_valid(cbt, &upd))
+ WT_ERR(__wt_kv_return(session, cbt, upd));
+ else {
+ if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
+ ret = __wt_btcur_prev(cbt, false);
+ WT_ERR(ret);
+ }
+ return (0);
+
+err: WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 6b0b8a08c02..45550ff627f 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -54,6 +54,16 @@ __split_oldest_gen(WT_SESSION_IMPL *session)
}
/*
+ * __wt_split_obsolete --
+ * Check if it is safe to free / evict based on split generation.
+ */
+bool
+__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen)
+{
+ return (split_gen < __split_oldest_gen(session));
+}
+
+/*
* __split_stash_add --
* Add a new entry into the session's split stash list.
*/
@@ -187,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
#ifdef HAVE_DIAGNOSTIC
/*
* __split_verify_intl_key_order --
- * Verify the key order on an internal page after a split, diagnostic only.
+ * Verify the key order on an internal page after a split.
*/
static void
__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -239,6 +249,46 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
}
}
+
+/*
+ * __split_verify_root --
+ * Verify a root page involved in a split.
+ */
+static int
+__split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_DECL_RET;
+ WT_REF *ref;
+
+ /* The split is complete and live, verify all of the pages involved. */
+ __split_verify_intl_key_order(session, page);
+
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /*
+ * An eviction thread might be attempting to evict the page
+ * (the WT_REF may be WT_REF_LOCKED), or it may be a disk based
+ * page (the WT_REF may be WT_REF_READING), or it may be in
+ * some other state. Acquire a hazard pointer for any
+ * in-memory pages so we know the state of the page.
+ *
+ * Ignore pages not in-memory (deleted, on-disk, being read),
+ * there's no in-memory structure to check.
+ */
+ if ((ret = __wt_page_in(session,
+ ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
+ continue;
+ WT_ERR(ret);
+
+ __split_verify_intl_key_order(session, ref->page);
+
+ WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT));
+ } WT_INTL_FOREACH_END;
+
+ return (0);
+
+err: /* Something really bad just happened. */
+ WT_PANIC_RET(session, ret, "fatal error during page split");
+}
#endif
/*
@@ -390,12 +440,12 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * __split_ref_step1 --
+ * __split_ref_prepare --
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_step1(
- WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
+__split_ref_prepare(WT_SESSION_IMPL *session,
+ WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
{
WT_PAGE *child;
WT_REF *child_ref, *ref;
@@ -418,30 +468,25 @@ __split_ref_step1(
child = ref->page;
/*
- * Block eviction and splits in newly created pages.
+ * Block eviction in newly created pages.
*
* Once the split is live, newly created internal pages might be
* evicted and their WT_REF structures freed. If that happened
* before all threads exit the index of the page that previously
* "owned" the WT_REF, a thread might see a freed WT_REF. To
- * ensure that doesn't happen, the newly created page's modify
- * structure has a field with a transaction ID that's checked
- * before any internal page is evicted. Unfortunately, we don't
- * know the correct value until we update the original page's
- * index (we need a transaction ID from after that update), but
- * the act of updating the original page's index is what allows
- * the eviction to happen.
+ * ensure that doesn't happen, the newly created page contains
+ * the current split generation and can't be evicted until
+ * all readers have left the old generation.
*
- * Split blocking was because historic versions of the split
- * code didn't update the WT_REF.home field until after the
- * split was live, so the WT_REF.home fields being updated could
- * split again before the update, there's a race between splits
- * as to which would update them first. The current code updates
- * the WT_REF.home fields before going live (in this function),
- * this shouldn't be an issue, but for now splits remain turned
- * off.
+ * Historic, we also blocked splits in newly created pages
+ * because we didn't update the WT_REF.home field until after
+ * the split was live, so the WT_REF.home fields being updated
+ * could split again before the update, there's a race between
+ * splits as to which would update them first. The current code
+ * updates the WT_REF.home fields before going live (in this
+ * function), this isn't an issue.
*/
- F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+ child->pg_intl_split_gen = split_gen;
/*
* We use a page flag to prevent the child from splitting from
@@ -465,64 +510,6 @@ __split_ref_step1(
}
/*
- * __split_ref_step2 --
- * Allow the newly created children to be evicted or split.
- */
-static int
-__split_ref_step2(
- WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
-{
- WT_DECL_RET;
- WT_PAGE *child;
- WT_REF *ref;
- uint32_t i;
-
- /*
- * The split has gone live, enable eviction and splits on the newly
- * created internal pages.
- */
- WT_WRITE_BARRIER();
-
- for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
- ref = pindex->index[i];
-
- /*
- * We don't hold hazard pointers on created pages, they cannot
- * be evicted because the page-modify transaction value set as
- * they were created prevents eviction. (See above, we reset
- * that value as part of fixing up the page.) But, an eviction
- * thread might be attempting to evict the page (the WT_REF may
- * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF
- * may be WT_REF_READING), or it may be in some other state.
- * Acquire a hazard pointer for any in-memory pages so we know
- * the state of the page. Ignore pages not in-memory (deleted,
- * on-disk, being read), there's no in-memory structure to fix.
- */
- if ((ret = __wt_page_in(session,
- ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
- continue;
- WT_ERR(ret);
-
- child = ref->page;
-
- /* The child can now be evicted or split. */
- F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, child));
-#endif
-
- WT_ERR(__wt_hazard_clear(session, ref));
- }
-
- return (0);
-
-err: /* Something really bad just happened. */
- WT_PANIC_RET(session, ret, "fatal error resolving a split");
-}
-
-/*
* __split_root --
* Split the root page in-memory, deepening the tree.
*/
@@ -653,8 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the root page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ root->pg_intl_split_gen = split_gen;
+
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, false);
+ __split_ref_prepare(session, alloc_index, split_gen, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -662,20 +653,17 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
+ alloc_index = NULL;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, root));
+ ret = __split_verify_root(session, root));
+ WT_ERR(ret);
#endif
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, false));
- /* The split is complete and correct, ignore benign errors. */
+ /* The split is complete and verified, ignore benign errors. */
complete = WT_ERR_IGNORE;
- /* We've installed the allocated page-index, ensure error handling. */
- alloc_index = NULL;
-
/*
* We can't free the previous root's index, there may be threads using
* it. Add to the session's discard list, to be freed once we know no
@@ -686,7 +674,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* fails, we don't roll back that change, because threads may already
* be using the new index.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
root_decr += size;
@@ -838,6 +825,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the parent page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ parent->pg_intl_split_gen = split_gen;
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -846,11 +837,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_INTL_INDEX_SET(parent, alloc_index);
alloc_index = NULL;
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
-#endif
-
/*
* If discarding the page's original WT_REF field, reset it to split.
* Threads cursoring through the tree were blocked because that WT_REF
@@ -869,16 +855,25 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
__wt_free(session, ref->page_del);
}
+ /*
+ * Set the discarded WT_REF state to split, ensuring we don't
+ * race with any discard of the WT_REF deleted fields.
+ */
WT_PUBLISH(ref->state, WT_REF_SPLIT);
+
+ /*
+ * Push out the change: not required for correctness, but stops
+ * threads spinning on incorrect page references.
+ */
+ WT_FULL_BARRIER();
}
- /*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
- */
- WT_FULL_BARRIER();
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
+#endif
- /* The split is complete and correct, ignore benign errors. */
+ /* The split is complete and verified, ignore benign errors. */
complete = WT_ERR_IGNORE;
/*
@@ -908,7 +903,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*
* Acquire a new split generation.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
@@ -1160,14 +1154,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ page->pg_intl_split_gen = split_gen;
+
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, true);
+ __split_ref_prepare(session, alloc_index, split_gen, true);
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
- /* Confirm the page's index hasn't moved, then update it. */
+ /*
+ * Confirm the page's index hasn't moved, then update it, which makes
+ * the split visible to threads descending the tree.
+ */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
@@ -1178,19 +1179,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__split_verify_intl_key_order(session, page));
#endif
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, true));
-
- /* The split is complete and correct, ignore benign errors. */
+ /* The split is complete and verified, ignore benign errors. */
complete = WT_ERR_IGNORE;
/*
- * Push out the changes: not required for correctness, but no reason
- * to wait.
- */
- WT_FULL_BARRIER();
-
- /*
* We don't care about the page-index we allocated, all we needed was
* the array of WT_REF structures, which has now been split into the
* parent page.
@@ -1207,7 +1199,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
* back that change, because threads may already be using the new parent
* page.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
page_decr += size;
@@ -1284,10 +1275,6 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
for (;;) {
parent = ref->home;
- /* Skip pages that aren't ready to split. */
- if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
- return (EBUSY);
-
if (trylock)
WT_RET(__wt_try_writelock(session, &parent->page_lock));
else
@@ -2086,8 +2073,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE *parent;
bool hazard;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
if ((ret = __split_insert(session, ref)) != 0) {
@@ -2178,8 +2164,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_PAGE *parent;
bool hazard;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
@@ -2207,8 +2192,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE *parent;
bool hazard;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref);
WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
ret = __split_parent(session, ref, NULL, 0, 0, false, true);
@@ -2229,8 +2213,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
page = ref->page;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref);
/*
* This isn't a split: a reconciliation failed because we couldn't write
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 049700952ee..ddaa2e5f70b 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* Take a copy of any held page and clear the return value. Remember
* the hazard pointer we're currently holding.
*
- * We may be passed a pointer to btree->evict_page that we are clearing
- * here. We check when discarding pages that we're not discarding that
- * page, so this clear must be done before the page is released.
+ * Clear the returned value, it makes future error handling easier.
*/
couple = couple_orig = ref = *refp;
*refp = NULL;
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index aa299a161da..9c3d467340e 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -623,215 +623,3 @@ leaf_match: cbt->compare = 0;
err: WT_TRET(__wt_page_release(session, current, 0));
return (ret);
}
-
-/*
- * __wt_row_random_leaf --
- * Return a random key from a row-store leaf page.
- */
-int
-__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
-{
- WT_INSERT *ins, **start, **stop;
- WT_INSERT_HEAD *ins_head;
- WT_PAGE *page;
- uint64_t samples;
- uint32_t choice, entries, i;
- int level;
-
- page = cbt->ref->page;
- start = stop = NULL; /* [-Wconditional-uninitialized] */
- entries = 0; /* [-Wconditional-uninitialized] */
-
- __cursor_pos_clear(cbt);
-
- /* If the page has disk-based entries, select from them. */
- if (page->entries != 0) {
- cbt->compare = 0;
- cbt->slot = __wt_random(&session->rnd) % page->entries;
-
- /*
- * The real row-store search function builds the key, so we
- * have to as well.
- */
- return (__wt_row_leaf_key(session,
- page, page->pg_row + cbt->slot, cbt->tmp, false));
- }
-
- /*
- * If the tree is new (and not empty), it might have a large insert
- * list.
- *
- * Walk down the list until we find a level with at least 50 entries,
- * that's where we'll start rolling random numbers. The value 50 is
- * used to ignore levels with only a few entries, that is, levels which
- * are potentially badly skewed.
- */
- F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
- return (WT_NOTFOUND);
- for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
- start = &ins_head->head[level];
- for (entries = 0, stop = start;
- *stop != NULL; stop = &(*stop)->next[level])
- ++entries;
-
- if (entries > 50)
- break;
- }
-
- /*
- * If it's a tiny list and we went all the way to level 0, correct the
- * level; entries is correctly set.
- */
- if (level < 0)
- level = 0;
-
- /*
- * Step down the skip list levels, selecting a random chunk of the name
- * space at each level.
- */
- for (samples = entries; level > 0; samples += entries) {
- /*
- * There are (entries) or (entries + 1) chunks of the name space
- * considered at each level. They are: between start and the 1st
- * element, between the 1st and 2nd elements, and so on to the
- * last chunk which is the name space after the stop element on
- * the current level. This last chunk of name space may or may
- * not be there: as we descend the levels of the skip list, this
- * chunk may appear, depending if the next level down has
- * entries logically after the stop point in the current level.
- * We can't ignore those entries: because of the algorithm used
- * to determine the depth of a skiplist, there may be a large
- * number of entries "revealed" by descending a level.
- *
- * If the next level down has more items after the current stop
- * point, there are (entries + 1) chunks to consider, else there
- * are (entries) chunks.
- */
- if (*(stop - 1) == NULL)
- choice = __wt_random(&session->rnd) % entries;
- else
- choice = __wt_random(&session->rnd) % (entries + 1);
-
- if (choice == entries) {
- /*
- * We selected the name space after the stop element on
- * this level. Set the start point to the current stop
- * point, descend a level and move the stop element to
- * the end of the list, that is, the end of the newly
- * discovered name space, counting entries as we go.
- */
- start = stop;
- --start;
- --level;
- for (entries = 0, stop = start;
- *stop != NULL; stop = &(*stop)->next[level])
- ++entries;
- } else {
- /*
- * We selected another name space on the level. Move the
- * start pointer the selected number of entries forward
- * to the start of the selected chunk (if the selected
- * number is 0, start won't move). Set the stop pointer
- * to the next element in the list and drop both start
- * and stop down a level.
- */
- for (i = 0; i < choice; ++i)
- start = &(*start)->next[level];
- stop = &(*start)->next[level];
-
- --start;
- --stop;
- --level;
-
- /* Count the entries in the selected name space. */
- for (entries = 0,
- ins = *start; ins != *stop; ins = ins->next[level])
- ++entries;
- }
- }
-
- /*
- * When we reach the bottom level, entries will already be set. Select
- * a random entry from the name space and return it.
- *
- * It should be impossible for the entries count to be 0 at this point,
- * but check for it out of paranoia and to quiet static testing tools.
- */
- if (entries > 0)
- entries = __wt_random(&session->rnd) % entries;
- for (ins = *start; entries > 0; --entries)
- ins = ins->next[0];
-
- cbt->ins = ins;
- cbt->ins_head = ins_head;
- cbt->compare = 0;
-
- /*
- * Random lookups in newly created collections can be slow if a page
- * consists of a large skiplist. Schedule the page for eviction if we
- * encounter a large skiplist. This worthwhile because applications
- * that take a sample often take many samples, so the overhead of
- * traversing the skip list each time accumulates to real time.
- */
- if (samples > 5000)
- __wt_page_evict_soon(session, cbt->ref);
-
- return (0);
-}
-
-/*
- * __wt_row_random_descent --
- * Find a random leaf page in a row-store tree.
- */
-int
-__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
-{
- WT_BTREE *btree;
- WT_DECL_RET;
- WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
- WT_REF *current, *descent;
-
- btree = S2BT(session);
- current = NULL;
-
- if (0) {
-restart: /*
- * Discard the currently held page and restart the search from
- * the root.
- */
- WT_RET(__wt_page_release(session, current, 0));
- }
-
- /* Search the internal pages of the tree. */
- current = &btree->root;
- for (;;) {
- page = current->page;
- if (page->type != WT_PAGE_ROW_INT)
- break;
-
- WT_INTL_INDEX_GET(session, page, pindex);
- descent = pindex->index[
- __wt_random(&session->rnd) % pindex->entries];
-
- /*
- * Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search at the root.
- *
- * On other error, simply return, the swap call ensures we're
- * holding nothing on failure.
- */
- if ((ret = __wt_page_swap(
- session, current, descent, WT_READ_RESTART_OK)) == 0) {
- current = descent;
- continue;
- }
- if (ret == WT_RESTART)
- goto restart;
- return (ret);
- }
-
- cbt->ref = current;
- return (0);
-}
diff --git a/src/checksum/power8/crc32_wrapper.c b/src/checksum/power8/crc32_wrapper.c
index ddfa2bdaeb8..a9be9ced1c6 100644
--- a/src/checksum/power8/crc32_wrapper.c
+++ b/src/checksum/power8/crc32_wrapper.c
@@ -1,4 +1,6 @@
#if defined(__powerpc64__)
+#include "wt_internal.h"
+
#define CRC_TABLE
#include "crc32_constants.h"
@@ -68,8 +70,6 @@ out:
}
#endif
-#include "wt_internal.h"
-
/*
* __wt_checksum_hw --
* WiredTiger: return a checksum for a chunk of memory.
diff --git a/src/checksum/zseries/crc32-s390x.c b/src/checksum/zseries/crc32-s390x.c
index f77d6768d42..28b46594220 100644
--- a/src/checksum/zseries/crc32-s390x.c
+++ b/src/checksum/zseries/crc32-s390x.c
@@ -6,8 +6,20 @@
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*
*/
+#include "wt_internal.h"
+
#include <sys/types.h>
#include <endian.h>
+
+#if defined(HAVE_CRC32_HARDWARE)
+
+#include <sys/auxv.h>
+
+/* RHEL 7 has kernel support, but does not define this constant in the lib c headers. */
+#ifndef HWCAP_S390_VX
+#define HWCAP_S390_VX 2048
+#endif
+
#include "crc32-s390x.h"
#include "slicing-consts.h"
@@ -69,8 +81,6 @@ unsigned int __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t l
/* Main CRC-32 functions */
DEFINE_CRC32_VX(__wt_crc32c_le_vx, __wt_crc32c_le_vgfm_16, __wt_crc32c_le)
-#include "wt_internal.h"
-
/*
* __wt_checksum_hw --
* WiredTiger: return a checksum for a chunk of memory.
@@ -81,6 +91,8 @@ __wt_checksum_hw(const void *chunk, size_t len)
return (~__wt_crc32c_le_vx(0xffffffff, chunk, len));
}
+#endif
+
/*
* __wt_checksum_init --
* WiredTiger: detect CRC hardware and set the checksum function.
@@ -89,8 +101,14 @@ void
__wt_checksum_init(void)
{
#if defined(HAVE_CRC32_HARDWARE)
- __wt_process.checksum = __wt_checksum_hw;
-#else
+ unsigned long caps = getauxval(AT_HWCAP);
+
+ if (caps & HWCAP_S390_VX)
+ __wt_process.checksum = __wt_checksum_hw;
+ else
+ __wt_process.checksum = __wt_checksum_sw;
+
+#else /* !HAVE_CRC32_HARDWARE */
__wt_process.checksum = __wt_checksum_sw;
#endif
}
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 6a93c1d05e2..b11a8d63fdb 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -147,12 +147,12 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -750,12 +750,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -837,12 +837,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -919,12 +919,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -1001,12 +1001,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index f691a76b1f2..124250a7a7d 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -1798,6 +1798,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "checkpoint", WT_VERB_CHECKPOINT },
{ "compact", WT_VERB_COMPACT },
{ "evict", WT_VERB_EVICT },
+ { "evict_stuck", WT_VERB_EVICT_STUCK },
{ "evictserver", WT_VERB_EVICTSERVER },
{ "fileops", WT_VERB_FILEOPS },
{ "handleops", WT_VERB_HANDLEOPS },
@@ -1987,6 +1988,16 @@ __conn_set_file_system(
CONNECTION_API_CALL(conn, session, set_file_system, config, cfg);
WT_UNUSED(cfg);
+ /*
+ * You can only configure a file system once, and attempting to do it
+ * again probably means the extension argument didn't have early-load
+ * set and we've already configured the default file system.
+ */
+ if (conn->file_system != NULL)
+ WT_ERR_MSG(session, EPERM,
+ "filesystem already configured; custom filesystems should "
+ "enable \"early_load\" configuration");
+
conn->file_system = file_system;
err: API_END_RET(session, ret);
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 2b0e5081f04..28dd06332e0 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -187,8 +187,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET_MSG(session, EINVAL,
"eviction target must be lower than the eviction trigger");
- WT_RET(__wt_cond_auto_alloc(session, "cache eviction server",
- false, 10000, WT_MILLION, &cache->evict_cond));
+ WT_RET(__wt_cond_auto_alloc(session,
+ "cache eviction server", 10000, WT_MILLION, &cache->evict_cond));
WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass"));
WT_RET(__wt_spin_init(session,
&cache->evict_queue_lock, "cache eviction queue"));
@@ -312,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
cache->bytes_dirty_intl + cache->bytes_dirty_leaf,
cache->pages_dirty_intl + cache->pages_dirty_leaf);
- WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond));
+ WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
__wt_spin_destroy(session, &cache->evict_pass_lock);
__wt_spin_destroy(session, &cache->evict_queue_lock);
__wt_spin_destroy(session, &cache->evict_walk_lock);
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 79c2fc23da5..49b766f4602 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -32,7 +32,7 @@
*/
#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3
#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6
-#define WT_CACHE_POOL_READ_MULTIPLIER 1
+#define WT_CACHE_POOL_READ_MULTIPLIER 1
static void __cache_pool_adjust(
WT_SESSION_IMPL *, uint64_t, uint64_t, bool, bool *);
@@ -104,8 +104,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
TAILQ_INIT(&cp->cache_pool_qh);
WT_ERR(__wt_spin_init(
session, &cp->cache_pool_lock, "cache shared pool"));
- WT_ERR(__wt_cond_alloc(session,
- "cache pool server", false, &cp->cache_pool_cond));
+ WT_ERR(__wt_cond_alloc(
+ session, "cache pool server", &cp->cache_pool_cond));
__wt_process.cache_pool = cp;
__wt_verbose(session,
@@ -733,7 +733,7 @@ __wt_cache_pool_server(void *arg)
F_ISSET(cache, WT_CACHE_POOL_RUN)) {
if (cp->currently_used <= cp->size)
__wt_cond_wait(
- session, cp->cache_pool_cond, WT_MILLION);
+ session, cp->cache_pool_cond, WT_MILLION, NULL);
/*
* Re-check pool run flag - since we want to avoid getting the
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index faeef4e71a2..7797ed4421c 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -63,6 +63,16 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp)
}
/*
+ * __ckpt_server_run_chk --
+ * Check to decide if the checkpoint server should continue running.
+ */
+static bool
+__ckpt_server_run_chk(WT_SESSION_IMPL *session)
+{
+ return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT));
+}
+
+/*
* __ckpt_server --
* The checkpoint server thread.
*/
@@ -78,14 +88,18 @@ __ckpt_server(void *arg)
conn = S2C(session);
wt_session = (WT_SESSION *)session;
- while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) {
+ for (;;) {
/*
* Wait...
* NOTE: If the user only configured logsize, then usecs
* will be 0 and this wait won't return until signalled.
*/
- __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs);
+ __wt_cond_wait(session,
+ conn->ckpt_cond, conn->ckpt_usecs, __ckpt_server_run_chk);
+
+ /* Check if we're quitting or being reconfigured. */
+ if (!__ckpt_server_run_chk(session))
+ break;
/*
* Checkpoint the database if the connection is marked dirty.
@@ -113,7 +127,8 @@ __ckpt_server(void *arg)
* it so we don't do another checkpoint
* immediately.
*/
- __wt_cond_wait(session, conn->ckpt_cond, 1);
+ __wt_cond_wait(
+ session, conn->ckpt_cond, 1, NULL);
}
} else
WT_STAT_CONN_INCR(session, txn_checkpoint_skipped);
@@ -152,8 +167,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn)
"checkpoint-server", true, session_flags, &conn->ckpt_session));
session = conn->ckpt_session;
- WT_RET(__wt_cond_alloc(
- session, "checkpoint server", false, &conn->ckpt_cond));
+ WT_RET(__wt_cond_alloc(session, "checkpoint server", &conn->ckpt_cond));
/*
* Start the thread.
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index b2f4bb04ce4..866b8633f71 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -25,21 +25,19 @@ __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle)
}
/*
- * __conn_dhandle_alloc --
+ * __wt_conn_dhandle_alloc --
* Allocate a new data handle and return it linked into the connection's
* list.
*/
-static int
-__conn_dhandle_alloc(WT_SESSION_IMPL *session,
- const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep)
+int
+__wt_conn_dhandle_alloc(
+ WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
uint64_t bucket;
- *dhandlep = NULL;
-
WT_RET(__wt_calloc_one(session, &dhandle));
__wt_rwlock_init(session, &dhandle->rwlock);
@@ -75,7 +73,7 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session,
bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
WT_CONN_DHANDLE_INSERT(S2C(session), dhandle, bucket);
- *dhandlep = dhandle;
+ session->dhandle = dhandle;
return (0);
err: __conn_dhandle_destroy(session, dhandle);
@@ -122,10 +120,7 @@ __wt_conn_dhandle_find(
}
}
- WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle));
-
- session->dhandle = dhandle;
- return (0);
+ return (WT_NOTFOUND);
}
/*
@@ -419,12 +414,11 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri,
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
uint64_t bucket;
conn = S2C(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
-
/*
* If we're given a URI, then we walk only the hash list for that
* name. If we don't have a URI we walk the entire dhandle list.
@@ -432,29 +426,42 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri,
if (uri != NULL) {
bucket =
__wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
- TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
+
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ WT_DHANDLE_NEXT(session, dhandle,
+ &conn->dhhash[bucket], hashq));
+ if (dhandle == NULL)
+ return (0);
+
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
dhandle->checkpoint != NULL ||
strcmp(uri, dhandle->name) != 0)
continue;
- WT_RET(__conn_btree_apply_internal(
- session, dhandle, file_func, name_func, cfg));
+ WT_ERR(__conn_btree_apply_internal(session,
+ dhandle, file_func, name_func, cfg));
}
} else {
- TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ return (0);
+
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
dhandle->checkpoint != NULL ||
!WT_PREFIX_MATCH(dhandle->name, "file:") ||
WT_IS_METADATA(dhandle))
continue;
- WT_RET(__conn_btree_apply_internal(
- session, dhandle, file_func, name_func, cfg));
+ WT_ERR(__conn_btree_apply_internal(session,
+ dhandle, file_func, name_func, cfg));
}
}
- return (0);
+err: WT_DHANDLE_RELEASE(dhandle);
+ return (ret);
}
/*
@@ -473,7 +480,8 @@ __wt_conn_dhandle_close_all(
conn = S2C(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
WT_ASSERT(session, session->dhandle == NULL);
bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
@@ -534,7 +542,8 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final)
dhandle = session->dhandle;
bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
WT_ASSERT(session, dhandle != conn->cache->evict_file_next);
/* Check if the handle was reacquired by a session while we waited. */
@@ -583,7 +592,7 @@ __wt_conn_dhandle_discard_single(
}
/* Try to remove the handle, protected by the data handle lock. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __conn_dhandle_remove(session, final));
if (set_pass_intr)
(void)__wt_atomic_subv32(&S2C(session)->cache->pass_intr, 1);
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 3f7fc9bb2a7..287e9ca7b99 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -53,18 +53,18 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
/* Spinlocks. */
WT_RET(__wt_spin_init(session, &conn->api_lock, "api"));
WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint);
- WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list);
WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor"));
WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table"));
WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata);
WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema);
- WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table);
WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file"));
/* Read-write locks */
+ __wt_rwlock_init(session, &conn->dhandle_lock);
__wt_rwlock_init(session, &conn->hot_backup_lock);
+ __wt_rwlock_init(session, &conn->table_lock);
WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock));
for (i = 0; i < WT_PAGE_LOCKS; ++i)
@@ -79,7 +79,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_spin_init(
session, &conn->lsm_manager.switch_lock, "LSM switch queue lock"));
WT_RET(__wt_cond_alloc(
- session, "LSM worker cond", false, &conn->lsm_manager.work_cond));
+ session, "LSM worker cond", &conn->lsm_manager.work_cond));
/*
* Generation numbers.
@@ -109,16 +109,15 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
* __wt_connection_destroy --
* Destroy the connection's underlying WT_CONNECTION_IMPL structure.
*/
-int
+void
__wt_connection_destroy(WT_CONNECTION_IMPL *conn)
{
- WT_DECL_RET;
WT_SESSION_IMPL *session;
u_int i;
/* Check there's something to destroy. */
if (conn == NULL)
- return (0);
+ return;
session = conn->default_session;
@@ -135,7 +134,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->api_lock);
__wt_spin_destroy(session, &conn->block_lock);
__wt_spin_destroy(session, &conn->checkpoint_lock);
- __wt_spin_destroy(session, &conn->dhandle_lock);
+ __wt_rwlock_destroy(session, &conn->dhandle_lock);
__wt_spin_destroy(session, &conn->encryptor_lock);
__wt_spin_destroy(session, &conn->fh_lock);
__wt_rwlock_destroy(session, &conn->hot_backup_lock);
@@ -143,17 +142,12 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->metadata_lock);
__wt_spin_destroy(session, &conn->reconfig_lock);
__wt_spin_destroy(session, &conn->schema_lock);
- __wt_spin_destroy(session, &conn->table_lock);
+ __wt_rwlock_destroy(session, &conn->table_lock);
__wt_spin_destroy(session, &conn->turtle_lock);
for (i = 0; i < WT_PAGE_LOCKS; ++i)
__wt_spin_destroy(session, &conn->page_lock[i]);
__wt_free(session, conn->page_lock);
- /* Destroy the file-system configuration. */
- if (conn->file_system != NULL && conn->file_system->terminate != NULL)
- WT_TRET(conn->file_system->terminate(
- conn->file_system, (WT_SESSION *)session));
-
/* Free allocated memory. */
__wt_free(session, conn->cfg);
__wt_free(session, conn->home);
@@ -162,5 +156,4 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_stat_connection_discard(session, conn);
__wt_free(NULL, conn);
- return (ret);
}
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 8f8f8614ba8..c6dd795389d 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -174,7 +174,7 @@ __logmgr_config(
WT_RET(__logmgr_sync_cfg(session, cfg));
if (conn->log_cond != NULL)
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
return (0);
}
@@ -341,7 +341,7 @@ __wt_log_truncate_files(
conn = S2C(session);
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
return (0);
- if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ if (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN) &&
FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE))
WT_RET_MSG(session, EINVAL,
"Attempt to archive manually while a server is running");
@@ -505,8 +505,7 @@ __log_file_server(void *arg)
locked = false;
__wt_spin_unlock(session, &log->log_sync_lock);
} else {
- __wt_cond_auto_signal(
- session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
/*
* We do not want to wait potentially a second
* to process this. Yield to give the wrlsn
@@ -517,8 +516,9 @@ __log_file_server(void *arg)
continue;
}
}
+
/* Wait until the next event. */
- __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10);
+ __wt_cond_wait(session, conn->log_file_cond, 100000, NULL);
}
if (0) {
@@ -730,12 +730,8 @@ __log_wrlsn_server(void *arg)
if (yield++ < WT_THOUSAND)
__wt_yield();
else
- /*
- * Send in false because if we did any work we would
- * not be on this path.
- */
__wt_cond_auto_wait(
- session, conn->log_wrlsn_cond, did_work);
+ session, conn->log_wrlsn_cond, did_work, NULL);
}
/*
* On close we need to do this one more time because there could
@@ -840,10 +836,9 @@ __log_server(void *arg)
}
/* Wait until the next event. */
-
__wt_epoch(session, &start);
- __wt_cond_auto_wait_signal(session,
- conn->log_cond, did_work, &signalled);
+ __wt_cond_auto_wait_signal(
+ session, conn->log_cond, did_work, NULL, &signalled);
__wt_epoch(session, &now);
timediff = WT_TIMEDIFF_MS(now, start);
}
@@ -904,10 +899,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_INIT_LSN(&log->write_lsn);
WT_INIT_LSN(&log->write_start_lsn);
log->fileid = 0;
- WT_RET(__wt_cond_alloc(
- session, "log sync", false, &log->log_sync_cond));
- WT_RET(__wt_cond_alloc(
- session, "log write", false, &log->log_write_cond));
+ WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond));
+ WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond));
WT_RET(__wt_log_open(session));
WT_RET(__wt_log_slot_init(session));
@@ -930,6 +923,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
return (0);
+ F_SET(conn, WT_CONN_LOG_SERVER_RUN);
+
/*
* Start the log close thread. It is not configurable.
* If logging is enabled, this thread runs.
@@ -937,8 +932,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
session_flags = WT_SESSION_NO_DATA_HANDLES;
WT_RET(__wt_open_internal_session(conn,
"log-close-server", false, session_flags, &conn->log_file_session));
- WT_RET(__wt_cond_alloc(conn->log_file_session,
- "log close server", false, &conn->log_file_cond));
+ WT_RET(__wt_cond_alloc(
+ conn->log_file_session, "log close server", &conn->log_file_cond));
/*
* Start the log file close thread.
@@ -954,8 +949,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server",
false, session_flags, &conn->log_wrlsn_session));
WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session,
- "log write lsn server", false, 10000, WT_MILLION,
- &conn->log_wrlsn_cond));
+ "log write lsn server", 10000, WT_MILLION, &conn->log_wrlsn_cond));
WT_RET(__wt_thread_create(conn->log_wrlsn_session,
&conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
conn->log_wrlsn_tid_set = true;
@@ -969,13 +963,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
if (conn->log_session != NULL) {
WT_ASSERT(session, conn->log_cond != NULL);
WT_ASSERT(session, conn->log_tid_set == true);
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
} else {
/* The log server gets its own session. */
WT_RET(__wt_open_internal_session(conn,
"log-server", false, session_flags, &conn->log_session));
WT_RET(__wt_cond_auto_alloc(conn->log_session,
- "log server", false, 50000, WT_MILLION, &conn->log_cond));
+ "log server", 50000, WT_MILLION, &conn->log_cond));
/*
* Start the thread.
@@ -1001,6 +995,8 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn = S2C(session);
+ F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
+
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
/*
* We always set up the log_path so printlog can work without
@@ -1011,7 +1007,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
return (0);
}
if (conn->log_tid_set) {
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
WT_TRET(__wt_thread_join(session, conn->log_tid));
conn->log_tid_set = false;
}
@@ -1026,7 +1022,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn->log_file_session = NULL;
}
if (conn->log_wrlsn_tid_set) {
- __wt_cond_auto_signal(session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
conn->log_wrlsn_tid_set = false;
}
@@ -1047,9 +1043,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
}
/* Destroy the condition variables now that all threads are stopped */
- WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
- WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index d4ace127bb2..5b20377d437 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
* Tell internal server threads to run: this must be set before opening
* any sessions.
*/
- F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN);
+ F_SET(conn, WT_CONN_SERVER_RUN);
/* WT_SESSION_IMPL array. */
WT_RET(__wt_calloc(session,
@@ -100,8 +100,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
__wt_yield();
}
- /* Clear any pending async ops. */
+ /*
+ * Clear any pending async operations and shut down the async worker
+ * threads and system before closing LSM.
+ */
WT_TRET(__wt_async_flush(session));
+ WT_TRET(__wt_async_destroy(session));
/*
* Shut down server threads other than the eviction server, which is
@@ -110,14 +114,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* exit before files are closed.
*/
F_CLR(conn, WT_CONN_SERVER_RUN);
- WT_TRET(__wt_async_destroy(session));
WT_TRET(__wt_lsm_manager_destroy(session));
- WT_TRET(__wt_sweep_destroy(session));
F_SET(conn, WT_CONN_CLOSING);
-
WT_TRET(__wt_checkpoint_server_destroy(session));
WT_TRET(__wt_statlog_destroy(session, true));
+ WT_TRET(__wt_sweep_destroy(session));
+
+ /* The eviction server is shut down last. */
WT_TRET(__wt_evict_destroy(session));
/* Shut down the lookaside table, after all eviction is complete. */
@@ -126,7 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(session));
- /* Shut down metadata tracking, required before creating tables. */
+ /* Shut down metadata tracking. */
WT_TRET(__wt_meta_track_destroy(session));
/*
@@ -140,7 +144,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
WT_TRET(__wt_txn_checkpoint_log(
session, true, WT_TXN_LOG_CKPT_STOP, NULL));
- F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
WT_TRET(__wt_logmgr_destroy(session));
/* Free memory for collators, compressors, data sources. */
@@ -159,15 +162,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
/* Discard transaction state. */
__wt_txn_global_destroy(session);
- /* Close extensions, first calling any unload entry point. */
- while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
- TAILQ_REMOVE(&conn->dlhqh, dlh, q);
-
- if (dlh->terminate != NULL)
- WT_TRET(dlh->terminate(wt_conn));
- WT_TRET(__wt_dlclose(session, dlh));
- }
-
/* Close the lock file, opening up the database to other connections. */
if (conn->lock_fh != NULL)
WT_TRET(__wt_close(session, &conn->lock_fh));
@@ -199,8 +193,22 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
__wt_free(session, s->hazard);
}
+ /* Destroy the file-system configuration. */
+ if (conn->file_system != NULL && conn->file_system->terminate != NULL)
+ WT_TRET(conn->file_system->terminate(
+ conn->file_system, (WT_SESSION *)session));
+
+ /* Close extensions, first calling any unload entry point. */
+ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
+ TAILQ_REMOVE(&conn->dlhqh, dlh, q);
+
+ if (dlh->terminate != NULL)
+ WT_TRET(dlh->terminate(wt_conn));
+ WT_TRET(__wt_dlclose(session, dlh));
+ }
+
/* Destroy the handle. */
- WT_TRET(__wt_connection_destroy(conn));
+ __wt_connection_destroy(conn);
return (ret);
}
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 3bcdfd7ecb1..d89392b66c6 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -409,7 +409,6 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
struct timespec ts;
struct tm *tm, _tm;
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
WT_FSTREAM *log_stream;
conn = S2C(session);
@@ -446,12 +445,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
* Lock the schema and walk the list of open handles, dumping
* any that match the list of object sources.
*/
- if (conn->stat_sources != NULL) {
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_conn_btree_apply(
+ if (conn->stat_sources != NULL)
+ WT_RET(__wt_conn_btree_apply(
session, NULL, __statlog_apply, NULL, NULL));
- WT_RET(ret);
- }
/*
* Walk the list of open LSM trees, dumping any that match the
@@ -485,8 +481,7 @@ __statlog_on_close(WT_SESSION_IMPL *session)
if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE))
return (0);
- if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
+ if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
WT_RET_MSG(session, EINVAL,
"Attempt to log statistics while a server is running");
@@ -498,6 +493,16 @@ err: __wt_scr_free(session, &tmp);
}
/*
+ * __statlog_server_run_chk --
+ * Check to decide if the statistics log server should continue running.
+ */
+static bool
+__statlog_server_run_chk(WT_SESSION_IMPL *session)
+{
+ return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS));
+}
+
+/*
* __statlog_server --
* The statistics server thread.
*/
@@ -525,10 +530,14 @@ __statlog_server(void *arg)
WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128));
WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128));
- while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) {
+ for (;;) {
/* Wait until the next event. */
- __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs);
+ __wt_cond_wait(session, conn->stat_cond,
+ conn->stat_usecs, __statlog_server_run_chk);
+
+ /* Check if we're quitting or being reconfigured. */
+ if (!__statlog_server_run_chk(session))
+ break;
if (WT_STAT_ENABLED(session))
WT_ERR(__statlog_log_one(session, &path, &tmp));
@@ -563,7 +572,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn)
session = conn->stat_session;
WT_RET(__wt_cond_alloc(
- session, "statistics log server", false, &conn->stat_cond));
+ session, "statistics log server", &conn->stat_cond));
/*
* Start the thread.
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index 7d5cb7d7c72..8c186c63939 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -233,7 +233,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
if (!WT_DHANDLE_CAN_DISCARD(dhandle))
continue;
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __sweep_remove_one(session, dhandle));
if (ret == 0)
WT_STAT_CONN_INCR(session, dh_sweep_remove);
@@ -246,6 +246,16 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
}
/*
+ * __sweep_server_run_chk --
+ * Check to decide if the checkpoint server should continue running.
+ */
+static bool
+__sweep_server_run_chk(WT_SESSION_IMPL *session)
+{
+ return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP));
+}
+
+/*
* __sweep_server --
* The handle sweep server thread.
*/
@@ -266,11 +276,15 @@ __sweep_server(void *arg)
/*
* Sweep for dead and excess handles.
*/
- while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_SWEEP)) {
+ for (;;) {
/* Wait until the next event. */
- __wt_cond_wait(session,
- conn->sweep_cond, conn->sweep_interval * WT_MILLION);
+ __wt_cond_wait(session, conn->sweep_cond,
+ conn->sweep_interval * WT_MILLION, __sweep_server_run_chk);
+
+ /* Check if we're quitting or being reconfigured. */
+ if (!__sweep_server_run_chk(session))
+ break;
+
__wt_seconds(session, &now);
WT_STAT_CONN_INCR(session, dh_sweeps);
@@ -390,7 +404,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
session = conn->sweep_session;
WT_RET(__wt_cond_alloc(
- session, "handle sweep server", false, &conn->sweep_cond));
+ session, "handle sweep server", &conn->sweep_cond));
WT_RET(__wt_thread_create(
session, &conn->sweep_tid, __sweep_server, session));
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 08b15e6ca5e..61ced8d11e7 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -346,13 +346,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
static int
__backup_all(WT_SESSION_IMPL *session)
{
- WT_DECL_RET;
-
/* Build a list of the file objects that need to be copied. */
- WT_WITH_HANDLE_LIST_LOCK(session, ret =
- __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL));
-
- return (ret);
+ return (__wt_meta_apply_all(
+ session, NULL, __backup_list_uri_append, NULL));
}
/*
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 4786b0524bc..6fc01c0421f 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -240,7 +240,17 @@ __curindex_search(WT_CURSOR *cursor)
found_key = child->key;
if (found_key.size < cursor->key.size)
WT_ERR(WT_NOTFOUND);
- found_key.size = cursor->key.size;
+
+ /*
+ * Custom collators expect to see complete keys, pass an item containing
+ * all the visible fields so it unpacks correctly.
+ */
+ if (cindex->index->collator != NULL &&
+ !F_ISSET(cursor, WT_CURSTD_RAW_SEARCH))
+ WT_ERR(__wt_struct_repack(session, child->key_format,
+ cindex->iface.key_format, &child->key, &found_key));
+ else
+ found_key.size = cursor->key.size;
WT_ERR(__wt_compare(
session, cindex->index->collator, &cursor->key, &found_key, &cmp));
@@ -307,8 +317,18 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact)
* so we flip the sign of the result to match what callers expect.
*/
found_key = child->key;
- if (found_key.size > cursor->key.size)
- found_key.size = cursor->key.size;
+ if (found_key.size > cursor->key.size) {
+ /*
+ * Custom collators expect to see complete keys, pass an item
+ * containing all the visible fields so it unpacks correctly.
+ */
+ if (cindex->index->collator != NULL)
+ WT_ERR(__wt_struct_repack(session,
+ cindex->child->key_format, cindex->iface.key_format,
+ &child->key, &found_key));
+ else
+ found_key.size = cursor->key.size;
+ }
WT_ERR(__wt_compare(
session, cindex->index->collator, &cursor->key, &found_key, exact));
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index 7ace6d49cf0..99a9e373354 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -633,6 +633,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config)
int
__wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
{
+ WT_DECL_RET;
WT_ITEM key;
/*
@@ -662,9 +663,11 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
* cursors cannot reference application memory after cursor operations
* and that requirement will save the day.
*/
- WT_RET(cursor->search(cursor));
+ F_SET(cursor, WT_CURSTD_RAW_SEARCH);
+ ret = cursor->search(cursor);
+ F_CLR(cursor, WT_CURSTD_RAW_SEARCH);
- return (0);
+ return (ret);
}
/*
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index 76f7fc5865f..7e8cd153d2d 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -769,7 +769,7 @@ __curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table)
return (0);
/* If the table is incomplete, wait on the table lock and recheck. */
- WT_WITH_TABLE_LOCK(session, complete = table->cg_complete);
+ WT_WITH_TABLE_READ_LOCK(session, complete = table->cg_complete);
if (!complete)
WT_RET_MSG(session, EINVAL,
"'%s' not available until all column groups are created",
diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox
index a0a3212be6d..b6434e3d161 100644
--- a/src/docs/cursor-random.dox
+++ b/src/docs/cursor-random.dox
@@ -20,9 +20,4 @@ cursor configured using \c next_random_sample_size divides the object
into \c next_random_sample_size pieces, and each subsequent retrieval
returns a record from the next one of those pieces.
-For example, setting \c next_random_sample_percent to \c 10 would cause
-the cursor to sequentially return records from each tenth part of the
-object. Setting \c next_random_sample_percent to \c 1000 would cause the
-cursor to sequentially return records from each .1% of the object.
-
*/
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 4a356f7da61..f463e6bc615 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -7,6 +7,12 @@
The WiredTiger Utility can now \c truncate an object. Removing all contents
from the specified object.
</dd>
+<dt>Handle list lock statistics</dt>
+<dd>
+In the 2.9.1 release we added statistics tracking handle list lock timing, we
+have switched that lock from a spin lock to a read-write lock, and consequently
+changed the statistics tracking lock related wait time.
+</dd>
</dl>
@section version_291 Upgrading to Version 2.9.1
diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox
index 83aadf8a776..2eac0fef3f4 100644
--- a/src/docs/wtperf.dox
+++ b/src/docs/wtperf.dox
@@ -195,14 +195,14 @@ use pareto distribution for random numbers. Zero to disable, otherwise a percen
number of operations to group into each transaction in the populate phase, zero for auto-commit
@par populate_threads (unsigned int, default=1)
number of populate threads, 1 for bulk load
+@par pre_load_data (boolean, default=false)
+Scan all data prior to starting the workload phase to warm the cache
@par random_range (unsigned int, default=0)
if non zero choose a value from within this range as the key for insert operations
@par random_value (boolean, default=false)
generate random content for the value
@par range_partition (boolean, default=false)
partition data by range (vs hash)
-@par read_range (unsigned int, default=0)
-scan a range of keys after each search
@par readonly (boolean, default=false)
reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified
@par reopen_connection (boolean, default=true)
@@ -228,7 +228,7 @@ number of tables to run operations over. Keys are divided evenly over the table
@par table_count_idle (unsigned int, default=0)
number of tables to create, that won't be populated. Default 0.
@par threads (string, default="")
-workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn'
+workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn'
@par transaction_config (string, default="")
WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero
@par table_name (string, default="test")
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 9b969de9a9e..42fe4d4608e 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -24,40 +24,40 @@ static int __evict_walk_file(
(S2C(s)->evict_threads.current_threads > 1)
/*
- * __evict_lock_dhandle --
- * Try to get the dhandle lock, with yield and sleep back off.
+ * __evict_lock_handle_list --
+ * Try to get the handle list lock, with yield and sleep back off.
* Keep timing statistics overall.
*/
static int
-__evict_lock_dhandle(WT_SESSION_IMPL *session)
+__evict_lock_handle_list(WT_SESSION_IMPL *session)
{
struct timespec enter, leave;
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_SPINLOCK *dh_lock;
- int64_t **stats;
+ WT_RWLOCK *dh_lock;
u_int spins;
bool dh_stats;
conn = S2C(session);
cache = conn->cache;
dh_lock = &conn->dhandle_lock;
- stats = (int64_t **)conn->stats;
- dh_stats = WT_STAT_ENABLED(session) && dh_lock->stat_count_off != -1;
/*
- * Maintain lock acquisition timing statistics as if this were a
- * regular lock acquisition.
+ * Setup tracking of handle lock acquisition wait time if statistics
+ * are enabled.
*/
+ dh_stats = WT_STAT_ENABLED(session);
+
if (dh_stats)
__wt_epoch(session, &enter);
+
/*
* Use a custom lock acquisition back off loop so the eviction server
* notices any interrupt quickly.
*/
for (spins = 0;
- (ret = __wt_spin_trylock_track(session, dh_lock)) == EBUSY &&
+ (ret = __wt_try_readlock(session, dh_lock)) == EBUSY &&
cache->pass_intr == 0; spins++) {
if (spins < WT_THOUSAND)
__wt_yield();
@@ -70,8 +70,9 @@ __evict_lock_dhandle(WT_SESSION_IMPL *session)
WT_RET(ret);
if (dh_stats) {
__wt_epoch(session, &leave);
- stats[session->stat_bucket][dh_lock->stat_int_usecs_off] +=
- (int64_t)WT_TIMEDIFF_US(leave, enter);
+ WT_STAT_CONN_INCRV(
+ session, lock_handle_list_wait_eviction,
+ (int64_t)WT_TIMEDIFF_US(leave, enter));
}
return (0);
}
@@ -197,8 +198,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
}
__wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
}
- WT_ASSERT(session,
- !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
__wt_spin_unlock(session, &cache->evict_queue_lock);
}
@@ -267,7 +267,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session)
}
#endif
- __wt_cond_auto_signal(session, cache->evict_cond);
+ __wt_cond_signal(session, cache->evict_cond);
}
/*
@@ -280,12 +280,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- bool did_work;
+ bool did_work, was_intr;
conn = S2C(session);
cache = conn->cache;
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
/*
* Ensure the cache stuck timer is initialized when starting eviction.
*/
@@ -308,12 +308,28 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
ret = __evict_server(session, &did_work);
F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS);
F_CLR(session, WT_SESSION_LOCKED_PASS);
+ was_intr = cache->pass_intr != 0;
__wt_spin_unlock(session, &cache->evict_pass_lock);
WT_ERR(ret);
+
+ /*
+ * If the eviction server was interrupted, wait until
+ * requests have been processed: the system may
+ * otherwise be busy so don't go to sleep.
+ */
+ if (was_intr) {
+ while (cache->pass_intr != 0 &&
+ F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
+ F_ISSET(thread, WT_THREAD_RUN))
+ __wt_yield();
+ continue;
+ }
+
__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping");
+
/* Don't rely on signals: check periodically. */
__wt_cond_auto_wait(
- session, cache->evict_cond, did_work);
+ session, cache->evict_cond, did_work, NULL);
__wt_verbose(session, WT_VERB_EVICTSERVER, "waking");
} else
WT_ERR(__evict_lru_pages(session, false));
@@ -353,12 +369,12 @@ err: WT_PANIC_MSG(session, ret, "cache eviction thread error");
static int
__evict_server(WT_SESSION_IMPL *session, bool *did_work)
{
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
+ struct timespec now;
+#endif
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
-#ifdef HAVE_DIAGNOSTIC
- struct timespec now;
-#endif
uint64_t orig_pages_evicted;
conn = S2C(session);
@@ -370,7 +386,8 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
/* Evict pages from the cache as needed. */
WT_RET(__evict_pass(session));
- if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) ||
+ cache->pass_intr != 0)
return (0);
/*
@@ -378,28 +395,31 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
* otherwise we can block applications evicting large pages.
*/
if (!__wt_cache_stuck(session)) {
-
/*
- * If we gave up acquiring the lock, that indicates a
- * session is waiting for us to clear walks. Do that
- * as part of a normal pass (without the handle list
+ * Try to get the handle list lock: if we give up, that
+ * indicates a session is waiting for us to clear walks. Do
+ * that as part of a normal pass (without the handle list
* lock) to avoid deadlock.
*/
- if ((ret = __evict_lock_dhandle(session)) == EBUSY)
+ if ((ret = __evict_lock_handle_list(session)) == EBUSY)
return (0);
WT_RET(ret);
ret = __evict_clear_all_walks(session);
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
WT_RET(ret);
cache->pages_evicted = 0;
} else if (cache->pages_evicted != cache->pages_evict) {
cache->pages_evicted = cache->pages_evict;
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
__wt_epoch(session, &cache->stuck_ts);
} else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) {
/*
- * After being stuck for 5 minutes, give up.
+ * If we're stuck for 5 minutes in diagnostic mode, or the
+ * verbose evict_stuck flag is configured, log the cache
+ * and transaction state.
+ *
+ * If we're stuck for 5 minutes in diagnostic mode, give up.
*
* We don't do this check for in-memory workloads because
* application threads are not blocked by the cache being full.
@@ -408,11 +428,22 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
*/
__wt_epoch(session, &now);
if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) {
- ret = ETIMEDOUT;
- __wt_err(session, ret,
+#if defined(HAVE_DIAGNOSTIC)
+ __wt_err(session, ETIMEDOUT,
"Cache stuck for too long, giving up");
- WT_TRET(__wt_dump_stuck_info(session, NULL));
+ ret = ETIMEDOUT;
+ WT_TRET(__wt_verbose_dump_txn(session));
+ WT_TRET(__wt_verbose_dump_cache(session));
return (ret);
+#elif defined(HAVE_VERBOSE)
+ if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
+ WT_RET(__wt_verbose_dump_txn(session));
+ WT_RET(__wt_verbose_dump_cache(session));
+
+ /* Reset the timer. */
+ __wt_epoch(session, &cache->stuck_ts);
+ }
+#endif
}
#endif
}
@@ -697,8 +728,8 @@ __evict_pass(WT_SESSION_IMPL *session)
*/
WT_STAT_CONN_INCR(session,
cache_eviction_server_slept);
- __wt_cond_wait(
- session, cache->evict_cond, WT_THOUSAND);
+ __wt_cond_wait(session,
+ cache->evict_cond, WT_THOUSAND, NULL);
continue;
}
@@ -725,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* Clear a single walk point.
*/
static int
-__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
+__evict_clear_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -742,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
if ((ref = btree->evict_ref) == NULL)
return (0);
- if (count_stat)
- WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
+ WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
/*
- * Clear evict_ref first, in case releasing it forces eviction (we
- * assert we never try to evict the current eviction walk point).
+ * Clear evict_ref before releasing it in case that forces eviction (we
+ * assert that we never try to evict the current eviction walk point).
*/
btree->evict_ref = NULL;
+
WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
(ret = __wt_page_release(cache->walk_session,
ref, WT_READ_NO_EVICT)));
@@ -772,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
TAILQ_FOREACH(dhandle, &conn->dhqh, q)
if (WT_PREFIX_MATCH(dhandle->name, "file:"))
WT_WITH_DHANDLE(session, dhandle,
- WT_TRET(__evict_clear_walk(session, true)));
+ WT_TRET(__evict_clear_walk(session)));
return (ret);
}
@@ -817,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
/* Clear any existing LRU eviction walk for the file. */
WT_WITH_PASS_LOCK(session,
- ret = __evict_clear_walk(session, true));
+ ret = __evict_clear_walk(session));
(void)__wt_atomic_subv32(&cache->pass_intr, 1);
WT_ERR(ret);
@@ -1087,7 +1118,8 @@ __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server)
/* If a worker thread found the queue empty, pause. */
if (ret == WT_NOTFOUND && !is_server &&
F_ISSET(S2C(session), WT_CONN_EVICTION_RUN))
- __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000);
+ __wt_cond_wait(
+ session, conn->evict_threads.wait_cond, 10000, NULL);
return (ret == WT_NOTFOUND ? 0 : ret);
}
@@ -1304,7 +1336,7 @@ retry: while (slot < max_entries) {
* reference count to keep it alive while we sweep.
*/
if (!dhandle_locked) {
- WT_ERR(__evict_lock_dhandle(session));
+ WT_ERR(__evict_lock_handle_list(session));
dhandle_locked = true;
}
@@ -1383,7 +1415,7 @@ retry: while (slot < max_entries) {
(void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
incr = true;
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
dhandle_locked = false;
/*
@@ -1430,7 +1462,7 @@ retry: while (slot < max_entries) {
}
err: if (dhandle_locked) {
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
dhandle_locked = false;
}
@@ -1526,6 +1558,19 @@ __evict_walk_file(WT_SESSION_IMPL *session,
start = queue->evict_queue + *slotp;
remaining_slots = max_entries - *slotp;
total_slots = max_entries - queue->evict_entries;
+ btree_inuse = cache_inuse = 0;
+ target_pages_clean = target_pages_dirty = 0;
+
+ /*
+ * The number of times we should fill the queue by the end of
+ * considering all trees.
+ */
+#define QUEUE_FILLS_PER_PASS 10
+
+ /*
+ * The minimum number of pages we should consider per tree.
+ */
+#define MIN_PAGES_PER_TREE 10
/*
* The target number of pages for this tree is proportional to the
@@ -1534,13 +1579,12 @@ __evict_walk_file(WT_SESSION_IMPL *session,
* cache (and only have to walk it once).
*/
if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
- btree_inuse = __wt_btree_bytes_inuse(session);
+ btree_inuse = __wt_btree_bytes_evictable(session);
cache_inuse = __wt_cache_bytes_inuse(cache);
bytes_per_slot = 1 + cache_inuse / total_slots;
target_pages_clean = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
- } else
- target_pages_clean = 0;
+ }
if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
btree_inuse = __wt_btree_dirty_leaf_inuse(session);
@@ -1548,35 +1592,58 @@ __evict_walk_file(WT_SESSION_IMPL *session,
bytes_per_slot = 1 + cache_inuse / total_slots;
target_pages_dirty = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
- } else
- target_pages_dirty = 0;
+ }
- target_pages = WT_MAX(target_pages_clean, target_pages_dirty);
+ /*
+ * Weight the number of target pages by the number of times we want to
+ * fill the cache per pass through all the trees. Note that we don't
+ * build this into the calculation above because we don't want to favor
+ * small trees, so round to a whole number of slots (zero for small
+ * trees) before multiplying.
+ */
+ target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
+ QUEUE_FILLS_PER_PASS;
+ /*
+ * Randomly walk trees with a small fraction of the cache in case there
+ * are so many trees that none of them use enough of the cache to be
+ * allocated slots.
+ *
+ * The chance of walking a tree is equal to the chance that a random
+ * byte in cache belongs to the tree, weighted by how many times we
+ * want to fill queues during a pass through all the trees in cache.
+ */
if (target_pages == 0) {
- /*
- * Randomly walk trees with a tiny fraction of the cache in
- * case there are so many trees that none of them use enough of
- * the cache to be allocated slots. Walk small trees 1% of the
- * time.
- */
- if (__wt_random(&session->rnd) > UINT32_MAX / 100)
+ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
+ btree_inuse = __wt_btree_bytes_evictable(session);
+ cache_inuse = __wt_cache_bytes_inuse(cache);
+ } else {
+ btree_inuse = __wt_btree_dirty_leaf_inuse(session);
+ cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
+ }
+ if (btree_inuse == 0 || cache_inuse == 0)
+ return (0);
+ if (__wt_random64(&session->rnd) % cache_inuse >
+ btree_inuse * QUEUE_FILLS_PER_PASS)
return (0);
- target_pages = 10;
}
+ /*
+ * There is some cost associated with walking a tree. If we're going
+ * to visit this tree, always look for a minimum number of pages.
+ */
+ if (target_pages < MIN_PAGES_PER_TREE)
+ target_pages = MIN_PAGES_PER_TREE;
+
+ /*
+ * If the tree is dead or we're near the end of the queue, fill the
+ * remaining slots.
+ */
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
target_pages > remaining_slots)
target_pages = remaining_slots;
end = start + target_pages;
- walk_flags =
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
-
- /* Randomize the walk direction. */
- if (btree->evict_walk_reverse)
- FLD_SET(walk_flags, WT_READ_PREV);
-
/*
* Examine at least a reasonable number of pages before deciding
* whether to give up. When we are only looking for dirty pages,
@@ -1588,8 +1655,41 @@ __evict_walk_file(WT_SESSION_IMPL *session,
min_pages *= 10;
/*
- * Get some more eviction candidate pages.
- *
+ * Choose a random point in the tree if looking for candidates in a
+ * tree with no starting point set. This is mostly aimed at ensuring
+ * eviction fairly visits all pages in trees with a lot of in-cache
+ * content.
+ */
+ if (btree->evict_ref == NULL) {
+ /* Ensure internal pages indexes remain valid for our walk */
+ WT_WITH_PAGE_INDEX(session, ret =
+ __wt_random_descent(session, &btree->evict_ref, true));
+ WT_RET_NOTFOUND_OK(ret);
+
+ /*
+ * Reverse the direction of the walk each time we start at a
+ * random point so both ends of the tree are equally likely to
+ * be visited.
+ */
+ btree->evict_walk_reverse = !btree->evict_walk_reverse;
+ }
+
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+ if (btree->evict_walk_reverse)
+ FLD_SET(walk_flags, WT_READ_PREV);
+
+ /*
+ * Get some more eviction candidate pages, starting at the last saved
+ * point. Clear the saved point immediately, we assert when discarding
+ * pages we're not discarding an eviction point, so this clear must be
+ * complete before the page is released.
+ */
+ ref = btree->evict_ref;
+ btree->evict_ref = NULL;
+
+ /*
* !!! Take care terminating this loop.
*
* Don't make an extra call to __wt_tree_walk after we hit the end of a
@@ -1602,7 +1702,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
for (evict = start, pages_queued = pages_seen = refs_walked = 0;
evict < end && (ret == 0 || ret == WT_NOTFOUND);
ret = __wt_tree_walk_count(
- session, &btree->evict_ref, &refs_walked, walk_flags)) {
+ session, &ref, &refs_walked, walk_flags)) {
/*
* Check whether we're finding a good ratio of candidates vs
* pages seen. Some workloads create "deserts" in trees where
@@ -1616,7 +1716,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
if (give_up)
break;
- if ((ref = btree->evict_ref) == NULL) {
+ if (ref == NULL) {
if (++restarts == 2)
break;
WT_STAT_CONN_INCR(
@@ -1706,7 +1806,7 @@ fast: /* If the page can't be evicted, give up. */
++pages_queued;
if (WT_PAGE_IS_INTERNAL(page))
- ++internal_pages;
+ ++internal_pages;
__wt_verbose(session, WT_VERB_EVICTSERVER,
"select: %p, size %" WT_SIZET_FMT,
@@ -1719,12 +1819,10 @@ fast: /* If the page can't be evicted, give up. */
session, cache_eviction_pages_queued, (u_int)(evict - start));
/*
- * If we didn't find any candidates in the file, reverse the direction
- * of the walk and skip it next time.
+ * If we couldn't find the number of pages we were looking for, skip
+ * the tree next time.
*/
- if (give_up)
- btree->evict_walk_reverse = !btree->evict_walk_reverse;
- if (pages_queued == 0 && !urgent_queued)
+ if (pages_queued < target_pages / 2 && !urgent_queued)
btree->evict_walk_period = WT_MIN(
WT_MAX(1, 2 * btree->evict_walk_period), 100);
else if (pages_queued == target_pages)
@@ -1733,6 +1831,8 @@ fast: /* If the page can't be evicted, give up. */
btree->evict_walk_period /= 2;
/*
+ * Give up the walk occasionally.
+ *
* If we happen to end up on the root page or a page requiring urgent
* eviction, clear it. We have to track hazard pointers, and the root
* page complicates that calculation.
@@ -1744,16 +1844,20 @@ fast: /* If the page can't be evicted, give up. */
* If we land on a page requiring forced eviction, move on to the next
* page: we want this page evicted as quickly as possible.
*/
- if ((ref = btree->evict_ref) != NULL) {
- /* Give up the walk occasionally. */
+ if (ref != NULL) {
if (__wt_ref_is_root(ref) || evict == start || give_up ||
ref->page->read_gen == WT_READGEN_OLDEST ||
- ref->page->memory_footprint >= btree->splitmempage)
- WT_RET(__evict_clear_walk(session, restarts == 0));
- else if (ref->page->read_gen == WT_READGEN_OLDEST)
+ ref->page->memory_footprint >= btree->splitmempage) {
+ if (restarts == 0)
+ WT_STAT_CONN_INCR(
+ session, cache_eviction_walks_abandoned);
+ WT_RET(__wt_page_release(cache->walk_session,
+ ref, WT_READ_NO_EVICT));
+ ref = NULL;
+ } else if (ref->page->read_gen == WT_READGEN_OLDEST)
WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
- session, &btree->evict_ref,
- &refs_walked, walk_flags));
+ session, &ref, &refs_walked, walk_flags));
+ btree->evict_ref = ref;
}
WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
@@ -2087,8 +2191,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
break;
case WT_NOTFOUND:
/* Allow the queue to re-populate before retrying. */
- __wt_cond_wait(
- session, conn->evict_threads.wait_cond, 10000);
+ __wt_cond_wait(session,
+ conn->evict_threads.wait_cond, 10000, NULL);
cache->app_waits++;
break;
default:
@@ -2184,226 +2288,140 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session)
S2BT(session)->evict_priority = 0;
}
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
/*
- * __dump_txn_state --
- * Output debugging information about the global transaction state.
+ * __verbose_dump_cache_single --
+ * Output diagnostic information about a single file in the cache.
*/
static int
-__dump_txn_state(WT_SESSION_IMPL *session, FILE *fp)
+__verbose_dump_cache_single(WT_SESSION_IMPL *session,
+ uint64_t *total_bytesp, uint64_t *total_dirty_bytesp)
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN *txn;
- WT_TXN_STATE *s;
- const char *iso_tag;
- uint64_t id;
- uint32_t i, session_cnt;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
-
- /* Note: odd string concatenation avoids spelling errors. */
- if (fprintf(fp, "==========\n" "transaction state dump\n") < 0)
- return (EIO);
-
- if (fprintf(fp,
- "current ID: %" PRIu64 "\n"
- "last running ID: %" PRIu64 "\n"
- "oldest ID: %" PRIu64 "\n"
- "oldest named snapshot ID: %" PRIu64 "\n",
- txn_global->current, txn_global->last_running,
- txn_global->oldest_id, txn_global->nsnap_oldest_id) < 0)
- return (EIO);
-
- if (fprintf(fp,
- "checkpoint running? %s\n"
- "checkpoint generation: %" PRIu64 "\n"
- "checkpoint pinned ID: %" PRIu64 "\n"
- "checkpoint txn ID: %" PRIu64 "\n"
- "session count: %" PRIu32 "\n",
- txn_global->checkpoint_running ? "yes" : "no",
- txn_global->checkpoint_gen,
- txn_global->checkpoint_pinned,
- txn_global->checkpoint_txnid,
- session_cnt) < 0)
- return (EIO);
-
- if (fprintf(fp, "Dumping transaction state of active sessions\n") < 0)
- return (EIO);
-
- /*
- * Walk each session transaction state and dump information. Accessing
- * the content of session handles is not thread safe, so some
- * information may change while traversing if other threads are active
- * at the same time, which is OK since this is diagnostic code.
- */
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Skip sessions with no active transaction */
- if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
- continue;
+ WT_DATA_HANDLE *dhandle;
+ WT_PAGE *page;
+ WT_REF *next_walk;
+ size_t size;
+ uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
+ uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
+ uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
+ uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
- txn = &conn->sessions[i].txn;
- iso_tag = "INVALID";
- switch (txn->isolation) {
- case WT_ISO_READ_COMMITTED:
- iso_tag = "WT_ISO_READ_COMMITTED";
- break;
- case WT_ISO_READ_UNCOMMITTED:
- iso_tag = "WT_ISO_READ_UNCOMMITTED";
- break;
- case WT_ISO_SNAPSHOT:
- iso_tag = "WT_ISO_SNAPSHOT";
- break;
+ intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
+ intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
+ leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
+ leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
+
+ next_walk = NULL;
+ while (__wt_tree_walk(session, &next_walk,
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
+ next_walk != NULL) {
+ page = next_walk->page;
+ size = page->memory_footprint;
+
+ if (WT_PAGE_IS_INTERNAL(page)) {
+ ++intl_pages;
+ intl_bytes += size;
+ intl_bytes_max = WT_MAX(intl_bytes_max, size);
+ if (__wt_page_is_modified(page)) {
+ ++intl_dirty_pages;
+ intl_dirty_bytes += size;
+ intl_dirty_bytes_max =
+ WT_MAX(intl_dirty_bytes_max, size);
+ }
+ } else {
+ ++leaf_pages;
+ leaf_bytes += size;
+ leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
+ if (__wt_page_is_modified(page)) {
+ ++leaf_dirty_pages;
+ leaf_dirty_bytes += size;
+ leaf_dirty_bytes_max =
+ WT_MAX(leaf_dirty_bytes_max, size);
+ }
}
-
- if (fprintf(fp,
- "ID: %6" PRIu64
- ", mod count: %u"
- ", pinned ID: %" PRIu64
- ", snap min: %" PRIu64
- ", snap max: %" PRIu64
- ", metadata pinned ID: %" PRIu64
- ", flags: 0x%08" PRIx32
- ", name: %s"
- ", isolation: %s" "\n",
- id,
- txn->mod_count,
- s->pinned_id,
- txn->snap_min,
- txn->snap_max,
- s->metadata_pinned,
- txn->flags,
- conn->sessions[i].name == NULL ?
- "EMPTY" : conn->sessions[i].name,
- iso_tag) < 0)
- return (EIO);
}
+ dhandle = session->dhandle;
+ if (dhandle->checkpoint == NULL)
+ WT_RET(__wt_msg(session, "%s(<live>):", dhandle->name));
+ else
+ WT_RET(__wt_msg(session, "%s(checkpoint=%s):",
+ dhandle->name, dhandle->checkpoint));
+ if (intl_pages != 0)
+ WT_RET(__wt_msg(session,
+ "internal: "
+ "%" PRIu64 " pages, "
+ "%" PRIu64 "MB, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
+ "%" PRIu64 "MB max page, "
+ "%" PRIu64 "MB max dirty page",
+ intl_pages,
+ intl_bytes / WT_MEGABYTE,
+ intl_pages - intl_dirty_pages,
+ intl_dirty_pages,
+ (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE,
+ intl_dirty_bytes / WT_MEGABYTE,
+ intl_bytes_max / WT_MEGABYTE,
+ intl_dirty_bytes_max / WT_MEGABYTE));
+ if (leaf_pages != 0)
+ WT_RET(__wt_msg(session,
+ "leaf: "
+ "%" PRIu64 " pages, "
+ "%" PRIu64 "MB, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
+ "%" PRIu64 "MB max page, "
+ "%" PRIu64 "MB max dirty page",
+ leaf_pages,
+ leaf_bytes / WT_MEGABYTE,
+ leaf_pages - leaf_dirty_pages,
+ leaf_dirty_pages,
+ (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE,
+ leaf_dirty_bytes / WT_MEGABYTE,
+ leaf_bytes_max / WT_MEGABYTE,
+ leaf_dirty_bytes_max / WT_MEGABYTE));
+
+ *total_bytesp += intl_bytes + leaf_bytes;
+ *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes;
+
return (0);
}
/*
- * __dump_cache --
- * Output debugging information about the size of the files in cache.
+ * __wt_verbose_dump_cache --
+ * Output diagnostic information about the cache.
*/
-static int
-__dump_cache(WT_SESSION_IMPL *session, FILE *fp)
+int
+__wt_verbose_dump_cache(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
- WT_DATA_HANDLE *dhandle, *saved_dhandle;
- WT_PAGE *page;
- WT_REF *next_walk;
- uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
- uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
- uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
- uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
uint64_t total_bytes, total_dirty_bytes;
- size_t size;
conn = S2C(session);
total_bytes = total_dirty_bytes = 0;
- /* Note: odd string concatenation avoids spelling errors. */
- if (fprintf(fp, "==========\n" "cache dump\n") < 0)
- return (EIO);
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+ WT_RET(__wt_msg(session, "cache dump"));
- saved_dhandle = session->dhandle;
- TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ break;
if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
!F_ISSET(dhandle, WT_DHANDLE_OPEN))
continue;
- intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
- intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
- leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
- leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
-
- next_walk = NULL;
- session->dhandle = dhandle;
- while (__wt_tree_walk(session, &next_walk,
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
- next_walk != NULL) {
- page = next_walk->page;
- size = page->memory_footprint;
-
- if (WT_PAGE_IS_INTERNAL(page)) {
- ++intl_pages;
- intl_bytes += size;
- intl_bytes_max = WT_MAX(intl_bytes_max, size);
- if (__wt_page_is_modified(page)) {
- ++intl_dirty_pages;
- intl_dirty_bytes += size;
- intl_dirty_bytes_max =
- WT_MAX(intl_dirty_bytes_max, size);
- }
- } else {
- ++leaf_pages;
- leaf_bytes += size;
- leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
- if (__wt_page_is_modified(page)) {
- ++leaf_dirty_pages;
- leaf_dirty_bytes += size;
- leaf_dirty_bytes_max =
- WT_MAX(leaf_dirty_bytes_max, size);
- }
- }
- }
- session->dhandle = NULL;
-
- if (dhandle->checkpoint == NULL) {
- if (fprintf(fp,
- "%s(<live>): \n", dhandle->name) < 0)
- return (EIO);
- } else {
- if (fprintf(fp, "%s(checkpoint=%s): \n",
- dhandle->name, dhandle->checkpoint) < 0)
- return (EIO);
- }
- if (intl_pages != 0) {
- if (fprintf(fp,
- "\t" "internal: "
- "%" PRIu64 " pages, "
- "%" PRIu64 "MB, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
- "%" PRIu64 "MB max page, "
- "%" PRIu64 "MB max dirty page\n",
- intl_pages,
- intl_bytes >> 20,
- intl_pages - intl_dirty_pages,
- intl_dirty_pages,
- (intl_bytes - intl_dirty_bytes) >> 20,
- intl_dirty_bytes >> 20,
- intl_bytes_max >> 20,
- intl_dirty_bytes_max >> 20) < 0)
- return (EIO);
- }
- if (leaf_pages != 0) {
- if (fprintf(fp,
- "\t" "leaf: "
- "%" PRIu64 " pages, "
- "%" PRIu64 "MB, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
- "%" PRIu64 "MB max page, "
- "%" PRIu64 "MB max dirty page\n",
- leaf_pages,
- leaf_bytes >> 20,
- leaf_pages - leaf_dirty_pages,
- leaf_dirty_pages,
- (leaf_bytes - leaf_dirty_bytes) >> 20,
- leaf_dirty_bytes >> 20,
- leaf_bytes_max >> 20,
- leaf_dirty_bytes_max >> 20) < 0)
- return (EIO);
- }
-
- total_bytes += intl_bytes + leaf_bytes;
- total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes;
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __verbose_dump_cache_single(
+ session, &total_bytes, &total_dirty_bytes));
+ if (ret != 0)
+ break;
}
- session->dhandle = saved_dhandle;
+ WT_RET(ret);
/*
* Apply the overhead percentage so our total bytes are comparable with
@@ -2411,39 +2429,16 @@ __dump_cache(WT_SESSION_IMPL *session, FILE *fp)
*/
total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes);
- if (fprintf(fp,
+ WT_RET(__wt_msg(session,
"cache dump: "
- "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n"
- "total dirty bytes: %" PRIu64 "MB\n",
- total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20,
- total_dirty_bytes >> 20) < 0)
- return (EIO);
- if (fprintf(fp, "==========\n") < 0)
- return (EIO);
+ "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB",
+ total_bytes / WT_MEGABYTE,
+ __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE));
+ WT_RET(__wt_msg(session,
+ "total dirty bytes: %" PRIu64 "MB",
+ total_dirty_bytes / WT_MEGABYTE));
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
return (0);
}
-
-/*
- * __wt_dump_stuck_info --
- * Dump debugging information to a file (default stderr) about the state
- * of WiredTiger when we have determined that the cache is stuck full.
- */
-int
-__wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile)
-{
- FILE *fp;
- WT_DECL_RET;
-
- if (ofile == NULL)
- fp = stderr;
- else if ((fp = fopen(ofile, "w")) == NULL)
- return (EIO);
-
- WT_ERR(__dump_txn_state(session, fp));
- WT_ERR(__dump_cache(session, fp));
-err: if (ofile != NULL && fclose(fp) != 0)
- return (EIO);
- return (ret);
-}
#endif
diff --git a/src/evict/evict_stat.c b/src/evict/evict_stat.c
index 2dd3b1e83a0..7c2d5722a63 100644
--- a/src/evict/evict_stat.c
+++ b/src/evict/evict_stat.c
@@ -134,5 +134,5 @@ __wt_curstat_cache_walk(WT_SESSION_IMPL *session)
WT_STAT_DATA_SET(session,
cache_state_root_size, btree->root.page->memory_footprint);
- WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session));
+ __evict_stat_walk(session);
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 43c1a309d52..39ca223aebf 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -483,6 +483,7 @@ struct __wt_page {
*/
struct {
WT_REF *parent_ref; /* Parent reference */
+ uint64_t split_gen; /* Generation of last split */
struct __wt_page_index {
uint32_t entries;
@@ -492,6 +493,8 @@ struct __wt_page {
} intl;
#undef pg_intl_parent_ref
#define pg_intl_parent_ref u.intl.parent_ref
+#undef pg_intl_split_gen
+#define pg_intl_split_gen u.intl.split_gen
/*
* Macros to copy/set the index because the name is obscured to ensure
@@ -593,9 +596,8 @@ struct __wt_page {
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
-#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */
-#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
-#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */
+#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
+#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
uint8_t unused[2]; /* Unused padding */
diff --git a/src/include/btree.i b/src/include/btree.i
index 09fa8df8c56..315efa86fa6 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -71,6 +71,30 @@ __wt_btree_bytes_inuse(WT_SESSION_IMPL *session)
}
/*
+ * __wt_btree_bytes_evictable --
+ * Return the number of bytes that can be evicted (i.e. bytes apart from
+ * the pinned root page).
+ */
+static inline uint64_t
+__wt_btree_bytes_evictable(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_PAGE *root_page;
+ uint64_t bytes_inmem, bytes_root;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+ root_page = btree->root.page;
+
+ bytes_inmem = btree->bytes_inmem;
+ bytes_root = root_page == NULL ? 0 : root_page->memory_footprint;
+
+ return (bytes_inmem <= bytes_root ? 0 :
+ __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root));
+}
+
+/*
* __wt_btree_dirty_inuse --
* Return the number of dirty bytes in use.
*/
@@ -1324,8 +1348,8 @@ __wt_page_can_evict(
* discards its WT_REF array, and a thread traversing the original
* parent page index might see a freed WT_REF.
*/
- if (WT_PAGE_IS_INTERNAL(page) &&
- F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
+ if (WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete(
+ session, page->pg_intl_split_gen))
return (false);
/*
diff --git a/src/include/cache.h b/src/include/cache.h
index 70f6169200d..abd5a1901f7 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -83,7 +83,7 @@ struct __wt_cache {
uint64_t worker_evicts; /* Pages evicted by worker threads */
uint64_t evict_max_page_size; /* Largest page seen at eviction */
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
struct timespec stuck_ts; /* Stuck timestamp */
#endif
diff --git a/src/include/cache.i b/src/include/cache.i
index 17ab39e97d2..d71978ccf35 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -364,7 +364,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp)
* block eviction), we don't want to highjack the thread for eviction.
*/
if (F_ISSET(session, WT_SESSION_NO_EVICTION |
- WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA))
+ WT_SESSION_LOCKED_HANDLE_LIST_WRITE | WT_SESSION_LOCKED_SCHEMA))
return (0);
/* In memory configurations don't block when the cache is full. */
diff --git a/src/include/connection.h b/src/include/connection.h
index 64ac4271db1..ce483d3291a 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -123,12 +123,16 @@ struct __wt_named_extractor {
* main queue and the hashed queue.
*/
#define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \
+ WT_ASSERT(session, \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \
TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \
TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \
++conn->dhandle_count; \
} while (0)
#define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \
+ WT_ASSERT(session, \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \
TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \
TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \
--conn->dhandle_count; \
@@ -163,13 +167,13 @@ struct __wt_connection_impl {
WT_SPINLOCK api_lock; /* Connection API spinlock */
WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */
- WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */
WT_SPINLOCK fh_lock; /* File handle queue spinlock */
WT_SPINLOCK metadata_lock; /* Metadata update spinlock */
WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */
WT_SPINLOCK schema_lock; /* Schema operation spinlock */
- WT_SPINLOCK table_lock; /* Table creation spinlock */
+ WT_RWLOCK table_lock; /* Table list lock */
WT_SPINLOCK turtle_lock; /* Turtle file spinlock */
+ WT_RWLOCK dhandle_lock; /* Data handle list lock */
/*
* We distribute the btree page locks across a set of spin locks. Don't
diff --git a/src/include/dhandle.h b/src/include/dhandle.h
index dcc788f0839..4f318e7bccf 100644
--- a/src/include/dhandle.h
+++ b/src/include/dhandle.h
@@ -37,6 +37,24 @@
#define WT_SESSION_META_DHANDLE(s) \
(((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle)
+#define WT_DHANDLE_ACQUIRE(dhandle) \
+ (void)__wt_atomic_add32(&dhandle->session_ref, 1)
+
+#define WT_DHANDLE_RELEASE(dhandle) \
+ (void)__wt_atomic_sub32(&dhandle->session_ref, 1)
+
+#define WT_DHANDLE_NEXT(session, dhandle, head, field) do { \
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));\
+ if (dhandle == NULL) \
+ dhandle = TAILQ_FIRST(head); \
+ else { \
+ WT_DHANDLE_RELEASE(dhandle); \
+ dhandle = TAILQ_NEXT(dhandle, field); \
+ } \
+ if (dhandle != NULL) \
+ WT_DHANDLE_ACQUIRE(dhandle); \
+} while (0)
+
/*
* WT_DATA_HANDLE --
* A handle for a generic named data source.
diff --git a/src/include/extern.h b/src/include/extern.h
index 566eb386c29..19ad9a880df 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -98,6 +98,7 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_A
extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -105,7 +106,6 @@ extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((w
extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -150,6 +150,9 @@ extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
@@ -160,6 +163,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -192,8 +196,6 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_las_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -254,6 +256,7 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg) WT_GCC_FUNC_DECL_ATTRIBUT
extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_conn_dhandle_alloc( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -262,7 +265,7 @@ extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *ur
extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -352,7 +355,7 @@ extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int
extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -403,7 +406,7 @@ extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bo
extern int __wt_log_slot_new(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -613,11 +616,9 @@ extern void __wt_session_close_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_
extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_auto_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -689,6 +690,7 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2) WT_GCC_FUNC_DECL_ATTRIBUT
extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern uint64_t __wt_random64(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -741,6 +743,7 @@ extern void __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATT
extern void __wt_txn_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h
index 5acb7b0ed27..fed7835ada1 100644
--- a/src/include/extern_posix.h
+++ b/src/include/extern_posix.h
@@ -12,8 +12,8 @@ extern int __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapp
extern int __wt_posix_map_preload(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_posix_map_discard(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_region, size_t len, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/include/extern_win.h b/src/include/extern_win.h
index 11b45f11304..0bfc821c7a6 100644
--- a/src/include/extern_win.h
+++ b/src/include/extern_win.h
@@ -10,8 +10,8 @@ extern int __wt_os_win(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((war
extern int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/include/flags.h b/src/include/flags.h
index 2f0c207078a..c1fff920e3b 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -53,22 +53,24 @@
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_INTERNAL 0x00000002
#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004
-#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000008
-#define WT_SESSION_LOCKED_METADATA 0x00000010
-#define WT_SESSION_LOCKED_PASS 0x00000020
-#define WT_SESSION_LOCKED_SCHEMA 0x00000040
-#define WT_SESSION_LOCKED_SLOT 0x00000080
-#define WT_SESSION_LOCKED_TABLE 0x00000100
-#define WT_SESSION_LOCKED_TURTLE 0x00000200
-#define WT_SESSION_LOGGING_INMEM 0x00000400
-#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800
-#define WT_SESSION_NO_CACHE 0x00001000
-#define WT_SESSION_NO_DATA_HANDLES 0x00002000
-#define WT_SESSION_NO_EVICTION 0x00004000
-#define WT_SESSION_NO_LOGGING 0x00008000
-#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000
-#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000
-#define WT_SESSION_SERVER_ASYNC 0x00040000
+#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008
+#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010
+#define WT_SESSION_LOCKED_METADATA 0x00000020
+#define WT_SESSION_LOCKED_PASS 0x00000040
+#define WT_SESSION_LOCKED_SCHEMA 0x00000080
+#define WT_SESSION_LOCKED_SLOT 0x00000100
+#define WT_SESSION_LOCKED_TABLE_READ 0x00000200
+#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400
+#define WT_SESSION_LOCKED_TURTLE 0x00000800
+#define WT_SESSION_LOGGING_INMEM 0x00001000
+#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000
+#define WT_SESSION_NO_CACHE 0x00004000
+#define WT_SESSION_NO_DATA_HANDLES 0x00008000
+#define WT_SESSION_NO_EVICTION 0x00010000
+#define WT_SESSION_NO_LOGGING 0x00020000
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00040000
+#define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000
+#define WT_SESSION_SERVER_ASYNC 0x00100000
#define WT_STAT_CLEAR 0x00000001
#define WT_STAT_JSON 0x00000002
#define WT_STAT_ON_CLOSE 0x00000004
@@ -90,28 +92,29 @@
#define WT_VERB_COMPACT 0x00000008
#define WT_VERB_EVICT 0x00000010
#define WT_VERB_EVICTSERVER 0x00000020
-#define WT_VERB_FILEOPS 0x00000040
-#define WT_VERB_HANDLEOPS 0x00000080
-#define WT_VERB_LOG 0x00000100
-#define WT_VERB_LSM 0x00000200
-#define WT_VERB_LSM_MANAGER 0x00000400
-#define WT_VERB_METADATA 0x00000800
-#define WT_VERB_MUTEX 0x00001000
-#define WT_VERB_OVERFLOW 0x00002000
-#define WT_VERB_READ 0x00004000
-#define WT_VERB_REBALANCE 0x00008000
-#define WT_VERB_RECONCILE 0x00010000
-#define WT_VERB_RECOVERY 0x00020000
-#define WT_VERB_RECOVERY_PROGRESS 0x00040000
-#define WT_VERB_SALVAGE 0x00080000
-#define WT_VERB_SHARED_CACHE 0x00100000
-#define WT_VERB_SPLIT 0x00200000
-#define WT_VERB_TEMPORARY 0x00400000
-#define WT_VERB_THREAD_GROUP 0x00800000
-#define WT_VERB_TRANSACTION 0x01000000
-#define WT_VERB_VERIFY 0x02000000
-#define WT_VERB_VERSION 0x04000000
-#define WT_VERB_WRITE 0x08000000
+#define WT_VERB_EVICT_STUCK 0x00000040
+#define WT_VERB_FILEOPS 0x00000080
+#define WT_VERB_HANDLEOPS 0x00000100
+#define WT_VERB_LOG 0x00000200
+#define WT_VERB_LSM 0x00000400
+#define WT_VERB_LSM_MANAGER 0x00000800
+#define WT_VERB_METADATA 0x00001000
+#define WT_VERB_MUTEX 0x00002000
+#define WT_VERB_OVERFLOW 0x00004000
+#define WT_VERB_READ 0x00008000
+#define WT_VERB_REBALANCE 0x00010000
+#define WT_VERB_RECONCILE 0x00020000
+#define WT_VERB_RECOVERY 0x00040000
+#define WT_VERB_RECOVERY_PROGRESS 0x00080000
+#define WT_VERB_SALVAGE 0x00100000
+#define WT_VERB_SHARED_CACHE 0x00200000
+#define WT_VERB_SPLIT 0x00400000
+#define WT_VERB_TEMPORARY 0x00800000
+#define WT_VERB_THREAD_GROUP 0x01000000
+#define WT_VERB_TRANSACTION 0x02000000
+#define WT_VERB_VERIFY 0x04000000
+#define WT_VERB_VERSION 0x08000000
+#define WT_VERB_WRITE 0x10000000
#define WT_VISIBILITY_ERR 0x00000080
/*
* flags section: END
diff --git a/src/include/log.h b/src/include/log.h
index d9fea892c68..a6be3582b4d 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -163,7 +163,7 @@ struct __wt_logslot {
WT_CACHE_LINE_PAD_BEGIN
volatile int64_t slot_state; /* Slot state */
int64_t slot_unbuffered; /* Unbuffered data in this slot */
- int32_t slot_error; /* Error value */
+ int slot_error; /* Error value */
wt_off_t slot_start_offset; /* Starting file offset */
wt_off_t slot_last_offset; /* Last record offset */
WT_LSN slot_release_lsn; /* Slot release LSN */
@@ -254,6 +254,7 @@ struct __wt_log {
#define WT_SLOT_POOL 128
WT_LOGSLOT *active_slot; /* Active slot */
WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */
+ int32_t pool_index; /* Index into slot pool */
size_t slot_buf_size; /* Buffer size for slots */
#ifdef HAVE_DIAGNOSTIC
uint64_t write_calls; /* Calls to log_write */
diff --git a/src/include/misc.i b/src/include/misc.i
index f36be32d6a2..d5692a3f9cf 100644
--- a/src/include/misc.i
+++ b/src/include/misc.i
@@ -11,11 +11,12 @@
* Wait on a mutex, optionally timing out.
*/
static inline void
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait(WT_SESSION_IMPL *session,
+ WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *))
{
bool notused;
- __wt_cond_wait_signal(session, cond, usecs, &notused);
+ __wt_cond_wait_signal(session, cond, usecs, run_func, &notused);
}
/*
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 727a690bb1c..06b8c4a3304 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -21,8 +21,8 @@ struct __wt_condvar {
int waiters; /* Numbers of waiters, or
-1 if signalled with no waiters. */
/*
- * The following fields are only used for automatically adjusting
- * condition variables. They could be in a separate structure.
+ * The following fields are used for automatically adjusting condition
+ * variable wait times.
*/
uint64_t min_wait; /* Minimum wait duration */
uint64_t max_wait; /* Maximum wait duration */
diff --git a/src/include/packing.i b/src/include/packing.i
index 17ca261bcfc..8ba3dd536ac 100644
--- a/src/include/packing.i
+++ b/src/include/packing.i
@@ -168,10 +168,15 @@ next: if (pack->cur == pack->end)
(int)(pack->end - pack->orig), pack->orig);
return (0);
case 'u':
- case 'U':
/* Special case for items with a size prefix. */
pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u';
return (0);
+ case 'U':
+ /*
+ * Don't change the type. 'U' is used internally, so this type
+ * was already changed to explicitly include the size.
+ */
+ return (0);
case 'b':
case 'h':
case 'i':
diff --git a/src/include/schema.h b/src/include/schema.h
index bb116e5cf2f..9a6e1e54e80 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -78,6 +78,14 @@ struct __wt_table {
*/
#define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1)
+/* Helpers for the locked state of the handle list and table locks. */
+#define WT_SESSION_LOCKED_HANDLE_LIST \
+ (WT_SESSION_LOCKED_HANDLE_LIST_READ | \
+ WT_SESSION_LOCKED_HANDLE_LIST_WRITE)
+#define WT_SESSION_LOCKED_TABLE \
+ (WT_SESSION_LOCKED_TABLE_READ | \
+ WT_SESSION_LOCKED_TABLE_WRITE)
+
/*
* WT_WITH_LOCK_WAIT --
* Wait for a lock, perform an operation, drop the lock.
@@ -85,7 +93,7 @@ struct __wt_table {
#define WT_WITH_LOCK_WAIT(session, lock, flag, op) do { \
if (F_ISSET(session, (flag))) { \
op; \
- } else { \
+ } else { \
__wt_spin_lock_track(session, lock); \
F_SET(session, (flag)); \
op; \
@@ -122,16 +130,46 @@ struct __wt_table {
&S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op)
/*
- * WT_WITH_HANDLE_LIST_LOCK --
- * Acquire the data handle list lock, perform an operation, drop the lock.
+ * WT_WITH_HANDLE_LIST_READ_LOCK --
+ * Acquire the data handle list lock in shared mode, perform an operation,
+ * drop the lock. The handle list lock is a read-write lock so the
+ * implementation is different to the other lock macros.
*
* Note: always waits because some operations need the handle list lock to
* discard handles, and we only expect it to be held across short
* operations.
*/
-#define WT_WITH_HANDLE_LIST_LOCK(session, op) \
- WT_WITH_LOCK_WAIT(session, \
- &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op)
+#define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \
+ op; \
+ } else { \
+ __wt_readlock(session, &S2C(session)->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ __wt_readunlock(session, &S2C(session)->dhandle_lock); \
+ } \
+} while (0)
+
+/*
+ * WT_WITH_HANDLE_LIST_WRITE_LOCK --
+ * Acquire the data handle list lock in exclusive mode, perform an
+ * operation, drop the lock. The handle list lock is a read-write lock so
+ * the implementation is different to the other lock macros.
+ */
+#define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \
+ op; \
+ } else { \
+ WT_ASSERT(session, \
+ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\
+ __wt_writelock(session, &S2C(session)->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ __wt_writeunlock(session, &S2C(session)->dhandle_lock); \
+ } \
+} while (0)
/*
* WT_WITH_METADATA_LOCK --
@@ -165,22 +203,58 @@ struct __wt_table {
} while (0)
/*
- * WT_WITH_TABLE_LOCK, WT_WITH_TABLE_LOCK_NOWAIT --
+ * WT_WITH_TABLE_READ_LOCK, WT_WITH_TABLE_WRITE_LOCK,
+ * WT_WITH_TABLE_WRITE_LOCK_NOWAIT --
* Acquire the table lock, perform an operation, drop the lock.
+ * The table lock is a read-write lock so the implementation is different
+ * to most other lock macros.
+ *
+ * Note: readlock always waits because some operations need the table lock
+ * to discard handles, and we only expect it to be held across short
+ * operations.
*/
-#define WT_WITH_TABLE_LOCK(session, op) do { \
- WT_ASSERT(session, \
- F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \
- !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
- WT_WITH_LOCK_WAIT(session, \
- &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \
+#define WT_WITH_TABLE_READ_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_TABLE)) { \
+ op; \
+ } else { \
+ WT_ASSERT(session, \
+ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
+ __wt_readlock(session, &S2C(session)->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \
+ __wt_readunlock(session, &S2C(session)->table_lock); \
+ } \
+} while (0)
+
+#define WT_WITH_TABLE_WRITE_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \
+ op; \
+ } else { \
+ WT_ASSERT(session, \
+ !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \
+ WT_SESSION_LOCKED_HANDLE_LIST)); \
+ __wt_writelock(session, &S2C(session)->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ __wt_writeunlock(session, &S2C(session)->table_lock); \
+ } \
} while (0)
-#define WT_WITH_TABLE_LOCK_NOWAIT(session, ret, op) do { \
+#define WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, op) do { \
WT_ASSERT(session, \
- F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \
- !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
- WT_WITH_LOCK_NOWAIT(session, ret, \
- &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE) || \
+ !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \
+ WT_SESSION_LOCKED_HANDLE_LIST)); \
+ if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \
+ op; \
+ } else if ((ret = __wt_try_writelock(session, \
+ &S2C(session)->table_lock)) == 0) { \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ __wt_writeunlock(session, &S2C(session)->table_lock); \
+ } \
} while (0)
/*
@@ -192,19 +266,31 @@ struct __wt_table {
WT_CONNECTION_IMPL *__conn = S2C(session); \
bool __checkpoint_locked = \
F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \
- bool __handle_locked = \
- F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \
- bool __table_locked = \
- F_ISSET(session, WT_SESSION_LOCKED_TABLE); \
+ bool __handle_read_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ bool __handle_write_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ bool __table_read_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ); \
+ bool __table_write_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
bool __schema_locked = \
F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \
- if (__handle_locked) { \
- F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \
- __wt_spin_unlock(session, &__conn->dhandle_lock); \
+ if (__handle_read_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ __wt_readunlock(session, &__conn->dhandle_lock); \
} \
- if (__table_locked) { \
- F_CLR(session, WT_SESSION_LOCKED_TABLE); \
- __wt_spin_unlock(session, &__conn->table_lock); \
+ if (__handle_write_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ __wt_writeunlock(session, &__conn->dhandle_lock); \
+ } \
+ if (__table_read_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \
+ __wt_readunlock(session, &__conn->table_lock); \
+ } \
+ if (__table_write_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ __wt_writeunlock(session, &__conn->table_lock); \
} \
if (__schema_locked) { \
F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \
@@ -223,12 +309,20 @@ struct __wt_table {
__wt_spin_lock(session, &__conn->schema_lock); \
F_SET(session, WT_SESSION_LOCKED_SCHEMA); \
} \
- if (__table_locked) { \
- __wt_spin_lock(session, &__conn->table_lock); \
- F_SET(session, WT_SESSION_LOCKED_TABLE); \
+ if (__table_read_locked) { \
+ __wt_readlock(session, &__conn->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \
+ } \
+ if (__table_write_locked) { \
+ __wt_writelock(session, &__conn->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ } \
+ if (__handle_read_locked) { \
+ __wt_readlock(session, &__conn->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
} \
- if (__handle_locked) { \
- __wt_spin_lock(session, &__conn->dhandle_lock); \
- F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \
+ if (__handle_write_locked) { \
+ __wt_writelock(session, &__conn->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
} \
} while (0)
diff --git a/src/include/session.h b/src/include/session.h
index 7dd523aea26..085f871a34f 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -52,8 +52,6 @@ struct __wt_session_impl {
const char *lastop; /* Last operation */
uint32_t id; /* UID, offset in session array */
- WT_CONDVAR *cond; /* Condition variable */
-
WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
WT_DATA_HANDLE *dhandle; /* Current data handle */
diff --git a/src/include/stat.h b/src/include/stat.h
index fd3e3290d95..8b2e78a4ed5 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -392,9 +392,7 @@ struct __wt_connection_stats {
int64_t lock_checkpoint_count;
int64_t lock_checkpoint_wait_application;
int64_t lock_checkpoint_wait_internal;
- int64_t lock_handle_list_count;
- int64_t lock_handle_list_wait_application;
- int64_t lock_handle_list_wait_internal;
+ int64_t lock_handle_list_wait_eviction;
int64_t lock_metadata_count;
int64_t lock_metadata_wait_application;
int64_t lock_metadata_wait_internal;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 90989cc679d..c148e759299 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -576,8 +576,9 @@ struct __wt_cursor {
#define WT_CURSTD_OPEN 0x00200
#define WT_CURSTD_OVERWRITE 0x00400
#define WT_CURSTD_RAW 0x00800
-#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */
-#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */
+#define WT_CURSTD_RAW_SEARCH 0x01000
+#define WT_CURSTD_VALUE_EXT 0x02000 /* Value points out of the tree. */
+#define WT_CURSTD_VALUE_INT 0x04000 /* Value points into the tree. */
#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
uint32_t flags;
#endif
@@ -1982,12 +1983,13 @@ struct __wt_connection {
* as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a
* list\, with values chosen from the following options: \c "api"\, \c
* "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c
- * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\,
- * \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
- * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
- * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\,
- * \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c "verify"\,
- * \c "version"\, \c "write"; default empty.}
+ * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c
+ * "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c
+ * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c
+ * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c
+ * "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default
+ * empty.}
* @configend
* @errors
*/
@@ -2361,7 +2363,7 @@ struct __wt_connection {
* @config{exclusive, fail if the database already exists\, generally used with
* the \c create option., a boolean flag; default \c false.}
* @config{extensions, list of shared library extensions to load (using dlopen).
- * Any values specified to an library extension are passed to
+ * Any values specified to a library extension are passed to
* WT_CONNECTION::load_extension as the \c config parameter (for example\,
* <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings;
* default empty.}
@@ -2513,12 +2515,13 @@ struct __wt_connection {
* WiredTiger is configured with --enable-verbose. Options are given as a
* list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
* values chosen from the following options: \c "api"\, \c "block"\, \c
- * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\,
- * \c "handleops"\, \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c
- * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c
- * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c
- * "split"\, \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c
- * "verify"\, \c "version"\, \c "write"; default empty.}
+ * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evict_stuck"\, \c
+ * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, \c
+ * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c
+ * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c
+ * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c
+ * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c "write";
+ * default empty.}
* @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to
* files. Ignored on non-Windows systems. Options are given as a list\, such
* as <code>"write_through=[data]"</code>. Configuring \c write_through requires
@@ -4593,240 +4596,236 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1133
/*! lock: checkpoint lock internal thread wait time (usecs) */
#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1134
-/*! lock: handle-list lock acquisitions */
-#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1135
-/*! lock: handle-list lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1136
-/*! lock: handle-list lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1137
+/*! lock: handle-list lock eviction thread wait time (usecs) */
+#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1135
/*! lock: metadata lock acquisitions */
-#define WT_STAT_CONN_LOCK_METADATA_COUNT 1138
+#define WT_STAT_CONN_LOCK_METADATA_COUNT 1136
/*! lock: metadata lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1139
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1137
/*! lock: metadata lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1140
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1138
/*! lock: schema lock acquisitions */
-#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1141
+#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1139
/*! lock: schema lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1142
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1140
/*! lock: schema lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1143
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1141
/*! lock: table lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_COUNT 1144
+#define WT_STAT_CONN_LOCK_TABLE_COUNT 1142
/*!
* lock: table lock application thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1145
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1143
/*!
* lock: table lock internal thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1146
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1144
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1147
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1145
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1148
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1146
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1149
+#define WT_STAT_CONN_LOG_SLOT_RACES 1147
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1150
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1148
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1151
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1149
/*! log: consolidated slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1152
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1150
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1153
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1151
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1154
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1152
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1155
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1153
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1156
+#define WT_STAT_CONN_LOG_FLUSH 1154
/*! log: log force write operations */
-#define WT_STAT_CONN_LOG_FORCE_WRITE 1157
+#define WT_STAT_CONN_LOG_FORCE_WRITE 1155
/*! log: log force write operations skipped */
-#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1158
+#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1156
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1159
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1157
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1160
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1158
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1161
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1159
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1162
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1160
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1163
+#define WT_STAT_CONN_LOG_SCANS 1161
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1164
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1162
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1165
+#define WT_STAT_CONN_LOG_WRITE_LSN 1163
/*! log: log server thread write LSN walk skipped */
-#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1166
+#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1164
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1167
+#define WT_STAT_CONN_LOG_SYNC 1165
/*! log: log sync time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DURATION 1168
+#define WT_STAT_CONN_LOG_SYNC_DURATION 1166
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1169
+#define WT_STAT_CONN_LOG_SYNC_DIR 1167
/*! log: log sync_dir time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1170
+#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1168
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1171
+#define WT_STAT_CONN_LOG_WRITES 1169
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1172
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1170
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1173
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1171
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1174
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1172
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1175
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1173
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1176
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1174
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1177
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1175
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1178
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1176
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1179
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1177
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1180
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1178
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1181
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1179
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1182
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1180
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1183
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1181
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1184
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1182
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1185
+#define WT_STAT_CONN_REC_PAGES 1183
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1186
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1184
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1187
+#define WT_STAT_CONN_REC_PAGE_DELETE 1185
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1188
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1186
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1189
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1187
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1190
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1188
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1191
+#define WT_STAT_CONN_SESSION_OPEN 1189
/*! session: table alter failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1192
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1190
/*! session: table alter successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1193
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1191
/*! session: table alter unchanged and skipped */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1194
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1192
/*! session: table compact failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1195
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1193
/*! session: table compact successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1196
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1194
/*! session: table create failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1197
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1195
/*! session: table create successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1198
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1196
/*! session: table drop failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1199
+#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1197
/*! session: table drop successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1200
+#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1198
/*! session: table rebalance failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1201
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1199
/*! session: table rebalance successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1202
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1200
/*! session: table rename failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1203
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1201
/*! session: table rename successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1204
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1202
/*! session: table salvage failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1205
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1203
/*! session: table salvage successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1206
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1204
/*! session: table truncate failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1207
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1205
/*! session: table truncate successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1208
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1206
/*! session: table verify failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1209
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1207
/*! session: table verify successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1210
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1208
/*! thread-state: active filesystem fsync calls */
-#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1211
+#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1209
/*! thread-state: active filesystem read calls */
-#define WT_STAT_CONN_THREAD_READ_ACTIVE 1212
+#define WT_STAT_CONN_THREAD_READ_ACTIVE 1210
/*! thread-state: active filesystem write calls */
-#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1213
+#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1211
/*! thread-yield: application thread time evicting (usecs) */
-#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1214
+#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1212
/*! thread-yield: application thread time waiting for cache (usecs) */
-#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1215
+#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1213
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1216
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1214
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1217
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1215
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1218
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1216
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1219
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1217
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1220
+#define WT_STAT_CONN_PAGE_SLEEP 1218
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1221
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1219
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1222
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1220
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1223
+#define WT_STAT_CONN_TXN_BEGIN 1221
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1224
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1222
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1225
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1223
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1226
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1224
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1227
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1225
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1228
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1226
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1229
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1227
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1230
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1228
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1231
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1229
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1232
+#define WT_STAT_CONN_TXN_CHECKPOINT 1230
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1233
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1231
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1234
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1232
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1235
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1233
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1236
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1234
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1237
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1235
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1238
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1236
/*!
* transaction: transaction range of IDs currently pinned by named
* snapshots
*/
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1239
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1237
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1240
+#define WT_STAT_CONN_TXN_SYNC 1238
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1241
+#define WT_STAT_CONN_TXN_COMMIT 1239
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1242
+#define WT_STAT_CONN_TXN_ROLLBACK 1240
/*!
* @}
diff --git a/src/log/log.c b/src/log/log.c
index da500a74e87..d6caa55f8c7 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -43,11 +43,11 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
*/
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_unlock(session, &log->log_slot_lock);
- __wt_cond_auto_signal(session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
if (++yield_count < WT_THOUSAND)
__wt_yield();
else
- __wt_cond_wait(session, log->log_write_cond, 200);
+ __wt_cond_wait(session, log->log_write_cond, 200, NULL);
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_lock(session, &log->log_slot_lock);
}
@@ -62,6 +62,8 @@ static int
__log_fs_write(WT_SESSION_IMPL *session,
WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf)
{
+ WT_DECL_RET;
+
/*
* If we're writing into a new log file, we have to wait for all
* writes to the previous log file to complete otherwise there could
@@ -71,7 +73,10 @@ __log_fs_write(WT_SESSION_IMPL *session,
__log_wait_for_earlier_slot(session, slot);
WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
}
- return (__wt_write(session, slot->slot_fh, offset, len, buf));
+ if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0)
+ WT_PANIC_MSG(session, ret,
+ "%s: fatal log failure", slot->slot_fh->name);
+ return (ret);
}
/*
@@ -89,7 +94,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
log = conn->log;
log->ckpt_lsn = *ckp_lsn;
if (conn->log_cond != NULL)
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
}
/*
@@ -170,7 +175,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
*/
while (log->sync_lsn.l.file < min_lsn->l.file) {
__wt_cond_signal(session, S2C(session)->log_file_cond);
- __wt_cond_wait(session, log->log_sync_cond, 10000);
+ __wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
}
__wt_spin_lock(session, &log->log_sync_lock);
WT_ASSERT(session, log->log_dir_fh != NULL);
@@ -915,7 +920,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
else {
WT_STAT_CONN_INCR(session, log_prealloc_missed);
if (conn->log_cond != NULL)
- __wt_cond_auto_signal(
+ __wt_cond_signal(
session, conn->log_cond);
}
}
@@ -1490,7 +1495,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
*/
if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file ||
__wt_spin_trylock(session, &log->log_sync_lock) != 0) {
- __wt_cond_wait(session, log->log_sync_cond, 10000);
+ __wt_cond_wait(
+ session, log->log_sync_cond, 10000, NULL);
continue;
}
locked = true;
@@ -2126,7 +2132,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_STAT_CONN_INCR(session, log_writes);
- __wt_log_slot_join(session, rdup_len, flags, &myslot);
+ /*
+ * The only time joining a slot should ever return an error is if it
+ * detects a panic.
+ */
+ WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot));
/*
* If the addition of this record crosses the buffer boundary,
* switch in a new slot.
@@ -2160,7 +2170,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
* XXX I've seen times when conditions are NULL.
*/
if (conn->log_cond != NULL) {
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
__wt_yield();
} else
WT_ERR(__wt_log_force_write(session, 1, NULL));
@@ -2169,12 +2179,14 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
/* Wait for our writes to reach the OS */
while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
- __wt_cond_wait(session, log->log_write_cond, 10000);
+ __wt_cond_wait(
+ session, log->log_write_cond, 10000, NULL);
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
- __wt_cond_wait(session, log->log_sync_cond, 10000);
+ __wt_cond_wait(
+ session, log->log_sync_cond, 10000, NULL);
}
/*
@@ -2199,12 +2211,12 @@ err:
/*
* If one of the sync flags is set, assert the proper LSN has moved to
- * match.
+ * match on success.
*/
- WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) ||
+ WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) ||
__wt_log_cmp(&log->write_lsn, &lsn) >= 0);
- WT_ASSERT(session,
- !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
+ WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) ||
+ __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
return (ret);
}
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index a29a34e5652..542f010ea53 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -8,6 +8,49 @@
#include "wt_internal.h"
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __log_slot_dump --
+ * Dump the entire slot state.
+ */
+static void
+__log_slot_dump(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int earliest, i;
+
+ conn = S2C(session);
+ log = conn->log;
+ earliest = 0;
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ if (__wt_log_cmp(&slot->slot_release_lsn,
+ &log->slot_pool[earliest].slot_release_lsn) < 0)
+ earliest = i;
+ __wt_errx(session, "Slot %d:", i);
+ __wt_errx(session, " State: %" PRIx64 " Flags: %" PRIx32,
+ slot->slot_state, slot->flags);
+ __wt_errx(session, " Start LSN: %" PRIu32 "/%" PRIu32,
+ slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset);
+ __wt_errx(session, " End LSN: %" PRIu32 "/%" PRIu32,
+ slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset);
+ __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32,
+ slot->slot_release_lsn.l.file,
+ slot->slot_release_lsn.l.offset);
+ __wt_errx(session, " Offset: start: %" PRIuMAX
+ " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset,
+ (uintmax_t)slot->slot_last_offset);
+ __wt_errx(session, " Unbuffered: %" PRId64
+ " error: %" PRId32, slot->slot_unbuffered,
+ slot->slot_error);
+ }
+ __wt_errx(session, "Earliest slot: %d", earliest);
+
+}
+#endif
+
/*
* __wt_log_slot_activate --
* Initialize a slot to become active.
@@ -21,7 +64,6 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
conn = S2C(session);
log = conn->log;
- slot->slot_state = 0;
/*
* !!! slot_release_lsn must be set outside this function because
* this function may be called after a log file switch and the
@@ -30,12 +72,19 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
* set for closing the file handle on a log file switch. The flags
* are reset when the slot is freed. See log_slot_free.
*/
+ slot->slot_unbuffered = 0;
slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn;
slot->slot_start_offset = log->alloc_lsn.l.offset;
slot->slot_last_offset = log->alloc_lsn.l.offset;
slot->slot_fh = log->log_fh;
slot->slot_error = 0;
- slot->slot_unbuffered = 0;
+ WT_DIAGNOSTIC_YIELD;
+ /*
+ * Set the slot state last. Other threads may have a stale pointer
+ * to this slot and could try to alter the state and other fields once
+ * they see the state cleared.
+ */
+ WT_PUBLISH(slot->slot_state, 0);
}
/*
@@ -50,6 +99,10 @@ __log_slot_close(
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
int64_t end_offset, new_state, old_state;
+#ifdef HAVE_DIAGNOSTIC
+ struct timespec begin, now;
+ int count;
+#endif
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
WT_ASSERT(session, releasep != NULL);
@@ -101,9 +154,33 @@ retry:
* that value. If the state is unbuffered, wait for the unbuffered
* size to be set.
*/
- while (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state) &&
- slot->slot_unbuffered == 0)
- __wt_yield();
+#ifdef HAVE_DIAGNOSTIC
+ count = 0;
+ __wt_epoch(session, &begin);
+#endif
+ if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) {
+ while (slot->slot_unbuffered == 0) {
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
+ __wt_yield();
+#ifdef HAVE_DIAGNOSTIC
+ ++count;
+ if (count > WT_MILLION) {
+ __wt_epoch(session, &now);
+ if (WT_TIMEDIFF_SEC(now, begin) > 10) {
+ __wt_errx(session, "SLOT_CLOSE: Slot %"
+ PRIu32 " Timeout unbuffered, state 0x%"
+ PRIx64 " unbuffered %" PRIu64,
+ (uint32_t)(slot - &log->slot_pool[0]),
+ slot->slot_state,
+ slot->slot_unbuffered);
+ __log_slot_dump(session);
+ __wt_abort(session);
+ }
+ count = 0;
+ }
+#endif
+ }
+ }
end_offset =
WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered;
@@ -218,7 +295,11 @@ __wt_log_slot_new(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
WT_LOGSLOT *slot;
- int32_t i;
+ int32_t i, pool_i;
+#ifdef HAVE_DIAGNOSTIC
+ struct timespec begin, now;
+ int count;
+#endif
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
conn = S2C(session);
@@ -232,16 +313,22 @@ __wt_log_slot_new(WT_SESSION_IMPL *session)
WT_LOG_SLOT_OPEN(slot->slot_state))
return (0);
+#ifdef HAVE_DIAGNOSTIC
+ count = 0;
+ __wt_epoch(session, &begin);
+#endif
/*
* Keep trying until we can find a free slot.
*/
for (;;) {
/*
- * For now just restart at 0. We could use log->pool_index
- * if that is inefficient.
+ * Rotate among the slots to lessen collisions.
*/
- for (i = 0; i < WT_SLOT_POOL; i++) {
- slot = &log->slot_pool[i];
+ for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL;
+ i++, pool_i++) {
+ if (pool_i >= WT_SLOT_POOL)
+ pool_i = 0;
+ slot = &log->slot_pool[pool_i];
if (slot->slot_state == WT_LOG_SLOT_FREE) {
/*
* Acquire our starting position in the
@@ -256,14 +343,28 @@ __wt_log_slot_new(WT_SESSION_IMPL *session)
WT_STAT_CONN_INCR(session,
log_slot_transitions);
log->active_slot = slot;
+ log->pool_index = pool_i;
return (0);
}
}
/*
* If we didn't find any free slots signal the worker thread.
*/
- __wt_cond_auto_signal(session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
__wt_yield();
+#ifdef HAVE_DIAGNOSTIC
+ ++count;
+ if (count > WT_MILLION) {
+ __wt_epoch(session, &now);
+ if (WT_TIMEDIFF_SEC(now, begin) > 10) {
+ __wt_errx(session,
+ "SLOT_NEW: Timeout free slot");
+ __log_slot_dump(session);
+ __wt_abort(session);
+ }
+ count = 0;
+ }
+#endif
}
/* NOTREACHED */
}
@@ -311,10 +412,13 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
/*
* We cannot initialize the release LSN in the activate function
* because that function can be called after a log file switch.
+ * The release LSN is usually the same as the slot_start_lsn except
+ * around a log file switch.
*/
slot->slot_release_lsn = log->alloc_lsn;
__wt_log_slot_activate(session, slot);
log->active_slot = slot;
+ log->pool_index = 0;
if (0) {
err: while (--i >= 0)
@@ -361,7 +465,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session)
* __wt_log_slot_join --
* Join a consolidated logging slot.
*/
-void
+int
__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
uint32_t flags, WT_MYSLOT *myslot)
{
@@ -370,53 +474,63 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
WT_LOGSLOT *slot;
int64_t flag_state, new_state, old_state, released;
int32_t join_offset, new_join;
-#ifdef HAVE_DIAGNOSTIC
- bool unbuf_force;
-#endif
+ bool unbuffered, yld;
conn = S2C(session);
log = conn->log;
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ WT_ASSERT(session, mysize != 0);
/*
* There should almost always be a slot open.
*/
+ unbuffered = false;
#ifdef HAVE_DIAGNOSTIC
- unbuf_force = (++log->write_calls % WT_THOUSAND) == 0;
+ yld = (++log->write_calls % 7) == 0;
+ if ((log->write_calls % WT_THOUSAND) == 0 ||
+ mysize > WT_LOG_SLOT_BUF_MAX) {
+#else
+ yld = false;
+ if (mysize > WT_LOG_SLOT_BUF_MAX) {
#endif
+ unbuffered = true;
+ F_SET(myslot, WT_MYSLOT_UNBUFFERED);
+ }
for (;;) {
WT_BARRIER();
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
slot = log->active_slot;
old_state = slot->slot_state;
- /*
- * Try to join our size into the existing size and
- * atomically write it back into the state.
- */
- flag_state = WT_LOG_SLOT_FLAGS(old_state);
- released = WT_LOG_SLOT_RELEASED(old_state);
- join_offset = WT_LOG_SLOT_JOINED(old_state);
-#ifdef HAVE_DIAGNOSTIC
- if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) {
-#else
- if (mysize > WT_LOG_SLOT_BUF_MAX) {
-#endif
- new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
- F_SET(myslot, WT_MYSLOT_UNBUFFERED);
- myslot->slot = slot;
- } else
- new_join = join_offset + (int32_t)mysize;
- new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
- (int64_t)new_join, (int64_t)released, (int64_t)flag_state);
-
- /*
- * Check if the slot is open for joining and we are able to
- * swap in our size into the state.
- */
- if (WT_LOG_SLOT_OPEN(old_state) &&
- __wt_atomic_casiv64(
- &slot->slot_state, old_state, new_state))
- break;
+ if (WT_LOG_SLOT_OPEN(old_state)) {
+ /*
+ * Try to join our size into the existing size and
+ * atomically write it back into the state.
+ */
+ flag_state = WT_LOG_SLOT_FLAGS(old_state);
+ released = WT_LOG_SLOT_RELEASED(old_state);
+ join_offset = WT_LOG_SLOT_JOINED(old_state);
+ if (unbuffered)
+ new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
+ else
+ new_join = join_offset + (int32_t)mysize;
+ new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
+ (int64_t)new_join, (int64_t)released,
+ (int64_t)flag_state);
+
+ /*
+ * Braces used due to potential empty body warning.
+ */
+ if (yld) {
+ WT_DIAGNOSTIC_YIELD;
+ }
+ /*
+ * Attempt to swap our size into the state.
+ */
+ if (__wt_atomic_casiv64(
+ &slot->slot_state, old_state, new_state))
+ break;
+ }
/*
* The slot is no longer open or we lost the race to
* update it. Yield and try again.
@@ -428,8 +542,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
* We joined this slot. Fill in our information to return to
* the caller.
*/
- if (mysize != 0)
- WT_STAT_CONN_INCR(session, log_slot_joins);
+ WT_STAT_CONN_INCR(session, log_slot_joins);
if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
F_SET(slot, WT_SLOT_SYNC_DIR);
if (LF_ISSET(WT_LOG_FLUSH))
@@ -444,6 +557,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
myslot->slot = slot;
myslot->offset = join_offset;
myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
+ return (0);
}
/*
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index a2511f48e2b..60afbc99ade 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -1692,8 +1692,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
bulk = cval.val != 0;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree));
+ ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree);
+
/*
* Check whether the exclusive open for a bulk load succeeded, and
* if it did ensure that it's safe to bulk load into the tree.
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index cbd83a5cd30..6dc06146179 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -387,8 +387,8 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
__wt_sleep(0, 10000);
if (TAILQ_EMPTY(&conn->lsmqh))
continue;
- __wt_spin_lock(session, &conn->dhandle_lock);
- F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST);
+ __wt_readlock(session, &conn->dhandle_lock);
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ);
dhandle_locked = true;
TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
if (!lsm_tree->active)
@@ -448,14 +448,14 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
session, WT_LSM_WORK_MERGE, 0, lsm_tree));
}
}
- __wt_spin_unlock(session, &conn->dhandle_lock);
- F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST);
+ __wt_readunlock(session, &conn->dhandle_lock);
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ);
dhandle_locked = false;
}
err: if (dhandle_locked) {
- __wt_spin_unlock(session, &conn->dhandle_lock);
- F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST);
+ __wt_readunlock(session, &conn->dhandle_lock);
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ);
}
return (ret);
}
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index 150de968722..21e8991be94 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -33,9 +33,7 @@ __curstat_lsm_init(
"checkpoint=" WT_CHECKPOINT, NULL, NULL };
locked = false;
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree));
WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
/* Propagate all, fast and/or clear to the cursors we open. */
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 71a981a6284..a9275976023 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -38,7 +38,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
/* We may be destroying an lsm_tree before it was added. */
if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
WT_ASSERT(session, final ||
- F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
}
@@ -321,9 +321,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
metadata = NULL;
/* If the tree can be opened, it already exists. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
- if (ret == 0) {
+ if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) {
__wt_lsm_tree_release(session, lsm_tree);
return (exclusive ? EEXIST : 0);
}
@@ -339,7 +337,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
* error: the returned handle is NULL on error, and the metadata
* tracking macros handle cleaning up on failure.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __lsm_tree_open(session, uri, true, &lsm_tree));
if (ret == 0)
__wt_lsm_tree_release(session, lsm_tree);
@@ -404,6 +402,9 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
}
*treep = lsm_tree;
+
+ WT_ASSERT(session, lsm_tree->excl_session ==
+ (exclusive ? session : NULL));
return (0);
}
@@ -456,7 +457,8 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
conn = S2C(session);
lsm_tree = NULL;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
/* Start the LSM manager thread if it isn't running. */
if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1))
@@ -520,14 +522,21 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session,
{
WT_DECL_RET;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
-
- ret = __lsm_tree_find(session, uri, exclusive, treep);
+ /*
+ * Dropping and re-acquiring the lock is safe here, since the tree open
+ * call checks to see if another thread beat it to opening the tree
+ * before proceeding.
+ */
+ if (exclusive)
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
+ ret = __lsm_tree_find(session, uri, exclusive, treep));
+ else
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ ret = __lsm_tree_find(session, uri, exclusive, treep));
if (ret == WT_NOTFOUND)
- ret = __lsm_tree_open(session, uri, exclusive, treep);
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
+ ret = __lsm_tree_open(session, uri, exclusive, treep));
- WT_ASSERT(session, ret != 0 ||
- (*treep)->excl_session == (exclusive ? session : NULL));
return (ret);
}
@@ -857,9 +866,7 @@ __wt_lsm_tree_alter(
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree));
/* Prevent any new opens. */
__wt_lsm_tree_writelock(session, lsm_tree);
@@ -899,9 +906,7 @@ __wt_lsm_tree_drop(
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
WT_ASSERT(session, !lsm_tree->active);
/* Prevent any new opens. */
@@ -934,7 +939,7 @@ __wt_lsm_tree_drop(
WT_ASSERT(session, !lsm_tree->active);
err: if (locked)
__wt_lsm_tree_writeunlock(session, lsm_tree);
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __lsm_tree_discard(session, lsm_tree, false));
WT_TRET(tret);
return (ret);
@@ -960,9 +965,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree));
/* Prevent any new opens. */
__wt_lsm_tree_writelock(session, lsm_tree);
@@ -1007,7 +1010,7 @@ err: if (locked)
* Discard this LSM tree structure. The first operation on the renamed
* tree will create a new one.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __lsm_tree_discard(session, lsm_tree, false));
WT_TRET(tret);
return (ret);
@@ -1032,9 +1035,7 @@ __wt_lsm_tree_truncate(
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
/* Prevent any new opens. */
__wt_lsm_tree_writelock(session, lsm_tree);
@@ -1068,7 +1069,7 @@ err: if (locked)
* the last good version of the metadata will be used, resulting
* in a valid (not truncated) tree.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __lsm_tree_discard(session, lsm_tree, false));
WT_TRET(tret);
}
@@ -1157,9 +1158,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
/* Tell __wt_schema_worker not to look inside the LSM tree. */
*skipp = true;
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, name, false, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree));
if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
WT_ERR_MSG(session, EINVAL,
@@ -1356,9 +1355,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
locked = false;
exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE);
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
/*
* We mark that we're busy using the tree to coordinate
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index d9c185a3f58..4349acf7b55 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -276,7 +276,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
!F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
!chunk->evicted) {
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __lsm_discard_handle(session, chunk->uri, NULL));
if (ret == 0)
chunk->evicted = 1;
@@ -517,7 +517,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
*
* This will fail with EBUSY if the file is still in use.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT));
WT_RET(ret);
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index b0d0758775d..ffa00c0a5e7 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -154,7 +154,7 @@ __lsm_worker(void *arg)
/* Don't busy wait if there was any work to do. */
if (!progress) {
- __wt_cond_wait(session, cookie->work_cond, 10000);
+ __wt_cond_wait(session, cookie->work_cond, 10000, NULL);
continue;
}
}
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index be8b1abda31..a5ee78f9e3e 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -13,8 +13,7 @@
* Allocate and initialize a condition variable.
*/
int
-__wt_cond_alloc(WT_SESSION_IMPL *session,
- const char *name, bool is_signalled, WT_CONDVAR **condp)
+__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
WT_DECL_RET;
@@ -27,7 +26,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
WT_ERR(pthread_cond_init(&cond->cond, NULL));
cond->name = name;
- cond->waiters = is_signalled ? -1 : 0;
+ cond->waiters = 0;
*condp = cond;
return (0);
@@ -42,8 +41,8 @@ err: __wt_free(session, cond);
* out period expires, let the caller know.
*/
void
-__wt_cond_wait_signal(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled)
+__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
+ uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled)
{
struct timespec ts;
WT_DECL_RET;
@@ -62,6 +61,23 @@ __wt_cond_wait_signal(
WT_ERR(pthread_mutex_lock(&cond->mtx));
locked = true;
+ /*
+ * It's possible to race with threads waking us up. That's not a problem
+ * if there are multiple wakeups because the next wakeup will get us, or
+ * if we're only pausing for a short period. It's a problem if there's
+ * only a single wakeup, our waker is likely waiting for us to exit.
+ * After acquiring the mutex (so we're guaranteed to be awakened by any
+ * future wakeup call), optionally check if we're OK to keep running.
+ * This won't ensure our caller won't just loop and call us again, but
+ * at least it's not our fault.
+ *
+ * Assert we're not waiting longer than a second if not checking the
+ * run status.
+ */
+ WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION);
+ if (run_func != NULL && !run_func(session))
+ goto skipping;
+
if (usecs > 0) {
__wt_epoch(session, &ts);
ts.tv_sec += (time_t)
@@ -81,7 +97,7 @@ __wt_cond_wait_signal(
ret == ETIME ||
#endif
ret == ETIMEDOUT) {
- *signalled = false;
+skipping: *signalled = false;
ret = 0;
}
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index 79c62ccd7f2..0001c6c2322 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -13,8 +13,7 @@
* Allocate and initialize a condition variable.
*/
int
-__wt_cond_alloc(WT_SESSION_IMPL *session,
- const char *name, bool is_signalled, WT_CONDVAR **condp)
+__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
@@ -26,7 +25,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
InitializeConditionVariable(&cond->cond);
cond->name = name;
- cond->waiters = is_signalled ? -1 : 0;
+ cond->waiters = 0;
*condp = cond;
return (0);
@@ -38,8 +37,8 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
* out period expires, let the caller know.
*/
void
-__wt_cond_wait_signal(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled)
+__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
+ uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled)
{
BOOL sleepret;
DWORD milliseconds, windows_error;
@@ -59,8 +58,26 @@ __wt_cond_wait_signal(
EnterCriticalSection(&cond->mtx);
locked = true;
+ /*
+ * It's possible to race with threads waking us up. That's not a problem
+ * if there are multiple wakeups because the next wakeup will get us, or
+ * if we're only pausing for a short period. It's a problem if there's
+ * only a single wakeup, our waker is likely waiting for us to exit.
+ * After acquiring the mutex (so we're guaranteed to be awakened by any
+ * future wakeup call), optionally check if we're OK to keep running.
+ * This won't ensure our caller won't just loop and call us again, but
+ * at least it's not our fault.
+ *
+ * Assert we're not waiting longer than a second if not checking the
+ * run status.
+ */
+ WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION);
+
+ if (run_func != NULL && !run_func(session))
+ goto skipping;
+
if (usecs > 0) {
- milliseconds64 = usecs / 1000;
+ milliseconds64 = usecs / WT_THOUSAND;
/*
* Check for 32-bit unsigned integer overflow
@@ -90,7 +107,7 @@ __wt_cond_wait_signal(
if (sleepret == 0) {
windows_error = __wt_getlasterror();
if (windows_error == ERROR_TIMEOUT) {
- *signalled = false;
+skipping: *signalled = false;
sleepret = 1;
}
}
@@ -117,17 +134,17 @@ void
__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
{
WT_DECL_RET;
- bool locked;
-
- locked = false;
__wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name);
/*
- * Our callers are often setting flags to cause a thread to exit. Add
- * a barrier to ensure the flags are seen by the threads.
+ * Our callers often set flags to cause a thread to exit. Add a barrier
+ * to ensure exit flags are seen by the sleeping threads, otherwise we
+ * can wake up a thread, it immediately goes back to sleep, and we'll
+ * hang. Use a full barrier (we may not write before waiting on thread
+ * join).
*/
- WT_WRITE_BARRIER();
+ WT_FULL_BARRIER();
/*
* Fast path if we are in (or can enter), a state where the next waiter
diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c
index c1a4f257648..49801e4e5f9 100644
--- a/src/schema/schema_drop.c
+++ b/src/schema/schema_drop.c
@@ -30,7 +30,7 @@ __drop_file(
WT_RET(__wt_schema_backup_check(session, filename));
/* Close all btree handles associated with this file. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __wt_conn_dhandle_close_all(session, uri, force));
WT_RET(ret);
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index ea7374b7554..74ef5135a4a 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -25,7 +25,7 @@ __schema_add_table(WT_SESSION_IMPL *session,
/* Make sure the metadata is open before getting other locks. */
WT_RET(__wt_metadata_cursor(session, NULL));
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_READ_LOCK(session,
ret = __wt_schema_open_table(
session, name, namelen, ok_incomplete, &table));
WT_RET(ret);
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index f512482c162..a374f4c2831 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -33,7 +33,7 @@ __rename_file(
WT_RET(__wt_schema_backup_check(session, filename));
WT_RET(__wt_schema_backup_check(session, newfile));
/* Close any btree handles in the file. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __wt_conn_dhandle_close_all(session, uri, false));
WT_ERR(ret);
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index fb7f8cec074..e5f71b5d56f 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
* any open file handles, including checkpoints.
*/
if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) {
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __wt_conn_dhandle_close_all(
session, uri, false));
WT_ERR(ret);
diff --git a/src/session/session_api.c b/src/session/session_api.c
index fcbfa8809b3..d282c5d0c32 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -162,7 +162,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config)
cfg[1] = NULL;
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_alter(session, uri, cfg))));
err: if (ret != 0)
@@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config)
/* Release common session resources. */
WT_TRET(__wt_session_release_resources(session));
- /* Destroy the thread's mutex. */
- WT_TRET(__wt_cond_destroy(session, &session->cond));
-
/* The API lock protects opening and closing of sessions. */
__wt_spin_lock(session, &conn->api_lock);
@@ -521,7 +518,7 @@ __wt_session_create(
WT_DECL_RET;
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_create(session, uri, config)));
return (ret);
}
@@ -769,7 +766,7 @@ __session_rename(WT_SESSION *wt_session,
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_rename(session, uri, newuri, cfg))));
err: if (ret != 0)
@@ -858,21 +855,22 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
if (lock_wait)
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session, ret =
+ WT_WITH_TABLE_WRITE_LOCK(session, ret =
__wt_schema_drop(session, uri, cfg))));
else
WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret,
WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret,
- WT_WITH_TABLE_LOCK_NOWAIT(session, ret, ret =
+ WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret,
+ ret =
__wt_schema_drop(session, uri, cfg))));
} else {
if (lock_wait)
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_drop(session, uri, cfg)));
else
WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret,
- WT_WITH_TABLE_LOCK_NOWAIT(session, ret,
+ WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret,
ret = __wt_schema_drop(session, uri, cfg)));
}
@@ -1489,6 +1487,20 @@ err: API_END_RET(session, ret);
}
/*
+ * __transaction_sync_run_chk --
+ * Check to decide if the transaction sync call should continue running.
+ */
+static bool
+__transaction_sync_run_chk(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ return (FLD_ISSET(conn->flags, WT_CONN_LOG_SERVER_RUN));
+}
+
+/*
* __session_transaction_sync --
* WT_SESSION->transaction_sync method.
*/
@@ -1502,7 +1514,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
WT_SESSION_IMPL *session;
WT_TXN *txn;
struct timespec now, start;
- uint64_t timeout_ms, waited_ms;
+ uint64_t remaining_usec, timeout_ms, waited_ms;
bool forever;
session = (WT_SESSION_IMPL *)wt_session;
@@ -1555,22 +1567,20 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
__wt_epoch(session, &start);
/*
* Keep checking the LSNs until we find it is stable or we reach
- * our timeout.
+ * our timeout, or there's some other reason to quit.
*/
while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
+ if (!__transaction_sync_run_chk(session))
+ WT_ERR(ETIMEDOUT);
+
__wt_cond_signal(session, conn->log_file_cond);
__wt_epoch(session, &now);
waited_ms = WT_TIMEDIFF_MS(now, start);
- if (forever || waited_ms < timeout_ms)
- /*
- * Note, we will wait an increasing amount of time
- * each iteration, likely doubling. Also note that
- * the function timeout value is in usecs (we are
- * computing the wait time in msecs and passing that
- * in, unchanged, as the usecs to wait).
- */
- __wt_cond_wait(session, log->log_sync_cond, waited_ms);
- else
+ if (forever || waited_ms < timeout_ms) {
+ remaining_usec = (timeout_ms - waited_ms) * WT_THOUSAND;
+ __wt_cond_wait(session, log->log_sync_cond,
+ remaining_usec, __transaction_sync_run_chk);
+ } else
WT_ERR(ETIMEDOUT);
}
@@ -1825,8 +1835,6 @@ __open_session(WT_CONNECTION_IMPL *conn,
session_ret->name = NULL;
session_ret->id = i;
- WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond));
-
if (WT_SESSION_FIRST_USE(session_ret))
__wt_random_init(&session_ret->rnd);
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index f1251794b89..ee9bddbfc19 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -44,8 +44,7 @@ __session_discard_dhandle(
TAILQ_REMOVE(&session->dhandles, dhandle_cache, q);
TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq);
- (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1);
-
+ WT_DHANDLE_RELEASE(dhandle_cache->dhandle);
__wt_overwrite_and_free(session, dhandle_cache);
}
@@ -412,17 +411,27 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
/*
* __session_find_shared_dhandle --
* Search for a data handle in the connection and add it to a session's
- * cache. Since the data handle isn't locked, this must be called holding
- * the handle list lock, and we must increment the handle's reference
- * count before releasing it.
+ * cache. We must increment the handle's reference count while holding
+ * the handle list lock.
*/
static int
__session_find_shared_dhandle(
WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
- WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint));
- (void)__wt_atomic_add32(&session->dhandle->session_ref, 1);
- return (0);
+ WT_DECL_RET;
+
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ if ((ret = __wt_conn_dhandle_find(session, uri, checkpoint)) == 0)
+ WT_DHANDLE_ACQUIRE(session->dhandle));
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
+ if ((ret = __wt_conn_dhandle_alloc(session, uri, checkpoint)) == 0)
+ WT_DHANDLE_ACQUIRE(session->dhandle));
+
+ return (ret);
}
/*
@@ -450,16 +459,16 @@ __session_get_dhandle(
* We didn't find a match in the session cache, search the shared
* handle list and cache the handle we find.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __session_find_shared_dhandle(session, uri, checkpoint));
- WT_RET(ret);
+ WT_RET(__session_find_shared_dhandle(session, uri, checkpoint));
/*
* Fixup the reference count on failure (we incremented the reference
* count while holding the handle-list lock).
*/
- if ((ret = __session_add_dhandle(session)) != 0)
- (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1);
+ if ((ret = __session_add_dhandle(session)) != 0) {
+ WT_DHANDLE_RELEASE(session->dhandle);
+ session->dhandle = NULL;
+ }
return (ret);
}
@@ -505,17 +514,15 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
* reopen handles in the meantime. A combination of the schema
* and handle list locks are used to enforce this.
*/
- if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) ||
- !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) {
+ if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) {
dhandle->excl_session = NULL;
dhandle->excl_ref = 0;
F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
__wt_writeunlock(session, &dhandle->rwlock);
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_session_get_btree(
- session, uri, checkpoint, cfg, flags)));
+ ret = __wt_session_get_btree(
+ session, uri, checkpoint, cfg, flags));
return (ret);
}
diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c
index a3ae67f5baa..600e5eab0ff 100644
--- a/src/support/cond_auto.c
+++ b/src/support/cond_auto.c
@@ -1,29 +1,9 @@
/*-
- * Public Domain 2014-2016 MongoDB, Inc.
- * Public Domain 2008-2014 WiredTiger, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
*
- * This is free and unencumbered software released into the public domain.
- *
- * Anyone is free to copy, modify, publish, use, compile, sell, or
- * distribute this software, either in source code form or as a compiled
- * binary, for any purpose, commercial or non-commercial, and by any
- * means.
- *
- * In jurisdictions that recognize copyright laws, the author or authors
- * of this software dedicate any and all copyright interest in the
- * software to the public domain. We make this dedication for the benefit
- * of the public at large and to the detriment of our heirs and
- * successors. We intend this dedication to be an overt act of
- * relinquishment in perpetuity of all present and future rights to this
- * software under copyright law.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
+ * See the file LICENSE for redistribution information.
*/
#include "wt_internal.h"
@@ -38,13 +18,12 @@
* Allocate and initialize an automatically adjusting condition variable.
*/
int
-__wt_cond_auto_alloc(
- WT_SESSION_IMPL *session, const char *name,
- bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp)
+__wt_cond_auto_alloc(WT_SESSION_IMPL *session,
+ const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
- WT_RET(__wt_cond_alloc(session, name, is_signalled, condp));
+ WT_RET(__wt_cond_alloc(session, name, condp));
cond = *condp;
cond->min_wait = min;
@@ -55,33 +34,19 @@ __wt_cond_auto_alloc(
}
/*
- * __wt_cond_auto_signal --
- * Signal a condition variable.
- */
-void
-__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
-{
-
- WT_ASSERT(session, cond->min_wait != 0);
- __wt_cond_signal(session, cond);
-}
-
-/*
* __wt_cond_auto_wait_signal --
* Wait on a mutex, optionally timing out. If we get it before the time
* out period expires, let the caller know.
- * TODO: Can this version of the API be removed, now that we have the
- * auto adjusting condition variables?
*/
void
-__wt_cond_auto_wait_signal(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled)
+__wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
+ bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled)
{
uint64_t delta;
/*
* Catch cases where this function is called with a condition variable
- * that was initialized non-auto.
+ * that wasn't initialized to do automatic adjustments.
*/
WT_ASSERT(session, cond->min_wait != 0);
@@ -94,7 +59,8 @@ __wt_cond_auto_wait_signal(
cond->max_wait, cond->prev_wait + delta);
}
- __wt_cond_wait_signal(session, cond, cond->prev_wait, signalled);
+ __wt_cond_wait_signal(
+ session, cond, cond->prev_wait, run_func, signalled);
if (progress || *signalled)
WT_STAT_CONN_INCR(session, cond_auto_wait_reset);
@@ -108,24 +74,10 @@ __wt_cond_auto_wait_signal(
* out period expires, let the caller know.
*/
void
-__wt_cond_auto_wait(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress)
+__wt_cond_auto_wait(WT_SESSION_IMPL *session,
+ WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *))
{
- bool signalled;
-
- /*
- * Call the signal version so the wait period is reset if the
- * condition is woken explicitly.
- */
- __wt_cond_auto_wait_signal(session, cond, progress, &signalled);
-}
+ bool notused;
-/*
- * __wt_cond_auto_destroy --
- * Destroy a condition variable.
- */
-int
-__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
-{
- return (__wt_cond_destroy(session, condp));
+ __wt_cond_auto_wait_signal(session, cond, progress, run_func, &notused);
}
diff --git a/src/support/rand.c b/src/support/rand.c
index a5b229b9abc..4fae43edc8e 100644
--- a/src/support/rand.c
+++ b/src/support/rand.c
@@ -120,3 +120,15 @@ __wt_random(WT_RAND_STATE volatile * rnd_state)
return ((z << 16) + (w & 65535));
}
+
+/*
+ * __wt_random64 --
+ * Return a 64-bit pseudo-random number.
+ */
+uint64_t
+__wt_random64(WT_RAND_STATE volatile * rnd_state)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ return (((uint64_t)__wt_random(rnd_state) << 32) +
+ __wt_random(rnd_state));
+}
diff --git a/src/support/stat.c b/src/support/stat.c
index 167d17137ce..fd38e1b79ee 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -759,9 +759,7 @@ static const char * const __stats_connection_desc[] = {
"lock: checkpoint lock acquisitions",
"lock: checkpoint lock application thread wait time (usecs)",
"lock: checkpoint lock internal thread wait time (usecs)",
- "lock: handle-list lock acquisitions",
- "lock: handle-list lock application thread wait time (usecs)",
- "lock: handle-list lock internal thread wait time (usecs)",
+ "lock: handle-list lock eviction thread wait time (usecs)",
"lock: metadata lock acquisitions",
"lock: metadata lock application thread wait time (usecs)",
"lock: metadata lock internal thread wait time (usecs)",
@@ -1044,9 +1042,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->lock_checkpoint_count = 0;
stats->lock_checkpoint_wait_application = 0;
stats->lock_checkpoint_wait_internal = 0;
- stats->lock_handle_list_count = 0;
- stats->lock_handle_list_wait_application = 0;
- stats->lock_handle_list_wait_internal = 0;
+ stats->lock_handle_list_wait_eviction = 0;
stats->lock_metadata_count = 0;
stats->lock_metadata_wait_application = 0;
stats->lock_metadata_wait_internal = 0;
@@ -1351,12 +1347,8 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, lock_checkpoint_wait_application);
to->lock_checkpoint_wait_internal +=
WT_STAT_READ(from, lock_checkpoint_wait_internal);
- to->lock_handle_list_count +=
- WT_STAT_READ(from, lock_handle_list_count);
- to->lock_handle_list_wait_application +=
- WT_STAT_READ(from, lock_handle_list_wait_application);
- to->lock_handle_list_wait_internal +=
- WT_STAT_READ(from, lock_handle_list_wait_internal);
+ to->lock_handle_list_wait_eviction +=
+ WT_STAT_READ(from, lock_handle_list_wait_eviction);
to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count);
to->lock_metadata_wait_application +=
WT_STAT_READ(from, lock_metadata_wait_application);
diff --git a/src/support/thread_group.c b/src/support/thread_group.c
index beb143e63e2..2b4b7ad4e61 100644
--- a/src/support/thread_group.c
+++ b/src/support/thread_group.c
@@ -259,7 +259,7 @@ __wt_thread_group_create(
__wt_rwlock_init(session, &group->lock);
WT_ERR(__wt_cond_alloc(
- session, "Thread group cond", false, &group->wait_cond));
+ session, "thread group cond", &group->wait_cond));
cond_alloced = true;
__wt_writelock(session, &group->lock);
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 660d37b17d5..e5e59c2b901 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -803,3 +803,98 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session)
__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock);
__wt_free(session, txn_global->states);
}
+
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
+/*
+ * __wt_verbose_dump_txn --
+ * Output diagnostic information about the global transaction state.
+ */
+int
+__wt_verbose_dump_txn(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN *txn;
+ WT_TXN_STATE *s;
+ const char *iso_tag;
+ uint64_t id;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+ WT_RET(__wt_msg(session, "transaction state dump"));
+
+ WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current));
+ WT_RET(__wt_msg(session,
+ "last running ID: %" PRIu64, txn_global->last_running));
+ WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id));
+ WT_RET(__wt_msg(session,
+ "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id));
+
+ WT_RET(__wt_msg(session, "checkpoint running? %s",
+ txn_global->checkpoint_running ? "yes" : "no"));
+ WT_RET(__wt_msg(session,
+ "checkpoint generation: %" PRIu64, txn_global->checkpoint_gen));
+ WT_RET(__wt_msg(session,
+ "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_pinned));
+ WT_RET(__wt_msg(session,
+ "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txnid));
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
+
+ WT_RET(__wt_msg(session, "Transaction state of active sessions:"));
+
+ /*
+ * Walk each session transaction state and dump information. Accessing
+ * the content of session handles is not thread safe, so some
+ * information may change while traversing if other threads are active
+ * at the same time, which is OK since this is diagnostic code.
+ */
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /* Skip sessions with no active transaction */
+ if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
+ continue;
+
+ txn = &conn->sessions[i].txn;
+ iso_tag = "INVALID";
+ switch (txn->isolation) {
+ case WT_ISO_READ_COMMITTED:
+ iso_tag = "WT_ISO_READ_COMMITTED";
+ break;
+ case WT_ISO_READ_UNCOMMITTED:
+ iso_tag = "WT_ISO_READ_UNCOMMITTED";
+ break;
+ case WT_ISO_SNAPSHOT:
+ iso_tag = "WT_ISO_SNAPSHOT";
+ break;
+ }
+
+ WT_RET(__wt_msg(session,
+ "ID: %6" PRIu64
+ ", mod count: %u"
+ ", pinned ID: %" PRIu64
+ ", snap min: %" PRIu64
+ ", snap max: %" PRIu64
+ ", metadata pinned ID: %" PRIu64
+ ", flags: 0x%08" PRIx32
+ ", name: %s"
+ ", isolation: %s",
+ id,
+ txn->mod_count,
+ s->pinned_id,
+ txn->snap_min,
+ txn->snap_max,
+ s->metadata_pinned,
+ txn->flags,
+ conn->sessions[i].name == NULL ?
+ "EMPTY" : conn->sessions[i].name,
+ iso_tag));
+ }
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+
+ return (0);
+}
+#endif
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 3b19162fd3d..3261c8089f4 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -525,6 +525,17 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session,
}
/*
+ * __checkpoint_fail_reset --
+ * Reset fields when a failure occurs.
+ */
+static void
+__checkpoint_fail_reset(WT_SESSION_IMPL *session)
+{
+ S2BT(session)->modified = true;
+ S2BT(session)->ckpt = NULL;
+}
+
+/*
* __txn_checkpoint --
* Checkpoint a database or a list of objects in the database.
*/
@@ -543,7 +554,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
void *saved_meta_next;
u_int i;
uint64_t fsync_duration_usecs;
- bool full, idle, logging, tracking;
+ bool failed, full, idle, logging, tracking;
const char *txn_cfg[] = { WT_CONFIG_BASE(session,
WT_SESSION_begin_transaction), "isolation=snapshot", NULL };
@@ -639,10 +650,9 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_ASSERT(session, session->ckpt_handle_next == 0);
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __checkpoint_apply_all(
- session, cfg, __wt_checkpoint_get_handles, NULL))));
+ WT_WITH_TABLE_READ_LOCK(session,
+ ret = __checkpoint_apply_all(
+ session, cfg, __wt_checkpoint_get_handles, NULL)));
WT_ERR(ret);
/*
@@ -825,12 +835,13 @@ err: /*
* overwritten the checkpoint, so what ends up on disk is not
* consistent.
*/
- if (ret != 0 && !conn->modified)
+ failed = ret != 0;
+ if (failed)
conn->modified = true;
session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
if (tracking)
- WT_TRET(__wt_meta_track_off(session, false, ret != 0));
+ WT_TRET(__wt_meta_track_off(session, false, failed));
cache->eviction_scrub_limit = 0.0;
WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
@@ -863,6 +874,13 @@ err: /*
for (i = 0; i < session->ckpt_handle_next; ++i) {
if (session->ckpt_handle[i] == NULL)
continue;
+ /*
+ * If the operation failed, mark all trees dirty so they are
+ * included if a future checkpoint can succeed.
+ */
+ if (failed)
+ WT_WITH_DHANDLE(session, session->ckpt_handle[i],
+ __checkpoint_fail_reset(session));
WT_WITH_DHANDLE(session, session->ckpt_handle[i],
WT_TRET(__wt_session_release_btree(session)));
}
@@ -1341,7 +1359,6 @@ __checkpoint_tree(
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
WT_LSN ckptlsn;
- int was_modified;
bool fake_ckpt;
WT_UNUSED(cfg);
@@ -1352,7 +1369,6 @@ __checkpoint_tree(
conn = S2C(session);
dhandle = session->dhandle;
fake_ckpt = false;
- was_modified = btree->modified;
/*
* Set the checkpoint LSN to the maximum LSN so that if logging is
@@ -1483,10 +1499,9 @@ err: /*
* If the checkpoint didn't complete successfully, make sure the
* tree is marked dirty.
*/
- if (ret != 0 && !btree->modified && was_modified) {
+ if (ret != 0) {
btree->modified = true;
- if (!S2C(session)->modified)
- S2C(session)->modified = true;
+ S2C(session)->modified = true;
}
__wt_meta_ckptlist_free(session, ckptbase);
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index 7ad295f421b..2931dc1ce82 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -269,7 +269,7 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session,
WT_ITEM ckpt_snapshot_unused;
uint32_t ckpt_file, ckpt_offset;
u_int ckpt_nsnapshot_unused;
- const char *fmt = WT_UNCHECKED_STRING(IIIU);
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
&ckpt_file, &ckpt_offset,
@@ -297,7 +297,7 @@ __wt_txn_checkpoint_log(
uint8_t *end, *p;
size_t recsize;
uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
- const char *fmt = WT_UNCHECKED_STRING(IIIIU);
+ const char *fmt = WT_UNCHECKED_STRING(IIIIu);
txn = &session->txn;
ckpt_lsn = &txn->ckpt_lsn;
diff --git a/src/utilities/util.h b/src/utilities/util.h
index cf12d7d4aa6..93a96d44219 100644
--- a/src/utilities/util.h
+++ b/src/utilities/util.h
@@ -40,7 +40,6 @@ int util_flush(WT_SESSION *, const char *);
int util_list(WT_SESSION *, int, char *[]);
int util_load(WT_SESSION *, int, char *[]);
int util_loadtext(WT_SESSION *, int, char *[]);
-char *util_name(WT_SESSION *, const char *, const char *);
int util_printlog(WT_SESSION *, int, char *[]);
int util_read(WT_SESSION *, int, char *[]);
int util_read_line(WT_SESSION *, ULINE *, bool, bool *);
@@ -51,5 +50,6 @@ int util_stat(WT_SESSION *, int, char *[]);
int util_str2recno(WT_SESSION *, const char *p, uint64_t *recnop);
int util_truncate(WT_SESSION *, int, char *[]);
int util_upgrade(WT_SESSION *, int, char *[]);
+char *util_uri(WT_SESSION *, const char *, const char *);
int util_verify(WT_SESSION *, int, char *[]);
int util_write(WT_SESSION *, int, char *[]);
diff --git a/src/utilities/util_alter.c b/src/utilities/util_alter.c
index d228c15cd48..ef01a1ed826 100644
--- a/src/utilities/util_alter.c
+++ b/src/utilities/util_alter.c
@@ -34,9 +34,12 @@ util_alter(WT_SESSION *session, int argc, char *argv[])
for (configp = argv;
configp != NULL && *configp != NULL; configp += 2)
if ((ret = session->alter(
- session, configp[0], configp[1])) != 0)
- break;
- return (ret);
+ session, configp[0], configp[1])) != 0) {
+ (void)util_err(session, ret,
+ "session.alter: %s, %s", configp[0], configp[1]);
+ return (1);
+ }
+ return (0);
}
static int
diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c
index c114eb207fa..e469b4dce6e 100644
--- a/src/utilities/util_compact.c
+++ b/src/utilities/util_compact.c
@@ -30,21 +30,13 @@ util_compact(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->compact(session, uri, NULL)) != 0) {
- fprintf(stderr, "%s: compact(%s): %s\n",
- progname, uri, session->strerror(session, ret));
- goto err;
- }
-
- if (0) {
-err: ret = 1;
- }
+ if ((ret = session->compact(session, uri, NULL)) != 0)
+ (void)util_err(session, ret, "session.compact: %s", uri);
free(uri);
-
return (ret);
}
diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c
index 4e609736f2d..7c22a67792b 100644
--- a/src/utilities/util_create.c
+++ b/src/utilities/util_create.c
@@ -15,9 +15,9 @@ util_create(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- const char *config, *uri;
+ char *config, *uri;
- config = NULL;
+ config = uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF)
switch (ch) {
case 'c': /* command-line configuration */
@@ -35,12 +35,14 @@ util_create(WT_SESSION *session, int argc, char *argv[])
if (argc != 1)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
if ((ret = session->create(session, uri, config)) != 0)
- return (util_err(session, ret, "%s: session.create", uri));
- return (0);
+ (void)util_err(session, ret, "session.create: %s", uri);
+
+ free(uri);
+ return (ret);
}
static int
diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c
index ba41445dfb6..456005d445d 100644
--- a/src/utilities/util_drop.c
+++ b/src/utilities/util_drop.c
@@ -15,8 +15,9 @@ util_drop(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,12 +31,13 @@ util_drop(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- ret = session->drop(session, name, "force");
+ if ((ret = session->drop(session, uri, "force")) != 0)
+ (void)util_err(session, ret, "session.drop: %s", uri);
- free(name);
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index 3f8b4a49dfe..cded40a8b45 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -37,10 +37,10 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
size_t len;
int ch, i;
bool hex, json, reverse;
- char *checkpoint, *config, *name, *p, *simplename;
+ char *checkpoint, *config, *p, *simpleuri, *uri;
hex = json = reverse = false;
- checkpoint = config = name = simplename = NULL;
+ checkpoint = config = simpleuri = uri = NULL;
cursor = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF)
switch (ch) {
@@ -89,11 +89,11 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
if (json && i > 0)
if (dump_json_separator(session) != 0)
goto err;
- free(name);
- free(simplename);
- name = simplename = NULL;
+ free(uri);
+ free(simpleuri);
+ uri = simpleuri = NULL;
- if ((name = util_name(session, argv[i], "table")) == NULL)
+ if ((uri = util_uri(session, argv[i], "table")) == NULL)
goto err;
len =
@@ -113,19 +113,19 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
(void)strcat(config, json ? "dump=json" :
(hex ? "dump=hex" : "dump=print"));
if ((ret = session->open_cursor(
- session, name, NULL, config, &cursor)) != 0) {
+ session, uri, NULL, config, &cursor)) != 0) {
fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
- progname, name, session->strerror(session, ret));
+ progname, uri, session->strerror(session, ret));
goto err;
}
- if ((simplename = strdup(name)) == NULL) {
+ if ((simpleuri = strdup(uri)) == NULL) {
(void)util_err(session, errno, NULL);
goto err;
}
- if ((p = strchr(simplename, '(')) != NULL)
+ if ((p = strchr(simpleuri, '(')) != NULL)
*p = '\0';
- if (dump_config(session, simplename, cursor, hex, json) != 0)
+ if (dump_config(session, simpleuri, cursor, hex, json) != 0)
goto err;
if (dump_record(cursor, reverse, json) != 0)
@@ -148,8 +148,8 @@ err: ret = 1;
}
free(config);
- free(name);
- free(simplename);
+ free(uri);
+ free(simpleuri);
if (cursor != NULL && (ret = cursor->close(cursor)) != 0) {
(void)util_err(session, ret, NULL);
ret = 1;
diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c
index e91dbfce05b..f19ba4d1f97 100644
--- a/src/utilities/util_list.c
+++ b/src/utilities/util_list.c
@@ -19,10 +19,10 @@ util_list(WT_SESSION *session, int argc, char *argv[])
WT_DECL_RET;
int ch;
bool cflag, vflag;
- char *name;
+ char *uri;
cflag = vflag = false;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF)
switch (ch) {
case 'c':
@@ -42,17 +42,16 @@ util_list(WT_SESSION *session, int argc, char *argv[])
case 0:
break;
case 1:
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
break;
default:
return (usage());
}
- ret = list_print(session, name, cflag, vflag);
-
- free(name);
+ ret = list_print(session, uri, cflag, vflag);
+ free(uri);
return (ret);
}
@@ -99,7 +98,7 @@ list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize)
* List the high-level objects in the database.
*/
static int
-list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
+list_print(WT_SESSION *session, const char *uri, bool cflag, bool vflag)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -120,7 +119,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
ret, "%s: WT_SESSION.open_cursor", WT_METADATA_URI));
}
- found = name == NULL;
+ found = uri == NULL;
while ((ret = cursor->next(cursor)) == 0) {
/* Get the key. */
if ((ret = cursor->get_key(cursor, &key)) != 0)
@@ -129,8 +128,8 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
/*
* If a name is specified, only show objects that match.
*/
- if (name != NULL) {
- if (!WT_PREFIX_MATCH(key, name))
+ if (uri != NULL) {
+ if (!WT_PREFIX_MATCH(key, uri))
continue;
found = true;
}
@@ -161,7 +160,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
if (ret != WT_NOTFOUND)
return (util_cerr(cursor, "next", ret));
if (!found) {
- fprintf(stderr, "%s: %s: not found\n", progname, name);
+ fprintf(stderr, "%s: %s: not found\n", progname, uri);
return (1);
}
diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c
index ac18df80851..ca77643eb49 100644
--- a/src/utilities/util_load.c
+++ b/src/utilities/util_load.c
@@ -126,7 +126,7 @@ load_dump(WT_SESSION *session)
append ? ",append" : "", no_overwrite ? ",overwrite=false" : "");
if ((ret = session->open_cursor(
session, uri, NULL, config, &cursor)) != 0) {
- ret = util_err(session, ret, "%s: session.open", uri);
+ ret = util_err(session, ret, "%s: session.open_cursor", uri);
goto err;
}
diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c
index 020a4ed9ba9..1189d49a483 100644
--- a/src/utilities/util_load_json.c
+++ b/src/utilities/util_load_json.c
@@ -242,7 +242,7 @@ json_data(WT_SESSION *session,
LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : "");
if ((ret = session->open_cursor(
session, uri, NULL, config, &cursor)) != 0) {
- ret = util_err(session, ret, "%s: session.open", uri);
+ ret = util_err(session, ret, "%s: session.open_cursor", uri);
goto err;
}
keyformat = cursor->key_format;
diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c
index f9c5b6e9a1f..7602d43f8c9 100644
--- a/src/utilities/util_loadtext.c
+++ b/src/utilities/util_loadtext.c
@@ -15,9 +15,11 @@ static int usage(void);
int
util_loadtext(WT_SESSION *session, int argc, char *argv[])
{
+ WT_DECL_RET;
int ch;
- const char *uri;
+ char *uri;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF)
switch (ch) {
case 'f': /* input file */
@@ -35,10 +37,13 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- return (text(session, uri));
+ ret = text(session, uri);
+
+ free(uri);
+ return (ret);
}
/*
@@ -61,7 +66,7 @@ text(WT_SESSION *session, const char *uri)
*/
if ((ret = session->open_cursor(
session, uri, NULL, "append,overwrite", &cursor)) != 0)
- return (util_err(session, ret, "%s: session.open", uri));
+ return (util_err(session, ret, "%s: session.open_cursor", uri));
/*
* We're about to load strings, make sure the formats match.
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index 001a66d6d9e..7157f0d90fe 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -285,11 +285,11 @@ usage(void)
}
/*
- * util_name --
+ * util_uri --
* Build a name.
*/
char *
-util_name(WT_SESSION *session, const char *s, const char *type)
+util_uri(WT_SESSION *session, const char *s, const char *type)
{
size_t len;
char *name;
diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c
index e7fa2134934..5f3ed43905b 100644
--- a/src/utilities/util_printlog.c
+++ b/src/utilities/util_printlog.c
@@ -14,8 +14,8 @@ int
util_printlog(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
- int ch;
uint32_t flags;
+ int ch;
flags = 0;
while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF)
@@ -41,17 +41,9 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
if (argc != 0)
return (usage());
- ret = __wt_txn_printlog(session, flags);
-
- if (ret != 0) {
- fprintf(stderr, "%s: printlog failed: %s\n",
- progname, session->strerror(session, ret));
- goto err;
- }
+ if ((ret = __wt_txn_printlog(session, flags)) != 0)
+ (void)util_err(session, ret, "printlog");
- if (0) {
-err: ret = 1;
- }
return (ret);
}
diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c
index 2e766377aa9..393949b6a1c 100644
--- a/src/utilities/util_read.c
+++ b/src/utilities/util_read.c
@@ -18,8 +18,9 @@ util_read(WT_SESSION *session, int argc, char *argv[])
uint64_t recno;
int ch;
bool rkey, rval;
- const char *uri, *value;
+ char *uri, *value;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -32,13 +33,19 @@ util_read(WT_SESSION *session, int argc, char *argv[])
/* The remaining arguments are a uri followed by a list of keys. */
if (argc < 2)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- /* Open the object. */
- if ((ret = session->open_cursor(
- session, uri, NULL, NULL, &cursor)) != 0)
- return (util_err(session, ret, "%s: session.open", uri));
+ /*
+ * Open the object; free allocated memory immediately to simplify
+ * future error handling.
+ */
+ if ((ret =
+ session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0)
+ (void)util_err(session, ret, "%s: session.open_cursor", uri);
+ free(uri);
+ if (ret != 0)
+ return (ret);
/*
* A simple search only makes sense if the key format is a string or a
diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c
index 45f161487e5..c188ea17d22 100644
--- a/src/utilities/util_rebalance.c
+++ b/src/utilities/util_rebalance.c
@@ -15,9 +15,9 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,25 +30,21 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->rebalance(session, name, NULL)) != 0) {
- fprintf(stderr, "%s: rebalance(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->rebalance(session, uri, NULL)) != 0)
+ (void)util_err(session, ret, "session.rebalance: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(name);
-
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c
index aee299c6e63..bb2d40cd103 100644
--- a/src/utilities/util_rename.c
+++ b/src/utilities/util_rename.c
@@ -30,22 +30,15 @@ util_rename(WT_SESSION *session, int argc, char *argv[])
/* The remaining arguments are the object uri and new name. */
if (argc != 2)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
newuri = argv[1];
- if ((ret = session->rename(session, uri, newuri, NULL)) != 0) {
- fprintf(stderr, "%s: rename %s to %s: %s\n",
- progname, uri, newuri, session->strerror(session, ret));
- goto err;
- }
-
- if (0) {
-err: ret = 1;
- }
+ if ((ret = session->rename(session, uri, newuri, NULL)) != 0)
+ (void)util_err(
+ session, ret, "session.rename: %s, %s", uri, newuri);
free(uri);
-
return (ret);
}
diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c
index 679d1074457..6cc2278b846 100644
--- a/src/utilities/util_salvage.c
+++ b/src/utilities/util_salvage.c
@@ -16,10 +16,10 @@ util_salvage(WT_SESSION *session, int argc, char *argv[])
WT_DECL_RET;
int ch;
const char *force;
- char *name;
+ char *uri;
force = NULL;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF)
switch (ch) {
case 'F':
@@ -35,25 +35,21 @@ util_salvage(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the file name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "file")) == NULL)
+ if ((uri = util_uri(session, *argv, "file")) == NULL)
return (1);
- if ((ret = session->salvage(session, name, force)) != 0) {
- fprintf(stderr, "%s: salvage(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->salvage(session, uri, force)) != 0)
+ (void)util_err(session, ret, "session.salvage: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(name);
-
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c
index 4376f559ceb..1b75d9ea8bf 100644
--- a/src/utilities/util_stat.c
+++ b/src/utilities/util_stat.c
@@ -55,7 +55,7 @@ util_stat(WT_SESSION *session, int argc, char *argv[])
objname = (char *)"";
break;
case 1:
- if ((objname = util_name(session, *argv, "table")) == NULL)
+ if ((objname = util_uri(session, *argv, "table")) == NULL)
return (1);
objname_free = true;
break;
@@ -82,8 +82,8 @@ util_stat(WT_SESSION *session, int argc, char *argv[])
(ret = cursor->next(cursor)) == 0 &&
(ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0)
if (printf("%s=%s\n", desc, pval) < 0) {
- ret = errno;
- break;
+ (void)util_err(session, errno, "printf");
+ goto err;
}
if (ret == WT_NOTFOUND)
ret = 0;
diff --git a/src/utilities/util_truncate.c b/src/utilities/util_truncate.c
index 9325c0d7e84..35de02345c8 100644
--- a/src/utilities/util_truncate.c
+++ b/src/utilities/util_truncate.c
@@ -15,8 +15,9 @@ util_truncate(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,13 +31,13 @@ util_truncate(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->truncate(session, name, NULL, NULL, NULL)) != 0)
- return (util_err(session, ret, "%s: session.truncate", name));
+ if ((ret = session->truncate(session, uri, NULL, NULL, NULL)) != 0)
+ (void)util_err(session, ret, "session.truncate: %s", uri);
- free(name);
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c
index 63b23f28c16..f89bd46e133 100644
--- a/src/utilities/util_upgrade.c
+++ b/src/utilities/util_upgrade.c
@@ -15,9 +15,9 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,25 +30,21 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->upgrade(session, name, NULL)) != 0) {
- fprintf(stderr, "%s: upgrade(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->upgrade(session, uri, NULL)) != 0)
+ (void)util_err(session, ret, "session.upgrade: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(name);
-
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c
index 82bdd780cd3..d0587fcfc8c 100644
--- a/src/utilities/util_verify.c
+++ b/src/utilities/util_verify.c
@@ -17,10 +17,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
size_t size;
int ch;
bool dump_address, dump_blocks, dump_layout, dump_pages;
- char *config, *dump_offsets, *name;
+ char *config, *dump_offsets, *uri;
dump_address = dump_blocks = dump_layout = dump_pages = false;
- config = dump_offsets = name = NULL;
+ config = dump_offsets = uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "d:")) != EOF)
switch (ch) {
case 'd':
@@ -55,7 +55,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
/* Build the configuration string as necessary. */
@@ -69,7 +69,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
strlen("dump_offsets[],") +
(dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20;
if ((config = malloc(size)) == NULL) {
- (void)util_err(session, errno, NULL);
+ ret = util_err(session, errno, NULL);
goto err;
}
snprintf(config, size,
@@ -82,23 +82,19 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
dump_offsets != NULL ? "]," : "",
dump_pages ? "dump_pages," : "");
}
- if ((ret = session->verify(session, name, config)) != 0) {
- fprintf(stderr, "%s: verify(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->verify(session, uri, config)) != 0)
+ (void)util_err(session, ret, "session.verify: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(config);
- free(name);
-
+err: free(config);
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c
index 7d9bce02b36..b931fad064d 100644
--- a/src/utilities/util_write.c
+++ b/src/utilities/util_write.c
@@ -18,10 +18,10 @@ util_write(WT_SESSION *session, int argc, char *argv[])
uint64_t recno;
int ch;
bool append, overwrite, rkey;
- const char *uri;
- char config[100];
+ char *uri, config[100];
append = overwrite = false;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF)
switch (ch) {
case 'a':
@@ -47,15 +47,21 @@ util_write(WT_SESSION *session, int argc, char *argv[])
} else
if (argc < 3 || ((argc - 1) % 2 != 0))
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- /* Open the object. */
+ /*
+ * Open the object; free allocated memory immediately to simplify
+ * future error handling.
+ */
(void)snprintf(config, sizeof(config), "%s,%s",
append ? "append=true" : "", overwrite ? "overwrite=true" : "");
- if ((ret = session->open_cursor(
- session, uri, NULL, config, &cursor)) != 0)
- return (util_err(session, ret, "%s: session.open", uri));
+ if ((ret =
+ session->open_cursor(session, uri, NULL, config, &cursor)) != 0)
+ (void)util_err(session, ret, "%s: session.open_cursor", uri);
+ free(uri);
+ if (ret != 0)
+ return (ret);
/*
* A simple search only makes sense if the key format is a string or a