summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bench/wtperf/runners/log-append-zero.wtperf8
-rw-r--r--bench/wtperf/runners/multi-btree-stress.wtperf17
-rw-r--r--bench/wtperf/wtperf.c37
-rw-r--r--dist/api_data.py67
-rw-r--r--dist/flags.py1
-rw-r--r--dist/s_string.ok1
-rw-r--r--dist/stat_data.py1
-rw-r--r--src/async/async_api.c17
-rw-r--r--src/btree/bt_compact.c6
-rw-r--r--src/btree/bt_debug.c7
-rw-r--r--src/btree/bt_discard.c2
-rw-r--r--src/btree/bt_split.c47
-rw-r--r--src/btree/bt_sync.c3
-rw-r--r--src/cache/cache_las.c133
-rw-r--r--src/config/config.c7
-rw-r--r--src/config/config_def.c110
-rw-r--r--src/conn/conn_api.c4
-rw-r--r--src/conn/conn_cache_pool.c21
-rw-r--r--src/conn/conn_ckpt.c12
-rw-r--r--src/conn/conn_dhandle.c14
-rw-r--r--src/conn/conn_log.c105
-rw-r--r--src/conn/conn_open.c40
-rw-r--r--src/conn/conn_stat.c3
-rw-r--r--src/conn/conn_sweep.c12
-rw-r--r--src/evict/evict_lru.c129
-rw-r--r--src/include/btmem.h9
-rw-r--r--src/include/btree.i32
-rw-r--r--src/include/cache.h7
-rw-r--r--src/include/connection.h13
-rw-r--r--src/include/extern.h6
-rw-r--r--src/include/flags.h23
-rw-r--r--src/include/hardware.h23
-rw-r--r--src/include/mutex.h18
-rw-r--r--src/include/mutex.i88
-rw-r--r--src/include/serial.i5
-rw-r--r--src/include/stat.h1
-rw-r--r--src/include/wiredtiger.in115
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/log/log.c78
-rw-r--r--src/lsm/lsm_manager.c8
-rw-r--r--src/reconcile/rec_write.c41
-rw-r--r--src/schema/schema_open.c2
-rw-r--r--src/session/session_api.c119
-rw-r--r--src/support/stat.c3
-rw-r--r--src/txn/txn_recover.c24
-rw-r--r--test/suite/test_bug015.py48
-rw-r--r--test/suite/test_reconfig01.py (renamed from test/suite/test_reconfig.py)4
-rw-r--r--test/suite/test_reconfig02.py108
-rw-r--r--test/suite/test_txn02.py9
49 files changed, 1066 insertions, 524 deletions
diff --git a/bench/wtperf/runners/log-append-zero.wtperf b/bench/wtperf/runners/log-append-zero.wtperf
new file mode 100644
index 00000000000..973d2cddd0d
--- /dev/null
+++ b/bench/wtperf/runners/log-append-zero.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: Test a log file with a multi-threaded
+# append workload.
+conn_config="cache_size=1G,log=(enabled=true,file_max=20MB,zero_fill=true),checkpoint=(log_size=1G)"
+table_config="type=file"
+icount=50000000
+report_interval=5
+run_time=0
+populate_threads=8
diff --git a/bench/wtperf/runners/multi-btree-stress.wtperf b/bench/wtperf/runners/multi-btree-stress.wtperf
new file mode 100644
index 00000000000..b10b08f6035
--- /dev/null
+++ b/bench/wtperf/runners/multi-btree-stress.wtperf
@@ -0,0 +1,17 @@
+# wtperf options file: multi-database configuration attempting to
+# trigger slow operations by overloading CPU and disk.
+# References Jira WT-2131
+conn_config="cache_size=2GB,eviction=(threads_min=2,threads_max=2),log=(enabled=false),direct_io=(data,checkpoint),buffer_alignment=4096,checkpoint_sync=true,checkpoint=(wait=60)"
+table_config="allocation_size=4k,prefix_compression=false,split_pct=75,leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file"
+# Divide original icount by database_count.
+database_count=5
+icount=50000
+populate_threads=1
+random_range=50000000
+report_interval=5
+run_time=3600
+threads=((count=1,inserts=1),(count=10,reads=1))
+value_sz=100
+max_latency=1000
+sample_interval=5
+sample_rate=1
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 8dceeab2832..20c30e10482 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -600,7 +600,34 @@ worker(void *arg)
if (ret == WT_NOTFOUND)
break;
-op_err: lprintf(cfg, ret, 0,
+op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) {
+ /*
+ * If we are running with explicit transactions
+ * configured and we hit a WT_ROLLBACK, then we
+ * should rollback the current transaction and
+ * attempt to continue.
+ * This does break the guarantee of insertion
+ * order in cases of ordered inserts, as we
+ * aren't retrying here.
+ */
+ lprintf(cfg, ret, 1,
+ "%s for: %s, range: %"PRIu64, op_name(op),
+ key_buf, wtperf_value_range(cfg));
+ if ((ret = session->rollback_transaction(
+ session, NULL)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Failed rollback_transaction");
+ goto err;
+ }
+ if ((ret = session->begin_transaction(
+ session, NULL)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Worker begin transaction failed");
+ goto err;
+ }
+ break;
+ }
+ lprintf(cfg, ret, 0,
"%s failed for: %s, range: %"PRIu64,
op_name(op), key_buf, wtperf_value_range(cfg));
goto err;
@@ -644,7 +671,7 @@ op_err: lprintf(cfg, ret, 0,
if ((ret = session->begin_transaction(
session, NULL)) != 0) {
lprintf(cfg, ret, 0,
- "Worker transaction commit failed");
+ "Worker begin transaction failed");
goto err;
}
}
@@ -1171,8 +1198,12 @@ monitor(void *arg)
if (latency_max != 0 &&
(read_max > latency_max || insert_max > latency_max ||
update_max > latency_max))
+ /*
+ * Make this a non-fatal error and print WARNING in
+ * the output so Jenkins can flag it as unstable.
+ */
lprintf(cfg, 0, 0,
- "max latency exceeded: threshold %" PRIu32
+ "WARNING: max latency exceeded: threshold %" PRIu32
" read max %" PRIu32 " insert max %" PRIu32
" update max %" PRIu32, latency_max,
read_max, insert_max, update_max);
diff --git a/dist/api_data.py b/dist/api_data.py
index 5652edc4ebe..6fd7dcd0093 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -411,6 +411,41 @@ connection_runtime_config = [
interval in seconds at which to check for files that are
inactive and close them''', min=1, max=100000),
]),
+ Config('log', '', r'''
+ enable logging. Enabling logging uses three sessions from the
+ configured session_max''',
+ type='category', subconfig=[
+ Config('archive', 'true', r'''
+ automatically archive unneeded log files''',
+ type='boolean'),
+ Config('compressor', 'none', r'''
+ configure a compressor for log records. Permitted values are
+ \c "none" or custom compression engine name created with
+ WT_CONNECTION::add_compressor. If WiredTiger has builtin support
+ for \c "bzip2", \c "snappy", \c "lz4" or \c "zlib" compression,
+ these names are also available. See @ref compression for more
+ information'''),
+ Config('enabled', 'false', r'''
+ enable logging subsystem''',
+ type='boolean'),
+ Config('file_max', '100MB', r'''
+ the maximum size of log files''',
+ min='100KB', max='2GB'),
+ Config('path', '', r'''
+ the path to a directory into which the log files are written.
+ If the value is not an absolute path name, the files are created
+ relative to the database home'''),
+ Config('prealloc', 'true', r'''
+ pre-allocate log files.''',
+ type='boolean'),
+ Config('recover', 'on', r'''
+ run recovery or error if recovery needs to run after an
+ unclean shutdown.''',
+ choices=['error','on']),
+ Config('zero_fill', 'false', r'''
+ manually write zeroes into log files''',
+ type='boolean'),
+ ]),
Config('lsm_manager', '', r'''
configure database wide options for LSM tree management. The LSM
manager is started automatically the first time an LSM tree is opened.
@@ -611,38 +646,6 @@ common_wiredtiger_open = [
maximum number of simultaneous hazard pointers per session
handle''',
min='15'),
- Config('log', '', r'''
- enable logging. Enabling logging uses three sessions from the
- configured session_max''',
- type='category', subconfig=[
- Config('archive', 'true', r'''
- automatically archive unneeded log files''',
- type='boolean'),
- Config('compressor', 'none', r'''
- configure a compressor for log records. Permitted values are
- \c "none" or custom compression engine name created with
- WT_CONNECTION::add_compressor. If WiredTiger has builtin support
- for \c "bzip2", \c "snappy", \c "lz4" or \c "zlib" compression,
- these names are also available. See @ref compression for more
- information'''),
- Config('enabled', 'false', r'''
- enable logging subsystem''',
- type='boolean'),
- Config('file_max', '100MB', r'''
- the maximum size of log files''',
- min='100KB', max='2GB'),
- Config('path', '', r'''
- the path to a directory into which the log files are written.
- If the value is not an absolute path name, the files are created
- relative to the database home'''),
- Config('prealloc', 'true', r'''
- pre-allocate log files.''',
- type='boolean'),
- Config('recover', 'on', r'''
- run recovery or error if recovery needs to run after an
- unclean shutdown.''',
- choices=['error','on']),
- ]),
Config('mmap', 'true', r'''
Use memory mapping to access files when possible''',
type='boolean'),
diff --git a/dist/flags.py b/dist/flags.py
index d98f249335e..65b68cf4277 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -92,6 +92,7 @@ flags = {
'CONN_CKPT_SYNC',
'CONN_CLOSING',
'CONN_EVICTION_RUN',
+ 'CONN_LAS_OPEN',
'CONN_LEAK_MEMORY',
'CONN_LOG_SERVER_RUN',
'CONN_LSM_MERGE',
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 021e222919e..d234a3c101f 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -646,6 +646,7 @@ intrin
inuse
io
ip
+islocked
ispo
iteratively
jnr
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 5bf7000f402..76fdf185137 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -272,6 +272,7 @@ connection_stats = [
LogStat('log_sync_dir', 'log sync_dir operations'),
LogStat('log_write_lsn', 'log server thread advances write LSN'),
LogStat('log_writes', 'log write operations'),
+ LogStat('log_zero_fills', 'log files manually zero-filled'),
##########################################
# Reconciliation statistics
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 1d819474728..dc26f2d11c3 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -53,7 +53,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
* for the cursor.
*/
WT_RET(__wt_open_internal_session(
- conn, "async-cursor", true, true, &session));
+ conn, "async-cursor", true, 0, &session));
__wt_spin_lock(session, &async->ops_lock);
WT_ERR(__wt_calloc_one(session, &af));
WT_ERR(__wt_strdup(session, uri, &af->uri));
@@ -229,7 +229,7 @@ __async_start(WT_SESSION_IMPL *session)
{
WT_ASYNC *async;
WT_CONNECTION_IMPL *conn;
- uint32_t i;
+ uint32_t i, session_flags;
conn = S2C(session);
conn->async_cfg = 1;
@@ -256,9 +256,9 @@ __async_start(WT_SESSION_IMPL *session)
* workers and we may want to selectively stop some workers
* while leaving the rest running.
*/
- WT_RET(__wt_open_internal_session(conn,
- "async-worker", true, true, &async->worker_sessions[i]));
- F_SET(async->worker_sessions[i], WT_SESSION_SERVER_ASYNC);
+ session_flags = WT_SESSION_SERVER_ASYNC;
+ WT_RET(__wt_open_internal_session(conn, "async-worker",
+ true, session_flags, &async->worker_sessions[i]));
}
for (i = 0; i < conn->async_workers; i++) {
/*
@@ -305,7 +305,7 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[])
WT_DECL_RET;
WT_SESSION *wt_session;
bool run;
- uint32_t i;
+ uint32_t i, session_flags;
conn = S2C(session);
async = conn->async;
@@ -371,10 +371,9 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[])
/*
* Each worker has its own session.
*/
+ session_flags = WT_SESSION_SERVER_ASYNC;
WT_RET(__wt_open_internal_session(conn, "async-worker",
- true, true, &async->worker_sessions[i]));
- F_SET(async->worker_sessions[i],
- WT_SESSION_SERVER_ASYNC);
+ true, session_flags, &async->worker_sessions[i]));
}
for (i = conn->async_workers; i < tmp_conn.async_workers; i++) {
/*
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 18b6860c758..b2c9e4b67f8 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -55,10 +55,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
* The page's modification information can change underfoot if
* the page is being reconciled, serialize with reconciliation.
*/
- F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
+ WT_RET(__wt_fair_lock(session, &page->page_lock));
+
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
- F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+
+ WT_TRET(__wt_fair_unlock(session, &page->page_lock));
WT_RET(ret);
}
return (0);
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index ee2898f60be..15ae93522a7 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -636,7 +636,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ": %s\n", __wt_page_type_string(page->type));
__dmsg(ds, "\t" "disk %p, entries %" PRIu32, page->dsk, entries);
- __dmsg(ds, "%s", __wt_page_is_modified(page) ? ", dirty" : ", clean");
+ __dmsg(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean");
+ __dmsg(ds, ", %s", __wt_fair_islocked(
+ session, &page->page_lock) ? "locked" : "unlocked");
+
if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
__dmsg(ds, ", keys-built");
if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
@@ -647,8 +650,6 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ", evict-lru");
if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS))
__dmsg(ds, ", overflow-keys");
- if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION))
- __dmsg(ds, ", reconciliation");
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
__dmsg(ds, ", split-insert");
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index c27d42d38f4..998667e3e1f 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -55,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION));
+ WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock));
#ifdef HAVE_DIAGNOSTIC
{
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 29153ced178..adda9145ee4 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -866,6 +866,18 @@ __split_parent_lock(
*parentp = NULL;
/*
+ * A checkpoint reconciling this parent page can deadlock with
+ * our split. We have an exclusive page lock on the child before
+ * we acquire the page's reconciliation lock, and reconciliation
+ * acquires the page's reconciliation lock before it encounters
+ * the child's exclusive lock (which causes reconciliation to
+ * loop until the exclusive lock is resolved). If we want to split
+ * the parent, give up to avoid that deadlock.
+ */
+ if (S2BT(session)->checkpointing != WT_CKPT_OFF)
+ return (EBUSY);
+
+ /*
* Get a page-level lock on the parent to single-thread splits into the
* page because we need to single-thread sizing/growing the page index.
* It's OK to queue up multiple splits as the child pages split, but the
@@ -882,32 +894,11 @@ __split_parent_lock(
*/
for (;;) {
parent = ref->home;
- F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret);
- if (ret == 0) {
- /*
- * We can race with another thread deepening our parent.
- * To deal with that, read the parent pointer each time
- * we try to lock it, and check it's still correct after
- * it's locked.
- */
- if (parent == ref->home)
- break;
- F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
- continue;
- }
-
- /*
- * A checkpoint reconciling this parent page can deadlock with
- * our split. We have an exclusive page lock on the child before
- * we acquire the page's reconciliation lock, and reconciliation
- * acquires the page's reconciliation lock before it encounters
- * the child's exclusive lock (which causes reconciliation to
- * loop until the exclusive lock is resolved). If we can't lock
- * the parent, give up to avoid that deadlock.
- */
- if (S2BT(session)->checkpointing != WT_CKPT_OFF)
- return (EBUSY);
- __wt_yield();
+ WT_RET(__wt_fair_lock(session, &parent->page_lock));
+ if (parent == ref->home)
+ break;
+ /* Try again if the page deepened while we were waiting */
+ WT_RET(__wt_fair_unlock(session, &parent->page_lock));
}
/*
@@ -930,7 +921,7 @@ __split_parent_lock(
*parentp = parent;
return (0);
-err: F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
+err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
return (ret);
}
@@ -946,7 +937,7 @@ __split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
if (hazard)
ret = __wt_hazard_clear(session, parent);
- F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
+ WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
return (ret);
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 247bdef65c8..237d900c3d1 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -140,8 +140,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
*/
if (!WT_PAGE_IS_INTERNAL(page) &&
F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
- WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) &&
- mod->rec_result != WT_PM_REC_REWRITE) {
+ WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
__wt_page_modify_set(session, page);
continue;
}
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
index a964ac39874..2eb406c2af8 100644
--- a/src/cache/cache_las.c
+++ b/src/cache/cache_las.c
@@ -24,10 +24,9 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
/*
* Lookaside table statistics are copied from the underlying lookaside
* table data-source statistics. If there's no lookaside table, values
- * remain 0. In the current system, there's always a lookaside table,
- * but there's no reason not to be cautious.
+ * remain 0.
*/
- if (conn->las_cursor == NULL)
+ if (!F_ISSET(conn, WT_CONN_LAS_OPEN))
return;
/*
@@ -35,7 +34,8 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
* to it by way of the underlying btree handle, but it's a little ugly.
*/
cstats = conn->stats;
- dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats;
+ dstats = ((WT_CURSOR_BTREE *)
+ conn->las_session->las_cursor)->btree->dhandle->stats;
WT_STAT_SET(session, cstats,
cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert));
@@ -44,40 +44,6 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
}
/*
- * __las_cursor_create --
- * Open a new lookaside table cursor.
- */
-static int
-__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
-{
- WT_BTREE *btree;
- const char *open_cursor_cfg[] = {
- WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
-
- WT_RET(__wt_open_cursor(
- session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp));
-
- /*
- * Set special flags for the lookaside table: the lookaside flag (used,
- * for example, to avoid writing records during reconciliation), also
- * turn off checkpoints and logging.
- *
- * Test flags before setting them so updates can't race in subsequent
- * opens (the first update is safe because it's single-threaded from
- * wiredtiger_open).
- */
- btree = S2BT(session);
- if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
- F_SET(btree, WT_BTREE_LOOKASIDE);
- if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
- F_SET(btree, WT_BTREE_NO_CHECKPOINT);
- if (!F_ISSET(btree, WT_BTREE_NO_LOGGING))
- F_SET(btree, WT_BTREE_NO_LOGGING);
-
- return (0);
-}
-
-/*
* __wt_las_create --
* Initialize the database's lookaside store.
*/
@@ -85,7 +51,7 @@ int
__wt_las_create(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
+ uint32_t session_flags;
const char *drop_cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL };
@@ -93,30 +59,28 @@ __wt_las_create(WT_SESSION_IMPL *session)
/*
* Done at startup: we cannot do it on demand because we require the
- * schema lock to create and drop the file, and it may not always be
+ * schema lock to create and drop the table, and it may not always be
* available.
*
- * Open an internal session, used for the shared lookaside cursor.
- *
- * Sessions associated with a lookaside cursor should never be tapped
- * for eviction.
+ * Discard any previous incarnation of the table.
*/
- WT_RET(__wt_open_internal_session(
- conn, "lookaside table", true, true, &conn->las_session));
- session = conn->las_session;
- F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
-
- /* Discard any previous incarnation of the file. */
WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg));
- /* Re-create the file. */
+ /* Re-create the table. */
WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT));
- /* Open the shared cursor. */
- WT_WITHOUT_DHANDLE(session,
- ret = __las_cursor_create(session, &conn->las_cursor));
+ /*
+ * Open a shared internal session used to access the lookaside table.
+ * This session should never be tapped for eviction.
+ */
+ session_flags = WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION;
+ WT_RET(__wt_open_internal_session(
+ conn, "lookaside table", true, session_flags, &conn->las_session));
- return (ret);
+ /* Flag that the lookaside table has been created. */
+ F_SET(conn, WT_CONN_LAS_OPEN);
+
+ return (0);
}
/*
@@ -138,7 +102,6 @@ __wt_las_destroy(WT_SESSION_IMPL *session)
wt_session = &conn->las_session->iface;
ret = wt_session->close(wt_session, NULL);
- conn->las_cursor = NULL;
conn->las_session = NULL;
return (ret);
@@ -176,6 +139,40 @@ __wt_las_is_written(WT_SESSION_IMPL *session)
}
/*
+ * __wt_las_cursor_create --
+ * Open a new lookaside table cursor.
+ */
+int
+__wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
+{
+ WT_BTREE *btree;
+ const char *open_cursor_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
+
+ WT_RET(__wt_open_cursor(
+ session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp));
+
+ /*
+ * Set special flags for the lookaside table: the lookaside flag (used,
+ * for example, to avoid writing records during reconciliation), also
+ * turn off checkpoints and logging.
+ *
+ * Test flags before setting them so updates can't race in subsequent
+ * opens (the first update is safe because it's single-threaded from
+ * wiredtiger_open).
+ */
+ btree = S2BT(session);
+ if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
+ F_SET(btree, WT_BTREE_LOOKASIDE);
+ if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ F_SET(btree, WT_BTREE_NO_CHECKPOINT);
+ if (!F_ISSET(btree, WT_BTREE_NO_LOGGING))
+ F_SET(btree, WT_BTREE_NO_LOGGING);
+
+ return (0);
+}
+
+/*
* __wt_las_cursor --
* Return a lookaside cursor.
*/
@@ -184,7 +181,6 @@ __wt_las_cursor(
WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
{
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
*cursorp = NULL;
@@ -202,20 +198,15 @@ __wt_las_cursor(
conn = S2C(session);
- /* Eviction and sweep threads have their own lookaside table cursors. */
- if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
- if (session->las_cursor == NULL) {
- WT_WITHOUT_DHANDLE(session, ret =
- __las_cursor_create(session, &session->las_cursor));
- WT_RET(ret);
- }
-
+ /*
+ * Some threads have their own lookaside table cursors, else lock the
+ * shared lookaside cursor.
+ */
+ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
*cursorp = session->las_cursor;
- } else {
- /* Lock the shared lookaside cursor. */
+ else {
__wt_spin_lock(session, &conn->las_lock);
-
- *cursorp = conn->las_cursor;
+ *cursorp = conn->las_session->las_cursor;
}
/* Turn caching and eviction off. */
@@ -253,8 +244,8 @@ __wt_las_cursor_close(
F_SET(session, session_flags);
/*
- * Eviction and sweep threads have their own lookaside table cursors;
- * else, unlock the shared lookaside cursor.
+ * Some threads have their own lookaside table cursors, else unlock the
+ * shared lookaside cursor.
*/
if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
__wt_spin_unlock(session, &conn->las_lock);
diff --git a/src/config/config.c b/src/config/config.c
index 27de6264a28..505b843aa86 100644
--- a/src/config/config.c
+++ b/src/config/config.c
@@ -745,11 +745,16 @@ __wt_config_gets_def(WT_SESSION_IMPL *session,
*value = false_value;
value->val = def;
+
if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL)
return (0);
- else if (cfg[2] == NULL)
+
+ if (cfg[2] == NULL) {
WT_RET_NOTFOUND_OK(
__wt_config_getones(session, cfg[1], key, value));
+ return (0);
+ }
+
return (__wt_config_gets(session, cfg, key, value));
}
diff --git a/src/config/config_def.c b/src/config/config_def.c
index a3dc24fafc4..419f4124133 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -66,6 +66,21 @@ static const WT_CONFIG_CHECK
};
static const WT_CONFIG_CHECK
+ confchk_wiredtiger_open_log_subconfigs[] = {
+ { "archive", "boolean", NULL, NULL, NULL, 0 },
+ { "compressor", "string", NULL, NULL, NULL, 0 },
+ { "enabled", "boolean", NULL, NULL, NULL, 0 },
+ { "file_max", "int", NULL, "min=100KB,max=2GB", NULL, 0 },
+ { "path", "string", NULL, NULL, NULL, 0 },
+ { "prealloc", "boolean", NULL, NULL, NULL, 0 },
+ { "recover", "string",
+ NULL, "choices=[\"error\",\"on\"]",
+ NULL, 0 },
+ { "zero_fill", "boolean", NULL, NULL, NULL, 0 },
+ { NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
+static const WT_CONFIG_CHECK
confchk_wiredtiger_open_lsm_manager_subconfigs[] = {
{ "merge", "boolean", NULL, NULL, NULL, 0 },
{ "worker_thread_max", "int", NULL, "min=3,max=20", NULL, 0 },
@@ -116,6 +131,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "file_manager", "category",
NULL, NULL,
confchk_wiredtiger_open_file_manager_subconfigs, 3 },
+ { "log", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_log_subconfigs, 8 },
{ "lsm_manager", "category",
NULL, NULL,
confchk_wiredtiger_open_lsm_manager_subconfigs, 2 },
@@ -453,20 +471,6 @@ static const WT_CONFIG_CHECK
};
static const WT_CONFIG_CHECK
- confchk_wiredtiger_open_log_subconfigs[] = {
- { "archive", "boolean", NULL, NULL, NULL, 0 },
- { "compressor", "string", NULL, NULL, NULL, 0 },
- { "enabled", "boolean", NULL, NULL, NULL, 0 },
- { "file_max", "int", NULL, "min=100KB,max=2GB", NULL, 0 },
- { "path", "string", NULL, NULL, NULL, 0 },
- { "prealloc", "boolean", NULL, NULL, NULL, 0 },
- { "recover", "string",
- NULL, "choices=[\"error\",\"on\"]",
- NULL, 0 },
- { NULL, NULL, NULL, NULL, NULL, 0 }
-};
-
-static const WT_CONFIG_CHECK
confchk_wiredtiger_open_transaction_sync_subconfigs[] = {
{ "enabled", "boolean", NULL, NULL, NULL, 0 },
{ "method", "string",
@@ -517,7 +521,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "hazard_max", "int", NULL, "min=15", NULL, 0 },
{ "log", "category",
NULL, NULL,
- confchk_wiredtiger_open_log_subconfigs, 7 },
+ confchk_wiredtiger_open_log_subconfigs, 8 },
{ "lsm_manager", "category",
NULL, NULL,
confchk_wiredtiger_open_lsm_manager_subconfigs, 2 },
@@ -592,7 +596,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "hazard_max", "int", NULL, "min=15", NULL, 0 },
{ "log", "category",
NULL, NULL,
- confchk_wiredtiger_open_log_subconfigs, 7 },
+ confchk_wiredtiger_open_log_subconfigs, 8 },
{ "lsm_manager", "category",
NULL, NULL,
confchk_wiredtiger_open_lsm_manager_subconfigs, 2 },
@@ -665,7 +669,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "hazard_max", "int", NULL, "min=15", NULL, 0 },
{ "log", "category",
NULL, NULL,
- confchk_wiredtiger_open_log_subconfigs, 7 },
+ confchk_wiredtiger_open_log_subconfigs, 8 },
{ "lsm_manager", "category",
NULL, NULL,
confchk_wiredtiger_open_lsm_manager_subconfigs, 2 },
@@ -737,7 +741,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "hazard_max", "int", NULL, "min=15", NULL, 0 },
{ "log", "category",
NULL, NULL,
- confchk_wiredtiger_open_log_subconfigs, 7 },
+ confchk_wiredtiger_open_log_subconfigs, 8 },
{ "lsm_manager", "category",
NULL, NULL,
confchk_wiredtiger_open_lsm_manager_subconfigs, 2 },
@@ -814,12 +818,14 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80,"
"eviction_dirty_trigger=95,eviction_target=80,eviction_trigger=95"
",file_manager=(close_handle_minimum=250,close_idle_time=30,"
- "close_scan_interval=10),lsm_manager=(merge=,worker_thread_max=4)"
- ",lsm_merge=,shared_cache=(chunk=10MB,name=,quota=0,reserve=0,"
- "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "close_scan_interval=10),log=(archive=,compressor=,enabled=0,"
+ "file_max=100MB,path=,prealloc=,recover=on,zero_fill=0),"
+ "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=,"
+ "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
+ "statistics=none,statistics_log=(on_close=0,"
"path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
- confchk_WT_CONNECTION_reconfigure, 17
+ confchk_WT_CONNECTION_reconfigure, 18
},
{ "WT_CURSOR.close",
"",
@@ -969,13 +975,14 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"file_extend=,file_manager=(close_handle_minimum=250,"
"close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
- "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
- "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
- "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
- ",path=\"WiredTigerStat.%d.%H\",sources=,"
- "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
- ",method=fsync),use_environment_priv=0,verbose=",
+ "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB"
+ ",name=,quota=0,reserve=0,size=500MB),statistics=none,"
+ "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\","
+ "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
+ "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0"
+ ",verbose=",
confchk_wiredtiger_open, 34
},
{ "wiredtiger_open_all",
@@ -989,14 +996,14 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"file_extend=,file_manager=(close_handle_minimum=250,"
"close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
- "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
- "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
- "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
- ",path=\"WiredTigerStat.%d.%H\",sources=,"
- "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
- ",method=fsync),use_environment_priv=0,verbose=,version=(major=0,"
- "minor=0)",
+ "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB"
+ ",name=,quota=0,reserve=0,size=500MB),statistics=none,"
+ "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\","
+ "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
+ "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0"
+ ",verbose=,version=(major=0,minor=0)",
confchk_wiredtiger_open_all, 35
},
{ "wiredtiger_open_basecfg",
@@ -1009,13 +1016,14 @@ static const WT_CONFIG_ENTRY config_entries[] = {
",extensions=,file_extend=,file_manager=(close_handle_minimum=250"
",close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
- "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
- "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
- "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
- ",path=\"WiredTigerStat.%d.%H\",sources=,"
- "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
- ",method=fsync),verbose=,version=(major=0,minor=0)",
+ "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB"
+ ",name=,quota=0,reserve=0,size=500MB),statistics=none,"
+ "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\","
+ "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
+ "transaction_sync=(enabled=0,method=fsync),verbose=,"
+ "version=(major=0,minor=0)",
confchk_wiredtiger_open_basecfg, 31
},
{ "wiredtiger_open_usercfg",
@@ -1028,13 +1036,13 @@ static const WT_CONFIG_ENTRY config_entries[] = {
",extensions=,file_extend=,file_manager=(close_handle_minimum=250"
",close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
- "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
- "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
- "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
- ",path=\"WiredTigerStat.%d.%H\",sources=,"
- "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
- ",method=fsync),verbose=",
+ "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB"
+ ",name=,quota=0,reserve=0,size=500MB),statistics=none,"
+ "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\","
+ "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
+ "transaction_sync=(enabled=0,method=fsync),verbose=",
confchk_wiredtiger_open_usercfg, 30
},
{ NULL, NULL, NULL, 0 }
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index b5d0e8f2883..b50ad750158 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -1051,6 +1051,7 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
WT_ERR(__wt_async_reconfig(session, cfg));
WT_ERR(__wt_cache_config(session, true, cfg));
WT_ERR(__wt_checkpoint_server_create(session, cfg));
+ WT_ERR(__wt_logmgr_reconfig(session, cfg));
WT_ERR(__wt_lsm_manager_reconfig(session, cfg));
WT_ERR(__wt_statlog_create(session, cfg));
WT_ERR(__wt_sweep_config(session, cfg));
@@ -2037,9 +2038,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
/* Start the worker threads and run recovery. */
WT_ERR(__wt_connection_workers(session, cfg));
- /* Create the lookaside table. */
- WT_ERR(__wt_las_create(session));
-
WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
*wt_connp = &conn->iface;
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 6294e3b01a7..aa14e9aadde 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -243,6 +243,7 @@ __wt_conn_cache_pool_open(WT_SESSION_IMPL *session)
WT_CACHE_POOL *cp;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ uint32_t session_flags;
conn = S2C(session);
cache = conn->cache;
@@ -252,8 +253,9 @@ __wt_conn_cache_pool_open(WT_SESSION_IMPL *session)
* Create a session that can be used by the cache pool thread, do
* it in the main thread to avoid shutdown races
*/
+ session_flags = WT_SESSION_NO_DATA_HANDLES;
if ((ret = __wt_open_internal_session(
- conn, "cache-pool", false, false, &cache->cp_session)) != 0)
+ conn, "cache-pool", false, session_flags, &cache->cp_session)) != 0)
WT_RET_MSG(NULL, ret,
"Failed to create session for cache pool");
@@ -275,7 +277,7 @@ __wt_conn_cache_pool_open(WT_SESSION_IMPL *session)
* in each connection saves having a complex election process when
* the active connection shuts down.
*/
- F_SET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+ F_SET(cp, WT_CACHE_POOL_ACTIVE);
F_SET(cache, WT_CACHE_POOL_RUN);
WT_RET(__wt_thread_create(session, &cache->cp_tid,
__wt_cache_pool_server, cache->cp_session));
@@ -366,10 +368,10 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
if (--cp->refs == 0) {
WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh));
- F_CLR_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+ F_CLR(cp, WT_CACHE_POOL_ACTIVE);
}
- if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE)) {
+ if (!F_ISSET(cp, WT_CACHE_POOL_ACTIVE)) {
WT_TRET(__wt_verbose(
session, WT_VERB_SHARED_CACHE, "Destroying cache pool"));
__wt_spin_lock(session, &__wt_process.spinlock);
@@ -398,7 +400,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
/* Notify other participants if we were managing */
if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) {
- F_CLR_ATOMIC(cp, WT_CACHE_POOL_MANAGED);
+ cp->pool_managed = 0;
WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
"Shutting down shared cache manager connection"));
}
@@ -438,7 +440,7 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward)
* - Reduce the amount allocated, if we are over the budget
* - Increase the amount used if there is capacity and any pressure.
*/
- while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) &&
F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) {
WT_ERR(__cache_pool_adjust(
session, highest, bump_threshold, forward, &adjusted));
@@ -728,7 +730,7 @@ __wt_cache_pool_server(void *arg)
cache = S2C(session)->cache;
forward = true;
- while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) &&
F_ISSET(cache, WT_CACHE_POOL_RUN)) {
if (cp->currently_used <= cp->size)
WT_ERR(__wt_cond_wait(session,
@@ -738,13 +740,12 @@ __wt_cache_pool_server(void *arg)
* Re-check pool run flag - since we want to avoid getting the
* lock on shutdown.
*/
- if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ if (!F_ISSET(cp, WT_CACHE_POOL_ACTIVE) &&
F_ISSET(cache, WT_CACHE_POOL_RUN))
break;
/* Try to become the managing thread */
- F_CAS_ATOMIC(cp, WT_CACHE_POOL_MANAGED, ret);
- if (ret == 0) {
+ if (__wt_atomic_cas8(&cp->pool_managed, 0, 1)) {
F_SET(cache, WT_CACHE_POOL_MANAGER);
WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE,
"Cache pool switched manager thread"));
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index 7fc790d5efa..caf0c3b68f0 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -123,22 +123,24 @@ static int
__ckpt_server_start(WT_CONNECTION_IMPL *conn)
{
WT_SESSION_IMPL *session;
+ uint32_t session_flags;
/* Nothing to do if the server is already running. */
if (conn->ckpt_session != NULL)
return (0);
F_SET(conn, WT_CONN_SERVER_CHECKPOINT);
- /* The checkpoint server gets its own session. */
- WT_RET(__wt_open_internal_session(
- conn, "checkpoint-server", true, true, &conn->ckpt_session));
- session = conn->ckpt_session;
/*
+ * The checkpoint server gets its own session.
+ *
* Checkpoint does enough I/O it may be called upon to perform slow
* operations for the block manager.
*/
- F_SET(session, WT_SESSION_CAN_WAIT);
+ session_flags = WT_SESSION_CAN_WAIT;
+ WT_RET(__wt_open_internal_session(conn,
+ "checkpoint-server", true, session_flags, &conn->ckpt_session));
+ session = conn->ckpt_session;
WT_RET(__wt_cond_alloc(
session, "checkpoint server", false, &conn->ckpt_cond));
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 77e7693042b..0b364b5fd4b 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -678,11 +678,15 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
conn = S2C(session);
/*
- * Close open data handles: first, everything but the metadata file
- * (as closing a normal file may open and write the metadata file),
- * then the metadata file. This function isn't called often, and I
- * don't want to "know" anything about the metadata file's position on
- * the list, so we do it the hard way.
+ * Empty the session cache: any data handles created in a connection
+ * method may be cached here, and we're about to close them.
+ */
+ __wt_session_close_cache(session);
+
+ /*
+ * Close open data handles: first, everything but the metadata file (as
+ * closing a normal file may open and write the metadata file), then
+ * the metadata file.
*/
restart:
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index eba0a2769d6..9068e7e85a2 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -42,7 +42,8 @@ __logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg)
* Parse and setup the logging server options.
*/
static int
-__logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp)
+__logmgr_config(
+ WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool reconfig)
{
WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
@@ -50,22 +51,37 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp)
conn = S2C(session);
/*
- * The logging configuration is off by default.
+ * If we're reconfiguring, enabled must match the already
+ * existing setting.
+ *
+ * If it is off and the user it turning it on, or it is on
+ * and the user is turning it off, return an error.
*/
WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+ if (reconfig &&
+ ((cval.val != 0 &&
+ !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) ||
+ (cval.val == 0 &&
+ FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))))
+ return (EINVAL);
*runp = cval.val != 0;
/*
- * Setup a log path, compression and encryption even if logging is
- * disabled in case we are going to print a log.
+ * Setup a log path and compression even if logging is disabled in case
+ * we are going to print a log. Only do this on creation. Once a
+ * compressor or log path are set they cannot be changed.
*/
- conn->log_compressor = NULL;
- WT_RET(__wt_config_gets_none(session, cfg, "log.compressor", &cval));
- WT_RET(__wt_compressor_config(session, &cval, &conn->log_compressor));
-
- WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
- WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path));
-
+ if (!reconfig) {
+ conn->log_compressor = NULL;
+ WT_RET(__wt_config_gets_none(
+ session, cfg, "log.compressor", &cval));
+ WT_RET(__wt_compressor_config(
+ session, &cval, &conn->log_compressor));
+
+ WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
+ WT_RET(__wt_strndup(
+ session, cval.str, cval.len, &conn->log_path));
+ }
/* We are done if logging isn't enabled. */
if (!*runp)
return (0);
@@ -74,28 +90,56 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp)
if (cval.val != 0)
FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE);
- WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
- conn->log_file_max = (wt_off_t)cval.val;
- WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max);
+ if (!reconfig) {
+ /*
+ * Ignore if the user tries to change the file size. The
+ * amount of memory allocated to the log slots may be based
+ * on the log file size at creation and we don't want to
+ * re-allocate that memory while running.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
+ conn->log_file_max = (wt_off_t)cval.val;
+ WT_STAT_FAST_CONN_SET(session,
+ log_max_filesize, conn->log_file_max);
+ }
- WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
/*
- * If pre-allocation is configured, set the initial number to one.
+ * If pre-allocation is configured, set the initial number to a few.
* We'll adapt as load dictates.
*/
- if (cval.val != 0) {
- FLD_SET(conn->log_flags, WT_CONN_LOG_PREALLOC);
+ WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
+ if (cval.val != 0)
conn->log_prealloc = 1;
- }
+
+ /*
+ * Note that it is meaningless to reconfigure this value during
+ * runtime. It only matters on create before recovery runs.
+ */
WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval));
if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len))
FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR);
+ WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval));
+ if (cval.val != 0)
+ FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL);
+
WT_RET(__logmgr_sync_cfg(session, cfg));
return (0);
}
/*
+ * __wt_logmgr_reconfig --
+ * Reconfigure logging.
+ */
+int
+__wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg)
+{
+ bool dummy;
+
+ return (__logmgr_config(session, cfg, &dummy, true));
+}
+
+/*
* __log_archive_once --
* Perform one iteration of log archiving. Must be called with the
* log archive lock held.
@@ -216,7 +260,7 @@ __log_prealloc_once(WT_SESSION_IMPL *session)
*/
for (i = reccount; i < (u_int)conn->log_prealloc; i++) {
WT_ERR(__wt_log_allocfile(
- session, ++log->prep_fileid, WT_LOG_PREPNAME, true));
+ session, ++log->prep_fileid, WT_LOG_PREPNAME));
WT_STAT_FAST_CONN_INCR(session, log_prealloc_files);
}
/*
@@ -722,7 +766,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
/* Handle configuration. */
- WT_RET(__logmgr_config(session, cfg, &run));
+ WT_RET(__logmgr_config(session, cfg, &run, false));
/* If logging is not configured, we're done. */
if (!run)
@@ -777,6 +821,7 @@ int
__wt_logmgr_open(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ uint32_t session_flags;
conn = S2C(session);
@@ -788,8 +833,9 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
* Start the log close thread. It is not configurable.
* If logging is enabled, this thread runs.
*/
- WT_RET(__wt_open_internal_session(
- conn, "log-close-server", false, false, &conn->log_file_session));
+ session_flags = WT_SESSION_NO_DATA_HANDLES;
+ WT_RET(__wt_open_internal_session(conn,
+ "log-close-server", false, session_flags, &conn->log_file_session));
WT_RET(__wt_cond_alloc(conn->log_file_session,
"log close server", false, &conn->log_file_cond));
@@ -804,19 +850,14 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
* Start the log write LSN thread. It is not configurable.
* If logging is enabled, this thread runs.
*/
- WT_RET(__wt_open_internal_session(
- conn, "log-wrlsn-server", false, false, &conn->log_wrlsn_session));
+ WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server",
+ false, session_flags, &conn->log_wrlsn_session));
WT_RET(__wt_cond_alloc(conn->log_wrlsn_session,
"log write lsn server", false, &conn->log_wrlsn_cond));
WT_RET(__wt_thread_create(conn->log_wrlsn_session,
&conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
conn->log_wrlsn_tid_set = true;
- /* If no log thread services are configured, we're done. */
- if (!FLD_ISSET(conn->log_flags,
- (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC)))
- return (0);
-
/*
* If a log server thread exists, the user may have reconfigured
* archiving or pre-allocation. Signal the thread. Otherwise the
@@ -829,8 +870,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
WT_RET(__wt_cond_signal(session, conn->log_cond));
} else {
/* The log server gets its own session. */
- WT_RET(__wt_open_internal_session(
- conn, "log-server", false, false, &conn->log_session));
+ WT_RET(__wt_open_internal_session(conn,
+ "log-server", false, session_flags, &conn->log_session));
WT_RET(__wt_cond_alloc(conn->log_session,
"log server", false, &conn->log_cond));
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index 199cf213e0a..04815c8e152 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -38,7 +38,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
* need to get cleaned up on close.
*/
WT_RET(__wt_open_internal_session(
- conn, "connection", true, false, &session));
+ conn, "connection", false, 0, &session));
/*
* The connection's default session is originally a static structure,
@@ -228,33 +228,45 @@ int
__wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
{
/*
- * Start the eviction thread.
- */
- WT_RET(__wt_evict_create(session));
-
- /*
* Start the optional statistics thread. Start statistics first so that
* other optional threads can know if statistics are enabled or not.
*/
WT_RET(__wt_statlog_create(session, cfg));
WT_RET(__wt_logmgr_create(session, cfg));
- /* Run recovery. */
+ /*
+ * Run recovery.
+ * NOTE: This call will start (and stop) eviction if recovery is
+ * required. Recovery must run before the lookaside table is created
+ * (because recovery will update the metadata), and before eviction is
+ * started for real.
+ */
WT_RET(__wt_txn_recover(session));
+ /*
+ * Start the optional logging/archive threads.
+ * NOTE: The log manager must be started before checkpoints so that the
+ * checkpoint server knows if logging is enabled. It must also be
+ * started before any operation that can commit, or the commit can
+ * block.
+ */
+ WT_RET(__wt_logmgr_open(session));
+
+ /* Create the lookaside table. */
+ WT_RET(__wt_las_create(session));
+
+ /*
+ * Start eviction threads.
+ * NOTE: Eviction must be started after the lookaside table is created.
+ */
+ WT_RET(__wt_evict_create(session));
+
/* Start the handle sweep thread. */
WT_RET(__wt_sweep_create(session));
/* Start the optional async threads. */
WT_RET(__wt_async_create(session, cfg));
- /*
- * Start the optional logging/archive thread.
- * NOTE: The log manager must be started before checkpoints so that the
- * checkpoint server knows if logging is enabled.
- */
- WT_RET(__wt_logmgr_open(session));
-
/* Start the optional checkpoint thread. */
WT_RET(__wt_checkpoint_server_create(session, cfg));
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index d8c7227ae61..ec3a630581a 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -447,9 +447,10 @@ __statlog_start(WT_CONNECTION_IMPL *conn)
return (0);
F_SET(conn, WT_CONN_SERVER_STATISTICS);
+
/* The statistics log server gets its own session. */
WT_RET(__wt_open_internal_session(
- conn, "statlog-server", true, true, &conn->stat_session));
+ conn, "statlog-server", true, 0, &conn->stat_session));
session = conn->stat_session;
WT_RET(__wt_cond_alloc(
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index 2de0cc12069..23846f978fe 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -353,16 +353,13 @@ int
__wt_sweep_create(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ uint32_t session_flags;
conn = S2C(session);
/* Set first, the thread might run before we finish up. */
F_SET(conn, WT_CONN_SERVER_SWEEP);
- WT_RET(__wt_open_internal_session(
- conn, "sweep-server", true, true, &conn->sweep_session));
- session = conn->sweep_session;
-
/*
* Handle sweep does enough I/O it may be called upon to perform slow
* operations for the block manager.
@@ -372,8 +369,11 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
*
* Don't tap the sweep thread for eviction.
*/
- F_SET(session, WT_SESSION_CAN_WAIT |
- WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
+ session_flags = WT_SESSION_CAN_WAIT |
+ WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION;
+ WT_RET(__wt_open_internal_session(
+ conn, "sweep-server", true, session_flags, &conn->sweep_session));
+ session = conn->sweep_session;
WT_RET(__wt_cond_alloc(
session, "handle sweep server", false, &conn->sweep_cond));
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 3c00ee30896..f9171900ca4 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -236,27 +236,35 @@ __evict_workers_resize(WT_SESSION_IMPL *session)
WT_DECL_RET;
WT_EVICT_WORKER *workers;
size_t alloc;
- uint32_t i;
+ uint32_t i, session_flags;
conn = S2C(session);
- alloc = conn->evict_workers_alloc * sizeof(*workers);
- WT_RET(__wt_realloc(session, &alloc,
- conn->evict_workers_max * sizeof(*workers), &conn->evict_workctx));
- workers = conn->evict_workctx;
+ if (conn->evict_workers_alloc < conn->evict_workers_max) {
+ alloc = conn->evict_workers_alloc * sizeof(*workers);
+ WT_RET(__wt_realloc(session, &alloc,
+ conn->evict_workers_max * sizeof(*workers),
+ &conn->evict_workctx));
+ workers = conn->evict_workctx;
+ }
for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) {
- WT_ERR(__wt_open_internal_session(conn,
- "eviction-worker", true, false, &workers[i].session));
- workers[i].id = i;
-
/*
- * Eviction worker threads get their own lookaside table cursor.
+ * Eviction worker threads get their own session.
* Eviction worker threads may be called upon to perform slow
* operations for the block manager.
+ *
+ * Eviction worker threads get their own lookaside table cursor
+ * if the lookaside table is open. Note that eviction is also
+ * started during recovery, before the lookaside table is
+ * created.
*/
- F_SET(workers[i].session,
- WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT);
+ session_flags = WT_SESSION_CAN_WAIT;
+ if (F_ISSET(conn, WT_CONN_LAS_OPEN))
+ FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR);
+ WT_ERR(__wt_open_internal_session(conn, "eviction-worker",
+ false, session_flags, &workers[i].session));
+ workers[i].id = i;
if (i < conn->evict_workers_min) {
++conn->evict_workers;
@@ -278,33 +286,37 @@ int
__wt_evict_create(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ uint32_t session_flags;
conn = S2C(session);
/* Set first, the thread might run before we finish up. */
F_SET(conn, WT_CONN_EVICTION_RUN);
- /* We need a session handle because we're reading/writing pages. */
- WT_RET(__wt_open_internal_session(
- conn, "eviction-server", true, false, &conn->evict_session));
+ /*
+ * We need a session handle because we're reading/writing pages.
+ *
+ * The eviction server gets its own lookaside table cursor.
+ *
+ * If there's only a single eviction thread, it may be called upon to
+ * perform slow operations for the block manager. (The flag is not
+ * reset if reconfigured later, but I doubt that's a problem.)
+ */
+ session_flags = F_ISSET(conn, WT_CONN_LAS_OPEN) ?
+ WT_SESSION_LOOKASIDE_CURSOR : 0;
+ if (conn->evict_workers_max == 0)
+ FLD_SET(session_flags, WT_SESSION_CAN_WAIT);
+ WT_RET(__wt_open_internal_session(conn,
+ "eviction-server", false, session_flags, &conn->evict_session));
session = conn->evict_session;
/*
* If eviction workers were configured, allocate sessions for them now.
* This is done to reduce the chance that we will open new eviction
* sessions after WT_CONNECTION::close is called.
- *
- * If there's only a single eviction thread, it may be called upon to
- * perform slow operations for the block manager. (The flag is not
- * reset if reconfigured later, but I doubt that's a problem.)
*/
if (conn->evict_workers_max > 0)
WT_RET(__evict_workers_resize(session));
- else
- F_SET(session, WT_SESSION_CAN_WAIT);
-
- /* The eviction server gets its own lookaside table cursor. */
- F_SET(session, WT_SESSION_LOOKASIDE_CURSOR);
/*
* Start the primary eviction server thread after the worker threads
@@ -358,6 +370,8 @@ __wt_evict_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_cond_signal(session, cache->evict_waiter_cond));
WT_TRET(__wt_thread_join(session, workers[i].tid));
}
+ conn->evict_workers = 0;
+
/* Handle shutdown when cleaning up after a failed open. */
if (conn->evict_workctx != NULL) {
for (i = 0; i < conn->evict_workers_alloc; i++) {
@@ -367,6 +381,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session)
}
__wt_free(session, conn->evict_workctx);
}
+ conn->evict_workers_alloc = 0;
if (conn->evict_session != NULL) {
wt_session = &conn->evict_session->iface;
@@ -1457,15 +1472,12 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
WT_DECL_RET;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *txn_state;
- int count;
- bool q_found, txn_busy;
+ uint64_t init_evict_count, max_pages_evicted;
+ bool txn_busy;
conn = S2C(session);
cache = conn->cache;
- /* First, wake the eviction server. */
- WT_RET(__wt_evict_server_wake(session));
-
/*
* If the current transaction is keeping the oldest ID pinned, it is in
* the middle of an operation. This may prevent the oldest ID from
@@ -1479,11 +1491,15 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
session->nhazard > 0 ||
(txn_state->snap_min != WT_TXN_NONE &&
txn_global->current != txn_global->oldest_id);
- if (txn_busy) {
- if (pct_full < 100)
- return (0);
- busy = true;
- }
+
+ if (txn_busy && pct_full < 100)
+ return (0);
+
+ if (busy == 1)
+ txn_busy = 1;
+
+ /* Wake the eviction server if we need to do work. */
+ WT_RET(__wt_evict_server_wake(session));
/*
* If we're busy, either because of the transaction check we just did,
@@ -1491,9 +1507,11 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
* as a page read), limit the work to a single eviction and return. If
* that's not the case, we can do more.
*/
- count = busy ? 1 : 10;
+ init_evict_count = cache->pages_evict;
for (;;) {
+ max_pages_evicted = txn_busy ? 5 : 20;
+
/*
* A pathological case: if we're the oldest transaction in the
* system and the eviction server is stuck trying to find space,
@@ -1507,43 +1525,34 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
return (WT_ROLLBACK);
}
+ /* See if eviction is still needed. */
+ if (!__wt_eviction_needed(session, NULL) ||
+ cache->pages_evict > init_evict_count + max_pages_evicted)
+ return (0);
+
/* Evict a page. */
- q_found = false;
switch (ret = __evict_page(session, false)) {
case 0:
cache->app_evicts++;
- if (--count == 0)
+ if (txn_busy)
return (0);
-
- q_found = true;
- break;
+ /* FALLTHROUGH */
case EBUSY:
- continue;
+ break;
case WT_NOTFOUND:
+ /* Allow the queue to re-populate before retrying. */
+ WT_RET(__wt_cond_wait(
+ session, cache->evict_waiter_cond, 100000));
+ cache->app_waits++;
break;
default:
return (ret);
}
- /* See if eviction is still needed. */
- if (!__wt_eviction_needed(session, NULL))
- return (0);
-
- /* If we found pages in the eviction queue, continue there. */
- if (q_found)
- continue;
-
- /* Wait for the queue to re-populate before trying again. */
- WT_RET(
- __wt_cond_wait(session, cache->evict_waiter_cond, 100000));
-
- cache->app_waits++;
- /* Check if things have changed so that we are busy. */
- if (!busy && txn_state->snap_min != WT_TXN_NONE &&
- txn_global->current != txn_global->oldest_id) {
- busy = true;
- count = 1;
- }
+ /* Check if we have become busy. */
+ if (!txn_busy && txn_state->snap_min != WT_TXN_NONE &&
+ txn_global->current != txn_global->oldest_id)
+ txn_busy = true;
}
/* NOTREACHED */
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 0302533bb04..41b2c98f9e8 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -578,8 +578,7 @@ struct __wt_page {
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
-#define WT_PAGE_RECONCILIATION 0x20 /* Page reconciliation lock */
-#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
+#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
@@ -603,6 +602,12 @@ struct __wt_page {
#define WT_READGEN_STEP 100
uint64_t read_gen;
+ /*
+ * Used to protect and co-ordinate splits for internal pages and
+ * reconciliation for all pages.
+ */
+ WT_FAIR_LOCK page_lock;
+
size_t memory_footprint; /* Memory attached to the page */
/* Page's on-disk representation: NULL for pages created in memory. */
diff --git a/src/include/btree.i b/src/include/btree.i
index c7466019e14..14b5303cca9 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -977,7 +977,8 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_BTREE *btree;
WT_INSERT_HEAD *ins_head;
WT_INSERT *ins;
- int i;
+ size_t size;
+ int count;
btree = S2BT(session);
@@ -1007,25 +1008,36 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
return (false);
/*
- * There is no point splitting if the list is small, no deep items is
- * our heuristic for that. A 1/4 probability of adding a new skiplist
- * level, with level-0 always created, means there will be a 5th level
- * entry for roughly every 1024 entries in the list. If there are at
- * least 4 5th level entries (4K items), the list is large enough.
+ * There is no point doing an in-memory split unless there is a lot of
+ * data in the last skiplist on the page. Split if there are enough
+ * items and the skiplist does not fit within a single disk page.
+ *
+ * Rather than scanning the whole list, walk a higher level, which
+ * gives a sample of the items -- at level 0 we have all the items, at
+ * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more
+ * than 30 items and more data than would fit in a disk page, split.
*/
-#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1)
+#define WT_MIN_SPLIT_DEPTH 2
+#define WT_MIN_SPLIT_COUNT 30
+#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */
+
ins_head = page->pg_row_entries == 0 ?
WT_ROW_INSERT_SMALLEST(page) :
WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
if (ins_head == NULL)
return (false);
- for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH];
- ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH])
- if (++i == 4) {
+ for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH];
+ ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) {
+ count += WT_MIN_SPLIT_MULTIPLIER;
+ size += WT_MIN_SPLIT_MULTIPLIER *
+ (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd));
+ if (count > WT_MIN_SPLIT_COUNT &&
+ size > (size_t)btree->maxleafpage) {
WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable);
WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable);
return (true);
}
+ }
return (false);
}
diff --git a/src/include/cache.h b/src/include/cache.h
index f199372ea5e..caf8996e68b 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -151,7 +151,8 @@ struct __wt_cache_pool {
/* Locked: List of connections participating in the cache pool. */
TAILQ_HEAD(__wt_cache_pool_qh, __wt_connection_impl) cache_pool_qh;
-#define WT_CACHE_POOL_MANAGED 0x01 /* Cache pool has a manager thread */
-#define WT_CACHE_POOL_ACTIVE 0x02 /* Cache pool is active */
- uint8_t flags_atomic;
+ uint8_t pool_managed; /* Cache pool has a manager thread */
+
+#define WT_CACHE_POOL_ACTIVE 0x01 /* Cache pool is active */
+ uint8_t flags;
};
diff --git a/src/include/connection.h b/src/include/connection.h
index 2c20c2f7936..2dfb24a83da 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -336,12 +336,12 @@ struct __wt_connection_impl {
const char *stat_stamp; /* Statistics log entry timestamp */
uint64_t stat_usecs; /* Statistics log period */
-#define WT_CONN_LOG_ARCHIVE 0x01 /* Archive is enabled */
-#define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */
-#define WT_CONN_LOG_EXISTED 0x04 /* Log files found */
-#define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */
-#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */
-#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */
+#define WT_CONN_LOG_ARCHIVE 0x01 /* Archive is enabled */
+#define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */
+#define WT_CONN_LOG_EXISTED 0x04 /* Log files found */
+#define WT_CONN_LOG_RECOVER_DONE 0x08 /* Recovery completed */
+#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */
+#define WT_CONN_LOG_ZERO_FILL 0x20 /* Manually zero files */
uint32_t log_flags; /* Global logging configuration */
WT_CONDVAR *log_cond; /* Log server wait mutex */
WT_SESSION_IMPL *log_session; /* Log server session */
@@ -377,7 +377,6 @@ struct __wt_connection_impl {
*/
WT_SPINLOCK las_lock; /* Lookaside table spinlock */
WT_SESSION_IMPL *las_session; /* Lookaside table session */
- WT_CURSOR *las_cursor; /* Lookaside table cursor */
bool las_written; /* Lookaside table has been written */
WT_ITEM las_sweep_key; /* Sweep server's saved key */
diff --git a/src/include/extern.h b/src/include/extern.h
index cfc1dc8f26e..1f63f07646e 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -186,6 +186,7 @@ extern int __wt_las_create(WT_SESSION_IMPL *session);
extern int __wt_las_destroy(WT_SESSION_IMPL *session);
extern void __wt_las_set_written(WT_SESSION_IMPL *session);
extern bool __wt_las_is_written(WT_SESSION_IMPL *session);
+extern int __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp);
extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags);
extern int __wt_las_sweep(WT_SESSION_IMPL *session);
@@ -246,6 +247,7 @@ extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool fina
extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
+extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg);
extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]);
extern int __wt_log_wrlsn(WT_SESSION_IMPL *session);
extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
@@ -335,7 +337,7 @@ extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_in
extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot);
-extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, bool prealloc);
+extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest);
extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum);
extern int __wt_log_open(WT_SESSION_IMPL *session);
extern int __wt_log_close(WT_SESSION_IMPL *session);
@@ -592,8 +594,8 @@ extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
-extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool uses_dhandles, bool open_metadata, WT_SESSION_IMPL **sessionp);
extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, WT_SESSION_IMPL **sessionp);
+extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp);
extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp);
extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config);
extern int __wt_session_lock_dhandle( WT_SESSION_IMPL *session, uint32_t flags, bool *is_deadp);
diff --git a/src/include/flags.h b/src/include/flags.h
index ca3c3c38245..24dccd30913 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -6,17 +6,18 @@
#define WT_CONN_CKPT_SYNC 0x00000002
#define WT_CONN_CLOSING 0x00000004
#define WT_CONN_EVICTION_RUN 0x00000008
-#define WT_CONN_LEAK_MEMORY 0x00000010
-#define WT_CONN_LOG_SERVER_RUN 0x00000020
-#define WT_CONN_LSM_MERGE 0x00000040
-#define WT_CONN_PANIC 0x00000080
-#define WT_CONN_SERVER_ASYNC 0x00000100
-#define WT_CONN_SERVER_CHECKPOINT 0x00000200
-#define WT_CONN_SERVER_LSM 0x00000400
-#define WT_CONN_SERVER_RUN 0x00000800
-#define WT_CONN_SERVER_STATISTICS 0x00001000
-#define WT_CONN_SERVER_SWEEP 0x00002000
-#define WT_CONN_WAS_BACKUP 0x00004000
+#define WT_CONN_LAS_OPEN 0x00000010
+#define WT_CONN_LEAK_MEMORY 0x00000020
+#define WT_CONN_LOG_SERVER_RUN 0x00000040
+#define WT_CONN_LSM_MERGE 0x00000080
+#define WT_CONN_PANIC 0x00000100
+#define WT_CONN_SERVER_ASYNC 0x00000200
+#define WT_CONN_SERVER_CHECKPOINT 0x00000400
+#define WT_CONN_SERVER_LSM 0x00000800
+#define WT_CONN_SERVER_RUN 0x00001000
+#define WT_CONN_SERVER_STATISTICS 0x00002000
+#define WT_CONN_SERVER_SWEEP 0x00004000
+#define WT_CONN_WAS_BACKUP 0x00008000
#define WT_EVICTING 0x00000001
#define WT_EVICT_LOOKASIDE 0x00000002
#define WT_EVICT_UPDATE_RESTORE 0x00000004
diff --git a/src/include/hardware.h b/src/include/hardware.h
index 32353072c5b..1ab2c3d39c4 100644
--- a/src/include/hardware.h
+++ b/src/include/hardware.h
@@ -37,29 +37,6 @@
&(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
} while (0)
-#define F_CAS_ATOMIC(p, mask, ret) do { \
- uint8_t __orig; \
- ret = 0; \
- do { \
- __orig = (p)->flags_atomic; \
- if ((__orig & (uint8_t)(mask)) != 0) { \
- ret = EBUSY; \
- break; \
- } \
- } while (!__wt_atomic_cas8( \
- &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
-} while (0)
-
-#define F_CAS_ATOMIC_WAIT(p, mask) do { \
- int __ret; \
- for (;;) { \
- F_CAS_ATOMIC(p, mask, __ret); \
- if (__ret == 0) \
- break; \
- __wt_yield(); \
- } \
-} while (0)
-
#define F_CLR_ATOMIC(p, mask) do { \
uint8_t __orig; \
do { \
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 1f1bb8f4b5c..b67e5e610e8 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -52,6 +52,24 @@ struct __wt_rwlock {
};
/*
+ * A light weight lock that can be used to replace spinlocks if fairness is
+ * necessary. Implements a ticket-based back off spin lock.
+ * The fields are available as a union to allow for atomically setting
+ * the state of the entire lock.
+ */
+struct __wt_fair_lock {
+ union {
+ uint32_t lock;
+ struct {
+ uint16_t owner; /* Ticket for current owner */
+ uint16_t waiter; /* Last allocated ticket */
+ } s;
+ } u;
+#define fair_lock_owner u.s.owner
+#define fair_lock_waiter u.s.waiter
+};
+
+/*
* Spin locks:
*
* WiredTiger uses spinlocks for fast mutual exclusion (where operations done
diff --git a/src/include/mutex.i b/src/include/mutex.i
index 5ea4583a2ab..54a9cc6f9fd 100644
--- a/src/include/mutex.i
+++ b/src/include/mutex.i
@@ -251,3 +251,91 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
#error Unknown spinlock type
#endif
+
+/*
+ * __wt_fair_trylock --
+ * Try to get a lock - give up if it is not immediately available.
+ */
+static inline int
+__wt_fair_trylock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ WT_FAIR_LOCK new, old;
+
+ WT_UNUSED(session);
+
+ old = new = *lock;
+
+ /* Exit early if there is no chance we can get the lock. */
+ if (old.fair_lock_waiter != old.fair_lock_owner)
+ return (EBUSY);
+
+ /* The replacement lock value is a result of allocating a new ticket. */
+ ++new.fair_lock_waiter;
+ return (__wt_atomic_cas32(
+ &lock->u.lock, old.u.lock, new.u.lock) ? 0 : EBUSY);
+}
+
+/*
+ * __wt_fair_lock --
+ * Get a lock.
+ */
+static inline int
+__wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ uint16_t ticket;
+ int pause_cnt;
+
+ WT_UNUSED(session);
+
+ /*
+ * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+ * value will wrap and two lockers will simultaneously be granted the
+ * lock.
+ */
+ ticket = __wt_atomic_fetch_add16(&lock->fair_lock_waiter, 1);
+ for (pause_cnt = 0; ticket != lock->fair_lock_owner;) {
+ /*
+ * We failed to get the lock; pause before retrying and if we've
+ * paused enough, sleep so we don't burn CPU to no purpose. This
+ * situation happens if there are more threads than cores in the
+ * system and we're thrashing on shared resources.
+ */
+ if (++pause_cnt < 1000)
+ WT_PAUSE();
+ else
+ __wt_sleep(0, 10);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_fair_unlock --
+ * Release a shared lock.
+ */
+static inline int
+__wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ WT_UNUSED(session);
+
+ /*
+ * We have exclusive access - the update does not need to be atomic.
+ */
+ ++lock->fair_lock_owner;
+
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_fair_islocked --
+ * Test whether the lock is currently held
+ */
+static inline bool
+__wt_fair_islocked(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ WT_UNUSED(session);
+
+ return (lock->fair_lock_waiter != lock->fair_lock_owner);
+}
+#endif
diff --git a/src/include/serial.i b/src/include/serial.i
index 5358b874c06..ca22ce12d81 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -316,12 +316,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/* If we can't lock it, don't scan, that's okay. */
- F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret);
- if (ret != 0)
+ if (__wt_fair_trylock(session, &page->page_lock) != 0)
return (0);
obsolete = __wt_update_obsolete_check(session, page, upd->next);
- F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+ WT_RET(__wt_fair_unlock(session, &page->page_lock));
if (obsolete != NULL)
__wt_update_obsolete_free(session, page, obsolete);
diff --git a/src/include/stat.h b/src/include/stat.h
index 3f7d8985a84..1ebe253e5db 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -338,6 +338,7 @@ struct __wt_connection_stats {
int64_t log_sync_dir;
int64_t log_write_lsn;
int64_t log_writes;
+ int64_t log_zero_fills;
int64_t lsm_checkpoint_throttle;
int64_t lsm_merge_throttle;
int64_t lsm_rows_merged;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 9078a0e2e99..b7ebb8fbc14 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1750,6 +1750,33 @@ struct __wt_connection {
* seconds at which to check for files that are inactive and close
* them., an integer between 1 and 100000; default \c 10.}
* @config{ ),,}
+ * @config{log = (, enable logging. Enabling logging uses three
+ * sessions from the configured session_max., a set of related
+ * configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;archive, automatically archive
+ * unneeded log files., a boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;compressor, configure a compressor
+ * for log records. Permitted values are \c "none" or custom
+ * compression engine name created with WT_CONNECTION::add_compressor.
+ * If WiredTiger has builtin support for \c "bzip2"\, \c "snappy"\, \c
+ * "lz4" or \c "zlib" compression\, these names are also available. See
+ * @ref compression for more information., a string; default \c none.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable logging subsystem., a
+ * boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;file_max, the maximum size of log
+ * files., an integer between 100KB and 2GB; default \c 100MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the path to a directory into
+ * which the log files are written. If the value is not an absolute
+ * path name\, the files are created relative to the database home., a
+ * string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;prealloc,
+ * pre-allocate log files., a boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;recover, run recovery or error if
+ * recovery needs to run after an unclean shutdown., a string\, chosen
+ * from the following options: \c "error"\, \c "on"; default \c on.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;zero_fill, manually write zeroes into
+ * log files., a boolean flag; default \c false.}
+ * @config{ ),,}
* @config{lsm_manager = (, configure database wide options for LSM tree
* management. The LSM manager is started automatically the first time
* an LSM tree is opened. The LSM manager uses a session from the
@@ -2212,6 +2239,8 @@ struct __wt_connection {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;recover, run recovery
* or error if recovery needs to run after an unclean shutdown., a string\,
* chosen from the following options: \c "error"\, \c "on"; default \c on.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;zero_fill, manually write zeroes into log
+ * files., a boolean flag; default \c false.}
* @config{ ),,}
* @config{lsm_manager = (, configure database wide options for LSM tree
* management. The LSM manager is started automatically the first time an LSM
@@ -3793,90 +3822,92 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_LOG_WRITE_LSN 1109
/*! log: log write operations */
#define WT_STAT_CONN_LOG_WRITES 1110
+/*! log: log files manually zero-filled */
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1111
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1111
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1112
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1112
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1113
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1113
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1114
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1114
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1115
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1115
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1116
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1116
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1117
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1117
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1118
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1118
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1119
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1119
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1120
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1120
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1121
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1121
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1122
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1122
+#define WT_STAT_CONN_MEMORY_FREE 1123
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1123
+#define WT_STAT_CONN_MEMORY_GROW 1124
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1124
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1125
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1125
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1126
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1126
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1127
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1127
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1128
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1128
+#define WT_STAT_CONN_PAGE_SLEEP 1129
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1129
+#define WT_STAT_CONN_READ_IO 1130
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1130
+#define WT_STAT_CONN_REC_PAGES 1131
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1131
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1132
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1132
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1133
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1133
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1134
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1134
+#define WT_STAT_CONN_RWLOCK_READ 1135
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1135
+#define WT_STAT_CONN_RWLOCK_WRITE 1136
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1136
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1137
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1137
+#define WT_STAT_CONN_SESSION_OPEN 1138
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1138
+#define WT_STAT_CONN_TXN_BEGIN 1139
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1139
+#define WT_STAT_CONN_TXN_CHECKPOINT 1140
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1140
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1141
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1141
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1142
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1142
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1143
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1143
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1144
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1144
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1145
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1145
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1146
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1146
+#define WT_STAT_CONN_TXN_COMMIT 1147
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1147
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1148
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1148
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1149
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1149
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1150
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1150
+#define WT_STAT_CONN_TXN_ROLLBACK 1151
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1151
+#define WT_STAT_CONN_TXN_SYNC 1152
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1152
+#define WT_STAT_CONN_WRITE_IO 1153
/*!
* @}
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 4d46a25b63c..3f4e0ada7f1 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -164,6 +164,8 @@ struct __wt_ext;
typedef struct __wt_ext WT_EXT;
struct __wt_extlist;
typedef struct __wt_extlist WT_EXTLIST;
+struct __wt_fair_lock;
+ typedef struct __wt_fair_lock WT_FAIR_LOCK;
struct __wt_fh;
typedef struct __wt_fh WT_FH;
struct __wt_hazard;
diff --git a/src/log/log.c b/src/log/log.c
index ca0b81c4cf6..efe4d22eeca 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -357,6 +357,67 @@ __wt_log_extract_lognum(
}
/*
+ * __log_zero --
+ * Zero a log file.
+ */
+static int
+__log_zero(WT_SESSION_IMPL *session,
+ WT_FH *fh, wt_off_t start_off, wt_off_t len)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(zerobuf);
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t allocsize, bufsz, off, partial, wrlen;
+
+ conn = S2C(session);
+ log = conn->log;
+ allocsize = log->allocsize;
+ zerobuf = NULL;
+ if (allocsize < WT_MEGABYTE)
+ bufsz = WT_MEGABYTE;
+ else
+ bufsz = allocsize;
+ /*
+ * If they're using smaller log files, cap it at the file size.
+ */
+ if (conn->log_file_max < bufsz)
+ bufsz = (uint32_t)conn->log_file_max;
+ WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf));
+ memset(zerobuf->mem, 0, zerobuf->memsize);
+ WT_STAT_FAST_CONN_INCR(session, log_zero_fills);
+
+ /*
+ * Read in a chunk starting at the end of the file. Keep going until
+ * we reach the beginning or we find a chunk that contains any non-zero
+ * bytes. Compare against a known zero byte chunk.
+ */
+ off = (uint32_t)start_off;
+ while (off < (uint32_t)len) {
+ /*
+ * Typically we start to zero the file after the log header
+ * and the bufsz is a sector-aligned size. So we want to
+ * align our writes when we can.
+ */
+ partial = off % bufsz;
+ if (partial != 0)
+ wrlen = bufsz - partial;
+ else
+ wrlen = bufsz;
+ /*
+ * Check if we're writing a partial amount at the end too.
+ */
+ if ((uint32_t)len - off < bufsz)
+ wrlen = (uint32_t)len - off;
+ WT_ERR(__wt_write(session,
+ fh, (wt_off_t)off, wrlen, zerobuf->mem));
+ off += wrlen;
+ }
+err: __wt_scr_free(session, &zerobuf);
+ return (ret);
+}
+
+/*
* __log_prealloc --
* Pre-allocate a log file.
*/
@@ -370,7 +431,15 @@ __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh)
conn = S2C(session);
log = conn->log;
ret = 0;
- if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE ||
+ /*
+ * If the user configured zero filling, pre-allocate the log file
+ * manually. Otherwise use either fallocate or ftruncate to create
+ * and zero the log file based on what is available.
+ */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL))
+ ret = __log_zero(session, fh,
+ WT_LOG_FIRST_RECORD, conn->log_file_max);
+ else if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE ||
(ret = __wt_fallocate(session, fh,
WT_LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
ret = __wt_ftruncate(session, fh,
@@ -753,7 +822,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
if (create_log) {
log->prep_missed++;
WT_RET(__wt_log_allocfile(
- session, log->fileid, WT_LOG_FILENAME, true));
+ session, log->fileid, WT_LOG_FILENAME));
}
WT_RET(__log_openfile(session,
false, &log->log_fh, WT_LOG_FILENAME, log->fileid));
@@ -904,7 +973,7 @@ err: WT_TRET(__wt_close(session, &log_fh));
*/
int
__wt_log_allocfile(
- WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, bool prealloc)
+ WT_SESSION_IMPL *session, uint32_t lognum, const char *dest)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_ITEM(from_path);
@@ -936,8 +1005,7 @@ __wt_log_allocfile(
WT_ERR(__log_openfile(session, true, &log_fh, WT_LOG_TMPNAME, tmp_id));
WT_ERR(__log_file_header(session, log_fh, NULL, true));
WT_ERR(__wt_ftruncate(session, log_fh, WT_LOG_FIRST_RECORD));
- if (prealloc)
- WT_ERR(__log_prealloc(session, log_fh));
+ WT_ERR(__log_prealloc(session, log_fh));
WT_ERR(__wt_fsync(session, log_fh));
WT_ERR(__wt_close(session, &log_fh));
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index bd3adb3a528..1c5124c32af 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -203,12 +203,14 @@ __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg)
int
__wt_lsm_manager_start(WT_SESSION_IMPL *session)
{
+ WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_LSM_MANAGER *manager;
WT_SESSION_IMPL *worker_session;
uint32_t i;
- manager = &S2C(session)->lsm_manager;
+ conn = S2C(session);
+ manager = &conn->lsm_manager;
/*
* We need at least a manager, a switch thread and a generic
@@ -225,7 +227,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session)
*/
for (i = 0; i < WT_LSM_MAX_WORKERS; i++) {
WT_ERR(__wt_open_internal_session(
- S2C(session), "lsm-worker", true, false, &worker_session));
+ conn, "lsm-worker", false, 0, &worker_session));
worker_session->isolation = WT_ISO_READ_UNCOMMITTED;
manager->lsm_worker_cookies[i].session = worker_session;
}
@@ -234,7 +236,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session)
WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid,
__lsm_worker_manager, &manager->lsm_worker_cookies[0]));
- F_SET(S2C(session), WT_CONN_SERVER_LSM);
+ F_SET(conn, WT_CONN_SERVER_LSM);
if (0) {
err: for (i = 0;
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 82264f7c58f..40917bebf56 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -44,7 +44,6 @@ typedef struct {
* Track maximum transaction ID seen and first unwritten transaction ID.
*/
uint64_t max_txn;
- uint64_t first_dirty_txn;
/*
* When we can't mark the page clean (for example, checkpoint found some
@@ -292,7 +291,7 @@ typedef struct {
} WT_RECONCILE;
static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool);
-static void __rec_cell_build_addr(
+static void __rec_cell_build_addr(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
static int __rec_cell_build_int_key(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, bool *);
@@ -394,7 +393,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
* In-memory splits: reconciliation of an internal page cannot handle
* a child page splitting during the reconciliation.
*/
- F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
+ WT_RET(__wt_fair_lock(session, &page->page_lock));
/* Reconcile the page. */
switch (page->type) {
@@ -432,7 +431,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_TRET(__rec_write_wrapup_err(session, r, page));
/* Release the reconciliation lock. */
- F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+ WT_TRET(__wt_fair_unlock(session, &page->page_lock));
/* Update statistics. */
WT_STAT_FAST_CONN_INCR(session, rec_pages);
@@ -538,11 +537,6 @@ __rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
if (r->leave_dirty) {
/*
- * Update the page's first unwritten transaction ID.
- */
- mod->first_dirty_txn = r->first_dirty_txn;
-
- /*
* The page remains dirty.
*
* Any checkpoint call cleared the tree's modified flag before
@@ -880,12 +874,6 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->cache_write_lookaside = r->cache_write_restore = false;
- /*
- * Running transactions may update the page after we write it, so
- * this is the highest ID we can be confident we will see.
- */
- r->first_dirty_txn = conn->txn_global.last_running;
-
return (0);
}
@@ -1083,17 +1071,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if ((txnid = upd->txnid) == WT_TXN_ABORTED)
continue;
- /*
- * Track the largest/smallest transaction IDs on the list and
- * the smallest not-globally-visible transaction on the page.
- */
+ /* Track the largest/smallest transaction IDs on the list. */
if (WT_TXNID_LT(max_txn, txnid))
max_txn = txnid;
if (WT_TXNID_LT(txnid, min_txn))
min_txn = txnid;
- if (WT_TXNID_LT(txnid, r->first_dirty_txn) &&
- !__wt_txn_visible_all(session, txnid))
- r->first_dirty_txn = txnid;
/*
* Find the first update we can use.
@@ -3837,7 +3819,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
val->cell_len = 0;
val->len = val->buf.size;
} else
- __rec_cell_build_addr(r, addr->addr, addr->size,
+ __rec_cell_build_addr(session, r,
+ addr->addr, addr->size,
__rec_vtype(addr), ref->key.recno);
WT_CHILD_RELEASE_ERR(session, hazard, ref);
@@ -3883,7 +3866,7 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Build the value cell. */
addr = &multi->addr;
- __rec_cell_build_addr(r,
+ __rec_cell_build_addr(session, r,
addr->addr, addr->size, __rec_vtype(addr), r->recno);
/* Boundary: split or write the page. */
@@ -4708,7 +4691,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
vtype = state == WT_CHILD_PROXY ?
WT_CELL_ADDR_DEL : (u_int)vpack->raw;
}
- __rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB);
+ __rec_cell_build_addr(session, r, p, size, vtype, WT_RECNO_OOB);
WT_CHILD_RELEASE_ERR(session, hazard, ref);
/*
@@ -4794,8 +4777,8 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r->cell_zero = false;
addr = &multi->addr;
- __rec_cell_build_addr(
- r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
+ __rec_cell_build_addr(session, r,
+ addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
/* Boundary: split or write the page. */
if (key->len + val->len > r->space_avail)
@@ -5863,13 +5846,15 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
* on the page.
*/
static void
-__rec_cell_build_addr(WT_RECONCILE *r,
+__rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r,
const void *addr, size_t size, u_int cell_type, uint64_t recno)
{
WT_KV *val;
val = &r->v;
+ WT_ASSERT(session, size != 0 || cell_type == WT_CELL_ADDR_DEL);
+
/*
* We don't check the address size because we can't store an address on
* an overflow page: if the address won't fit, the overflow page's
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index 42b578946f4..a86cff4d723 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -571,7 +571,7 @@ __wt_schema_get_index(WT_SESSION_IMPL *session,
/* Try to find the index in the table. */
for (i = 0; i < table->nindices; i++) {
idx = table->indices[i];
- if (strcmp(idx->name, uri) == 0) {
+ if (idx != NULL && strcmp(idx->name, uri) == 0) {
if (tablep != NULL)
*tablep = table;
else
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 1bb519e80e0..a766829afad 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -1144,47 +1144,12 @@ __session_strerror(WT_SESSION *wt_session, int error)
}
/*
- * __wt_open_internal_session --
- * Allocate a session for WiredTiger's use.
- */
-int
-__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
- bool uses_dhandles, bool open_metadata, WT_SESSION_IMPL **sessionp)
-{
- WT_SESSION_IMPL *session;
-
- *sessionp = NULL;
-
- WT_RET(__wt_open_session(conn, NULL, NULL, open_metadata, &session));
- session->name = name;
-
- /*
- * Public sessions are automatically closed during WT_CONNECTION->close.
- * If the session handles for internal threads were to go on the public
- * list, there would be complex ordering issues during close. Set a
- * flag to avoid this: internal sessions are not closed automatically.
- */
- F_SET(session, WT_SESSION_INTERNAL);
-
- /*
- * Some internal threads must keep running after we close all data
- * handles. Make sure these threads don't open their own handles.
- */
- if (!uses_dhandles)
- F_SET(session, WT_SESSION_NO_DATA_HANDLES);
-
- *sessionp = session;
- return (0);
-}
-
-/*
- * __wt_open_session --
- * Allocate a session handle. The internal parameter is used for sessions
- * opened by WiredTiger for its own use.
+ * __open_session --
+ * Allocate a session handle.
*/
-int
-__wt_open_session(WT_CONNECTION_IMPL *conn,
- WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata,
+static int
+__open_session(WT_CONNECTION_IMPL *conn,
+ WT_EVENT_HANDLER *event_handler, const char *config,
WT_SESSION_IMPL **sessionp)
{
static const WT_SESSION stds = {
@@ -1324,7 +1289,26 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
WT_STAT_FAST_CONN_INCR(session, session_open);
err: __wt_spin_unlock(session, &conn->api_lock);
- WT_RET(ret);
+ return (ret);
+}
+
+/*
+ * __wt_open_session --
+ * Allocate a session handle.
+ */
+int
+__wt_open_session(WT_CONNECTION_IMPL *conn,
+ WT_EVENT_HANDLER *event_handler, const char *config,
+ bool open_metadata, WT_SESSION_IMPL **sessionp)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_SESSION *wt_session;
+
+ *sessionp = NULL;
+
+ /* Acquire a session. */
+ WT_RET(__open_session(conn, event_handler, config, &session));
/*
* Acquiring the metadata handle requires the schema lock; we've seen
@@ -1336,8 +1320,59 @@ err: __wt_spin_unlock(session, &conn->api_lock);
*/
if (open_metadata) {
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
- WT_RET(__wt_metadata_open(session_ret));
+ if ((ret = __wt_metadata_open(session)) != 0) {
+ wt_session = &session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ return (ret);
+ }
}
+ *sessionp = session;
+ return (0);
+}
+
+/*
+ * __wt_open_internal_session --
+ * Allocate a session for WiredTiger's use.
+ */
+int
+__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
+ bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp)
+{
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+
+ *sessionp = NULL;
+
+ /* Acquire a session. */
+ WT_RET(__wt_open_session(conn, NULL, NULL, open_metadata, &session));
+ session->name = name;
+
+ /*
+ * Public sessions are automatically closed during WT_CONNECTION->close.
+ * If the session handles for internal threads were to go on the public
+ * list, there would be complex ordering issues during close. Set a
+ * flag to avoid this: internal sessions are not closed automatically.
+ */
+ F_SET(session, session_flags | WT_SESSION_INTERNAL);
+
+ /*
+ * Acquiring the lookaside table cursor requires various locks; we've
+ * seen problems in the past where deadlocks happened because sessions
+ * deadlocked getting the cursor late in the process. Be defensive,
+ * get it now.
+ */
+ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
+ WT_WITHOUT_DHANDLE(session, ret =
+ __wt_las_cursor_create(session, &session->las_cursor));
+ if (ret != 0) {
+ wt_session = &session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ return (ret);
+ }
+ }
+
+ *sessionp = session;
return (0);
}
diff --git a/src/support/stat.c b/src/support/stat.c
index 4e7f54937f4..9e817fad512 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -595,6 +595,7 @@ static const char * const __stats_connection_desc[] = {
"log: log sync_dir operations",
"log: log server thread advances write LSN",
"log: log write operations",
+ "log: log files manually zero-filled",
"LSM: sleep for LSM checkpoint throttle",
"LSM: sleep for LSM merge throttle",
"LSM: rows merged in an LSM tree",
@@ -760,6 +761,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->log_slot_unbuffered = 0;
stats->log_bytes_payload = 0;
stats->log_bytes_written = 0;
+ stats->log_zero_fills = 0;
stats->log_flush = 0;
stats->log_compress_writes = 0;
stats->log_compress_write_fails = 0;
@@ -944,6 +946,7 @@ __wt_stat_connection_aggregate(
to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered);
to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload);
to->log_bytes_written += WT_STAT_READ(from, log_bytes_written);
+ to->log_zero_fills += WT_STAT_READ(from, log_zero_fills);
to->log_flush += WT_STAT_READ(from, log_flush);
to->log_compress_writes += WT_STAT_READ(from, log_compress_writes);
to->log_compress_write_fails +=
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
index f2b181711d1..63d86969311 100644
--- a/src/txn/txn_recover.c
+++ b/src/txn/txn_recover.c
@@ -412,11 +412,12 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
WT_RECOVERY r;
struct WT_RECOVERY_FILE *metafile;
char *config;
- bool needs_rec, was_backup;
+ bool eviction_started, needs_rec, was_backup;
conn = S2C(session);
WT_CLEAR(r);
WT_INIT_LSN(&r.ckpt_lsn);
+ eviction_started = false;
was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);
/* We need a real session for recovery. */
@@ -494,6 +495,15 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
*/
if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR))
WT_ERR(WT_RUN_RECOVERY);
+
+ /*
+ * Recovery can touch more data than fits in cache, so it relies on
+ * regular eviction to manage paging. Start eviction threads for
+ * recovery without LAS cursors.
+ */
+ WT_ERR(__wt_evict_create(session));
+ eviction_started = true;
+
/*
* Always run recovery even if it was a clean shutdown.
* We can consider skipping it in the future.
@@ -522,6 +532,18 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
err: WT_TRET(__recovery_free(&r));
__wt_free(session, config);
+
+ if (ret != 0)
+ __wt_err(session, ret, "Recovery failed");
+
+ /*
+ * Destroy the eviction threads that were started in support of
+ * recovery. They will be restarted once the lookaside table is
+ * created.
+ */
+ if (eviction_started)
+ WT_TRET(__wt_evict_destroy(session));
+
WT_TRET(session->iface.close(&session->iface, NULL));
return (ret);
diff --git a/test/suite/test_bug015.py b/test/suite/test_bug015.py
new file mode 100644
index 00000000000..65b5b8e1755
--- /dev/null
+++ b/test/suite/test_bug015.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from helper import copy_wiredtiger_home, key_populate, simple_populate
+
+# test_bug015.py
+# JIRA WT-2162: index drop in a certain order triggers NULL pointer deref
+class test_bug015(wttest.WiredTigerTestCase):
+ def test_bug015(self):
+ table = 'table:test_bug015'
+ idx1 = 'index:test_bug015:aab'
+ idx2 = 'index:test_bug015:aaa'
+ self.session.create(table, "columns=(k,v)")
+ self.session.create(idx1, "columns=(v)")
+ self.session.create(idx2, "columns=(v)")
+ self.session.drop(idx1, "force=true")
+ self.session.create(idx1, "columns=(v)")
+ self.session.drop(idx2, "force=true")
+ self.session.create(idx2, "columns=(v)")
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_reconfig.py b/test/suite/test_reconfig01.py
index b464895f155..2528f856a08 100644
--- a/test/suite/test_reconfig.py
+++ b/test/suite/test_reconfig01.py
@@ -30,9 +30,9 @@ import time
import wiredtiger, wttest
from helper import simple_populate
-# test_reconfig.py
+# test_reconfig01.py
# Smoke-test the connection reconfiguration operations.
-class test_reconfig(wttest.WiredTigerTestCase):
+class test_reconfig01(wttest.WiredTigerTestCase):
def test_reconfig_shared_cache(self):
self.conn.reconfigure("shared_cache=(name=pool,size=300M)")
diff --git a/test/suite/test_reconfig02.py b/test/suite/test_reconfig02.py
new file mode 100644
index 00000000000..e0981a887fb
--- /dev/null
+++ b/test/suite/test_reconfig02.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, time
+import wiredtiger, wttest
+from helper import simple_populate
+
+# test_reconfig02.py
+# Smoke-test the connection reconfiguration operations.
+class test_reconfig02(wttest.WiredTigerTestCase):
+ init_config = 'log=(archive=false,enabled,file_max=100K,prealloc=false,zero_fill=false)'
+ uri = "table:reconfig02"
+ entries = 1000
+
+ def setUpConnectionOpen(self, dir):
+ self.conn_config = self.init_config
+ return wttest.WiredTigerTestCase.setUpConnectionOpen(self, dir)
+
+ # Call reconfigure for zero filling a file. There is nothing
+ # we can actually look for to confirm it did anything.
+ # Also changing the log file size is a no-op, but should not fail.
+ def test_reconfig02_simple(self):
+ self.conn.reconfigure("log=(zero_fill=true)")
+ self.conn.reconfigure("log=(file_max=1MB)")
+
+ # Test that we get an error if we try to turn logging off.
+ def test_reconfig02_disable(self):
+ msg = 'Invalid argument'
+ gotException = False
+ try:
+ self.conn.reconfigure("log=(enabled=false)")
+ except wiredtiger.WiredTigerError as e:
+ gotException = True
+ self.pr('got exception: ' + str(e))
+ self.assertTrue(str(e).find(msg) >= 0)
+ self.assertTrue(gotException)
+
+ # Logging starts on, but prealloc is off. Verify it is off.
+ # Reconfigure it on and run again, making sure that log files
+ # get pre-allocated.
+ def test_reconfig02_prealloc(self):
+ # Create a table just to write something into the log. Sleep
+ # to give the worker thread a chance to run.
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+ time.sleep(2)
+ prep_logs = fnmatch.filter(os.listdir('.'), "*Prep*")
+ # Make sure no pre-allocated log files exist.
+ self.assertEqual(0, len(prep_logs))
+
+ # Now turn on pre-allocation. Sleep to give the worker thread
+ # a chance to run and verify pre-allocated log files exist.
+ self.conn.reconfigure("log=(prealloc=true)")
+ time.sleep(2)
+ prep_logs = fnmatch.filter(os.listdir('.'), "*Prep*")
+ self.assertNotEqual(0, len(prep_logs))
+
+ # Logging starts on, but archive is off. Verify it is off.
+ # Reconfigure it on and run again, making sure that log files
+ # get archived.
+ def test_reconfig02_archive(self):
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+ c = self.session.open_cursor(self.uri, None, None)
+ for i in range(self.entries):
+ c[i] = i + 1
+ c.close()
+ # Close and reopen connection to write a checkpoint, move to the
+ # next log file and verify that archive did not run.
+ orig_logs = fnmatch.filter(os.listdir('.'), "*Log*")
+ self.reopen_conn()
+ cur_logs = fnmatch.filter(os.listdir('.'), "*Log*")
+ for o in orig_logs:
+ self.assertEqual(True, o in cur_logs)
+
+ # Now turn on archive, sleep a bit to allow the archive thread
+ # to run and then confirm that all original logs are gone.
+ self.conn.reconfigure("log=(archive=true)")
+ time.sleep(2)
+ cur_logs = fnmatch.filter(os.listdir('.'), "*Log*")
+ for o in orig_logs:
+ self.assertEqual(False, o in cur_logs)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py
index 83c10f41244..17d0b97b50f 100644
--- a/test/suite/test_txn02.py
+++ b/test/suite/test_txn02.py
@@ -104,9 +104,18 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
# deterministic manner.
self.txn_sync = self.sync_list[
self.scenario_number % len(self.sync_list)]
+ #
+ # We don't want to run zero fill with only the same settings, such
+ # as archive or sync, which are an even number of options.
+ #
+ freq = 3
+ zerofill = 'false'
+ if self.scenario_number % freq == 0:
+ zerofill = 'true'
self.backup_dir = os.path.join(self.home, "WT_BACKUP")
conn_params = \
'log=(archive=false,enabled,file_max=%s),' % self.logmax + \
+ 'log=(zero_fill=%s),' % zerofill + \
'create,error_prefix="%s: ",' % self.shortid() + \
'transaction_sync="%s",' % self.txn_sync
# print "Creating conn at '%s' with config '%s'" % (dir, conn_params)